Coverage for python/lsst/daf/butler/registry/queries/_query.py: 14%
204 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-01 02:25 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-01 02:25 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ()
25from collections.abc import Iterable, Iterator, Mapping, Sequence, Set
26from contextlib import contextmanager
27from typing import Any, cast, final
29from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm
31from ...core import (
32 DataCoordinate,
33 DatasetColumnTag,
34 DatasetRef,
35 DatasetType,
36 Dimension,
37 DimensionElement,
38 DimensionGraph,
39 DimensionKeyColumnTag,
40 DimensionRecord,
41)
42from ..wildcards import CollectionWildcard
43from ._query_backend import QueryBackend
44from ._query_context import QueryContext
45from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader
48@final
49class Query:
50 """A general-purpose representation of a registry query.
52 Parameters
53 ----------
54 dimensions : `DimensionGraph`
55 The dimensions that span the query and are used to join its relations
56 together.
57 backend : `QueryBackend`
58 Backend object used to create the query and new ones derived from it.
59 context : `QueryContext`
60 Context manager that holds relation engines and database connections
61 for the query.
62 relation : `Relation`
63 The relation tree representation of the query as a series of operations
64 on tables.
65 governor_constraints : `Mapping` [ `str` [ `~collections.abc.Set`
66 [ `str` ] ] ]
67 Constraints on governor dimensions encoded in this query's relation.
68 This is a mapping from governor dimension name to sets of values that
69 dimension may take.
70 is_deferred : `bool`
71 If `True`, modifier methods that return a related `Query` object should
72 not immediately execute the new query.
73 has_record_columns : `bool` or `DimensionElement`
74 Whether this query's relation already includes columns for all or some
75 dimension element records: `True` means all elements in ``dimensions``
76 either have records present in ``record_caches`` or all columns present
77 in ``relation``, while a specific `DimensionElement` means that element
78 does.
79 record_caches : `Mapping` [ `DimensionElement`, `Mapping`
80 [ `DataCoordinate`, `DimensionRecord` ] ], optional
81 Cached dimension record values, organized first by dimension element
82 and then by data ID.
84 Notes
85 -----
86 Iterating over a `Query` yields mappings from `ColumnTag` to the associated
87 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and
88 `iter_dimension_records` methods can be used to instead iterate over
89 various butler primitives derived from these rows.
91 Iterating over a `Query` may or may not execute database queries again each
92 time, depending on the state of its relation tree - see `Query.run` for
93 details.
95 Query is immutable; all methods that might appear to modify it in place
96 actually return a new object (though many attributes will be shared).
98 Query is currently (still) an internal-to-Registry object, with only the
99 "QueryResults" classes that are backed by it directly exposed to users. It
100 has been designed with the intent that it will eventually play a larger
101 role, either as the main query result object in a redesigned query
102 interface, or a "power user" result option that accompanies simpler
103 replacements for the current "QueryResults" objects.
104 """
106 def __init__(
107 self,
108 dimensions: DimensionGraph,
109 backend: QueryBackend[QueryContext],
110 context: QueryContext,
111 relation: Relation,
112 governor_constraints: Mapping[str, Set[str]],
113 is_deferred: bool,
114 has_record_columns: bool | DimensionElement,
115 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
116 ):
117 self._dimensions = dimensions
118 self._backend = backend
119 self._context = context
120 self._relation = relation
121 self._governor_constraints = governor_constraints
122 self._is_deferred = is_deferred
123 self._has_record_columns = has_record_columns
124 self._record_caches = record_caches if record_caches is not None else {}
126 @property
127 def dimensions(self) -> DimensionGraph:
128 """The dimensions that span the query and are used to join its
129 relations together (`DimensionGraph`).
130 """
131 return self._dimensions
133 @property
134 def relation(self) -> Relation:
135 """The relation tree representation of the query as a series of
136 operations on tables (`Relation`).
137 """
138 return self._relation
140 @property
141 def has_record_columns(self) -> bool | DimensionElement:
142 """Whether this query's relation already includes columns for all or
143 some dimension element records (`bool` or `DimensionElement`).
144 """
145 return self._has_record_columns
147 @property
148 def backend(self) -> QueryBackend[QueryContext]:
149 """Backend object used to create the query and new ones derived from it
150 (`QueryBackend`).
151 """
152 return self._backend
154 @contextmanager
155 def open_context(self) -> Iterator[None]:
156 """Return a context manager that ensures a database connection is
157 established and temporary tables and cursors have a defined lifetime.
159 Returns
160 -------
161 context : `contextlib.AbstractContextManager`
162 Context manager with no return value.
163 """
164 if self._context.is_open:
165 yield
166 else:
167 with self._context:
168 yield
170 def __str__(self) -> str:
171 return str(self._relation)
173 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]:
174 return iter(self._context.fetch_iterable(self._relation))
176 def iter_data_ids(self, dimensions: DimensionGraph | None = None) -> Iterator[DataCoordinate]:
177 """Return an iterator that converts result rows to data IDs.
179 Parameters
180 ----------
181 dimensions : `DimensionGraph`, optional
182 Dimensions of the data IDs to return. If not provided,
183 ``self.dimensions`` is used.
185 Returns
186 -------
187 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ]
188 Iterator that yields data IDs.
189 """
190 if dimensions is None:
191 dimensions = self._dimensions
192 reader = DataCoordinateReader.make(
193 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches
194 )
195 if not (reader.columns_required <= self.relation.columns):
196 raise ColumnError(
197 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
198 f"for data IDs with dimensions {dimensions}."
199 )
200 return (reader.read(row) for row in self)
202 def iter_dataset_refs(
203 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,)
204 ) -> Iterator[DatasetRef]:
205 """Return an iterator that converts result rows to dataset references.
207 Parameters
208 ----------
209 dataset_type : `DatasetType`
210 The parent dataset type to yield references for.
211 components : `~collections.abc.Sequence` [ `None` or `str` ]
212 Which component dataset types to construct refs for from each row
213 representing a parent; `None` for the parent itself.
215 Returns
216 -------
217 refs : `~collections.abc.Iterator` [ `DatasetRef` ]
218 Iterator that yields (resolved) dataset references.
219 """
220 reader = DatasetRefReader(
221 dataset_type,
222 translate_collection=self._backend.get_collection_name,
223 records=self._has_record_columns is True,
224 record_caches=self._record_caches,
225 )
226 if not (reader.columns_required <= self.relation.columns):
227 raise ColumnError(
228 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
229 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}."
230 )
231 for row in self:
232 parent_ref = reader.read(row)
233 for component in components:
234 if component is None:
235 yield parent_ref
236 else:
237 yield parent_ref.makeComponentRef(component)
239 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]:
240 """Return an iterator that converts result rows to dimension records.
242 Parameters
243 ----------
244 element : `DimensionElement`, optional
245 Dimension element whose records will be returned. If not provided,
246 `has_record_columns` must be a `DimensionElement` instance.
248 Returns
249 -------
250 records : `~collections.abc.Iterator` [ `DimensionRecord` ]
251 Iterator that yields dimension records.
252 """
253 if element is None:
254 match self._has_record_columns:
255 case True | False:
256 raise ValueError("No default dimension element in query; 'element' must be given.")
257 case only_element_with_records:
258 element = only_element_with_records
259 if (cache := self._record_caches.get(element)) is not None:
260 return (cache[data_id] for data_id in self.iter_data_ids(element.graph))
261 else:
262 reader = DimensionRecordReader(element)
263 if not (reader.columns_required <= self.relation.columns):
264 raise ColumnError(
265 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
266 f"for records of element {element.name}."
267 )
268 return (reader.read(row) for row in self)
270 def run(self) -> Query:
271 """Execute the query and hold its results in memory.
273 Returns
274 -------
275 executed : `Query`
276 New query that holds the query results.
278 Notes
279 -----
280 Iterating over the results of a query that has been `run` will always
281 iterate over an existing container, while iterating over a query that
282 has not been run will result in executing at least some of the query
283 each time.
285 Running a query also sets its `is_deferred` flag to `False`, which will
286 cause new queries constructed by its methods to be run immediately,
287 unless ``defer=True`` is passed to the factory method. After a query
288 has been run, factory methods will also tend to prefer to apply new
289 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python
290 code acting on the existing container rather than going back to SQL,
291 which can be less efficient overall that applying operations to a
292 deferred query and executing them all only at the end.
294 Running a query is represented in terms of relations by adding a
295 `~lsst.daf.relation.Materialization` marker relation in the iteration
296 engine and then processing the relation tree; this attaches the
297 container of rows to that new relation to short-circuit any future
298 processing of the tree and lock changes to the tree upstream of it.
299 This is very different from the SQL-engine
300 `~lsst.daf.relation.Materialization` added to the tree by the
301 `materialize` method from a user perspective, though it has a similar
302 representation in the relation tree.
303 """
304 relation = (
305 # Make a new relation that definitely ends in the iteration engine
306 # (this does nothing if it already does).
307 self.relation.transferred_to(self._context.iteration_engine)
308 # Make the new relation save its rows to an in-memory Python
309 # collection in relation.payload when processed.
310 .materialized(name_prefix="run")
311 )
312 # Actually process the relation, simplifying out trivial relations,
313 # executing any SQL queries, and saving results to relation.payload.
314 # We discard the simplified relation that's returned, because we want
315 # the new query to have any extra diagnostic information contained in
316 # the original.
317 self._context.process(relation)
318 return self._copy(relation, False)
320 def materialized(self, defer_postprocessing: bool = True) -> Query:
321 """Materialize the results of this query in its context's preferred
322 engine.
324 Usually this means inserting the results into a temporary table in a
325 database.
327 Parameters
328 ----------
329 defer_postprocessing : `bool`, optional
330 If `True`, do not execute operations that occur in the context's
331 `QueryContext.iteration_engine` up front; instead insert and
332 execute a materialization upstream of them (e.g. via a a SQL
333 ``INSERT INTO ... SELECT`` statement, with no fetching to the
334 client) and execute the postprocessing operations when iterating
335 over the query results. If `False`, and iteration-engine
336 postprocessing operations exist, run the full query, execute them
337 now, and upload the results.
338 If the relation is already in the preferred engine, this option
339 is ignored and the materialization will not involve fetching rows
340 to the iteration engine at all. If the relation has already been
341 materialized in the iteration engine (i.e. via `run`), then this
342 option is again ignored and an upload of the existing rows will
343 be performed.
345 Returns
346 -------
347 materialized : `Query`
348 Modified query with the same row-and-column content with a
349 materialization in ``self.context.preferred_engine``.
350 """
351 if defer_postprocessing or self.relation.engine == self._context.preferred_engine:
352 relation, stripped = self._context.strip_postprocessing(self._relation)
353 if relation.engine == self._context.preferred_engine:
354 # We got all the way to the engine we want to materialize in.
355 # Apply that operation to the tree, process it (which actually
356 # creates a temporary table and populates it), and then reapply
357 # the stripped operations.
358 relation = relation.materialized()
359 self._context.process(relation)
360 for operation in stripped:
361 relation = operation.apply(
362 relation, transfer=True, preferred_engine=self._context.iteration_engine
363 )
364 return self._copy(relation, True)
365 # Either defer_postprocessing=False, or attempting to strip off unary
366 # operations until we got to the preferred engine didn't work, because
367 # this tree doesn't actually involve the preferred engine. So we just
368 # transfer to the preferred engine first, and then materialize,
369 # process, and return.
370 relation = self._relation.transferred_to(self._context.preferred_engine).materialized()
371 self._context.process(relation)
372 return self._copy(relation, True)
374 def projected(
375 self,
376 dimensions: Iterable[Dimension | str] | None = None,
377 unique: bool = True,
378 columns: Iterable[ColumnTag] | None = None,
379 defer: bool | None = None,
380 drop_postprocessing: bool = False,
381 keep_record_columns: bool = True,
382 ) -> Query:
383 """Return a modified `Query` with a subset of this one's columns.
385 Parameters
386 ----------
387 dimensions : `~collections.abc.Iterable` [ `Dimension` or `str` ],
388 optional
389 Dimensions to include in the new query. Will be expanded to
390 include all required and implied dependencies. Must be a subset of
391 ``self.dimensions``. If not provided, ``self.dimensions`` is used.
392 unique : `bool`, optional
393 If `True` (default) deduplicate rows after dropping columns.
394 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional
395 Additional dataset or dimension record columns to include in the
396 query. Dimension key columns added here are ignored unless they
397 extend beyond the key columns implied by the ``dimensions``
398 argument (which is an error).
399 defer : `bool`, optional
400 If `False`, run the new query immediately. If `True`, do not. If
401 `None` (default), the ``defer`` option passed when making ``self``
402 is used (this option is "sticky").
403 drop_postprocessing : `bool`, optional
404 Drop any iteration-engine operations that depend on columns that
405 are being removed (e.g. region-overlap tests when region columns
406 are being dropped), making it more likely that projection and
407 deduplication could be performed in the preferred engine, where
408 they may be more efficient.
409 keep_record_columns : `bool`, optional
410 If `True` (default) and this query `has_record_columns`, implicitly
411 add any of those to ``columns`` whose dimension element is in the
412 given ``dimensions``.
414 Returns
415 -------
416 query : `Query`
417 New query with the requested columns only, optionally deduplicated.
419 Notes
420 -----
421 Dataset columns are dropped from the new query unless passed via the
422 ``columns`` argument. All other columns are by default preserved.
424 Raises
425 ------
426 lsst.daf.relation.ColumnError
427 Raised if the columns to include in the new query are not all
428 present in the current query.
429 """
430 if dimensions is None:
431 dimensions = set(self._dimensions)
432 else:
433 dimensions = set(dimensions)
434 if columns is not None:
435 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns))
436 dimensions = self._dimensions.universe.extract(dimensions)
437 if columns is None:
438 columns = set()
439 else:
440 columns = set(columns)
441 columns.update(DimensionKeyColumnTag.generate(dimensions.names))
442 if keep_record_columns:
443 if self._has_record_columns is True:
444 for element in dimensions.elements:
445 if element not in self._record_caches:
446 columns.update(element.RecordClass.fields.columns)
447 elif self._has_record_columns in dimensions.elements:
448 element = cast(DimensionElement, self._has_record_columns)
449 columns.update(element.RecordClass.fields.columns)
450 if drop_postprocessing:
451 relation = self._context.drop_invalidated_postprocessing(self._relation, columns)
452 # Dropping postprocessing Calculations could cause other columns
453 # we had otherwise intended to keep to be dropped as well.
454 columns &= relation.columns
455 else:
456 relation = self._relation
457 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine)
458 if unique:
459 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine)
460 return self._chain(relation, defer, dimensions=dimensions)
462 def with_record_columns(
463 self, dimension_element: DimensionElement | None = None, defer: bool | None = None
464 ) -> Query:
465 """Return a modified `Query` with additional dimension record columns
466 and/or caches.
468 Parameters
469 ----------
470 dimension_element : `DimensionElement`, optional
471 Single element to add record columns for, or `None` default to add
472 them for all elements in `dimensions`.
473 defer : `bool`, optional
474 If `False`, run the new query immediately. If `True`, do not. If
475 `None` (default), the ``defer`` option passed when making ``self``
476 is used (this option is "sticky").
478 Returns
479 -------
480 query : `Query`
481 New query with the requested record columns either in the relation
482 or (when possible) available via record caching.
484 Notes
485 -----
486 Adding dimension record columns is fundamentally different from adding
487 new dimension key columns or dataset columns, because it is purely an
488 addition of columns, not rows - we can always join in a dimension
489 element table (if it has not already been included) on keys already
490 present in the current relation, confident that there is exactly one
491 row in the dimension element table for each row in the current
492 relation.
493 """
494 if self._has_record_columns is True or self._has_record_columns == dimension_element:
495 return self
496 record_caches = dict(self._record_caches)
497 columns_required: set[ColumnTag] = set()
498 for element in self.dimensions.elements if dimension_element is None else [dimension_element]:
499 if element in record_caches:
500 continue
501 if (cache := self._backend.get_dimension_record_cache(element.name, self._context)) is not None:
502 record_caches[element] = cache
503 else:
504 columns_required.update(element.RecordClass.fields.columns.keys())
505 # Modify the relation we have to remove any projections that dropped
506 # columns we now want, as long the relation's behavior is otherwise
507 # unchanged.
508 columns_required -= self._relation.columns
509 relation, columns_found = self._context.restore_columns(self._relation, columns_required)
510 columns_required.difference_update(columns_found)
511 if columns_required:
512 relation = self._backend.make_dimension_relation(
513 self._dimensions,
514 columns_required,
515 self._context,
516 initial_relation=relation,
517 # Don't permit joins to use any columns beyond those in the
518 # original relation, as that would change what this operation
519 # does.
520 initial_join_max_columns=frozenset(self._relation.columns),
521 governor_constraints=self._governor_constraints,
522 )
523 return self._chain(
524 relation,
525 defer=defer,
526 has_record_columns=True if dimension_element is None else dimension_element,
527 record_caches=record_caches,
528 )
530 def find_datasets(
531 self,
532 dataset_type: DatasetType,
533 collections: Any,
534 *,
535 find_first: bool = True,
536 columns: Set[str] = frozenset(("dataset_id", "run")),
537 defer: bool | None = None,
538 ) -> Query:
539 """Return a modified `Query` that includes a search for datasets of the
540 given type.
542 Parameters
543 ----------
544 dataset_type : `DatasetType`
545 Dataset type to search for. May not be a component.
546 collections
547 Collection search path or pattern. Must be a single collection
548 name or ordered sequence if ``find_first=True``. See
549 :ref:`daf_butler_collection_expressions` for more information.
550 find_first : `bool`, optional
551 If `True` (default) search collections in order until the first
552 match for each data ID is found. If `False`, return all matches in
553 all collections.
554 columns : `~collections.abc.Set` [ `str` ]
555 Dataset columns to include in the new query. Options include
557 - ``dataset_id``: the unique identifier of the dataset. The type
558 is implementation-dependent. Never nullable. Included by
559 default.
561 - ``ingest_date``: the date and time the dataset was added to the
562 data repository.
564 - ``run``: the foreign key column to the `~CollectionType.RUN`
565 collection holding the dataset (not necessarily the collection
566 name). The type is dependent on the collection manager
567 implementation. Included by default.
569 - ``collection``: the foreign key column to the collection type in
570 which the dataset was actually in this search. The type is
571 dependent on the collection manager implementation. This may
572 differ from ``run`` if the dataset is present in a matching
573 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
574 collection, which means the same dataset may also appear multiple
575 times in the query results.
577 - ``timespan``: the validity range for datasets found in a
578 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other
579 collection types.
581 The default columns (``dataset_id`` and ``run``) are sufficient to
582 enable `iter_dataset_refs`, which also takes care of translating
583 the internal ``RUN`` collection key into its public name.
585 Setting this to an empty set while passing ``find_first=False``
586 will return a query that is constrained by dataset existence in
587 some matching collection that does not actually return which
588 datasets existed.
589 defer : `bool`, optional
590 If `False`, run the new query immediately. If `True`, do not. If
591 `None` (default), the ``defer`` option passed when making ``self``
592 is used (this option is "sticky").
594 Returns
595 -------
596 query : `Query`
597 New query with the requested dataset columns, constrained by the
598 existence of datasets of this type in the given collection.
600 Raises
601 ------
602 lsst.daf.relation.ColumnError
603 Raised if a dataset search is already present in this query and
604 this is a find-first search.
605 ValueError
606 Raised if the given dataset type's dimensions are not a subset of
607 the current query's dimensions.
608 """
609 if find_first and DatasetColumnTag.filter_from(self._relation.columns):
610 raise ColumnError(
611 "Cannot search for datasets with find_first=True "
612 "on a query that already includes dataset columns."
613 )
614 #
615 # TODO: it'd nice to do a QueryContext.restore_columns call here or
616 # similar, to look for dataset-constraint joins already present in the
617 # relation and expand them to include dataset-result columns as well,
618 # instead of doing a possibly-redundant join here. But that would
619 # require pushing relation usage down further into
620 # DatasetStorageManager.make_relation, so that it doesn't need to be
621 # given the columns, and then giving the relation system the ability to
622 # simplify-away redundant joins when they only provide columns that
623 # aren't ultimately used. The right time to look into that is probably
624 # when investigating whether the base QueryBackend should be
625 # responsible for producing an "abstract" relation tree of some sort,
626 # with the subclasses only responsible for filling it in with payloads
627 # (and possibly replacing some leaves with new sub-trees) during when
628 # "processed" (or in some other "prepare" step).
629 #
630 # This is a low priority for three reasons:
631 # - there's some chance the database's query optimizer will simplify
632 # away these redundant joins;
633 # - at present, the main use of this code path is in QG generation,
634 # where we materialize the initial data ID query into a temp table
635 # and hence can't go back and "recover" those dataset columns anyway;
636 #
637 if not (dataset_type.dimensions <= self._dimensions):
638 raise ValueError(
639 "Cannot find datasets from a query unless the dataset types's dimensions "
640 f"({dataset_type.dimensions}, for {dataset_type.name}) are a subset of the query's "
641 f"({self._dimensions})."
642 )
643 columns = set(columns)
644 columns.add("dataset_id")
645 collections = CollectionWildcard.from_expression(collections)
646 if find_first:
647 collections.require_ordered()
648 rejections: list[str] = []
649 collection_records = self._backend.resolve_dataset_collections(
650 dataset_type,
651 collections,
652 governor_constraints=self._governor_constraints,
653 allow_calibration_collections=False, # TODO
654 rejections=rejections,
655 )
656 if not collection_records:
657 relation = self._relation.join(
658 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context)
659 )
660 elif find_first:
661 relation = self._backend.make_dataset_search_relation(
662 dataset_type, collection_records, columns, self._context, join_to=self._relation
663 )
664 else:
665 dataset_relation = self._backend.make_dataset_query_relation(
666 dataset_type, collection_records, columns, self._context
667 )
668 relation = self.relation.join(dataset_relation)
669 return self._chain(relation, defer=defer)
671 def sliced(
672 self,
673 start: int = 0,
674 stop: int | None = None,
675 defer: bool | None = None,
676 ) -> Query:
677 """Return a modified `Query` with that takes a slice of this one's
678 rows.
680 Parameters
681 ----------
682 start : `int`, optional
683 First index to include, inclusive.
684 stop : `int` or `None`, optional
685 One past the last index to include (i.e. exclusive).
686 defer : `bool`, optional
687 If `False`, run the new query immediately. If `True`, do not. If
688 `None` (default), the ``defer`` option passed when making ``self``
689 is used (this option is "sticky").
691 Returns
692 -------
693 query : `Query`
694 New query with the requested slice.
696 Notes
697 -----
698 This operation must be implemented in the iteration engine if there are
699 postprocessing operations, which may be much less efficient than
700 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..``
701 in SQL).
703 Since query row order is usually arbitrary, it usually makes sense to
704 call `sorted` before calling `sliced` to make the results
705 deterministic. This is not checked because there are some contexts
706 where getting an arbitrary subset of the results of a given size
707 still makes sense.
708 """
709 return self._chain(self._relation[start:stop], defer)
711 def sorted(
712 self,
713 order_by: Iterable[SortTerm],
714 defer: bool | None = None,
715 ) -> Query:
716 """Return a modified `Query` that sorts this one's rows.
718 Parameters
719 ----------
720 order_by : `~collections.abc.Iterable` [ `SortTerm` ]
721 Expressions to sort by.
722 defer : `bool`, optional
723 If `False`, run the new query immediately. If `True`, do not. If
724 `None` (default), the ``defer`` option passed when making ``self``
725 is used (this option is "sticky").
727 Returns
728 -------
729 query : `Query`
730 New query with the requested sorting.
732 Notes
733 -----
734 The ``order_by`` expression can include references to dimension record
735 columns that were not present in the original relation; this is
736 similar to calling `with_record_columns` for those columns first (but
737 in this case column requests cannot be satisfied by record caches).
738 All other columns referenced must be present in the query already.
739 """
740 op = Sort(tuple(order_by))
741 columns_required = set(op.columns_required)
742 columns_required.difference_update(self._relation.columns)
743 if columns_required:
744 relation, columns_found = self._context.restore_columns(self._relation, columns_required)
745 columns_required.difference_update(columns_found)
746 if columns_required:
747 try:
748 relation = self._backend.make_dimension_relation(
749 self._dimensions,
750 columns_required,
751 self._context,
752 initial_relation=relation,
753 # Don't permit joins to use any columns beyond those in
754 # the original relation, as that would change what this
755 # operation does.
756 initial_join_max_columns=frozenset(self._relation.columns),
757 governor_constraints=self._governor_constraints,
758 )
759 except ColumnError as err:
760 raise ColumnError(
761 "Cannot sort by columns that were not included in the original query or "
762 "fully resolved by its dimensions."
763 ) from err
764 else:
765 relation = self._relation
766 relation = op.apply(relation, preferred_engine=self._context.preferred_engine)
767 return self._chain(relation, defer)
769 def count(self, *, exact: bool = True, discard: bool = False) -> int:
770 """Count the number of rows in this query.
772 Parameters
773 ----------
774 exact : `bool`, optional
775 If `True` (default), return the exact number of rows. If `False`,
776 returning an upper bound is permitted if it can be done much more
777 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but
778 ignoring client-side filtering that would otherwise take place.
779 discard : `bool`, optional
780 If `True`, compute the exact count even if it would require running
781 the full query and then throwing away the result rows after
782 counting them. If `False`, this is an error, as the user would
783 usually be better off executing the query first to fetch its rows
784 into a new query (or passing ``exact=False``). Ignored if
785 ``exact=False``.
787 Returns
788 -------
789 n_rows : `int`
790 Number of rows in the query, or an upper bound. This includes
791 duplicates, if there are any.
793 Raises
794 ------
795 RuntimeError
796 Raised if an exact count was requested and could not be obtained
797 without fetching and discarding rows.
798 """
799 if self._relation.min_rows == self._relation.max_rows:
800 return self._relation.max_rows
801 return self._context.count(self._relation, exact=exact, discard=discard)
803 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
804 """Check whether this query has any result rows at all.
806 Parameters
807 ----------
808 execute : `bool`, optional
809 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
810 determined prior to execution that the query would return no rows.
811 exact : `bool`, optional
812 If `True`, run the full query and perform post-query filtering if
813 needed, until at least one result row is found. If `False`, the
814 returned result does not account for post-query filtering, and
815 hence may be `True` even when all result rows would be filtered
816 out.
818 Returns
819 -------
820 any_rows : `bool`
821 Whether the query has any rows, or if it may have any rows if
822 ``exact=False``.
824 Raises
825 ------
826 RuntimeError
827 Raised if an exact check was requested and could not be obtained
828 without executing the query.
829 """
830 if self._relation.min_rows > 0:
831 return True
832 if self._relation.max_rows == 0:
833 return False
834 if execute:
835 return self._context.any(self._relation, execute=execute, exact=exact)
836 elif not exact:
837 return True
838 raise TypeError("Cannot obtain exact results without executing the query.")
840 def explain_no_results(self, execute: bool = True) -> list[str]:
841 """Return human-readable messages that may help explain why the query
842 yields no results.
844 Parameters
845 ----------
846 execute : `bool`, optional
847 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
848 of aspects of the query to more precisely determine where rows were
849 filtered out.
851 Returns
852 -------
853 messages : `Iterable` [ `str` ]
854 String messages that describe reasons the query might not yield any
855 results.
856 """
857 # First try without actually executing any queries.
858 diagnostics = Diagnostics.run(self._relation)
859 if diagnostics.is_doomed:
860 return diagnostics.messages
861 if execute:
862 # Try again, running LIMIT 1 queries as we walk back down the tree
863 # to look for relations with no rows:
864 diagnostics = Diagnostics.run(self._relation, executor=self._context.any)
865 if diagnostics.is_doomed:
866 return diagnostics.messages
867 return []
869 def _copy(
870 self,
871 relation: Relation,
872 is_deferred: bool,
873 dimensions: DimensionGraph | None = None,
874 governor_constraints: Mapping[str, Set[str]] | None = None,
875 has_record_columns: bool | DimensionElement | None = None,
876 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
877 ) -> Query:
878 """Return a modified copy of this query with some attributes replaced.
880 See class docs for parameter documentation; the only difference here
881 is that the defaults are the values ``self`` was constructed with.
882 """
883 return Query(
884 dimensions=self._dimensions if dimensions is None else dimensions,
885 backend=self._backend,
886 context=self._context,
887 relation=relation,
888 governor_constraints=(
889 governor_constraints if governor_constraints is not None else self._governor_constraints
890 ),
891 is_deferred=is_deferred,
892 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns,
893 record_caches=self._record_caches if record_caches is None else record_caches,
894 )
896 def _chain(
897 self,
898 relation: Relation,
899 defer: bool | None,
900 dimensions: DimensionGraph | None = None,
901 governor_constraints: Mapping[str, Set[str]] | None = None,
902 has_record_columns: bool | DimensionElement | None = None,
903 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
904 ) -> Query:
905 """Return a modified query with a new relation while handling the
906 ubiquitous ``defer`` parameter's logic.
908 Parameters
909 ----------
910 relation : `Relation`
911 Relation for the new query.
912 defer : `bool`
913 If `False`, run the new query immediately. If `True`, do not. If
914 `None` , the ``defer`` option passed when making ``self`` is used
915 (this option is "sticky").
916 dimensions : `DimensionGraph`, optional
917 See class docs.
918 governor_constraints : `Mapping` [ `str` [ `~collections.abc.Set`
919 [ `str` ] ] ], optional
920 See class docs.
921 has_record_columns : `bool` or `DimensionElement`, optional
922 See class docs.
923 record_caches : `Mapping` [ `DimensionElement`, `Mapping`
924 [ `DataCoordinate`, `DimensionRecord` ] ], optional
925 See class docs.
927 Returns
928 -------
929 chained : `Query`
930 Modified query, or ``self`` if no modifications were actually
931 requested.
932 """
933 if defer is None:
934 defer = self._is_deferred
935 if (
936 relation is self._relation
937 and dimensions is None
938 and defer == self._is_deferred
939 and record_caches is None
940 and has_record_columns is None
941 and governor_constraints is None
942 ):
943 return self
944 result = self._copy(
945 relation,
946 is_deferred=True,
947 governor_constraints=governor_constraints,
948 dimensions=dimensions,
949 has_record_columns=has_record_columns,
950 record_caches=record_caches,
951 )
952 if not defer:
953 result = result.run()
954 return result