Coverage for python/lsst/daf/butler/registry/queries/_query.py: 14%
258 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 02:46 -0700
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 02:46 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ()
31import itertools
32from collections.abc import Iterable, Iterator, Mapping, Sequence, Set
33from contextlib import contextmanager
34from typing import Any, cast, final
36from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm
38from ..._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag
39from ..._dataset_ref import DatasetRef
40from ..._dataset_type import DatasetType
41from ...dimensions import (
42 DataCoordinate,
43 DimensionElement,
44 DimensionGroup,
45 DimensionRecord,
46 DimensionRecordSet,
47)
48from .._collection_type import CollectionType
49from ..wildcards import CollectionWildcard
50from ._query_backend import QueryBackend
51from ._query_context import QueryContext
52from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader
55@final
56class Query:
57 """A general-purpose representation of a registry query.
59 Parameters
60 ----------
61 dimensions : `DimensionGroup`
62 The dimensions that span the query and are used to join its relations
63 together.
64 backend : `QueryBackend`
65 Backend object used to create the query and new ones derived from it.
66 context : `QueryContext`
67 Context manager that holds relation engines and database connections
68 for the query.
69 relation : `Relation`
70 The relation tree representation of the query as a series of operations
71 on tables.
72 governor_constraints : `~collections.abc.Mapping` [ `str`, \
73 `~collections.abc.Set` [ `str` ] ]
74 Constraints on governor dimensions encoded in this query's relation.
75 This is a mapping from governor dimension name to sets of values that
76 dimension may take.
77 is_deferred : `bool`
78 If `True`, modifier methods that return a related `Query` object should
79 not immediately execute the new query.
80 has_record_columns : `bool` or `DimensionElement`
81 Whether this query's relation already includes columns for all or some
82 dimension element records: `True` means all elements in ``dimensions``
83 either have records present in ``record_caches`` or all columns present
84 in ``relation``, while a specific `DimensionElement` means that element
85 does.
86 record_caches : `~collections.abc.Mapping` [ `str`, \
87 `DimensionRecordSet` ], optional
88 Cached dimension record values.
90 Notes
91 -----
92 Iterating over a `Query` yields mappings from `ColumnTag` to the associated
93 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and
94 `iter_dimension_records` methods can be used to instead iterate over
95 various butler primitives derived from these rows.
97 Iterating over a `Query` may or may not execute database queries again each
98 time, depending on the state of its relation tree - see `Query.run` for
99 details.
101 Query is immutable; all methods that might appear to modify it in place
102 actually return a new object (though many attributes will be shared).
104 Query is currently (still) an internal-to-Registry object, with only the
105 "QueryResults" classes that are backed by it directly exposed to users. It
106 has been designed with the intent that it will eventually play a larger
107 role, either as the main query result object in a redesigned query
108 interface, or a "power user" result option that accompanies simpler
109 replacements for the current "QueryResults" objects.
110 """
112 def __init__(
113 self,
114 dimensions: DimensionGroup,
115 backend: QueryBackend[QueryContext],
116 context: QueryContext,
117 relation: Relation,
118 governor_constraints: Mapping[str, Set[str]],
119 is_deferred: bool,
120 has_record_columns: bool | DimensionElement,
121 record_caches: Mapping[str, DimensionRecordSet] | None = None,
122 ):
123 self._dimensions = dimensions
124 self._backend = backend
125 self._context = context
126 self._relation = relation
127 self._governor_constraints = governor_constraints
128 self._is_deferred = is_deferred
129 self._has_record_columns = has_record_columns
130 self._record_caches = record_caches if record_caches is not None else {}
132 @property
133 def dimensions(self) -> DimensionGroup:
134 """The dimensions that span the query and are used to join its
135 relations together (`DimensionGroup`).
136 """
137 return self._dimensions
139 @property
140 def relation(self) -> Relation:
141 """The relation tree representation of the query as a series of
142 operations on tables (`Relation`).
143 """
144 return self._relation
146 @property
147 def has_record_columns(self) -> bool | DimensionElement:
148 """Whether this query's relation already includes columns for all or
149 some dimension element records (`bool` or `DimensionElement`).
150 """
151 return self._has_record_columns
153 @property
154 def backend(self) -> QueryBackend[QueryContext]:
155 """Backend object used to create the query and new ones derived from it
156 (`QueryBackend`).
157 """
158 return self._backend
160 @contextmanager
161 def open_context(self) -> Iterator[None]:
162 """Return a context manager that ensures a database connection is
163 established, temporary tables and cursors have a defined lifetime,
164 and client-side caching is turned on.
166 Returns
167 -------
168 context : `contextlib.AbstractContextManager`
169 Context manager with no return value.
170 """
171 with self._backend.caching_context():
172 if self._context.is_open:
173 yield
174 else:
175 with self._context:
176 yield
178 def __str__(self) -> str:
179 return str(self._relation)
181 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]:
182 return iter(self._context.fetch_iterable(self._relation))
184 def iter_data_ids(self, dimensions: DimensionGroup | None = None) -> Iterator[DataCoordinate]:
185 """Return an iterator that converts result rows to data IDs.
187 Parameters
188 ----------
189 dimensions : `DimensionGroup`, optional
190 Dimensions of the data IDs to return. If not provided,
191 ``self.dimensions`` is used.
193 Returns
194 -------
195 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ]
196 Iterator that yields data IDs.
197 """
198 if dimensions is None:
199 dimensions = self._dimensions
200 reader = DataCoordinateReader.make(
201 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches
202 )
203 if not (reader.columns_required <= self.relation.columns):
204 raise ColumnError(
205 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
206 f"for data IDs with dimensions {dimensions}."
207 )
208 with self.backend.caching_context():
209 for row in self:
210 yield reader.read(row)
212 def iter_dataset_refs(
213 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,)
214 ) -> Iterator[DatasetRef]:
215 """Return an iterator that converts result rows to dataset references.
217 Parameters
218 ----------
219 dataset_type : `DatasetType`
220 The parent dataset type to yield references for.
221 components : `~collections.abc.Sequence` [ `None` or `str` ]
222 Which component dataset types to construct refs for from each row
223 representing a parent; `None` for the parent itself.
225 Returns
226 -------
227 refs : `~collections.abc.Iterator` [ `DatasetRef` ]
228 Iterator that yields (resolved) dataset references.
229 """
230 reader = DatasetRefReader(
231 dataset_type,
232 translate_collection=self._backend.get_collection_name,
233 records=self._has_record_columns is True,
234 record_caches=self._record_caches,
235 )
236 if not (reader.columns_required <= self.relation.columns):
237 raise ColumnError(
238 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
239 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}."
240 )
241 with self.backend.caching_context():
242 for row in self:
243 parent_ref = reader.read(row)
244 for component in components:
245 if component is None:
246 yield parent_ref
247 else:
248 yield parent_ref.makeComponentRef(component)
250 def iter_data_ids_and_dataset_refs(
251 self, dataset_type: DatasetType, dimensions: DimensionGroup | None = None
252 ) -> Iterator[tuple[DataCoordinate, DatasetRef]]:
253 """Iterate over pairs of data IDs and dataset refs.
255 This permits the data ID dimensions to differ from the dataset
256 dimensions.
258 Parameters
259 ----------
260 dataset_type : `DatasetType`
261 The parent dataset type to yield references for.
262 dimensions : `DimensionGroup`, optional
263 Dimensions of the data IDs to return. If not provided,
264 ``self.dimensions`` is used.
266 Returns
267 -------
268 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`,
269 `DatasetRef` ] ]
270 An iterator over (data ID, dataset reference) pairs.
271 """
272 if dimensions is None:
273 dimensions = self._dimensions
274 data_id_reader = DataCoordinateReader.make(
275 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches
276 )
277 dataset_reader = DatasetRefReader(
278 dataset_type,
279 translate_collection=self._backend.get_collection_name,
280 records=self._has_record_columns is True,
281 record_caches=self._record_caches,
282 )
283 if not (data_id_reader.columns_required <= self.relation.columns):
284 raise ColumnError(
285 f"Missing column(s) {set(data_id_reader.columns_required - self.relation.columns)} "
286 f"for data IDs with dimensions {dimensions}."
287 )
288 if not (dataset_reader.columns_required <= self.relation.columns):
289 raise ColumnError(
290 f"Missing column(s) {set(dataset_reader.columns_required - self.relation.columns)} "
291 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}."
292 )
293 with self.backend.caching_context():
294 for row in self:
295 yield (data_id_reader.read(row), dataset_reader.read(row))
297 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]:
298 """Return an iterator that converts result rows to dimension records.
300 Parameters
301 ----------
302 element : `DimensionElement`, optional
303 Dimension element whose records will be returned. If not provided,
304 `has_record_columns` must be a `DimensionElement` instance.
306 Returns
307 -------
308 records : `~collections.abc.Iterator` [ `DimensionRecord` ]
309 Iterator that yields dimension records.
310 """
311 if element is None:
312 match self._has_record_columns:
313 case True | False:
314 raise ValueError("No default dimension element in query; 'element' must be given.")
315 case only_element_with_records:
316 element = only_element_with_records
317 if (cache := self._record_caches.get(element.name)) is not None:
318 for data_id in self.iter_data_ids(element.minimal_group):
319 yield cache.find(data_id)
320 else:
321 reader = DimensionRecordReader(element)
322 if not (reader.columns_required <= self.relation.columns):
323 raise ColumnError(
324 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
325 f"for records of element {element.name}."
326 )
327 with self._backend.caching_context():
328 for row in self:
329 yield reader.read(row)
331 def run(self) -> Query:
332 """Execute the query and hold its results in memory.
334 Returns
335 -------
336 executed : `Query`
337 New query that holds the query results.
339 Notes
340 -----
341 Iterating over the results of a query that has been `run` will always
342 iterate over an existing container, while iterating over a query that
343 has not been run will result in executing at least some of the query
344 each time.
346 Running a query also sets its `is_deferred` flag to `False`, which will
347 cause new queries constructed by its methods to be run immediately,
348 unless ``defer=True`` is passed to the factory method. After a query
349 has been run, factory methods will also tend to prefer to apply new
350 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python
351 code acting on the existing container rather than going back to SQL,
352 which can be less efficient overall that applying operations to a
353 deferred query and executing them all only at the end.
355 Running a query is represented in terms of relations by adding a
356 `~lsst.daf.relation.Materialization` marker relation in the iteration
357 engine and then processing the relation tree; this attaches the
358 container of rows to that new relation to short-circuit any future
359 processing of the tree and lock changes to the tree upstream of it.
360 This is very different from the SQL-engine
361 `~lsst.daf.relation.Materialization` added to the tree by the
362 `materialize` method from a user perspective, though it has a similar
363 representation in the relation tree.
364 """
365 relation = (
366 # Make a new relation that definitely ends in the iteration engine
367 # (this does nothing if it already does).
368 self.relation.transferred_to(self._context.iteration_engine)
369 # Make the new relation save its rows to an in-memory Python
370 # collection in relation.payload when processed.
371 .materialized(name_prefix="run")
372 )
373 # Actually process the relation, simplifying out trivial relations,
374 # executing any SQL queries, and saving results to relation.payload.
375 # We discard the simplified relation that's returned, because we want
376 # the new query to have any extra diagnostic information contained in
377 # the original.
378 self._context.process(relation)
379 return self._copy(relation, False)
381 def materialized(self, defer_postprocessing: bool = True) -> Query:
382 """Materialize the results of this query in its context's preferred
383 engine.
385 Usually this means inserting the results into a temporary table in a
386 database.
388 Parameters
389 ----------
390 defer_postprocessing : `bool`, optional
391 If `True`, do not execute operations that occur in the context's
392 `QueryContext.iteration_engine` up front; instead insert and
393 execute a materialization upstream of them (e.g. via a a SQL
394 ``INSERT INTO ... SELECT`` statement, with no fetching to the
395 client) and execute the postprocessing operations when iterating
396 over the query results. If `False`, and iteration-engine
397 postprocessing operations exist, run the full query, execute them
398 now, and upload the results.
399 If the relation is already in the preferred engine, this option
400 is ignored and the materialization will not involve fetching rows
401 to the iteration engine at all. If the relation has already been
402 materialized in the iteration engine (i.e. via `run`), then this
403 option is again ignored and an upload of the existing rows will
404 be performed.
406 Returns
407 -------
408 materialized : `Query`
409 Modified query with the same row-and-column content with a
410 materialization in ``self.context.preferred_engine``.
411 """
412 if defer_postprocessing or self.relation.engine == self._context.preferred_engine:
413 relation, stripped = self._context.strip_postprocessing(self._relation)
414 if relation.engine == self._context.preferred_engine:
415 # We got all the way to the engine we want to materialize in.
416 # Apply that operation to the tree, process it (which actually
417 # creates a temporary table and populates it), and then reapply
418 # the stripped operations.
419 relation = relation.materialized()
420 self._context.process(relation)
421 for operation in stripped:
422 relation = operation.apply(
423 relation, transfer=True, preferred_engine=self._context.iteration_engine
424 )
425 return self._copy(relation, True)
426 # Either defer_postprocessing=False, or attempting to strip off unary
427 # operations until we got to the preferred engine didn't work, because
428 # this tree doesn't actually involve the preferred engine. So we just
429 # transfer to the preferred engine first, and then materialize,
430 # process, and return.
431 relation = self._relation.transferred_to(self._context.preferred_engine).materialized()
432 self._context.process(relation)
433 return self._copy(relation, True)
435 def projected(
436 self,
437 dimensions: DimensionGroup | Iterable[str] | None = None,
438 unique: bool = True,
439 columns: Iterable[ColumnTag] | None = None,
440 defer: bool | None = None,
441 drop_postprocessing: bool = False,
442 keep_record_columns: bool = True,
443 ) -> Query:
444 """Return a modified `Query` with a subset of this one's columns.
446 Parameters
447 ----------
448 dimensions : `~collections.abc.Iterable` [ `str` ],
449 optional
450 Dimensions to include in the new query. Will be expanded to
451 include all required and implied dependencies. Must be a subset of
452 ``self.dimensions``. If not provided, ``self.dimensions`` is used.
453 unique : `bool`, optional
454 If `True` (default) deduplicate rows after dropping columns.
455 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional
456 Additional dataset or dimension record columns to include in the
457 query. Dimension key columns added here are ignored unless they
458 extend beyond the key columns implied by the ``dimensions``
459 argument (which is an error).
460 defer : `bool`, optional
461 If `False`, run the new query immediately. If `True`, do not. If
462 `None` (default), the ``defer`` option passed when making ``self``
463 is used (this option is "sticky").
464 drop_postprocessing : `bool`, optional
465 Drop any iteration-engine operations that depend on columns that
466 are being removed (e.g. region-overlap tests when region columns
467 are being dropped), making it more likely that projection and
468 deduplication could be performed in the preferred engine, where
469 they may be more efficient.
470 keep_record_columns : `bool`, optional
471 If `True` (default) and this query `has_record_columns`, implicitly
472 add any of those to ``columns`` whose dimension element is in the
473 given ``dimensions``.
475 Returns
476 -------
477 query : `Query`
478 New query with the requested columns only, optionally deduplicated.
480 Notes
481 -----
482 Dataset columns are dropped from the new query unless passed via the
483 ``columns`` argument. All other columns are by default preserved.
485 Raises
486 ------
487 lsst.daf.relation.ColumnError
488 Raised if the columns to include in the new query are not all
489 present in the current query.
490 """
491 match dimensions:
492 case None:
493 dimensions = set(self._dimensions.names)
494 case DimensionGroup():
495 dimensions = set(dimensions.names)
496 case iterable:
497 dimensions = set(iterable)
498 if columns is not None:
499 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns))
500 dimensions = self._dimensions.universe.conform(dimensions)
501 if columns is None:
502 columns = set()
503 else:
504 columns = set(columns)
505 columns.update(DimensionKeyColumnTag.generate(dimensions.names))
506 if keep_record_columns:
507 if self._has_record_columns is True:
508 for element_name in dimensions.elements:
509 if element_name not in self._record_caches:
510 columns.update(self.dimensions.universe[element_name].RecordClass.fields.columns)
511 elif self._has_record_columns in dimensions.elements:
512 element = cast(DimensionElement, self._has_record_columns)
513 columns.update(element.RecordClass.fields.columns)
514 if drop_postprocessing:
515 relation = self._context.drop_invalidated_postprocessing(self._relation, columns)
516 # Dropping postprocessing Calculations could cause other columns
517 # we had otherwise intended to keep to be dropped as well.
518 columns &= relation.columns
519 else:
520 relation = self._relation
521 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine)
522 if unique:
523 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine)
524 return self._chain(relation, defer, dimensions=dimensions)
526 def with_record_columns(self, dimension_element: str | None = None, defer: bool | None = None) -> Query:
527 """Return a modified `Query` with additional dimension record columns
528 and/or caches.
530 Parameters
531 ----------
532 dimension_element : `str`, optional
533 Name of a single dimension element to add record columns for, or
534 `None` default to add them for all elements in `dimensions`.
535 defer : `bool`, optional
536 If `False`, run the new query immediately. If `True`, do not. If
537 `None` (default), the ``defer`` option passed when making ``self``
538 is used (this option is "sticky").
540 Returns
541 -------
542 query : `Query`
543 New query with the requested record columns either in the relation
544 or (when possible) available via record caching.
546 Notes
547 -----
548 Adding dimension record columns is fundamentally different from adding
549 new dimension key columns or dataset columns, because it is purely an
550 addition of columns, not rows - we can always join in a dimension
551 element table (if it has not already been included) on keys already
552 present in the current relation, confident that there is exactly one
553 row in the dimension element table for each row in the current
554 relation.
555 """
556 if self._has_record_columns is True or self._has_record_columns == dimension_element:
557 return self
558 record_caches = dict(self._record_caches)
559 columns_required: set[ColumnTag] = set()
560 for element_name in self.dimensions.elements if dimension_element is None else [dimension_element]:
561 element = self.dimensions.universe[element_name]
562 if element_name in record_caches:
563 continue
564 if (cache := self._backend.get_dimension_record_cache(element_name)) is not None:
565 record_caches[element_name] = cache
566 else:
567 columns_required.update(element.RecordClass.fields.columns.keys())
568 # Modify the relation we have to remove any projections that dropped
569 # columns we now want, as long the relation's behavior is otherwise
570 # unchanged.
571 columns_required -= self._relation.columns
572 relation, columns_found = self._context.restore_columns(self._relation, columns_required)
573 columns_required.difference_update(columns_found)
574 if columns_required:
575 relation = self._backend.make_dimension_relation(
576 self._dimensions,
577 columns_required,
578 self._context,
579 initial_relation=relation,
580 # Don't permit joins to use any columns beyond those in the
581 # original relation, as that would change what this operation
582 # does.
583 initial_join_max_columns=frozenset(self._relation.columns),
584 governor_constraints=self._governor_constraints,
585 )
586 return self._chain(
587 relation,
588 defer=defer,
589 has_record_columns=(
590 True if dimension_element is None else self.dimensions.universe[dimension_element]
591 ),
592 record_caches=record_caches,
593 )
595 def find_datasets(
596 self,
597 dataset_type: DatasetType,
598 collections: Any,
599 *,
600 find_first: bool = True,
601 columns: Set[str] = frozenset(("dataset_id", "run")),
602 defer: bool | None = None,
603 ) -> Query:
604 """Return a modified `Query` that includes a search for datasets of the
605 given type.
607 Parameters
608 ----------
609 dataset_type : `DatasetType`
610 Dataset type to search for. May not be a component.
611 collections : `~typing.Any`
612 Collection search path or pattern. Must be a single collection
613 name or ordered sequence if ``find_first=True``. See
614 :ref:`daf_butler_collection_expressions` for more information.
615 find_first : `bool`, optional
616 If `True` (default) search collections in order until the first
617 match for each data ID is found. If `False`, return all matches in
618 all collections.
619 columns : `~collections.abc.Set` [ `str` ]
620 Dataset columns to include in the new query. Options include
622 - ``dataset_id``: the unique identifier of the dataset. The type
623 is implementation-dependent. Never nullable. Included by
624 default.
625 - ``ingest_date``: the date and time the dataset was added to the
626 data repository.
627 - ``run``: the foreign key column to the `~CollectionType.RUN`
628 collection holding the dataset (not necessarily the collection
629 name). The type is dependent on the collection manager
630 implementation. Included by default.
631 - ``collection``: the foreign key column to the collection type in
632 which the dataset was actually in this search. The type is
633 dependent on the collection manager implementation. This may
634 differ from ``run`` if the dataset is present in a matching
635 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
636 collection, which means the same dataset may also appear multiple
637 times in the query results.
638 - ``timespan``: the validity range for datasets found in a
639 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other
640 collection types.
642 The default columns (``dataset_id`` and ``run``) are sufficient to
643 enable `iter_dataset_refs`, which also takes care of translating
644 the internal ``RUN`` collection key into its public name.
646 Setting this to an empty set while passing ``find_first=False``
647 will return a query that is constrained by dataset existence in
648 some matching collection that does not actually return which
649 datasets existed.
650 defer : `bool`, optional
651 If `False`, run the new query immediately. If `True`, do not. If
652 `None` (default), the ``defer`` option passed when making ``self``
653 is used (this option is "sticky").
655 Returns
656 -------
657 query : `Query`
658 New query with the requested dataset columns, constrained by the
659 existence of datasets of this type in the given collection.
661 Raises
662 ------
663 lsst.daf.relation.ColumnError
664 Raised if a dataset search is already present in this query and
665 this is a find-first search.
666 """
667 if find_first and DatasetColumnTag.filter_from(self._relation.columns):
668 raise ColumnError(
669 "Cannot search for datasets with find_first=True "
670 "on a query that already includes dataset columns."
671 )
672 #
673 # TODO: it'd be nice to do a QueryContext.restore_columns call here or
674 # similar, to look for dataset-constraint joins already present in the
675 # relation and expand them to include dataset-result columns as well,
676 # instead of doing a possibly-redundant join here. But that would
677 # require pushing relation usage down further into
678 # DatasetStorageManager.make_relation, so that it doesn't need to be
679 # given the columns, and then giving the relation system the ability to
680 # simplify-away redundant joins when they only provide columns that
681 # aren't ultimately used. The right time to look into that is probably
682 # when investigating whether the base QueryBackend should be
683 # responsible for producing an "abstract" relation tree of some sort,
684 # with the subclasses only responsible for filling it in with payloads
685 # (and possibly replacing some leaves with new sub-trees) during when
686 # "processed" (or in some other "prepare" step).
687 #
688 # This is a low priority for three reasons:
689 # - there's some chance the database's query optimizer will simplify
690 # away these redundant joins;
691 # - at present, the main use of this code path is in QG generation,
692 # where we materialize the initial data ID query into a temp table
693 # and hence can't go back and "recover" those dataset columns anyway;
694 #
695 collections = CollectionWildcard.from_expression(collections)
696 if find_first:
697 collections.require_ordered()
698 rejections: list[str] = []
699 collection_records = self._backend.resolve_dataset_collections(
700 dataset_type,
701 collections,
702 governor_constraints=self._governor_constraints,
703 allow_calibration_collections=True,
704 rejections=rejections,
705 )
706 # If the dataset type has dimensions not in the current query, or we
707 # need a temporal join for a calibration collection, either restore
708 # those columns or join them in.
709 full_dimensions = dataset_type.dimensions.as_group().union(self._dimensions)
710 relation = self._relation
711 record_caches = self._record_caches
712 base_columns_required: set[ColumnTag] = {
713 DimensionKeyColumnTag(name) for name in full_dimensions.names
714 }
715 spatial_joins: list[tuple[str, str]] = []
716 if not (dataset_type.dimensions <= self._dimensions):
717 if self._has_record_columns is True:
718 # This query is for expanded data IDs, so if we add new
719 # dimensions to the query we need to be able to get records for
720 # the new dimensions.
721 record_caches = dict(self._record_caches)
722 for element_name in full_dimensions.elements:
723 element = full_dimensions.universe[element_name]
724 if element in record_caches:
725 continue
726 if (cache := self._backend.get_dimension_record_cache(element_name)) is not None:
727 record_caches[element_name] = cache
728 else:
729 base_columns_required.update(element.RecordClass.fields.columns.keys())
730 # See if we need spatial joins between the current query and the
731 # dataset type's dimensions. The logic here is for multiple
732 # spatial joins in general, but in practice it'll be exceedingly
733 # rare for there to be more than one. We start by figuring out
734 # which spatial "families" (observations vs. skymaps, skypix
735 # systems) are present on only one side and not the other.
736 lhs_spatial_families = self._dimensions.spatial - dataset_type.dimensions.spatial
737 rhs_spatial_families = dataset_type.dimensions.spatial - self._dimensions.spatial
738 # Now we iterate over the Cartesian product of those, so e.g.
739 # if the query has {tract, patch, visit} and the dataset type
740 # has {htm7} dimensions, the iterations of this loop
741 # correspond to: (skymap, htm), (observations, htm).
742 for lhs_spatial_family, rhs_spatial_family in itertools.product(
743 lhs_spatial_families, rhs_spatial_families
744 ):
745 # For each pair we add a join between the most-precise element
746 # present in each family (e.g. patch beats tract).
747 spatial_joins.append(
748 (
749 lhs_spatial_family.choose(
750 full_dimensions.elements.names, self.dimensions.universe
751 ).name,
752 rhs_spatial_family.choose(
753 full_dimensions.elements.names, self.dimensions.universe
754 ).name,
755 )
756 )
757 # Set up any temporal join between the query dimensions and CALIBRATION
758 # collection's validity ranges.
759 temporal_join_on: set[ColumnTag] = set()
760 if any(r.type is CollectionType.CALIBRATION for r in collection_records):
761 for family in self._dimensions.temporal:
762 endpoint = family.choose(self._dimensions.elements.names, self.dimensions.universe)
763 temporal_join_on.add(DimensionRecordColumnTag(endpoint.name, "timespan"))
764 base_columns_required.update(temporal_join_on)
765 # Note which of the many kinds of potentially-missing columns we have
766 # and add the rest.
767 base_columns_required.difference_update(relation.columns)
768 if base_columns_required:
769 relation = self._backend.make_dimension_relation(
770 full_dimensions,
771 base_columns_required,
772 self._context,
773 initial_relation=relation,
774 # Don't permit joins to use any columns beyond those in the
775 # original relation, as that would change what this
776 # operation does.
777 initial_join_max_columns=frozenset(self._relation.columns),
778 governor_constraints=self._governor_constraints,
779 spatial_joins=spatial_joins,
780 )
781 # Finally we can join in the search for the dataset query.
782 columns = set(columns)
783 columns.add("dataset_id")
784 if not collection_records:
785 relation = relation.join(
786 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context)
787 )
788 elif find_first:
789 relation = self._backend.make_dataset_search_relation(
790 dataset_type,
791 collection_records,
792 columns,
793 self._context,
794 join_to=relation,
795 temporal_join_on=temporal_join_on,
796 )
797 else:
798 relation = self._backend.make_dataset_query_relation(
799 dataset_type,
800 collection_records,
801 columns,
802 self._context,
803 join_to=relation,
804 temporal_join_on=temporal_join_on,
805 )
806 return self._chain(relation, dimensions=full_dimensions, record_caches=record_caches, defer=defer)
808 def sliced(
809 self,
810 start: int = 0,
811 stop: int | None = None,
812 defer: bool | None = None,
813 ) -> Query:
814 """Return a modified `Query` with that takes a slice of this one's
815 rows.
817 Parameters
818 ----------
819 start : `int`, optional
820 First index to include, inclusive.
821 stop : `int` or `None`, optional
822 One past the last index to include (i.e. exclusive).
823 defer : `bool`, optional
824 If `False`, run the new query immediately. If `True`, do not. If
825 `None` (default), the ``defer`` option passed when making ``self``
826 is used (this option is "sticky").
828 Returns
829 -------
830 query : `Query`
831 New query with the requested slice.
833 Notes
834 -----
835 This operation must be implemented in the iteration engine if there are
836 postprocessing operations, which may be much less efficient than
837 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..``
838 in SQL).
840 Since query row order is usually arbitrary, it usually makes sense to
841 call `sorted` before calling `sliced` to make the results
842 deterministic. This is not checked because there are some contexts
843 where getting an arbitrary subset of the results of a given size
844 still makes sense.
845 """
846 return self._chain(self._relation[start:stop], defer)
848 def sorted(
849 self,
850 order_by: Iterable[SortTerm],
851 defer: bool | None = None,
852 ) -> Query:
853 """Return a modified `Query` that sorts this one's rows.
855 Parameters
856 ----------
857 order_by : `~collections.abc.Iterable` [ `SortTerm` ]
858 Expressions to sort by.
859 defer : `bool`, optional
860 If `False`, run the new query immediately. If `True`, do not. If
861 `None` (default), the ``defer`` option passed when making ``self``
862 is used (this option is "sticky").
864 Returns
865 -------
866 query : `Query`
867 New query with the requested sorting.
869 Notes
870 -----
871 The ``order_by`` expression can include references to dimension record
872 columns that were not present in the original relation; this is
873 similar to calling `with_record_columns` for those columns first (but
874 in this case column requests cannot be satisfied by record caches).
875 All other columns referenced must be present in the query already.
876 """
877 op = Sort(tuple(order_by))
878 columns_required = set(op.columns_required)
879 columns_required.difference_update(self._relation.columns)
880 if columns_required:
881 relation, columns_found = self._context.restore_columns(self._relation, columns_required)
882 columns_required.difference_update(columns_found)
883 if columns_required:
884 try:
885 relation = self._backend.make_dimension_relation(
886 self._dimensions,
887 columns_required,
888 self._context,
889 initial_relation=relation,
890 # Don't permit joins to use any columns beyond those in
891 # the original relation, as that would change what this
892 # operation does.
893 initial_join_max_columns=frozenset(self._relation.columns),
894 governor_constraints=self._governor_constraints,
895 )
896 except ColumnError as err:
897 raise ColumnError(
898 "Cannot sort by columns that were not included in the original query or "
899 "fully resolved by its dimensions."
900 ) from err
901 else:
902 relation = self._relation
903 relation = op.apply(relation, preferred_engine=self._context.preferred_engine)
904 return self._chain(relation, defer)
906 def count(self, *, exact: bool = True, discard: bool = False) -> int:
907 """Count the number of rows in this query.
909 Parameters
910 ----------
911 exact : `bool`, optional
912 If `True` (default), return the exact number of rows. If `False`,
913 returning an upper bound is permitted if it can be done much more
914 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but
915 ignoring client-side filtering that would otherwise take place.
916 discard : `bool`, optional
917 If `True`, compute the exact count even if it would require running
918 the full query and then throwing away the result rows after
919 counting them. If `False`, this is an error, as the user would
920 usually be better off executing the query first to fetch its rows
921 into a new query (or passing ``exact=False``). Ignored if
922 ``exact=False``.
924 Returns
925 -------
926 n_rows : `int`
927 Number of rows in the query, or an upper bound. This includes
928 duplicates, if there are any.
930 Raises
931 ------
932 RuntimeError
933 Raised if an exact count was requested and could not be obtained
934 without fetching and discarding rows.
935 """
936 if self._relation.min_rows == self._relation.max_rows:
937 return self._relation.max_rows
938 return self._context.count(self._relation, exact=exact, discard=discard)
940 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
941 """Check whether this query has any result rows at all.
943 Parameters
944 ----------
945 execute : `bool`, optional
946 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
947 determined prior to execution that the query would return no rows.
948 exact : `bool`, optional
949 If `True`, run the full query and perform post-query filtering if
950 needed, until at least one result row is found. If `False`, the
951 returned result does not account for post-query filtering, and
952 hence may be `True` even when all result rows would be filtered
953 out.
955 Returns
956 -------
957 any_rows : `bool`
958 Whether the query has any rows, or if it may have any rows if
959 ``exact=False``.
961 Raises
962 ------
963 RuntimeError
964 Raised if an exact check was requested and could not be obtained
965 without executing the query.
966 """
967 if self._relation.min_rows > 0:
968 return True
969 if self._relation.max_rows == 0:
970 return False
971 if execute:
972 return self._context.any(self._relation, execute=execute, exact=exact)
973 elif not exact:
974 return True
975 raise TypeError("Cannot obtain exact results without executing the query.")
977 def explain_no_results(self, execute: bool = True) -> list[str]:
978 """Return human-readable messages that may help explain why the query
979 yields no results.
981 Parameters
982 ----------
983 execute : `bool`, optional
984 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
985 of aspects of the query to more precisely determine where rows were
986 filtered out.
988 Returns
989 -------
990 messages : `~collections.abc.Iterable` [ `str` ]
991 String messages that describe reasons the query might not yield any
992 results.
993 """
994 # First try without actually executing any queries.
995 diagnostics = Diagnostics.run(self._relation)
996 if diagnostics.is_doomed:
997 return diagnostics.messages
998 if execute:
999 # Try again, running LIMIT 1 queries as we walk back down the tree
1000 # to look for relations with no rows:
1001 diagnostics = Diagnostics.run(self._relation, executor=self._context.any)
1002 if diagnostics.is_doomed:
1003 return diagnostics.messages
1004 return []
1006 def _copy(
1007 self,
1008 relation: Relation,
1009 is_deferred: bool,
1010 dimensions: DimensionGroup | None = None,
1011 governor_constraints: Mapping[str, Set[str]] | None = None,
1012 has_record_columns: bool | DimensionElement | None = None,
1013 record_caches: Mapping[str, DimensionRecordSet] | None = None,
1014 ) -> Query:
1015 """Return a modified copy of this query with some attributes replaced.
1017 See class docs for parameter documentation; the only difference here
1018 is that the defaults are the values ``self`` was constructed with.
1019 """
1020 return Query(
1021 dimensions=self._dimensions if dimensions is None else dimensions,
1022 backend=self._backend,
1023 context=self._context,
1024 relation=relation,
1025 governor_constraints=(
1026 governor_constraints if governor_constraints is not None else self._governor_constraints
1027 ),
1028 is_deferred=is_deferred,
1029 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns,
1030 record_caches=self._record_caches if record_caches is None else record_caches,
1031 )
1033 def _chain(
1034 self,
1035 relation: Relation,
1036 defer: bool | None,
1037 dimensions: DimensionGroup | None = None,
1038 governor_constraints: Mapping[str, Set[str]] | None = None,
1039 has_record_columns: bool | DimensionElement | None = None,
1040 record_caches: Mapping[str, DimensionRecordSet] | None = None,
1041 ) -> Query:
1042 """Return a modified query with a new relation while handling the
1043 ubiquitous ``defer`` parameter's logic.
1045 Parameters
1046 ----------
1047 relation : `Relation`
1048 Relation for the new query.
1049 defer : `bool`
1050 If `False`, run the new query immediately. If `True`, do not. If
1051 `None` , the ``defer`` option passed when making ``self`` is used
1052 (this option is "sticky").
1053 dimensions : `DimensionGroup`, optional
1054 See class docs.
1055 governor_constraints : `~collections.abc.Mapping` [ `str`, \
1056 `~collections.abc.Set` [ `str` ] ], optional
1057 See class docs.
1058 has_record_columns : `bool` or `DimensionElement`, optional
1059 See class docs.
1060 record_caches : `~collections.abc.Mapping` [ `str`, \
1061 `DimensionRecordSet`, optional
1062 See class docs.
1064 Returns
1065 -------
1066 chained : `Query`
1067 Modified query, or ``self`` if no modifications were actually
1068 requested.
1069 """
1070 if defer is None:
1071 defer = self._is_deferred
1072 if (
1073 relation is self._relation
1074 and dimensions is None
1075 and defer == self._is_deferred
1076 and record_caches is None
1077 and has_record_columns is None
1078 and governor_constraints is None
1079 ):
1080 return self
1081 result = self._copy(
1082 relation,
1083 is_deferred=True,
1084 governor_constraints=governor_constraints,
1085 dimensions=dimensions,
1086 has_record_columns=has_record_columns,
1087 record_caches=record_caches,
1088 )
1089 if not defer:
1090 result = result.run()
1091 return result