Coverage for python/lsst/daf/butler/registry/queries/_query.py: 15%
244 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ()
31import itertools
32from collections.abc import Iterable, Iterator, Mapping, Sequence, Set
33from contextlib import contextmanager
34from typing import Any, cast, final
36from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm
38from ..._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag
39from ..._dataset_ref import DatasetRef
40from ..._dataset_type import DatasetType
41from ...dimensions import DataCoordinate, Dimension, DimensionElement, DimensionGraph, DimensionRecord
42from .._collection_type import CollectionType
43from ..wildcards import CollectionWildcard
44from ._query_backend import QueryBackend
45from ._query_context import QueryContext
46from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader
49@final
50class Query:
51 """A general-purpose representation of a registry query.
53 Parameters
54 ----------
55 dimensions : `DimensionGraph`
56 The dimensions that span the query and are used to join its relations
57 together.
58 backend : `QueryBackend`
59 Backend object used to create the query and new ones derived from it.
60 context : `QueryContext`
61 Context manager that holds relation engines and database connections
62 for the query.
63 relation : `Relation`
64 The relation tree representation of the query as a series of operations
65 on tables.
66 governor_constraints : `~collections.abc.Mapping` [ `str`, \
67 `~collections.abc.Set` [ `str` ] ]
68 Constraints on governor dimensions encoded in this query's relation.
69 This is a mapping from governor dimension name to sets of values that
70 dimension may take.
71 is_deferred : `bool`
72 If `True`, modifier methods that return a related `Query` object should
73 not immediately execute the new query.
74 has_record_columns : `bool` or `DimensionElement`
75 Whether this query's relation already includes columns for all or some
76 dimension element records: `True` means all elements in ``dimensions``
77 either have records present in ``record_caches`` or all columns present
78 in ``relation``, while a specific `DimensionElement` means that element
79 does.
80 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \
81 `~collections.abc.Mapping`
82 [ `DataCoordinate`, `DimensionRecord` ] ], optional
83 Cached dimension record values, organized first by dimension element
84 and then by data ID.
86 Notes
87 -----
88 Iterating over a `Query` yields mappings from `ColumnTag` to the associated
89 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and
90 `iter_dimension_records` methods can be used to instead iterate over
91 various butler primitives derived from these rows.
93 Iterating over a `Query` may or may not execute database queries again each
94 time, depending on the state of its relation tree - see `Query.run` for
95 details.
97 Query is immutable; all methods that might appear to modify it in place
98 actually return a new object (though many attributes will be shared).
100 Query is currently (still) an internal-to-Registry object, with only the
101 "QueryResults" classes that are backed by it directly exposed to users. It
102 has been designed with the intent that it will eventually play a larger
103 role, either as the main query result object in a redesigned query
104 interface, or a "power user" result option that accompanies simpler
105 replacements for the current "QueryResults" objects.
106 """
108 def __init__(
109 self,
110 dimensions: DimensionGraph,
111 backend: QueryBackend[QueryContext],
112 context: QueryContext,
113 relation: Relation,
114 governor_constraints: Mapping[str, Set[str]],
115 is_deferred: bool,
116 has_record_columns: bool | DimensionElement,
117 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
118 ):
119 self._dimensions = dimensions
120 self._backend = backend
121 self._context = context
122 self._relation = relation
123 self._governor_constraints = governor_constraints
124 self._is_deferred = is_deferred
125 self._has_record_columns = has_record_columns
126 self._record_caches = record_caches if record_caches is not None else {}
128 @property
129 def dimensions(self) -> DimensionGraph:
130 """The dimensions that span the query and are used to join its
131 relations together (`DimensionGraph`).
132 """
133 return self._dimensions
135 @property
136 def relation(self) -> Relation:
137 """The relation tree representation of the query as a series of
138 operations on tables (`Relation`).
139 """
140 return self._relation
142 @property
143 def has_record_columns(self) -> bool | DimensionElement:
144 """Whether this query's relation already includes columns for all or
145 some dimension element records (`bool` or `DimensionElement`).
146 """
147 return self._has_record_columns
149 @property
150 def backend(self) -> QueryBackend[QueryContext]:
151 """Backend object used to create the query and new ones derived from it
152 (`QueryBackend`).
153 """
154 return self._backend
156 @contextmanager
157 def open_context(self) -> Iterator[None]:
158 """Return a context manager that ensures a database connection is
159 established and temporary tables and cursors have a defined lifetime.
161 Returns
162 -------
163 context : `contextlib.AbstractContextManager`
164 Context manager with no return value.
165 """
166 if self._context.is_open:
167 yield
168 else:
169 with self._context:
170 yield
172 def __str__(self) -> str:
173 return str(self._relation)
175 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]:
176 return iter(self._context.fetch_iterable(self._relation))
178 def iter_data_ids(self, dimensions: DimensionGraph | None = None) -> Iterator[DataCoordinate]:
179 """Return an iterator that converts result rows to data IDs.
181 Parameters
182 ----------
183 dimensions : `DimensionGraph`, optional
184 Dimensions of the data IDs to return. If not provided,
185 ``self.dimensions`` is used.
187 Returns
188 -------
189 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ]
190 Iterator that yields data IDs.
191 """
192 if dimensions is None:
193 dimensions = self._dimensions
194 reader = DataCoordinateReader.make(
195 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches
196 )
197 if not (reader.columns_required <= self.relation.columns):
198 raise ColumnError(
199 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
200 f"for data IDs with dimensions {dimensions}."
201 )
202 return (reader.read(row) for row in self)
204 def iter_dataset_refs(
205 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,)
206 ) -> Iterator[DatasetRef]:
207 """Return an iterator that converts result rows to dataset references.
209 Parameters
210 ----------
211 dataset_type : `DatasetType`
212 The parent dataset type to yield references for.
213 components : `~collections.abc.Sequence` [ `None` or `str` ]
214 Which component dataset types to construct refs for from each row
215 representing a parent; `None` for the parent itself.
217 Returns
218 -------
219 refs : `~collections.abc.Iterator` [ `DatasetRef` ]
220 Iterator that yields (resolved) dataset references.
221 """
222 reader = DatasetRefReader(
223 dataset_type,
224 translate_collection=self._backend.get_collection_name,
225 records=self._has_record_columns is True,
226 record_caches=self._record_caches,
227 )
228 if not (reader.columns_required <= self.relation.columns):
229 raise ColumnError(
230 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
231 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}."
232 )
233 for row in self:
234 parent_ref = reader.read(row)
235 for component in components:
236 if component is None:
237 yield parent_ref
238 else:
239 yield parent_ref.makeComponentRef(component)
241 def iter_data_ids_and_dataset_refs(
242 self, dataset_type: DatasetType, dimensions: DimensionGraph | None = None
243 ) -> Iterator[tuple[DataCoordinate, DatasetRef]]:
244 """Iterate over pairs of data IDs and dataset refs.
246 This permits the data ID dimensions to differ from the dataset
247 dimensions.
249 Parameters
250 ----------
251 dataset_type : `DatasetType`
252 The parent dataset type to yield references for.
253 dimensions : `DimensionGraph`, optional
254 Dimensions of the data IDs to return. If not provided,
255 ``self.dimensions`` is used.
257 Returns
258 -------
259 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`,
260 `DatasetRef` ] ]
261 An iterator over (data ID, dataset reference) pairs.
262 """
263 if dimensions is None:
264 dimensions = self._dimensions
265 data_id_reader = DataCoordinateReader.make(
266 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches
267 )
268 dataset_reader = DatasetRefReader(
269 dataset_type,
270 translate_collection=self._backend.get_collection_name,
271 records=self._has_record_columns is True,
272 record_caches=self._record_caches,
273 )
274 if not (data_id_reader.columns_required <= self.relation.columns):
275 raise ColumnError(
276 f"Missing column(s) {set(data_id_reader.columns_required - self.relation.columns)} "
277 f"for data IDs with dimensions {dimensions}."
278 )
279 if not (dataset_reader.columns_required <= self.relation.columns):
280 raise ColumnError(
281 f"Missing column(s) {set(dataset_reader.columns_required - self.relation.columns)} "
282 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}."
283 )
284 for row in self:
285 yield (data_id_reader.read(row), dataset_reader.read(row))
287 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]:
288 """Return an iterator that converts result rows to dimension records.
290 Parameters
291 ----------
292 element : `DimensionElement`, optional
293 Dimension element whose records will be returned. If not provided,
294 `has_record_columns` must be a `DimensionElement` instance.
296 Returns
297 -------
298 records : `~collections.abc.Iterator` [ `DimensionRecord` ]
299 Iterator that yields dimension records.
300 """
301 if element is None:
302 match self._has_record_columns:
303 case True | False:
304 raise ValueError("No default dimension element in query; 'element' must be given.")
305 case only_element_with_records:
306 element = only_element_with_records
307 if (cache := self._record_caches.get(element)) is not None:
308 return (cache[data_id] for data_id in self.iter_data_ids(element.graph))
309 else:
310 reader = DimensionRecordReader(element)
311 if not (reader.columns_required <= self.relation.columns):
312 raise ColumnError(
313 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
314 f"for records of element {element.name}."
315 )
316 return (reader.read(row) for row in self)
318 def run(self) -> Query:
319 """Execute the query and hold its results in memory.
321 Returns
322 -------
323 executed : `Query`
324 New query that holds the query results.
326 Notes
327 -----
328 Iterating over the results of a query that has been `run` will always
329 iterate over an existing container, while iterating over a query that
330 has not been run will result in executing at least some of the query
331 each time.
333 Running a query also sets its `is_deferred` flag to `False`, which will
334 cause new queries constructed by its methods to be run immediately,
335 unless ``defer=True`` is passed to the factory method. After a query
336 has been run, factory methods will also tend to prefer to apply new
337 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python
338 code acting on the existing container rather than going back to SQL,
339 which can be less efficient overall that applying operations to a
340 deferred query and executing them all only at the end.
342 Running a query is represented in terms of relations by adding a
343 `~lsst.daf.relation.Materialization` marker relation in the iteration
344 engine and then processing the relation tree; this attaches the
345 container of rows to that new relation to short-circuit any future
346 processing of the tree and lock changes to the tree upstream of it.
347 This is very different from the SQL-engine
348 `~lsst.daf.relation.Materialization` added to the tree by the
349 `materialize` method from a user perspective, though it has a similar
350 representation in the relation tree.
351 """
352 relation = (
353 # Make a new relation that definitely ends in the iteration engine
354 # (this does nothing if it already does).
355 self.relation.transferred_to(self._context.iteration_engine)
356 # Make the new relation save its rows to an in-memory Python
357 # collection in relation.payload when processed.
358 .materialized(name_prefix="run")
359 )
360 # Actually process the relation, simplifying out trivial relations,
361 # executing any SQL queries, and saving results to relation.payload.
362 # We discard the simplified relation that's returned, because we want
363 # the new query to have any extra diagnostic information contained in
364 # the original.
365 self._context.process(relation)
366 return self._copy(relation, False)
368 def materialized(self, defer_postprocessing: bool = True) -> Query:
369 """Materialize the results of this query in its context's preferred
370 engine.
372 Usually this means inserting the results into a temporary table in a
373 database.
375 Parameters
376 ----------
377 defer_postprocessing : `bool`, optional
378 If `True`, do not execute operations that occur in the context's
379 `QueryContext.iteration_engine` up front; instead insert and
380 execute a materialization upstream of them (e.g. via a a SQL
381 ``INSERT INTO ... SELECT`` statement, with no fetching to the
382 client) and execute the postprocessing operations when iterating
383 over the query results. If `False`, and iteration-engine
384 postprocessing operations exist, run the full query, execute them
385 now, and upload the results.
386 If the relation is already in the preferred engine, this option
387 is ignored and the materialization will not involve fetching rows
388 to the iteration engine at all. If the relation has already been
389 materialized in the iteration engine (i.e. via `run`), then this
390 option is again ignored and an upload of the existing rows will
391 be performed.
393 Returns
394 -------
395 materialized : `Query`
396 Modified query with the same row-and-column content with a
397 materialization in ``self.context.preferred_engine``.
398 """
399 if defer_postprocessing or self.relation.engine == self._context.preferred_engine:
400 relation, stripped = self._context.strip_postprocessing(self._relation)
401 if relation.engine == self._context.preferred_engine:
402 # We got all the way to the engine we want to materialize in.
403 # Apply that operation to the tree, process it (which actually
404 # creates a temporary table and populates it), and then reapply
405 # the stripped operations.
406 relation = relation.materialized()
407 self._context.process(relation)
408 for operation in stripped:
409 relation = operation.apply(
410 relation, transfer=True, preferred_engine=self._context.iteration_engine
411 )
412 return self._copy(relation, True)
413 # Either defer_postprocessing=False, or attempting to strip off unary
414 # operations until we got to the preferred engine didn't work, because
415 # this tree doesn't actually involve the preferred engine. So we just
416 # transfer to the preferred engine first, and then materialize,
417 # process, and return.
418 relation = self._relation.transferred_to(self._context.preferred_engine).materialized()
419 self._context.process(relation)
420 return self._copy(relation, True)
422 def projected(
423 self,
424 dimensions: Iterable[Dimension | str] | None = None,
425 unique: bool = True,
426 columns: Iterable[ColumnTag] | None = None,
427 defer: bool | None = None,
428 drop_postprocessing: bool = False,
429 keep_record_columns: bool = True,
430 ) -> Query:
431 """Return a modified `Query` with a subset of this one's columns.
433 Parameters
434 ----------
435 dimensions : `~collections.abc.Iterable` [ `Dimension` or `str` ],
436 optional
437 Dimensions to include in the new query. Will be expanded to
438 include all required and implied dependencies. Must be a subset of
439 ``self.dimensions``. If not provided, ``self.dimensions`` is used.
440 unique : `bool`, optional
441 If `True` (default) deduplicate rows after dropping columns.
442 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional
443 Additional dataset or dimension record columns to include in the
444 query. Dimension key columns added here are ignored unless they
445 extend beyond the key columns implied by the ``dimensions``
446 argument (which is an error).
447 defer : `bool`, optional
448 If `False`, run the new query immediately. If `True`, do not. If
449 `None` (default), the ``defer`` option passed when making ``self``
450 is used (this option is "sticky").
451 drop_postprocessing : `bool`, optional
452 Drop any iteration-engine operations that depend on columns that
453 are being removed (e.g. region-overlap tests when region columns
454 are being dropped), making it more likely that projection and
455 deduplication could be performed in the preferred engine, where
456 they may be more efficient.
457 keep_record_columns : `bool`, optional
458 If `True` (default) and this query `has_record_columns`, implicitly
459 add any of those to ``columns`` whose dimension element is in the
460 given ``dimensions``.
462 Returns
463 -------
464 query : `Query`
465 New query with the requested columns only, optionally deduplicated.
467 Notes
468 -----
469 Dataset columns are dropped from the new query unless passed via the
470 ``columns`` argument. All other columns are by default preserved.
472 Raises
473 ------
474 lsst.daf.relation.ColumnError
475 Raised if the columns to include in the new query are not all
476 present in the current query.
477 """
478 if dimensions is None:
479 dimensions = set(self._dimensions)
480 else:
481 dimensions = set(dimensions)
482 if columns is not None:
483 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns))
484 dimensions = self._dimensions.universe.extract(dimensions)
485 if columns is None:
486 columns = set()
487 else:
488 columns = set(columns)
489 columns.update(DimensionKeyColumnTag.generate(dimensions.names))
490 if keep_record_columns:
491 if self._has_record_columns is True:
492 for element in dimensions.elements:
493 if element not in self._record_caches:
494 columns.update(element.RecordClass.fields.columns)
495 elif self._has_record_columns in dimensions.elements:
496 element = cast(DimensionElement, self._has_record_columns)
497 columns.update(element.RecordClass.fields.columns)
498 if drop_postprocessing:
499 relation = self._context.drop_invalidated_postprocessing(self._relation, columns)
500 # Dropping postprocessing Calculations could cause other columns
501 # we had otherwise intended to keep to be dropped as well.
502 columns &= relation.columns
503 else:
504 relation = self._relation
505 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine)
506 if unique:
507 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine)
508 return self._chain(relation, defer, dimensions=dimensions)
510 def with_record_columns(
511 self, dimension_element: DimensionElement | None = None, defer: bool | None = None
512 ) -> Query:
513 """Return a modified `Query` with additional dimension record columns
514 and/or caches.
516 Parameters
517 ----------
518 dimension_element : `DimensionElement`, optional
519 Single element to add record columns for, or `None` default to add
520 them for all elements in `dimensions`.
521 defer : `bool`, optional
522 If `False`, run the new query immediately. If `True`, do not. If
523 `None` (default), the ``defer`` option passed when making ``self``
524 is used (this option is "sticky").
526 Returns
527 -------
528 query : `Query`
529 New query with the requested record columns either in the relation
530 or (when possible) available via record caching.
532 Notes
533 -----
534 Adding dimension record columns is fundamentally different from adding
535 new dimension key columns or dataset columns, because it is purely an
536 addition of columns, not rows - we can always join in a dimension
537 element table (if it has not already been included) on keys already
538 present in the current relation, confident that there is exactly one
539 row in the dimension element table for each row in the current
540 relation.
541 """
542 if self._has_record_columns is True or self._has_record_columns == dimension_element:
543 return self
544 record_caches = dict(self._record_caches)
545 columns_required: set[ColumnTag] = set()
546 for element in self.dimensions.elements if dimension_element is None else [dimension_element]:
547 if element in record_caches:
548 continue
549 if (cache := self._backend.get_dimension_record_cache(element.name, self._context)) is not None:
550 record_caches[element] = cache
551 else:
552 columns_required.update(element.RecordClass.fields.columns.keys())
553 # Modify the relation we have to remove any projections that dropped
554 # columns we now want, as long the relation's behavior is otherwise
555 # unchanged.
556 columns_required -= self._relation.columns
557 relation, columns_found = self._context.restore_columns(self._relation, columns_required)
558 columns_required.difference_update(columns_found)
559 if columns_required:
560 relation = self._backend.make_dimension_relation(
561 self._dimensions,
562 columns_required,
563 self._context,
564 initial_relation=relation,
565 # Don't permit joins to use any columns beyond those in the
566 # original relation, as that would change what this operation
567 # does.
568 initial_join_max_columns=frozenset(self._relation.columns),
569 governor_constraints=self._governor_constraints,
570 )
571 return self._chain(
572 relation,
573 defer=defer,
574 has_record_columns=True if dimension_element is None else dimension_element,
575 record_caches=record_caches,
576 )
578 def find_datasets(
579 self,
580 dataset_type: DatasetType,
581 collections: Any,
582 *,
583 find_first: bool = True,
584 columns: Set[str] = frozenset(("dataset_id", "run")),
585 defer: bool | None = None,
586 ) -> Query:
587 """Return a modified `Query` that includes a search for datasets of the
588 given type.
590 Parameters
591 ----------
592 dataset_type : `DatasetType`
593 Dataset type to search for. May not be a component.
594 collections
595 Collection search path or pattern. Must be a single collection
596 name or ordered sequence if ``find_first=True``. See
597 :ref:`daf_butler_collection_expressions` for more information.
598 find_first : `bool`, optional
599 If `True` (default) search collections in order until the first
600 match for each data ID is found. If `False`, return all matches in
601 all collections.
602 columns : `~collections.abc.Set` [ `str` ]
603 Dataset columns to include in the new query. Options include
605 - ``dataset_id``: the unique identifier of the dataset. The type
606 is implementation-dependent. Never nullable. Included by
607 default.
609 - ``ingest_date``: the date and time the dataset was added to the
610 data repository.
612 - ``run``: the foreign key column to the `~CollectionType.RUN`
613 collection holding the dataset (not necessarily the collection
614 name). The type is dependent on the collection manager
615 implementation. Included by default.
617 - ``collection``: the foreign key column to the collection type in
618 which the dataset was actually in this search. The type is
619 dependent on the collection manager implementation. This may
620 differ from ``run`` if the dataset is present in a matching
621 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
622 collection, which means the same dataset may also appear multiple
623 times in the query results.
625 - ``timespan``: the validity range for datasets found in a
626 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other
627 collection types.
629 The default columns (``dataset_id`` and ``run``) are sufficient to
630 enable `iter_dataset_refs`, which also takes care of translating
631 the internal ``RUN`` collection key into its public name.
633 Setting this to an empty set while passing ``find_first=False``
634 will return a query that is constrained by dataset existence in
635 some matching collection that does not actually return which
636 datasets existed.
637 defer : `bool`, optional
638 If `False`, run the new query immediately. If `True`, do not. If
639 `None` (default), the ``defer`` option passed when making ``self``
640 is used (this option is "sticky").
642 Returns
643 -------
644 query : `Query`
645 New query with the requested dataset columns, constrained by the
646 existence of datasets of this type in the given collection.
648 Raises
649 ------
650 lsst.daf.relation.ColumnError
651 Raised if a dataset search is already present in this query and
652 this is a find-first search.
653 """
654 if find_first and DatasetColumnTag.filter_from(self._relation.columns):
655 raise ColumnError(
656 "Cannot search for datasets with find_first=True "
657 "on a query that already includes dataset columns."
658 )
659 #
660 # TODO: it'd be nice to do a QueryContext.restore_columns call here or
661 # similar, to look for dataset-constraint joins already present in the
662 # relation and expand them to include dataset-result columns as well,
663 # instead of doing a possibly-redundant join here. But that would
664 # require pushing relation usage down further into
665 # DatasetStorageManager.make_relation, so that it doesn't need to be
666 # given the columns, and then giving the relation system the ability to
667 # simplify-away redundant joins when they only provide columns that
668 # aren't ultimately used. The right time to look into that is probably
669 # when investigating whether the base QueryBackend should be
670 # responsible for producing an "abstract" relation tree of some sort,
671 # with the subclasses only responsible for filling it in with payloads
672 # (and possibly replacing some leaves with new sub-trees) during when
673 # "processed" (or in some other "prepare" step).
674 #
675 # This is a low priority for three reasons:
676 # - there's some chance the database's query optimizer will simplify
677 # away these redundant joins;
678 # - at present, the main use of this code path is in QG generation,
679 # where we materialize the initial data ID query into a temp table
680 # and hence can't go back and "recover" those dataset columns anyway;
681 #
682 collections = CollectionWildcard.from_expression(collections)
683 if find_first:
684 collections.require_ordered()
685 rejections: list[str] = []
686 collection_records = self._backend.resolve_dataset_collections(
687 dataset_type,
688 collections,
689 governor_constraints=self._governor_constraints,
690 allow_calibration_collections=True,
691 rejections=rejections,
692 )
693 # If the dataset type has dimensions not in the current query, or we
694 # need a temporal join for a calibration collection, either restore
695 # those columns or join them in.
696 full_dimensions = dataset_type.dimensions.union(self._dimensions)
697 relation = self._relation
698 record_caches = self._record_caches
699 base_columns_required: set[ColumnTag] = {
700 DimensionKeyColumnTag(name) for name in full_dimensions.names
701 }
702 spatial_joins: list[tuple[str, str]] = []
703 if not (dataset_type.dimensions <= self._dimensions):
704 if self._has_record_columns is True:
705 # This query is for expanded data IDs, so if we add new
706 # dimensions to the query we need to be able to get records for
707 # the new dimensions.
708 record_caches = dict(self._record_caches)
709 for element in full_dimensions.elements:
710 if element in record_caches:
711 continue
712 if (
713 cache := self._backend.get_dimension_record_cache(element.name, self._context)
714 ) is not None:
715 record_caches[element] = cache
716 else:
717 base_columns_required.update(element.RecordClass.fields.columns.keys())
718 # See if we need spatial joins between the current query and the
719 # dataset type's dimensions. The logic here is for multiple
720 # spatial joins in general, but in practice it'll be exceedingly
721 # rare for there to be more than one. We start by figuring out
722 # which spatial "families" (observations vs. skymaps, skypix
723 # systems) are present on only one side and not the other.
724 lhs_spatial_families = self._dimensions.spatial - dataset_type.dimensions.spatial
725 rhs_spatial_families = dataset_type.dimensions.spatial - self._dimensions.spatial
726 # Now we iterate over the Cartesian product of those, so e.g.
727 # if the query has {tract, patch, visit} and the dataset type
728 # has {htm7} dimensions, the iterations of this loop
729 # correspond to: (skymap, htm), (observations, htm).
730 for lhs_spatial_family, rhs_spatial_family in itertools.product(
731 lhs_spatial_families, rhs_spatial_families
732 ):
733 # For each pair we add a join between the most-precise element
734 # present in each family (e.g. patch beats tract).
735 spatial_joins.append(
736 (
737 lhs_spatial_family.choose(full_dimensions.elements).name,
738 rhs_spatial_family.choose(full_dimensions.elements).name,
739 )
740 )
741 # Set up any temporal join between the query dimensions and CALIBRATION
742 # collection's validity ranges.
743 temporal_join_on: set[ColumnTag] = set()
744 if any(r.type is CollectionType.CALIBRATION for r in collection_records):
745 for family in self._dimensions.temporal:
746 endpoint = family.choose(self._dimensions.elements)
747 temporal_join_on.add(DimensionRecordColumnTag(endpoint.name, "timespan"))
748 base_columns_required.update(temporal_join_on)
749 # Note which of the many kinds of potentially-missing columns we have
750 # and add the rest.
751 base_columns_required.difference_update(relation.columns)
752 if base_columns_required:
753 relation = self._backend.make_dimension_relation(
754 full_dimensions,
755 base_columns_required,
756 self._context,
757 initial_relation=relation,
758 # Don't permit joins to use any columns beyond those in the
759 # original relation, as that would change what this
760 # operation does.
761 initial_join_max_columns=frozenset(self._relation.columns),
762 governor_constraints=self._governor_constraints,
763 spatial_joins=spatial_joins,
764 )
765 # Finally we can join in the search for the dataset query.
766 columns = set(columns)
767 columns.add("dataset_id")
768 if not collection_records:
769 relation = relation.join(
770 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context)
771 )
772 elif find_first:
773 relation = self._backend.make_dataset_search_relation(
774 dataset_type,
775 collection_records,
776 columns,
777 self._context,
778 join_to=relation,
779 temporal_join_on=temporal_join_on,
780 )
781 else:
782 relation = self._backend.make_dataset_query_relation(
783 dataset_type,
784 collection_records,
785 columns,
786 self._context,
787 join_to=relation,
788 temporal_join_on=temporal_join_on,
789 )
790 return self._chain(relation, dimensions=full_dimensions, record_caches=record_caches, defer=defer)
792 def sliced(
793 self,
794 start: int = 0,
795 stop: int | None = None,
796 defer: bool | None = None,
797 ) -> Query:
798 """Return a modified `Query` with that takes a slice of this one's
799 rows.
801 Parameters
802 ----------
803 start : `int`, optional
804 First index to include, inclusive.
805 stop : `int` or `None`, optional
806 One past the last index to include (i.e. exclusive).
807 defer : `bool`, optional
808 If `False`, run the new query immediately. If `True`, do not. If
809 `None` (default), the ``defer`` option passed when making ``self``
810 is used (this option is "sticky").
812 Returns
813 -------
814 query : `Query`
815 New query with the requested slice.
817 Notes
818 -----
819 This operation must be implemented in the iteration engine if there are
820 postprocessing operations, which may be much less efficient than
821 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..``
822 in SQL).
824 Since query row order is usually arbitrary, it usually makes sense to
825 call `sorted` before calling `sliced` to make the results
826 deterministic. This is not checked because there are some contexts
827 where getting an arbitrary subset of the results of a given size
828 still makes sense.
829 """
830 return self._chain(self._relation[start:stop], defer)
832 def sorted(
833 self,
834 order_by: Iterable[SortTerm],
835 defer: bool | None = None,
836 ) -> Query:
837 """Return a modified `Query` that sorts this one's rows.
839 Parameters
840 ----------
841 order_by : `~collections.abc.Iterable` [ `SortTerm` ]
842 Expressions to sort by.
843 defer : `bool`, optional
844 If `False`, run the new query immediately. If `True`, do not. If
845 `None` (default), the ``defer`` option passed when making ``self``
846 is used (this option is "sticky").
848 Returns
849 -------
850 query : `Query`
851 New query with the requested sorting.
853 Notes
854 -----
855 The ``order_by`` expression can include references to dimension record
856 columns that were not present in the original relation; this is
857 similar to calling `with_record_columns` for those columns first (but
858 in this case column requests cannot be satisfied by record caches).
859 All other columns referenced must be present in the query already.
860 """
861 op = Sort(tuple(order_by))
862 columns_required = set(op.columns_required)
863 columns_required.difference_update(self._relation.columns)
864 if columns_required:
865 relation, columns_found = self._context.restore_columns(self._relation, columns_required)
866 columns_required.difference_update(columns_found)
867 if columns_required:
868 try:
869 relation = self._backend.make_dimension_relation(
870 self._dimensions,
871 columns_required,
872 self._context,
873 initial_relation=relation,
874 # Don't permit joins to use any columns beyond those in
875 # the original relation, as that would change what this
876 # operation does.
877 initial_join_max_columns=frozenset(self._relation.columns),
878 governor_constraints=self._governor_constraints,
879 )
880 except ColumnError as err:
881 raise ColumnError(
882 "Cannot sort by columns that were not included in the original query or "
883 "fully resolved by its dimensions."
884 ) from err
885 else:
886 relation = self._relation
887 relation = op.apply(relation, preferred_engine=self._context.preferred_engine)
888 return self._chain(relation, defer)
890 def count(self, *, exact: bool = True, discard: bool = False) -> int:
891 """Count the number of rows in this query.
893 Parameters
894 ----------
895 exact : `bool`, optional
896 If `True` (default), return the exact number of rows. If `False`,
897 returning an upper bound is permitted if it can be done much more
898 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but
899 ignoring client-side filtering that would otherwise take place.
900 discard : `bool`, optional
901 If `True`, compute the exact count even if it would require running
902 the full query and then throwing away the result rows after
903 counting them. If `False`, this is an error, as the user would
904 usually be better off executing the query first to fetch its rows
905 into a new query (or passing ``exact=False``). Ignored if
906 ``exact=False``.
908 Returns
909 -------
910 n_rows : `int`
911 Number of rows in the query, or an upper bound. This includes
912 duplicates, if there are any.
914 Raises
915 ------
916 RuntimeError
917 Raised if an exact count was requested and could not be obtained
918 without fetching and discarding rows.
919 """
920 if self._relation.min_rows == self._relation.max_rows:
921 return self._relation.max_rows
922 return self._context.count(self._relation, exact=exact, discard=discard)
924 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
925 """Check whether this query has any result rows at all.
927 Parameters
928 ----------
929 execute : `bool`, optional
930 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
931 determined prior to execution that the query would return no rows.
932 exact : `bool`, optional
933 If `True`, run the full query and perform post-query filtering if
934 needed, until at least one result row is found. If `False`, the
935 returned result does not account for post-query filtering, and
936 hence may be `True` even when all result rows would be filtered
937 out.
939 Returns
940 -------
941 any_rows : `bool`
942 Whether the query has any rows, or if it may have any rows if
943 ``exact=False``.
945 Raises
946 ------
947 RuntimeError
948 Raised if an exact check was requested and could not be obtained
949 without executing the query.
950 """
951 if self._relation.min_rows > 0:
952 return True
953 if self._relation.max_rows == 0:
954 return False
955 if execute:
956 return self._context.any(self._relation, execute=execute, exact=exact)
957 elif not exact:
958 return True
959 raise TypeError("Cannot obtain exact results without executing the query.")
961 def explain_no_results(self, execute: bool = True) -> list[str]:
962 """Return human-readable messages that may help explain why the query
963 yields no results.
965 Parameters
966 ----------
967 execute : `bool`, optional
968 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
969 of aspects of the query to more precisely determine where rows were
970 filtered out.
972 Returns
973 -------
974 messages : `~collections.abc.Iterable` [ `str` ]
975 String messages that describe reasons the query might not yield any
976 results.
977 """
978 # First try without actually executing any queries.
979 diagnostics = Diagnostics.run(self._relation)
980 if diagnostics.is_doomed:
981 return diagnostics.messages
982 if execute:
983 # Try again, running LIMIT 1 queries as we walk back down the tree
984 # to look for relations with no rows:
985 diagnostics = Diagnostics.run(self._relation, executor=self._context.any)
986 if diagnostics.is_doomed:
987 return diagnostics.messages
988 return []
990 def _copy(
991 self,
992 relation: Relation,
993 is_deferred: bool,
994 dimensions: DimensionGraph | None = None,
995 governor_constraints: Mapping[str, Set[str]] | None = None,
996 has_record_columns: bool | DimensionElement | None = None,
997 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
998 ) -> Query:
999 """Return a modified copy of this query with some attributes replaced.
1001 See class docs for parameter documentation; the only difference here
1002 is that the defaults are the values ``self`` was constructed with.
1003 """
1004 return Query(
1005 dimensions=self._dimensions if dimensions is None else dimensions,
1006 backend=self._backend,
1007 context=self._context,
1008 relation=relation,
1009 governor_constraints=(
1010 governor_constraints if governor_constraints is not None else self._governor_constraints
1011 ),
1012 is_deferred=is_deferred,
1013 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns,
1014 record_caches=self._record_caches if record_caches is None else record_caches,
1015 )
1017 def _chain(
1018 self,
1019 relation: Relation,
1020 defer: bool | None,
1021 dimensions: DimensionGraph | None = None,
1022 governor_constraints: Mapping[str, Set[str]] | None = None,
1023 has_record_columns: bool | DimensionElement | None = None,
1024 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
1025 ) -> Query:
1026 """Return a modified query with a new relation while handling the
1027 ubiquitous ``defer`` parameter's logic.
1029 Parameters
1030 ----------
1031 relation : `Relation`
1032 Relation for the new query.
1033 defer : `bool`
1034 If `False`, run the new query immediately. If `True`, do not. If
1035 `None` , the ``defer`` option passed when making ``self`` is used
1036 (this option is "sticky").
1037 dimensions : `DimensionGraph`, optional
1038 See class docs.
1039 governor_constraints : `~collections.abc.Mapping` [ `str`, \
1040 `~collections.abc.Set` [ `str` ] ], optional
1041 See class docs.
1042 has_record_columns : `bool` or `DimensionElement`, optional
1043 See class docs.
1044 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \
1045 `~collections.abc.Mapping` \
1046 [ `DataCoordinate`, `DimensionRecord` ] ], optional
1047 See class docs.
1049 Returns
1050 -------
1051 chained : `Query`
1052 Modified query, or ``self`` if no modifications were actually
1053 requested.
1054 """
1055 if defer is None:
1056 defer = self._is_deferred
1057 if (
1058 relation is self._relation
1059 and dimensions is None
1060 and defer == self._is_deferred
1061 and record_caches is None
1062 and has_record_columns is None
1063 and governor_constraints is None
1064 ):
1065 return self
1066 result = self._copy(
1067 relation,
1068 is_deferred=True,
1069 governor_constraints=governor_constraints,
1070 dimensions=dimensions,
1071 has_record_columns=has_record_columns,
1072 record_caches=record_caches,
1073 )
1074 if not defer:
1075 result = result.run()
1076 return result