Coverage for python/lsst/daf/butler/registry/queries/_query.py: 15%
241 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ()
31import itertools
32from collections.abc import Iterable, Iterator, Mapping, Sequence, Set
33from contextlib import contextmanager
34from typing import Any, cast, final
36from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm
38from ...core import (
39 DataCoordinate,
40 DatasetColumnTag,
41 DatasetRef,
42 DatasetType,
43 Dimension,
44 DimensionElement,
45 DimensionGraph,
46 DimensionKeyColumnTag,
47 DimensionRecord,
48 DimensionRecordColumnTag,
49)
50from .._collectionType import CollectionType
51from ..wildcards import CollectionWildcard
52from ._query_backend import QueryBackend
53from ._query_context import QueryContext
54from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader
57@final
58class Query:
59 """A general-purpose representation of a registry query.
61 Parameters
62 ----------
63 dimensions : `DimensionGraph`
64 The dimensions that span the query and are used to join its relations
65 together.
66 backend : `QueryBackend`
67 Backend object used to create the query and new ones derived from it.
68 context : `QueryContext`
69 Context manager that holds relation engines and database connections
70 for the query.
71 relation : `Relation`
72 The relation tree representation of the query as a series of operations
73 on tables.
74 governor_constraints : `~collections.abc.Mapping` [ `str`, \
75 `~collections.abc.Set` [ `str` ] ]
76 Constraints on governor dimensions encoded in this query's relation.
77 This is a mapping from governor dimension name to sets of values that
78 dimension may take.
79 is_deferred : `bool`
80 If `True`, modifier methods that return a related `Query` object should
81 not immediately execute the new query.
82 has_record_columns : `bool` or `DimensionElement`
83 Whether this query's relation already includes columns for all or some
84 dimension element records: `True` means all elements in ``dimensions``
85 either have records present in ``record_caches`` or all columns present
86 in ``relation``, while a specific `DimensionElement` means that element
87 does.
88 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \
89 `~collections.abc.Mapping`
90 [ `DataCoordinate`, `DimensionRecord` ] ], optional
91 Cached dimension record values, organized first by dimension element
92 and then by data ID.
94 Notes
95 -----
96 Iterating over a `Query` yields mappings from `ColumnTag` to the associated
97 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and
98 `iter_dimension_records` methods can be used to instead iterate over
99 various butler primitives derived from these rows.
101 Iterating over a `Query` may or may not execute database queries again each
102 time, depending on the state of its relation tree - see `Query.run` for
103 details.
105 Query is immutable; all methods that might appear to modify it in place
106 actually return a new object (though many attributes will be shared).
108 Query is currently (still) an internal-to-Registry object, with only the
109 "QueryResults" classes that are backed by it directly exposed to users. It
110 has been designed with the intent that it will eventually play a larger
111 role, either as the main query result object in a redesigned query
112 interface, or a "power user" result option that accompanies simpler
113 replacements for the current "QueryResults" objects.
114 """
116 def __init__(
117 self,
118 dimensions: DimensionGraph,
119 backend: QueryBackend[QueryContext],
120 context: QueryContext,
121 relation: Relation,
122 governor_constraints: Mapping[str, Set[str]],
123 is_deferred: bool,
124 has_record_columns: bool | DimensionElement,
125 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
126 ):
127 self._dimensions = dimensions
128 self._backend = backend
129 self._context = context
130 self._relation = relation
131 self._governor_constraints = governor_constraints
132 self._is_deferred = is_deferred
133 self._has_record_columns = has_record_columns
134 self._record_caches = record_caches if record_caches is not None else {}
136 @property
137 def dimensions(self) -> DimensionGraph:
138 """The dimensions that span the query and are used to join its
139 relations together (`DimensionGraph`).
140 """
141 return self._dimensions
143 @property
144 def relation(self) -> Relation:
145 """The relation tree representation of the query as a series of
146 operations on tables (`Relation`).
147 """
148 return self._relation
150 @property
151 def has_record_columns(self) -> bool | DimensionElement:
152 """Whether this query's relation already includes columns for all or
153 some dimension element records (`bool` or `DimensionElement`).
154 """
155 return self._has_record_columns
157 @property
158 def backend(self) -> QueryBackend[QueryContext]:
159 """Backend object used to create the query and new ones derived from it
160 (`QueryBackend`).
161 """
162 return self._backend
164 @contextmanager
165 def open_context(self) -> Iterator[None]:
166 """Return a context manager that ensures a database connection is
167 established and temporary tables and cursors have a defined lifetime.
169 Returns
170 -------
171 context : `contextlib.AbstractContextManager`
172 Context manager with no return value.
173 """
174 if self._context.is_open:
175 yield
176 else:
177 with self._context:
178 yield
180 def __str__(self) -> str:
181 return str(self._relation)
183 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]:
184 return iter(self._context.fetch_iterable(self._relation))
186 def iter_data_ids(self, dimensions: DimensionGraph | None = None) -> Iterator[DataCoordinate]:
187 """Return an iterator that converts result rows to data IDs.
189 Parameters
190 ----------
191 dimensions : `DimensionGraph`, optional
192 Dimensions of the data IDs to return. If not provided,
193 ``self.dimensions`` is used.
195 Returns
196 -------
197 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ]
198 Iterator that yields data IDs.
199 """
200 if dimensions is None:
201 dimensions = self._dimensions
202 reader = DataCoordinateReader.make(
203 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches
204 )
205 if not (reader.columns_required <= self.relation.columns):
206 raise ColumnError(
207 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
208 f"for data IDs with dimensions {dimensions}."
209 )
210 return (reader.read(row) for row in self)
212 def iter_dataset_refs(
213 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,)
214 ) -> Iterator[DatasetRef]:
215 """Return an iterator that converts result rows to dataset references.
217 Parameters
218 ----------
219 dataset_type : `DatasetType`
220 The parent dataset type to yield references for.
221 components : `~collections.abc.Sequence` [ `None` or `str` ]
222 Which component dataset types to construct refs for from each row
223 representing a parent; `None` for the parent itself.
225 Returns
226 -------
227 refs : `~collections.abc.Iterator` [ `DatasetRef` ]
228 Iterator that yields (resolved) dataset references.
229 """
230 reader = DatasetRefReader(
231 dataset_type,
232 translate_collection=self._backend.get_collection_name,
233 records=self._has_record_columns is True,
234 record_caches=self._record_caches,
235 )
236 if not (reader.columns_required <= self.relation.columns):
237 raise ColumnError(
238 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
239 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}."
240 )
241 for row in self:
242 parent_ref = reader.read(row)
243 for component in components:
244 if component is None:
245 yield parent_ref
246 else:
247 yield parent_ref.makeComponentRef(component)
249 def iter_data_ids_and_dataset_refs(
250 self, dataset_type: DatasetType, dimensions: DimensionGraph | None = None
251 ) -> Iterator[tuple[DataCoordinate, DatasetRef]]:
252 """Iterate over pairs of data IDs and dataset refs.
254 This permits the data ID dimensions to differ from the dataset
255 dimensions.
257 Parameters
258 ----------
259 dataset_type : `DatasetType`
260 The parent dataset type to yield references for.
261 dimensions : `DimensionGraph`, optional
262 Dimensions of the data IDs to return. If not provided,
263 ``self.dimensions`` is used.
265 Returns
266 -------
267 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`,
268 `DatasetRef` ] ]
269 An iterator over (data ID, dataset reference) pairs.
270 """
271 if dimensions is None:
272 dimensions = self._dimensions
273 data_id_reader = DataCoordinateReader.make(
274 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches
275 )
276 dataset_reader = DatasetRefReader(
277 dataset_type,
278 translate_collection=self._backend.get_collection_name,
279 records=self._has_record_columns is True,
280 record_caches=self._record_caches,
281 )
282 if not (data_id_reader.columns_required <= self.relation.columns):
283 raise ColumnError(
284 f"Missing column(s) {set(data_id_reader.columns_required - self.relation.columns)} "
285 f"for data IDs with dimensions {dimensions}."
286 )
287 if not (dataset_reader.columns_required <= self.relation.columns):
288 raise ColumnError(
289 f"Missing column(s) {set(dataset_reader.columns_required - self.relation.columns)} "
290 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}."
291 )
292 for row in self:
293 yield (data_id_reader.read(row), dataset_reader.read(row))
295 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]:
296 """Return an iterator that converts result rows to dimension records.
298 Parameters
299 ----------
300 element : `DimensionElement`, optional
301 Dimension element whose records will be returned. If not provided,
302 `has_record_columns` must be a `DimensionElement` instance.
304 Returns
305 -------
306 records : `~collections.abc.Iterator` [ `DimensionRecord` ]
307 Iterator that yields dimension records.
308 """
309 if element is None:
310 match self._has_record_columns:
311 case True | False:
312 raise ValueError("No default dimension element in query; 'element' must be given.")
313 case only_element_with_records:
314 element = only_element_with_records
315 if (cache := self._record_caches.get(element)) is not None:
316 return (cache[data_id] for data_id in self.iter_data_ids(element.graph))
317 else:
318 reader = DimensionRecordReader(element)
319 if not (reader.columns_required <= self.relation.columns):
320 raise ColumnError(
321 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
322 f"for records of element {element.name}."
323 )
324 return (reader.read(row) for row in self)
326 def run(self) -> Query:
327 """Execute the query and hold its results in memory.
329 Returns
330 -------
331 executed : `Query`
332 New query that holds the query results.
334 Notes
335 -----
336 Iterating over the results of a query that has been `run` will always
337 iterate over an existing container, while iterating over a query that
338 has not been run will result in executing at least some of the query
339 each time.
341 Running a query also sets its `is_deferred` flag to `False`, which will
342 cause new queries constructed by its methods to be run immediately,
343 unless ``defer=True`` is passed to the factory method. After a query
344 has been run, factory methods will also tend to prefer to apply new
345 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python
346 code acting on the existing container rather than going back to SQL,
347 which can be less efficient overall that applying operations to a
348 deferred query and executing them all only at the end.
350 Running a query is represented in terms of relations by adding a
351 `~lsst.daf.relation.Materialization` marker relation in the iteration
352 engine and then processing the relation tree; this attaches the
353 container of rows to that new relation to short-circuit any future
354 processing of the tree and lock changes to the tree upstream of it.
355 This is very different from the SQL-engine
356 `~lsst.daf.relation.Materialization` added to the tree by the
357 `materialize` method from a user perspective, though it has a similar
358 representation in the relation tree.
359 """
360 relation = (
361 # Make a new relation that definitely ends in the iteration engine
362 # (this does nothing if it already does).
363 self.relation.transferred_to(self._context.iteration_engine)
364 # Make the new relation save its rows to an in-memory Python
365 # collection in relation.payload when processed.
366 .materialized(name_prefix="run")
367 )
368 # Actually process the relation, simplifying out trivial relations,
369 # executing any SQL queries, and saving results to relation.payload.
370 # We discard the simplified relation that's returned, because we want
371 # the new query to have any extra diagnostic information contained in
372 # the original.
373 self._context.process(relation)
374 return self._copy(relation, False)
376 def materialized(self, defer_postprocessing: bool = True) -> Query:
377 """Materialize the results of this query in its context's preferred
378 engine.
380 Usually this means inserting the results into a temporary table in a
381 database.
383 Parameters
384 ----------
385 defer_postprocessing : `bool`, optional
386 If `True`, do not execute operations that occur in the context's
387 `QueryContext.iteration_engine` up front; instead insert and
388 execute a materialization upstream of them (e.g. via a a SQL
389 ``INSERT INTO ... SELECT`` statement, with no fetching to the
390 client) and execute the postprocessing operations when iterating
391 over the query results. If `False`, and iteration-engine
392 postprocessing operations exist, run the full query, execute them
393 now, and upload the results.
394 If the relation is already in the preferred engine, this option
395 is ignored and the materialization will not involve fetching rows
396 to the iteration engine at all. If the relation has already been
397 materialized in the iteration engine (i.e. via `run`), then this
398 option is again ignored and an upload of the existing rows will
399 be performed.
401 Returns
402 -------
403 materialized : `Query`
404 Modified query with the same row-and-column content with a
405 materialization in ``self.context.preferred_engine``.
406 """
407 if defer_postprocessing or self.relation.engine == self._context.preferred_engine:
408 relation, stripped = self._context.strip_postprocessing(self._relation)
409 if relation.engine == self._context.preferred_engine:
410 # We got all the way to the engine we want to materialize in.
411 # Apply that operation to the tree, process it (which actually
412 # creates a temporary table and populates it), and then reapply
413 # the stripped operations.
414 relation = relation.materialized()
415 self._context.process(relation)
416 for operation in stripped:
417 relation = operation.apply(
418 relation, transfer=True, preferred_engine=self._context.iteration_engine
419 )
420 return self._copy(relation, True)
421 # Either defer_postprocessing=False, or attempting to strip off unary
422 # operations until we got to the preferred engine didn't work, because
423 # this tree doesn't actually involve the preferred engine. So we just
424 # transfer to the preferred engine first, and then materialize,
425 # process, and return.
426 relation = self._relation.transferred_to(self._context.preferred_engine).materialized()
427 self._context.process(relation)
428 return self._copy(relation, True)
430 def projected(
431 self,
432 dimensions: Iterable[Dimension | str] | None = None,
433 unique: bool = True,
434 columns: Iterable[ColumnTag] | None = None,
435 defer: bool | None = None,
436 drop_postprocessing: bool = False,
437 keep_record_columns: bool = True,
438 ) -> Query:
439 """Return a modified `Query` with a subset of this one's columns.
441 Parameters
442 ----------
443 dimensions : `~collections.abc.Iterable` [ `Dimension` or `str` ],
444 optional
445 Dimensions to include in the new query. Will be expanded to
446 include all required and implied dependencies. Must be a subset of
447 ``self.dimensions``. If not provided, ``self.dimensions`` is used.
448 unique : `bool`, optional
449 If `True` (default) deduplicate rows after dropping columns.
450 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional
451 Additional dataset or dimension record columns to include in the
452 query. Dimension key columns added here are ignored unless they
453 extend beyond the key columns implied by the ``dimensions``
454 argument (which is an error).
455 defer : `bool`, optional
456 If `False`, run the new query immediately. If `True`, do not. If
457 `None` (default), the ``defer`` option passed when making ``self``
458 is used (this option is "sticky").
459 drop_postprocessing : `bool`, optional
460 Drop any iteration-engine operations that depend on columns that
461 are being removed (e.g. region-overlap tests when region columns
462 are being dropped), making it more likely that projection and
463 deduplication could be performed in the preferred engine, where
464 they may be more efficient.
465 keep_record_columns : `bool`, optional
466 If `True` (default) and this query `has_record_columns`, implicitly
467 add any of those to ``columns`` whose dimension element is in the
468 given ``dimensions``.
470 Returns
471 -------
472 query : `Query`
473 New query with the requested columns only, optionally deduplicated.
475 Notes
476 -----
477 Dataset columns are dropped from the new query unless passed via the
478 ``columns`` argument. All other columns are by default preserved.
480 Raises
481 ------
482 lsst.daf.relation.ColumnError
483 Raised if the columns to include in the new query are not all
484 present in the current query.
485 """
486 if dimensions is None:
487 dimensions = set(self._dimensions)
488 else:
489 dimensions = set(dimensions)
490 if columns is not None:
491 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns))
492 dimensions = self._dimensions.universe.extract(dimensions)
493 if columns is None:
494 columns = set()
495 else:
496 columns = set(columns)
497 columns.update(DimensionKeyColumnTag.generate(dimensions.names))
498 if keep_record_columns:
499 if self._has_record_columns is True:
500 for element in dimensions.elements:
501 if element not in self._record_caches:
502 columns.update(element.RecordClass.fields.columns)
503 elif self._has_record_columns in dimensions.elements:
504 element = cast(DimensionElement, self._has_record_columns)
505 columns.update(element.RecordClass.fields.columns)
506 if drop_postprocessing:
507 relation = self._context.drop_invalidated_postprocessing(self._relation, columns)
508 # Dropping postprocessing Calculations could cause other columns
509 # we had otherwise intended to keep to be dropped as well.
510 columns &= relation.columns
511 else:
512 relation = self._relation
513 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine)
514 if unique:
515 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine)
516 return self._chain(relation, defer, dimensions=dimensions)
518 def with_record_columns(
519 self, dimension_element: DimensionElement | None = None, defer: bool | None = None
520 ) -> Query:
521 """Return a modified `Query` with additional dimension record columns
522 and/or caches.
524 Parameters
525 ----------
526 dimension_element : `DimensionElement`, optional
527 Single element to add record columns for, or `None` default to add
528 them for all elements in `dimensions`.
529 defer : `bool`, optional
530 If `False`, run the new query immediately. If `True`, do not. If
531 `None` (default), the ``defer`` option passed when making ``self``
532 is used (this option is "sticky").
534 Returns
535 -------
536 query : `Query`
537 New query with the requested record columns either in the relation
538 or (when possible) available via record caching.
540 Notes
541 -----
542 Adding dimension record columns is fundamentally different from adding
543 new dimension key columns or dataset columns, because it is purely an
544 addition of columns, not rows - we can always join in a dimension
545 element table (if it has not already been included) on keys already
546 present in the current relation, confident that there is exactly one
547 row in the dimension element table for each row in the current
548 relation.
549 """
550 if self._has_record_columns is True or self._has_record_columns == dimension_element:
551 return self
552 record_caches = dict(self._record_caches)
553 columns_required: set[ColumnTag] = set()
554 for element in self.dimensions.elements if dimension_element is None else [dimension_element]:
555 if element in record_caches:
556 continue
557 if (cache := self._backend.get_dimension_record_cache(element.name, self._context)) is not None:
558 record_caches[element] = cache
559 else:
560 columns_required.update(element.RecordClass.fields.columns.keys())
561 # Modify the relation we have to remove any projections that dropped
562 # columns we now want, as long the relation's behavior is otherwise
563 # unchanged.
564 columns_required -= self._relation.columns
565 relation, columns_found = self._context.restore_columns(self._relation, columns_required)
566 columns_required.difference_update(columns_found)
567 if columns_required:
568 relation = self._backend.make_dimension_relation(
569 self._dimensions,
570 columns_required,
571 self._context,
572 initial_relation=relation,
573 # Don't permit joins to use any columns beyond those in the
574 # original relation, as that would change what this operation
575 # does.
576 initial_join_max_columns=frozenset(self._relation.columns),
577 governor_constraints=self._governor_constraints,
578 )
579 return self._chain(
580 relation,
581 defer=defer,
582 has_record_columns=True if dimension_element is None else dimension_element,
583 record_caches=record_caches,
584 )
586 def find_datasets(
587 self,
588 dataset_type: DatasetType,
589 collections: Any,
590 *,
591 find_first: bool = True,
592 columns: Set[str] = frozenset(("dataset_id", "run")),
593 defer: bool | None = None,
594 ) -> Query:
595 """Return a modified `Query` that includes a search for datasets of the
596 given type.
598 Parameters
599 ----------
600 dataset_type : `DatasetType`
601 Dataset type to search for. May not be a component.
602 collections
603 Collection search path or pattern. Must be a single collection
604 name or ordered sequence if ``find_first=True``. See
605 :ref:`daf_butler_collection_expressions` for more information.
606 find_first : `bool`, optional
607 If `True` (default) search collections in order until the first
608 match for each data ID is found. If `False`, return all matches in
609 all collections.
610 columns : `~collections.abc.Set` [ `str` ]
611 Dataset columns to include in the new query. Options include
613 - ``dataset_id``: the unique identifier of the dataset. The type
614 is implementation-dependent. Never nullable. Included by
615 default.
617 - ``ingest_date``: the date and time the dataset was added to the
618 data repository.
620 - ``run``: the foreign key column to the `~CollectionType.RUN`
621 collection holding the dataset (not necessarily the collection
622 name). The type is dependent on the collection manager
623 implementation. Included by default.
625 - ``collection``: the foreign key column to the collection type in
626 which the dataset was actually in this search. The type is
627 dependent on the collection manager implementation. This may
628 differ from ``run`` if the dataset is present in a matching
629 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
630 collection, which means the same dataset may also appear multiple
631 times in the query results.
633 - ``timespan``: the validity range for datasets found in a
634 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other
635 collection types.
637 The default columns (``dataset_id`` and ``run``) are sufficient to
638 enable `iter_dataset_refs`, which also takes care of translating
639 the internal ``RUN`` collection key into its public name.
641 Setting this to an empty set while passing ``find_first=False``
642 will return a query that is constrained by dataset existence in
643 some matching collection that does not actually return which
644 datasets existed.
645 defer : `bool`, optional
646 If `False`, run the new query immediately. If `True`, do not. If
647 `None` (default), the ``defer`` option passed when making ``self``
648 is used (this option is "sticky").
650 Returns
651 -------
652 query : `Query`
653 New query with the requested dataset columns, constrained by the
654 existence of datasets of this type in the given collection.
656 Raises
657 ------
658 lsst.daf.relation.ColumnError
659 Raised if a dataset search is already present in this query and
660 this is a find-first search.
661 """
662 if find_first and DatasetColumnTag.filter_from(self._relation.columns):
663 raise ColumnError(
664 "Cannot search for datasets with find_first=True "
665 "on a query that already includes dataset columns."
666 )
667 #
668 # TODO: it'd be nice to do a QueryContext.restore_columns call here or
669 # similar, to look for dataset-constraint joins already present in the
670 # relation and expand them to include dataset-result columns as well,
671 # instead of doing a possibly-redundant join here. But that would
672 # require pushing relation usage down further into
673 # DatasetStorageManager.make_relation, so that it doesn't need to be
674 # given the columns, and then giving the relation system the ability to
675 # simplify-away redundant joins when they only provide columns that
676 # aren't ultimately used. The right time to look into that is probably
677 # when investigating whether the base QueryBackend should be
678 # responsible for producing an "abstract" relation tree of some sort,
679 # with the subclasses only responsible for filling it in with payloads
680 # (and possibly replacing some leaves with new sub-trees) during when
681 # "processed" (or in some other "prepare" step).
682 #
683 # This is a low priority for three reasons:
684 # - there's some chance the database's query optimizer will simplify
685 # away these redundant joins;
686 # - at present, the main use of this code path is in QG generation,
687 # where we materialize the initial data ID query into a temp table
688 # and hence can't go back and "recover" those dataset columns anyway;
689 #
690 collections = CollectionWildcard.from_expression(collections)
691 if find_first:
692 collections.require_ordered()
693 rejections: list[str] = []
694 collection_records = self._backend.resolve_dataset_collections(
695 dataset_type,
696 collections,
697 governor_constraints=self._governor_constraints,
698 allow_calibration_collections=True,
699 rejections=rejections,
700 )
701 # If the dataset type has dimensions not in the current query, or we
702 # need a temporal join for a calibration collection, either restore
703 # those columns or join them in.
704 full_dimensions = dataset_type.dimensions.union(self._dimensions)
705 relation = self._relation
706 record_caches = self._record_caches
707 base_columns_required: set[ColumnTag] = {
708 DimensionKeyColumnTag(name) for name in full_dimensions.names
709 }
710 spatial_joins: list[tuple[str, str]] = []
711 if not (dataset_type.dimensions <= self._dimensions):
712 if self._has_record_columns is True:
713 # This query is for expanded data IDs, so if we add new
714 # dimensions to the query we need to be able to get records for
715 # the new dimensions.
716 record_caches = dict(self._record_caches)
717 for element in full_dimensions.elements:
718 if element in record_caches:
719 continue
720 if (
721 cache := self._backend.get_dimension_record_cache(element.name, self._context)
722 ) is not None:
723 record_caches[element] = cache
724 else:
725 base_columns_required.update(element.RecordClass.fields.columns.keys())
726 # See if we need spatial joins between the current query and the
727 # dataset type's dimensions. The logic here is for multiple
728 # spatial joins in general, but in practice it'll be exceedingly
729 # rare for there to be more than one. We start by figuring out
730 # which spatial "families" (observations vs. skymaps, skypix
731 # systems) are present on only one side and not the other.
732 lhs_spatial_families = self._dimensions.spatial - dataset_type.dimensions.spatial
733 rhs_spatial_families = dataset_type.dimensions.spatial - self._dimensions.spatial
734 # Now we iterate over the Cartesian product of those, so e.g.
735 # if the query has {tract, patch, visit} and the dataset type
736 # has {htm7} dimensions, the iterations of this loop
737 # correspond to: (skymap, htm), (observations, htm).
738 for lhs_spatial_family, rhs_spatial_family in itertools.product(
739 lhs_spatial_families, rhs_spatial_families
740 ):
741 # For each pair we add a join between the most-precise element
742 # present in each family (e.g. patch beats tract).
743 spatial_joins.append(
744 (
745 lhs_spatial_family.choose(full_dimensions.elements).name,
746 rhs_spatial_family.choose(full_dimensions.elements).name,
747 )
748 )
749 # Set up any temporal join between the query dimensions and CALIBRATION
750 # collection's validity ranges.
751 temporal_join_on: set[ColumnTag] = set()
752 if any(r.type is CollectionType.CALIBRATION for r in collection_records):
753 for family in self._dimensions.temporal:
754 endpoint = family.choose(self._dimensions.elements)
755 temporal_join_on.add(DimensionRecordColumnTag(endpoint.name, "timespan"))
756 base_columns_required.update(temporal_join_on)
757 # Note which of the many kinds of potentially-missing columns we have
758 # and add the rest.
759 base_columns_required.difference_update(relation.columns)
760 if base_columns_required:
761 relation = self._backend.make_dimension_relation(
762 full_dimensions,
763 base_columns_required,
764 self._context,
765 initial_relation=relation,
766 # Don't permit joins to use any columns beyond those in the
767 # original relation, as that would change what this
768 # operation does.
769 initial_join_max_columns=frozenset(self._relation.columns),
770 governor_constraints=self._governor_constraints,
771 spatial_joins=spatial_joins,
772 )
773 # Finally we can join in the search for the dataset query.
774 columns = set(columns)
775 columns.add("dataset_id")
776 if not collection_records:
777 relation = relation.join(
778 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context)
779 )
780 elif find_first:
781 relation = self._backend.make_dataset_search_relation(
782 dataset_type,
783 collection_records,
784 columns,
785 self._context,
786 join_to=relation,
787 temporal_join_on=temporal_join_on,
788 )
789 else:
790 relation = self._backend.make_dataset_query_relation(
791 dataset_type,
792 collection_records,
793 columns,
794 self._context,
795 join_to=relation,
796 temporal_join_on=temporal_join_on,
797 )
798 return self._chain(relation, dimensions=full_dimensions, record_caches=record_caches, defer=defer)
800 def sliced(
801 self,
802 start: int = 0,
803 stop: int | None = None,
804 defer: bool | None = None,
805 ) -> Query:
806 """Return a modified `Query` with that takes a slice of this one's
807 rows.
809 Parameters
810 ----------
811 start : `int`, optional
812 First index to include, inclusive.
813 stop : `int` or `None`, optional
814 One past the last index to include (i.e. exclusive).
815 defer : `bool`, optional
816 If `False`, run the new query immediately. If `True`, do not. If
817 `None` (default), the ``defer`` option passed when making ``self``
818 is used (this option is "sticky").
820 Returns
821 -------
822 query : `Query`
823 New query with the requested slice.
825 Notes
826 -----
827 This operation must be implemented in the iteration engine if there are
828 postprocessing operations, which may be much less efficient than
829 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..``
830 in SQL).
832 Since query row order is usually arbitrary, it usually makes sense to
833 call `sorted` before calling `sliced` to make the results
834 deterministic. This is not checked because there are some contexts
835 where getting an arbitrary subset of the results of a given size
836 still makes sense.
837 """
838 return self._chain(self._relation[start:stop], defer)
840 def sorted(
841 self,
842 order_by: Iterable[SortTerm],
843 defer: bool | None = None,
844 ) -> Query:
845 """Return a modified `Query` that sorts this one's rows.
847 Parameters
848 ----------
849 order_by : `~collections.abc.Iterable` [ `SortTerm` ]
850 Expressions to sort by.
851 defer : `bool`, optional
852 If `False`, run the new query immediately. If `True`, do not. If
853 `None` (default), the ``defer`` option passed when making ``self``
854 is used (this option is "sticky").
856 Returns
857 -------
858 query : `Query`
859 New query with the requested sorting.
861 Notes
862 -----
863 The ``order_by`` expression can include references to dimension record
864 columns that were not present in the original relation; this is
865 similar to calling `with_record_columns` for those columns first (but
866 in this case column requests cannot be satisfied by record caches).
867 All other columns referenced must be present in the query already.
868 """
869 op = Sort(tuple(order_by))
870 columns_required = set(op.columns_required)
871 columns_required.difference_update(self._relation.columns)
872 if columns_required:
873 relation, columns_found = self._context.restore_columns(self._relation, columns_required)
874 columns_required.difference_update(columns_found)
875 if columns_required:
876 try:
877 relation = self._backend.make_dimension_relation(
878 self._dimensions,
879 columns_required,
880 self._context,
881 initial_relation=relation,
882 # Don't permit joins to use any columns beyond those in
883 # the original relation, as that would change what this
884 # operation does.
885 initial_join_max_columns=frozenset(self._relation.columns),
886 governor_constraints=self._governor_constraints,
887 )
888 except ColumnError as err:
889 raise ColumnError(
890 "Cannot sort by columns that were not included in the original query or "
891 "fully resolved by its dimensions."
892 ) from err
893 else:
894 relation = self._relation
895 relation = op.apply(relation, preferred_engine=self._context.preferred_engine)
896 return self._chain(relation, defer)
898 def count(self, *, exact: bool = True, discard: bool = False) -> int:
899 """Count the number of rows in this query.
901 Parameters
902 ----------
903 exact : `bool`, optional
904 If `True` (default), return the exact number of rows. If `False`,
905 returning an upper bound is permitted if it can be done much more
906 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but
907 ignoring client-side filtering that would otherwise take place.
908 discard : `bool`, optional
909 If `True`, compute the exact count even if it would require running
910 the full query and then throwing away the result rows after
911 counting them. If `False`, this is an error, as the user would
912 usually be better off executing the query first to fetch its rows
913 into a new query (or passing ``exact=False``). Ignored if
914 ``exact=False``.
916 Returns
917 -------
918 n_rows : `int`
919 Number of rows in the query, or an upper bound. This includes
920 duplicates, if there are any.
922 Raises
923 ------
924 RuntimeError
925 Raised if an exact count was requested and could not be obtained
926 without fetching and discarding rows.
927 """
928 if self._relation.min_rows == self._relation.max_rows:
929 return self._relation.max_rows
930 return self._context.count(self._relation, exact=exact, discard=discard)
932 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
933 """Check whether this query has any result rows at all.
935 Parameters
936 ----------
937 execute : `bool`, optional
938 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
939 determined prior to execution that the query would return no rows.
940 exact : `bool`, optional
941 If `True`, run the full query and perform post-query filtering if
942 needed, until at least one result row is found. If `False`, the
943 returned result does not account for post-query filtering, and
944 hence may be `True` even when all result rows would be filtered
945 out.
947 Returns
948 -------
949 any_rows : `bool`
950 Whether the query has any rows, or if it may have any rows if
951 ``exact=False``.
953 Raises
954 ------
955 RuntimeError
956 Raised if an exact check was requested and could not be obtained
957 without executing the query.
958 """
959 if self._relation.min_rows > 0:
960 return True
961 if self._relation.max_rows == 0:
962 return False
963 if execute:
964 return self._context.any(self._relation, execute=execute, exact=exact)
965 elif not exact:
966 return True
967 raise TypeError("Cannot obtain exact results without executing the query.")
969 def explain_no_results(self, execute: bool = True) -> list[str]:
970 """Return human-readable messages that may help explain why the query
971 yields no results.
973 Parameters
974 ----------
975 execute : `bool`, optional
976 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
977 of aspects of the query to more precisely determine where rows were
978 filtered out.
980 Returns
981 -------
982 messages : `~collections.abc.Iterable` [ `str` ]
983 String messages that describe reasons the query might not yield any
984 results.
985 """
986 # First try without actually executing any queries.
987 diagnostics = Diagnostics.run(self._relation)
988 if diagnostics.is_doomed:
989 return diagnostics.messages
990 if execute:
991 # Try again, running LIMIT 1 queries as we walk back down the tree
992 # to look for relations with no rows:
993 diagnostics = Diagnostics.run(self._relation, executor=self._context.any)
994 if diagnostics.is_doomed:
995 return diagnostics.messages
996 return []
998 def _copy(
999 self,
1000 relation: Relation,
1001 is_deferred: bool,
1002 dimensions: DimensionGraph | None = None,
1003 governor_constraints: Mapping[str, Set[str]] | None = None,
1004 has_record_columns: bool | DimensionElement | None = None,
1005 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
1006 ) -> Query:
1007 """Return a modified copy of this query with some attributes replaced.
1009 See class docs for parameter documentation; the only difference here
1010 is that the defaults are the values ``self`` was constructed with.
1011 """
1012 return Query(
1013 dimensions=self._dimensions if dimensions is None else dimensions,
1014 backend=self._backend,
1015 context=self._context,
1016 relation=relation,
1017 governor_constraints=(
1018 governor_constraints if governor_constraints is not None else self._governor_constraints
1019 ),
1020 is_deferred=is_deferred,
1021 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns,
1022 record_caches=self._record_caches if record_caches is None else record_caches,
1023 )
1025 def _chain(
1026 self,
1027 relation: Relation,
1028 defer: bool | None,
1029 dimensions: DimensionGraph | None = None,
1030 governor_constraints: Mapping[str, Set[str]] | None = None,
1031 has_record_columns: bool | DimensionElement | None = None,
1032 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
1033 ) -> Query:
1034 """Return a modified query with a new relation while handling the
1035 ubiquitous ``defer`` parameter's logic.
1037 Parameters
1038 ----------
1039 relation : `Relation`
1040 Relation for the new query.
1041 defer : `bool`
1042 If `False`, run the new query immediately. If `True`, do not. If
1043 `None` , the ``defer`` option passed when making ``self`` is used
1044 (this option is "sticky").
1045 dimensions : `DimensionGraph`, optional
1046 See class docs.
1047 governor_constraints : `~collections.abc.Mapping` [ `str`, \
1048 `~collections.abc.Set` [ `str` ] ], optional
1049 See class docs.
1050 has_record_columns : `bool` or `DimensionElement`, optional
1051 See class docs.
1052 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \
1053 `~collections.abc.Mapping` \
1054 [ `DataCoordinate`, `DimensionRecord` ] ], optional
1055 See class docs.
1057 Returns
1058 -------
1059 chained : `Query`
1060 Modified query, or ``self`` if no modifications were actually
1061 requested.
1062 """
1063 if defer is None:
1064 defer = self._is_deferred
1065 if (
1066 relation is self._relation
1067 and dimensions is None
1068 and defer == self._is_deferred
1069 and record_caches is None
1070 and has_record_columns is None
1071 and governor_constraints is None
1072 ):
1073 return self
1074 result = self._copy(
1075 relation,
1076 is_deferred=True,
1077 governor_constraints=governor_constraints,
1078 dimensions=dimensions,
1079 has_record_columns=has_record_columns,
1080 record_caches=record_caches,
1081 )
1082 if not defer:
1083 result = result.run()
1084 return result