Coverage for python / lsst / daf / butler / direct_query_driver / _query_builder.py: 32%
205 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-24 08:17 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-24 08:17 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "QueryBuilder",
32 "SingleSelectQueryBuilder",
33 "UnionQueryBuilder",
34 "UnionQueryBuilderTerm",
35)
37import dataclasses
38import itertools
39from abc import ABC, abstractmethod
40from collections.abc import Iterable, Set
41from typing import TYPE_CHECKING, Literal, TypeVar, overload
43import sqlalchemy
45from ..dimensions import DimensionGroup
46from ..queries import tree as qt
47from ..registry.interfaces import Database
48from ._query_analysis import (
49 QueryFindFirstAnalysis,
50 QueryJoinsAnalysis,
51 QueryTreeAnalysis,
52 ResolvedDatasetSearch,
53)
54from ._sql_builders import SqlColumns, SqlJoinsBuilder, SqlSelectBuilder
56if TYPE_CHECKING:
57 from ._driver import DirectQueryDriver
58 from ._postprocessing import Postprocessing
60_T = TypeVar("_T")
63class QueryBuilder(ABC):
64 """An abstract base class for objects that transform query descriptions
65 into SQL and `Postprocessing`.
67 See `DirectQueryDriver.build_query` for an overview of query construction,
68 including the role this class plays in it.
70 Parameters
71 ----------
72 tree_analysis : `QueryTreeAnalysis`
73 Result of initial analysis of the most of the query description.
74 considered consumed because nested attributes will be referenced and
75 may be modified in-place in the future.
76 projection_columns : `.queries.tree.ColumnSet`
77 Columns to include in the query's "projection" stage, where a GROUP BY
78 or DISTINCT may be performed.
79 final_columns : `.queries.tree.ColumnSet`
80 Columns to include in the final query.
81 find_first_dataset : `str` or ``...`` or None
82 Name of the dataset type that needs a find-first search. ``...``
83 is used to indicate the dataset types in a union dataset query.
84 `None` means find-first is not used.
85 """
87 def __init__(
88 self,
89 tree_analysis: QueryTreeAnalysis,
90 *,
91 projection_columns: qt.ColumnSet,
92 final_columns: qt.ColumnSet,
93 find_first_dataset: str | qt.AnyDatasetType | None,
94 ):
95 self.joins_analysis = tree_analysis.joins
96 self.postprocessing = tree_analysis.postprocessing
97 self.projection_columns = projection_columns
98 self.final_columns = final_columns
99 self.needs_dimension_distinct = False
100 self.find_first_dataset = find_first_dataset
102 joins_analysis: QueryJoinsAnalysis
103 """Description of the "joins" stage of query construction."""
105 projection_columns: qt.ColumnSet
106 """The columns present in the query after the projection is applied.
108 This is always a subset of `QueryJoinsAnalysis.columns`.
109 """
111 needs_dimension_distinct: bool = False
112 """If `True`, the projection's dimensions do not include all dimensions in
113 the "joins" stage, and hence a SELECT DISTINCT [ON] or GROUP BY must be
114 used to make post-projection rows unique.
115 """
117 find_first_dataset: str | qt.AnyDatasetType | None = None
118 """If not `None`, this is a find-first query for this dataset.
120 This is set even if the find-first search is trivial because there is only
121 one resolved collection.
122 """
124 final_columns: qt.ColumnSet
125 """The columns included in the SELECT clause of the complete SQL query
126 that is actually executed.
128 This is a subset of `QueryProjectionPlan.columns` that differs only in
129 columns used by the `find_first` stage or an ORDER BY expression.
131 Like all other `.queries.tree.ColumnSet` attributes, it does not include
132 fields added directly to `SqlSelectBuilder.special`, which may also be
133 added to the SELECT clause.
134 """
136 postprocessing: Postprocessing
137 """Struct representing post-query processing in Python, which may require
138 additional columns in the query results.
139 """
141 @abstractmethod
142 def analyze_projection(self) -> None:
143 """Analyze the "projection" stage of query construction, in which the
144 query may be nested in a GROUP BY or DISTINCT subquery in order to
145 ensure rows do not have duplicates.
147 This modifies the builder in place, and should be called immediately
148 after construction.
150 Notes
151 -----
152 Implementations should delegate to `super` to set
153 `needs_dimension_distinct`, but generally need to provide additional
154 logic to determine whether a GROUP BY or DISTINCT will be needed for
155 other reasons (e.g. duplication due to dataset searches over multiple
156 collections).
157 """
158 # The projection gets interesting if it does not have all of the
159 # dimension keys or dataset fields of the "joins" stage, because that
160 # means it needs to do a GROUP BY or DISTINCT ON to get unique rows.
161 # Subclass implementations handle the check for dataset fields.
162 if self.projection_columns.dimensions != self.joins_analysis.columns.dimensions:
163 assert self.projection_columns.dimensions.issubset(self.joins_analysis.columns.dimensions)
164 # We're going from a larger set of dimensions to a smaller set;
165 # that means we'll be doing a SELECT DISTINCT [ON] or GROUP BY.
166 self.needs_dimension_distinct = True
168 @abstractmethod
169 def analyze_find_first(self) -> None:
170 """Analyze the "find first" stage of query construction, in which a
171 Common Table Expression with PARTITION ON may be used to find the first
172 dataset for each data ID and dataset type in an ordered collection
173 sequence.
175 This modifies the builder in place, and should be called immediately
176 after `analyze_projection`.
177 """
178 raise NotImplementedError()
180 @abstractmethod
181 def apply_joins(self, driver: DirectQueryDriver) -> None:
182 """Translate the "joins" stage of the query to SQL.
184 This modifies the builder in place. It is the first step in the
185 "apply" phase, and should be called after `analyze_find_first` finishes
186 the "analysis" phase (if more than analysis is needed).
188 Parameters
189 ----------
190 driver : `DirectQueryDriver`
191 Driver that invoked this builder and may be called back into for
192 lower-level SQL generation operations.
193 """
194 raise NotImplementedError()
196 @abstractmethod
197 def apply_projection(self, driver: DirectQueryDriver, order_by: Iterable[qt.OrderExpression]) -> None:
198 """Translate the "projection" stage of the query to SQL.
200 This modifies the builder in place. It is the second step in the
201 "apply" phase, after `apply_joins`.
203 Parameters
204 ----------
205 driver : `DirectQueryDriver`
206 Driver that invoked this builder and may be called back into for
207 lower-level SQL generation operations.
208 order_by : `~collections.abc.Iterable` [ \
209 `.queries.tree.OrderExpression` ]
210 Column expression used to order the query rows.
211 """
212 raise NotImplementedError()
214 @abstractmethod
215 def apply_find_first(self, driver: DirectQueryDriver) -> None:
216 """Transform the "find first" stage of the query to SQL.
218 This modifies the builder in place. It is the third and final step in
219 the "apply" phase, after "apply_projection".
221 Parameters
222 ----------
223 driver : `DirectQueryDriver`
224 Driver that invoked this builder and may be called back into for
225 lower-level SQL generation operations.
226 """
227 raise NotImplementedError()
229 @overload
230 def finish_select( 230 ↛ exitline 230 didn't return from function 'finish_select' because
231 self, return_columns: Literal[True] = True
232 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, SqlColumns]: ...
234 @overload
235 def finish_select( 235 ↛ exitline 235 didn't return from function 'finish_select' because
236 self, return_columns: Literal[False]
237 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, None]: ...
239 @abstractmethod
240 def finish_select(
241 self, return_columns: bool = True
242 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, SqlColumns | None]:
243 """Finish translating the query into executable SQL.
245 Parameters
246 ----------
247 return_columns : `bool`
248 If `True`, return a structure that organizes the SQLAlchemy
249 column objects available to the query.
251 Returns
252 -------
253 sql_select : `sqlalchemy.Select` or `sqlalchemy.CompoundSelect`.
254 A SELECT [UNION ALL] SQL query.
255 sql_columns : `SqlColumns` or `None`
256 The columns available to the query (including any available to
257 an ORDER BY clause, not just those in the SELECT clause, in
258 contexts where those are not the same. May be `None` (but is not
259 guaranteed to be) if ``return_columns=False``.
260 """
261 raise NotImplementedError()
263 @abstractmethod
264 def finish_nested(self, cte: bool = False) -> SqlSelectBuilder:
265 """Finish translating the query into SQL that can be used as a
266 subquery.
268 Parameters
269 ----------
270 cte : `bool`, optional
271 If `True`, nest the query in a common table expression (i.e. SQL
272 WITH statement) instead of a subquery.
274 Returns
275 -------
276 select_builder : `SqlSelectBuilder`
277 A builder object that maps to a single SELECT statement. This may
278 directly hold the original query with no subquery or CTE if that
279 query was a single SELECT with no GROUP BY or DISTINCT; in either
280 case it is guaranteed that modifying this builder's result columns
281 and transforming it into a SELECT will not change the number of
282 rows.
283 """
284 raise NotImplementedError()
286 def _needs_collection_key_field(
287 self, dataset_search: ResolvedDatasetSearch, fields_for_dataset: set[qt.AnyDatasetFieldName]
288 ) -> bool:
289 """Return `True` if the ``collection_key`` dataset field is needed to
290 provide uniqueness for rows.
291 """
292 # For a dataset search, we sometimes want just one row for each dataset
293 # and sometimes we need multiple rows, one for each collection that
294 # the dataset was found in.
295 #
296 # We need multiple rows if any of the following are true:
297 # - This is a find-first dataset search. The rows will be ranked using
298 # a window function to determine the first collection containing a
299 # matching dataset, so we need a row for each collection to feed into
300 # the window.
301 # - The user requested dataset fields that differ depending on which
302 # collection the dataset was found in, so we need a row for each
303 # collection to get all the possible values for the dataset fields.
304 #
305 # To ensure that we keep the necessary rows after DISTINCT or GROUP BY
306 # is applied, we add a "collection_key" field that is unique for each
307 # collection.
309 # If there is only one collection, there will only be one row per
310 # dataset, so we don't need to disambiguate.
311 if len(dataset_search.collection_records) > 1:
312 if (
313 # We need a row for each collection, which will later
314 # be filtered down using the window function.
315 self.find_first_dataset is not None
316 # We might have multiple calibration collections containing the
317 # same dataset with the same timespan.
318 or "timespan" in fields_for_dataset
319 # The user specifically asked for a row for each collection we
320 # found the dataset in.
321 or "collection" in fields_for_dataset
322 ):
323 return True
325 return False
328class SingleSelectQueryBuilder(QueryBuilder):
329 """An implementation of `QueryBuilder` for queries that are structured as
330 a single SELECT (i.e. not a union).
332 See `DirectQueryDriver.build_query` for an overview of query construction,
333 including the role this class plays in it. This builder is used for most
334 butler queries, for which `.queries.tree.QueryTree.any_dataset` is `None`.
336 Parameters
337 ----------
338 tree_analysis : `QueryTreeAnalysis`
339 Result of initial analysis of the most of the query description.
340 considered consumed because nested attributes will be referenced and
341 may be modified in-place in the future.
342 projection_columns : `.queries.tree.ColumnSet`
343 Columns to include in the query's "projection" stage, where a GROUP BY
344 or DISTINCT may be performed.
345 final_columns : `.queries.tree.ColumnSet`
346 Columns to include in the final query.
347 find_first_dataset : `str` or None
348 Name of the dataset type that needs a find-first search.
349 `None` means find-first is not used.
350 """
352 def __init__(
353 self,
354 tree_analysis: QueryTreeAnalysis,
355 *,
356 projection_columns: qt.ColumnSet,
357 final_columns: qt.ColumnSet,
358 find_first_dataset: str | None,
359 ) -> None:
360 super().__init__(
361 tree_analysis=tree_analysis,
362 projection_columns=projection_columns,
363 final_columns=final_columns,
364 find_first_dataset=find_first_dataset,
365 )
366 assert not tree_analysis.union_datasets, "UnionQueryPlan should be used instead."
367 self._select_builder = tree_analysis.initial_select_builder
368 self.find_first = None
369 self.needs_dataset_distinct = False
371 needs_dataset_distinct: bool = False
372 """If `True`, the projection columns do not include collection-specific
373 dataset fields that were present in the "joins" stage, and hence a SELECT
374 DISTINCT [ON] or GROUP BY must be added to make post-projection rows
375 unique.
376 """
378 find_first: QueryFindFirstAnalysis[str] | None = None
379 """Description of the "find_first" stage of query construction.
381 This attribute is `None` if there is no find-first search at all, and
382 `False` in boolean contexts if the search is trivial because there is only
383 one collection after the collections have been resolved.
384 """
386 def analyze_projection(self) -> None:
387 # Docstring inherited.
388 super().analyze_projection()
389 # See if we need to do a DISTINCT [ON] or GROUP BY to get unique rows
390 # because we have rows for datasets in multiple collections with the
391 # same data ID and dataset type.
392 for dataset_type in self.joins_analysis.columns.dataset_fields:
393 assert dataset_type is not qt.ANY_DATASET, "Union dataset in non-dataset-union query."
394 if not self.projection_columns.dataset_fields[dataset_type]:
395 # The "joins"-stage query has one row for each collection for
396 # each data ID, but the projection-stage query just wants
397 # one row for each data ID.
398 if len(self.joins_analysis.datasets[dataset_type].collection_records) > 1:
399 self.needs_dataset_distinct = True
400 break
401 # If there are any dataset fields being propagated through the
402 # projection and there is more than one collection, we need to include
403 # the collection_key column so we can use that as one of the DISTINCT
404 # or GROUP BY columns.
405 for dataset_type, fields_for_dataset in self.projection_columns.dataset_fields.items():
406 assert dataset_type is not qt.ANY_DATASET, "Union dataset in non-dataset-union query."
407 if self._needs_collection_key_field(
408 self.joins_analysis.datasets[dataset_type], fields_for_dataset
409 ):
410 fields_for_dataset.add("collection_key")
412 def analyze_find_first(self) -> None:
413 # Docstring inherited.
414 assert self.find_first_dataset is not qt.ANY_DATASET, "No dataset union in this query"
415 assert self.find_first_dataset is not None
416 self.find_first = QueryFindFirstAnalysis(self.joins_analysis.datasets[self.find_first_dataset])
417 # If we're doing a find-first search and there's a calibration
418 # collection in play, we need to make sure the rows coming out of
419 # the base query have only one timespan for each data ID +
420 # collection, and we can only do that with a GROUP BY and COUNT
421 # that we inspect in postprocessing.
422 if self.find_first.search.is_calibration_search:
423 self.postprocessing.check_validity_match_count = True
425 def apply_joins(self, driver: DirectQueryDriver) -> None:
426 # Docstring inherited.
427 driver.apply_initial_query_joins(
428 self._select_builder, self.joins_analysis, union_dataset_dimensions=None
429 )
430 driver.apply_missing_dimension_joins(self._select_builder, self.joins_analysis)
432 def apply_projection(self, driver: DirectQueryDriver, order_by: Iterable[qt.OrderExpression]) -> None:
433 # Docstring inherited.
434 driver.project_spatial_join_filtering(
435 self.projection_columns, self.postprocessing, [self._select_builder]
436 )
437 driver.apply_query_projection(
438 self._select_builder,
439 self.postprocessing,
440 join_datasets=self.joins_analysis.datasets,
441 union_datasets=None,
442 projection_columns=self.projection_columns,
443 needs_dimension_distinct=self.needs_dimension_distinct,
444 needs_dataset_distinct=self.needs_dataset_distinct,
445 needs_validity_match_count=self.postprocessing.check_validity_match_count,
446 find_first_dataset=None if self.find_first is None else self.find_first.search.name,
447 order_by=order_by,
448 )
450 def apply_find_first(self, driver: DirectQueryDriver) -> None:
451 # Docstring inherited.
452 if not self.find_first:
453 return
454 self._select_builder = driver.apply_query_find_first(
455 self._select_builder, self.postprocessing, self.find_first
456 )
458 # The overloads in the base class seem to keep MyPy from recognizing the
459 # return type as covariant.
460 def finish_select( # type: ignore
461 self,
462 return_columns: bool = True,
463 ) -> tuple[sqlalchemy.Select, SqlColumns]:
464 # Docstring inherited.
465 self._select_builder.columns = self.final_columns
466 return self._select_builder.select(self.postprocessing), self._select_builder.joins
468 def finish_nested(self, cte: bool = False) -> SqlSelectBuilder:
469 # Docstring inherited.
470 self._select_builder.columns = self.final_columns
471 return self._select_builder.nested(cte=cte, postprocessing=self.postprocessing)
474@dataclasses.dataclass
475class UnionQueryBuilderTerm:
476 """A helper struct that holds state for `UnionQueryBuilder` that
477 corresponds to a set of dataset types with the same post-filtering
478 collection sequence.
479 """
481 select_builders: list[SqlSelectBuilder]
482 """Under-construction SQL queries associated with this plan, to be unioned
483 together when complete.
485 Each term corresponds to a different dataset type and a single SELECT; note
486 that this means a `UnionQueryBuilderTerm` does not map 1-1 with a SELECT in
487 the final UNION - it maps to a set of extremely similar SELECTs that differ
488 only in the dataset type name injected into each SELECT at the end.
489 """
491 datasets: ResolvedDatasetSearch[list[str]]
492 """Searches for datasets of different types to be joined into the rest of
493 the query, with the results (after projection and find-first) unioned
494 together.
496 The dataset types in a single `QueryUnionTermPlan` have the exact same
497 post-filtering collection search path, and hence the exact same query
498 plan, aside from the dataset type used to generate their dataset subquery.
499 Dataset types that have the same dimensions but do not have the same
500 post-filtering collection search path go in different `QueryUnionTermPlan`
501 instances, which still contribute to the same UNION [ALL] query.
502 Dataset types with different dimensions cannot go in the same SQL query
503 at all.
504 """
506 needs_dataset_distinct: bool = False
507 """If `True`, the projection columns do not include collection-specific
508 dataset fields that were present in the "joins" stage, and hence a SELECT
509 DISTINCT [ON] or GROUP BY must be added to make post-projection rows
510 unique.
511 """
513 needs_validity_match_count: bool = False
514 """Whether this query needs a validity match column for postprocessing
515 to check.
517 This can be `False` even if `Postprocessing.check_validity_match_count` is
518 `True`, indicating that some other term in the union needs the column and
519 hence this term just needs a dummy column (with "1" as the value).
520 """
522 find_first: QueryFindFirstAnalysis[list[str]] | None = None
523 """Description of the "find_first" stage of query construction.
525 This attribute is `None` if there is no find-first search at all, and
526 `False` in boolean contexts if the search is trivial because there is only
527 one collection after the collections have been resolved.
528 """
531class UnionQueryBuilder(QueryBuilder):
532 """An implementation of `QueryBuilder` for queries that are structured as
533 a UNION ALL with one SELECT for each dataset type.
535 See `DirectQueryDriver.build_query` for an overview of query construction,
536 including the role this class plays in it. This builder is used
537 special butler queries where `.queries.tree.QueryTree.any_dataset` is not
538 `None`.
540 Parameters
541 ----------
542 tree_analysis : `QueryTreeAnalysis`
543 Result of initial analysis of the most of the query description.
544 considered consumed because nested attributes will be referenced and
545 may be modified in-place in the future.
546 projection_columns : `.queries.tree.ColumnSet`
547 Columns to include in the query's "projection" stage, where a GROUP BY
548 or DISTINCT may be performed.
549 final_columns : `.queries.tree.ColumnSet`
550 Columns to include in the final query.
551 union_dataset_dimensions : `DimensionGroup`
552 Dimensions of the dataset types that comprise the union.
553 find_first_dataset : `str` or ``...`` or None
554 Name of the dataset type that needs a find-first search. ``...``
555 is used to indicate the dataset types in a union dataset query.
556 `None` means find-first is not used.
558 Notes
559 -----
560 `UnionQueryBuilder` can be in one of two states:
562 - During the "analysis" phase and at the beginning of the "apply" phase,
563 it has a single initial `SqlSelectBuilder`, because all union terms are
564 identical at this stage. The `UnionQueryTerm.builder` lists are empty.
565 - Within `apply_joins`, this single `SqlSelectBuilder` is copied to
566 populate the per-dataset type `SqlSelectBuilder` instances in the
567 `UnionQueryTerm.builders` lists.
568 """
570 def __init__(
571 self,
572 tree_analysis: QueryTreeAnalysis,
573 *,
574 projection_columns: qt.ColumnSet,
575 final_columns: qt.ColumnSet,
576 union_dataset_dimensions: DimensionGroup,
577 find_first_dataset: str | qt.AnyDatasetType | None,
578 ):
579 super().__init__(
580 tree_analysis=tree_analysis,
581 projection_columns=projection_columns,
582 final_columns=final_columns,
583 find_first_dataset=find_first_dataset,
584 )
585 self._initial_select_builder: SqlSelectBuilder | None = tree_analysis.initial_select_builder
586 self.union_dataset_dimensions = union_dataset_dimensions
587 self.union_terms = [
588 UnionQueryBuilderTerm(select_builders=[], datasets=datasets)
589 for datasets in tree_analysis.union_datasets
590 ]
592 @property
593 def db(self) -> Database:
594 """The database object associated with the nested select builders."""
595 if self._initial_select_builder is not None:
596 return self._initial_select_builder.joins.db
597 else:
598 return self.union_terms[0].select_builders[0].joins.db
600 @property
601 def special(self) -> Set[str]:
602 """The special columns associated with the nested select builders."""
603 if self._initial_select_builder is not None:
604 return self._initial_select_builder.joins.special.keys()
605 else:
606 return self.union_terms[0].select_builders[0].joins.special.keys()
608 def analyze_projection(self) -> None:
609 # Docstring inherited.
610 super().analyze_projection()
611 # See if we need to do a DISTINCT [ON] or GROUP BY to get unique rows
612 # because we have rows for datasets in multiple collections with the
613 # same data ID and dataset type.
614 for dataset_type in self.joins_analysis.columns.dataset_fields:
615 if not self.projection_columns.dataset_fields[dataset_type]:
616 if dataset_type is qt.ANY_DATASET:
617 for union_term in self.union_terms:
618 if len(union_term.datasets.collection_records) > 1:
619 union_term.needs_dataset_distinct = True
620 elif len(self.joins_analysis.datasets[dataset_type].collection_records) > 1:
621 # If a dataset being joined into all union terms has
622 # multiple collections, need_dataset_distinct is true
623 # for all union terms and we can exit the loop early.
624 for union_term in self.union_terms:
625 union_term.needs_dataset_distinct = True
626 break
627 # If there are any dataset fields being propagated through the
628 # projection and there is more than one collection, we need to include
629 # the collection_key column so we can use that as one of the DISTINCT
630 # or GROUP BY columns.
631 for dataset_type, fields_for_dataset in self.projection_columns.dataset_fields.items():
632 if dataset_type is qt.ANY_DATASET:
633 for union_term in self.union_terms:
634 # If there is more than one collection for one union term,
635 # we need to add collection_key to all of them to keep the
636 # SELECT columns uniform.
637 if self._needs_collection_key_field(union_term.datasets, fields_for_dataset):
638 fields_for_dataset.add("collection_key")
639 break
640 elif self._needs_collection_key_field(
641 self.joins_analysis.datasets[dataset_type], fields_for_dataset
642 ):
643 fields_for_dataset.add("collection_key")
645 def analyze_find_first(self) -> None:
646 # Docstring inherited.
647 assert self.find_first_dataset is not None
648 if self.find_first_dataset is qt.ANY_DATASET:
649 for union_term in self.union_terms:
650 union_term.find_first = QueryFindFirstAnalysis(union_term.datasets)
651 # If we're doing a find-first search and there's a calibration
652 # collection in play, we need to make sure the rows coming out
653 # of the base query have only one timespan for each data ID +
654 # collection, and we can only do that with a GROUP BY and COUNT
655 # that we inspect in postprocessing.
656 # Because the postprocessing is applied to the full query, all
657 # union terms will need this column, even if only one populates
658 # it with a nontrivial value.
659 if union_term.find_first.search.is_calibration_search:
660 self.postprocessing.check_validity_match_count = True
661 union_term.needs_validity_match_count = True
662 else:
663 # The query system machinery should actually be able to handle this
664 # case without too much difficulty (we just put the same
665 # find_first plan in each union term), but the result doesn't seem
666 # like it'd be useful, so it's better not to have to maintain that
667 # logic branch.
668 raise NotImplementedError(
669 f"Additional dataset search {self.find_first_dataset!r} can only be joined into a "
670 "union dataset query as a constraint in data IDs, not as a find-first result."
671 )
673 def apply_joins(self, driver: DirectQueryDriver) -> None:
674 # Docstring inherited.
675 assert self._initial_select_builder is not None
676 driver.apply_initial_query_joins(
677 self._initial_select_builder, self.joins_analysis, self.union_dataset_dimensions
678 )
679 # Join in the union datasets. This makes one copy of the initial
680 # select builder for each dataset type, and hence from here on we have
681 # to repeat whatever we do to all select builders.
682 for union_term in self.union_terms:
683 for dataset_type_name in union_term.datasets.name:
684 select_builder = self._initial_select_builder.copy()
685 driver.join_dataset_search(
686 select_builder.joins,
687 union_term.datasets,
688 self.joins_analysis.columns.dataset_fields[qt.ANY_DATASET],
689 union_dataset_type_name=dataset_type_name,
690 )
691 union_term.select_builders.append(select_builder)
692 self._initial_select_builder = None
693 for union_term in self.union_terms:
694 for select_builder in union_term.select_builders:
695 driver.apply_missing_dimension_joins(select_builder, self.joins_analysis)
697 def apply_projection(self, driver: DirectQueryDriver, order_by: Iterable[qt.OrderExpression]) -> None:
698 # Docstring inherited.
699 driver.project_spatial_join_filtering(
700 self.projection_columns,
701 self.postprocessing,
702 itertools.chain.from_iterable(union_term.select_builders for union_term in self.union_terms),
703 )
704 for union_term in self.union_terms:
705 for builder in union_term.select_builders:
706 driver.apply_query_projection(
707 builder,
708 self.postprocessing,
709 join_datasets=self.joins_analysis.datasets,
710 union_datasets=union_term.datasets,
711 projection_columns=self.projection_columns,
712 needs_dimension_distinct=self.needs_dimension_distinct,
713 needs_dataset_distinct=union_term.needs_dataset_distinct,
714 needs_validity_match_count=union_term.needs_validity_match_count,
715 find_first_dataset=None if union_term.find_first is None else qt.ANY_DATASET,
716 order_by=order_by,
717 )
719 def apply_find_first(self, driver: DirectQueryDriver) -> None:
720 # Docstring inherited.
721 for union_term in self.union_terms:
722 if not union_term.find_first:
723 continue
724 union_term.select_builders = [
725 driver.apply_query_find_first(builder, self.postprocessing, union_term.find_first)
726 for builder in union_term.select_builders
727 ]
729 @overload
730 def finish_select( 730 ↛ exitline 730 didn't return from function 'finish_select' because
731 self, return_columns: Literal[True] = True
732 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, SqlColumns]: ...
734 @overload
735 def finish_select( 735 ↛ exitline 735 didn't return from function 'finish_select' because
736 self, return_columns: Literal[False]
737 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, None]: ...
739 def finish_select(
740 self, return_columns: bool = True
741 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, SqlColumns | None]:
742 # Docstring inherited.
743 terms: list[sqlalchemy.Select] = []
744 for union_term in self.union_terms:
745 for dataset_type_name, select_builder in zip(
746 union_term.datasets.name, union_term.select_builders
747 ):
748 select_builder.columns = self.final_columns
749 select_builder.joins.special["_DATASET_TYPE_NAME"] = sqlalchemy.literal(dataset_type_name)
750 terms.append(select_builder.select(self.postprocessing))
751 sql: sqlalchemy.Select | sqlalchemy.CompoundSelect = (
752 sqlalchemy.union_all(*terms) if len(terms) > 1 else terms[0]
753 )
754 columns: SqlColumns | None = None
755 if return_columns:
756 columns = SqlColumns(
757 db=self.db,
758 )
759 columns.extract_columns(
760 self.final_columns,
761 self.postprocessing,
762 self.special,
763 column_collection=sql.selected_columns,
764 )
765 return sql, columns
767 def finish_nested(self, cte: bool = False) -> SqlSelectBuilder:
768 # Docstring inherited.
769 sql_select, _ = self.finish_select(return_columns=False)
770 from_clause = sql_select.cte() if cte else sql_select.subquery()
771 joins_builder = SqlJoinsBuilder(
772 db=self.db,
773 from_clause=from_clause,
774 ).extract_columns(self.final_columns, self.postprocessing)
775 return SqlSelectBuilder(joins_builder, columns=self.final_columns)