Coverage for python / lsst / daf / butler / direct_query_driver / _query_builder.py: 32%

205 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-26 08:49 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "QueryBuilder", 

32 "SingleSelectQueryBuilder", 

33 "UnionQueryBuilder", 

34 "UnionQueryBuilderTerm", 

35) 

36 

37import dataclasses 

38import itertools 

39from abc import ABC, abstractmethod 

40from collections.abc import Iterable, Set 

41from typing import TYPE_CHECKING, Literal, TypeVar, overload 

42 

43import sqlalchemy 

44 

45from ..dimensions import DimensionGroup 

46from ..queries import tree as qt 

47from ..registry.interfaces import Database 

48from ._query_analysis import ( 

49 QueryFindFirstAnalysis, 

50 QueryJoinsAnalysis, 

51 QueryTreeAnalysis, 

52 ResolvedDatasetSearch, 

53) 

54from ._sql_builders import SqlColumns, SqlJoinsBuilder, SqlSelectBuilder 

55 

56if TYPE_CHECKING: 

57 from ._driver import DirectQueryDriver 

58 from ._postprocessing import Postprocessing 

59 

60_T = TypeVar("_T") 

61 

62 

63class QueryBuilder(ABC): 

64 """An abstract base class for objects that transform query descriptions 

65 into SQL and `Postprocessing`. 

66 

67 See `DirectQueryDriver.build_query` for an overview of query construction, 

68 including the role this class plays in it. 

69 

70 Parameters 

71 ---------- 

72 tree_analysis : `QueryTreeAnalysis` 

73 Result of initial analysis of the most of the query description. 

74 considered consumed because nested attributes will be referenced and 

75 may be modified in-place in the future. 

76 projection_columns : `.queries.tree.ColumnSet` 

77 Columns to include in the query's "projection" stage, where a GROUP BY 

78 or DISTINCT may be performed. 

79 final_columns : `.queries.tree.ColumnSet` 

80 Columns to include in the final query. 

81 find_first_dataset : `str` or ``...`` or None 

82 Name of the dataset type that needs a find-first search. ``...`` 

83 is used to indicate the dataset types in a union dataset query. 

84 `None` means find-first is not used. 

85 """ 

86 

87 def __init__( 

88 self, 

89 tree_analysis: QueryTreeAnalysis, 

90 *, 

91 projection_columns: qt.ColumnSet, 

92 final_columns: qt.ColumnSet, 

93 find_first_dataset: str | qt.AnyDatasetType | None, 

94 ): 

95 self.joins_analysis = tree_analysis.joins 

96 self.postprocessing = tree_analysis.postprocessing 

97 self.projection_columns = projection_columns 

98 self.final_columns = final_columns 

99 self.needs_dimension_distinct = False 

100 self.find_first_dataset = find_first_dataset 

101 

102 joins_analysis: QueryJoinsAnalysis 

103 """Description of the "joins" stage of query construction.""" 

104 

105 projection_columns: qt.ColumnSet 

106 """The columns present in the query after the projection is applied. 

107 

108 This is always a subset of `QueryJoinsAnalysis.columns`. 

109 """ 

110 

111 needs_dimension_distinct: bool = False 

112 """If `True`, the projection's dimensions do not include all dimensions in 

113 the "joins" stage, and hence a SELECT DISTINCT [ON] or GROUP BY must be 

114 used to make post-projection rows unique. 

115 """ 

116 

117 find_first_dataset: str | qt.AnyDatasetType | None = None 

118 """If not `None`, this is a find-first query for this dataset. 

119 

120 This is set even if the find-first search is trivial because there is only 

121 one resolved collection. 

122 """ 

123 

124 final_columns: qt.ColumnSet 

125 """The columns included in the SELECT clause of the complete SQL query 

126 that is actually executed. 

127 

128 This is a subset of `QueryProjectionPlan.columns` that differs only in 

129 columns used by the `find_first` stage or an ORDER BY expression. 

130 

131 Like all other `.queries.tree.ColumnSet` attributes, it does not include 

132 fields added directly to `SqlSelectBuilder.special`, which may also be 

133 added to the SELECT clause. 

134 """ 

135 

136 postprocessing: Postprocessing 

137 """Struct representing post-query processing in Python, which may require 

138 additional columns in the query results. 

139 """ 

140 

141 @abstractmethod 

142 def analyze_projection(self) -> None: 

143 """Analyze the "projection" stage of query construction, in which the 

144 query may be nested in a GROUP BY or DISTINCT subquery in order to 

145 ensure rows do not have duplicates. 

146 

147 This modifies the builder in place, and should be called immediately 

148 after construction. 

149 

150 Notes 

151 ----- 

152 Implementations should delegate to `super` to set 

153 `needs_dimension_distinct`, but generally need to provide additional 

154 logic to determine whether a GROUP BY or DISTINCT will be needed for 

155 other reasons (e.g. duplication due to dataset searches over multiple 

156 collections). 

157 """ 

158 # The projection gets interesting if it does not have all of the 

159 # dimension keys or dataset fields of the "joins" stage, because that 

160 # means it needs to do a GROUP BY or DISTINCT ON to get unique rows. 

161 # Subclass implementations handle the check for dataset fields. 

162 if self.projection_columns.dimensions != self.joins_analysis.columns.dimensions: 

163 assert self.projection_columns.dimensions.issubset(self.joins_analysis.columns.dimensions) 

164 # We're going from a larger set of dimensions to a smaller set; 

165 # that means we'll be doing a SELECT DISTINCT [ON] or GROUP BY. 

166 self.needs_dimension_distinct = True 

167 

168 @abstractmethod 

169 def analyze_find_first(self) -> None: 

170 """Analyze the "find first" stage of query construction, in which a 

171 Common Table Expression with PARTITION ON may be used to find the first 

172 dataset for each data ID and dataset type in an ordered collection 

173 sequence. 

174 

175 This modifies the builder in place, and should be called immediately 

176 after `analyze_projection`. 

177 """ 

178 raise NotImplementedError() 

179 

180 @abstractmethod 

181 def apply_joins(self, driver: DirectQueryDriver) -> None: 

182 """Translate the "joins" stage of the query to SQL. 

183 

184 This modifies the builder in place. It is the first step in the 

185 "apply" phase, and should be called after `analyze_find_first` finishes 

186 the "analysis" phase (if more than analysis is needed). 

187 

188 Parameters 

189 ---------- 

190 driver : `DirectQueryDriver` 

191 Driver that invoked this builder and may be called back into for 

192 lower-level SQL generation operations. 

193 """ 

194 raise NotImplementedError() 

195 

196 @abstractmethod 

197 def apply_projection(self, driver: DirectQueryDriver, order_by: Iterable[qt.OrderExpression]) -> None: 

198 """Translate the "projection" stage of the query to SQL. 

199 

200 This modifies the builder in place. It is the second step in the 

201 "apply" phase, after `apply_joins`. 

202 

203 Parameters 

204 ---------- 

205 driver : `DirectQueryDriver` 

206 Driver that invoked this builder and may be called back into for 

207 lower-level SQL generation operations. 

208 order_by : `~collections.abc.Iterable` [ \ 

209 `.queries.tree.OrderExpression` ] 

210 Column expression used to order the query rows. 

211 """ 

212 raise NotImplementedError() 

213 

214 @abstractmethod 

215 def apply_find_first(self, driver: DirectQueryDriver) -> None: 

216 """Transform the "find first" stage of the query to SQL. 

217 

218 This modifies the builder in place. It is the third and final step in 

219 the "apply" phase, after "apply_projection". 

220 

221 Parameters 

222 ---------- 

223 driver : `DirectQueryDriver` 

224 Driver that invoked this builder and may be called back into for 

225 lower-level SQL generation operations. 

226 """ 

227 raise NotImplementedError() 

228 

229 @overload 

230 def finish_select( 230 ↛ exitline 230 didn't return from function 'finish_select' because

231 self, return_columns: Literal[True] = True 

232 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, SqlColumns]: ... 

233 

234 @overload 

235 def finish_select( 235 ↛ exitline 235 didn't return from function 'finish_select' because

236 self, return_columns: Literal[False] 

237 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, None]: ... 

238 

239 @abstractmethod 

240 def finish_select( 

241 self, return_columns: bool = True 

242 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, SqlColumns | None]: 

243 """Finish translating the query into executable SQL. 

244 

245 Parameters 

246 ---------- 

247 return_columns : `bool` 

248 If `True`, return a structure that organizes the SQLAlchemy 

249 column objects available to the query. 

250 

251 Returns 

252 ------- 

253 sql_select : `sqlalchemy.Select` or `sqlalchemy.CompoundSelect`. 

254 A SELECT [UNION ALL] SQL query. 

255 sql_columns : `SqlColumns` or `None` 

256 The columns available to the query (including any available to 

257 an ORDER BY clause, not just those in the SELECT clause, in 

258 contexts where those are not the same. May be `None` (but is not 

259 guaranteed to be) if ``return_columns=False``. 

260 """ 

261 raise NotImplementedError() 

262 

263 @abstractmethod 

264 def finish_nested(self, cte: bool = False) -> SqlSelectBuilder: 

265 """Finish translating the query into SQL that can be used as a 

266 subquery. 

267 

268 Parameters 

269 ---------- 

270 cte : `bool`, optional 

271 If `True`, nest the query in a common table expression (i.e. SQL 

272 WITH statement) instead of a subquery. 

273 

274 Returns 

275 ------- 

276 select_builder : `SqlSelectBuilder` 

277 A builder object that maps to a single SELECT statement. This may 

278 directly hold the original query with no subquery or CTE if that 

279 query was a single SELECT with no GROUP BY or DISTINCT; in either 

280 case it is guaranteed that modifying this builder's result columns 

281 and transforming it into a SELECT will not change the number of 

282 rows. 

283 """ 

284 raise NotImplementedError() 

285 

286 def _needs_collection_key_field( 

287 self, dataset_search: ResolvedDatasetSearch, fields_for_dataset: set[qt.AnyDatasetFieldName] 

288 ) -> bool: 

289 """Return `True` if the ``collection_key`` dataset field is needed to 

290 provide uniqueness for rows. 

291 """ 

292 # For a dataset search, we sometimes want just one row for each dataset 

293 # and sometimes we need multiple rows, one for each collection that 

294 # the dataset was found in. 

295 # 

296 # We need multiple rows if any of the following are true: 

297 # - This is a find-first dataset search. The rows will be ranked using 

298 # a window function to determine the first collection containing a 

299 # matching dataset, so we need a row for each collection to feed into 

300 # the window. 

301 # - The user requested dataset fields that differ depending on which 

302 # collection the dataset was found in, so we need a row for each 

303 # collection to get all the possible values for the dataset fields. 

304 # 

305 # To ensure that we keep the necessary rows after DISTINCT or GROUP BY 

306 # is applied, we add a "collection_key" field that is unique for each 

307 # collection. 

308 

309 # If there is only one collection, there will only be one row per 

310 # dataset, so we don't need to disambiguate. 

311 if len(dataset_search.collection_records) > 1: 

312 if ( 

313 # We need a row for each collection, which will later 

314 # be filtered down using the window function. 

315 self.find_first_dataset is not None 

316 # We might have multiple calibration collections containing the 

317 # same dataset with the same timespan. 

318 or "timespan" in fields_for_dataset 

319 # The user specifically asked for a row for each collection we 

320 # found the dataset in. 

321 or "collection" in fields_for_dataset 

322 ): 

323 return True 

324 

325 return False 

326 

327 

328class SingleSelectQueryBuilder(QueryBuilder): 

329 """An implementation of `QueryBuilder` for queries that are structured as 

330 a single SELECT (i.e. not a union). 

331 

332 See `DirectQueryDriver.build_query` for an overview of query construction, 

333 including the role this class plays in it. This builder is used for most 

334 butler queries, for which `.queries.tree.QueryTree.any_dataset` is `None`. 

335 

336 Parameters 

337 ---------- 

338 tree_analysis : `QueryTreeAnalysis` 

339 Result of initial analysis of the most of the query description. 

340 considered consumed because nested attributes will be referenced and 

341 may be modified in-place in the future. 

342 projection_columns : `.queries.tree.ColumnSet` 

343 Columns to include in the query's "projection" stage, where a GROUP BY 

344 or DISTINCT may be performed. 

345 final_columns : `.queries.tree.ColumnSet` 

346 Columns to include in the final query. 

347 find_first_dataset : `str` or None 

348 Name of the dataset type that needs a find-first search. 

349 `None` means find-first is not used. 

350 """ 

351 

352 def __init__( 

353 self, 

354 tree_analysis: QueryTreeAnalysis, 

355 *, 

356 projection_columns: qt.ColumnSet, 

357 final_columns: qt.ColumnSet, 

358 find_first_dataset: str | None, 

359 ) -> None: 

360 super().__init__( 

361 tree_analysis=tree_analysis, 

362 projection_columns=projection_columns, 

363 final_columns=final_columns, 

364 find_first_dataset=find_first_dataset, 

365 ) 

366 assert not tree_analysis.union_datasets, "UnionQueryPlan should be used instead." 

367 self._select_builder = tree_analysis.initial_select_builder 

368 self.find_first = None 

369 self.needs_dataset_distinct = False 

370 

371 needs_dataset_distinct: bool = False 

372 """If `True`, the projection columns do not include collection-specific 

373 dataset fields that were present in the "joins" stage, and hence a SELECT 

374 DISTINCT [ON] or GROUP BY must be added to make post-projection rows 

375 unique. 

376 """ 

377 

378 find_first: QueryFindFirstAnalysis[str] | None = None 

379 """Description of the "find_first" stage of query construction. 

380 

381 This attribute is `None` if there is no find-first search at all, and 

382 `False` in boolean contexts if the search is trivial because there is only 

383 one collection after the collections have been resolved. 

384 """ 

385 

386 def analyze_projection(self) -> None: 

387 # Docstring inherited. 

388 super().analyze_projection() 

389 # See if we need to do a DISTINCT [ON] or GROUP BY to get unique rows 

390 # because we have rows for datasets in multiple collections with the 

391 # same data ID and dataset type. 

392 for dataset_type in self.joins_analysis.columns.dataset_fields: 

393 assert dataset_type is not qt.ANY_DATASET, "Union dataset in non-dataset-union query." 

394 if not self.projection_columns.dataset_fields[dataset_type]: 

395 # The "joins"-stage query has one row for each collection for 

396 # each data ID, but the projection-stage query just wants 

397 # one row for each data ID. 

398 if len(self.joins_analysis.datasets[dataset_type].collection_records) > 1: 

399 self.needs_dataset_distinct = True 

400 break 

401 # If there are any dataset fields being propagated through the 

402 # projection and there is more than one collection, we need to include 

403 # the collection_key column so we can use that as one of the DISTINCT 

404 # or GROUP BY columns. 

405 for dataset_type, fields_for_dataset in self.projection_columns.dataset_fields.items(): 

406 assert dataset_type is not qt.ANY_DATASET, "Union dataset in non-dataset-union query." 

407 if self._needs_collection_key_field( 

408 self.joins_analysis.datasets[dataset_type], fields_for_dataset 

409 ): 

410 fields_for_dataset.add("collection_key") 

411 

412 def analyze_find_first(self) -> None: 

413 # Docstring inherited. 

414 assert self.find_first_dataset is not qt.ANY_DATASET, "No dataset union in this query" 

415 assert self.find_first_dataset is not None 

416 self.find_first = QueryFindFirstAnalysis(self.joins_analysis.datasets[self.find_first_dataset]) 

417 # If we're doing a find-first search and there's a calibration 

418 # collection in play, we need to make sure the rows coming out of 

419 # the base query have only one timespan for each data ID + 

420 # collection, and we can only do that with a GROUP BY and COUNT 

421 # that we inspect in postprocessing. 

422 if self.find_first.search.is_calibration_search: 

423 self.postprocessing.check_validity_match_count = True 

424 

425 def apply_joins(self, driver: DirectQueryDriver) -> None: 

426 # Docstring inherited. 

427 driver.apply_initial_query_joins( 

428 self._select_builder, self.joins_analysis, union_dataset_dimensions=None 

429 ) 

430 driver.apply_missing_dimension_joins(self._select_builder, self.joins_analysis) 

431 

432 def apply_projection(self, driver: DirectQueryDriver, order_by: Iterable[qt.OrderExpression]) -> None: 

433 # Docstring inherited. 

434 driver.project_spatial_join_filtering( 

435 self.projection_columns, self.postprocessing, [self._select_builder] 

436 ) 

437 driver.apply_query_projection( 

438 self._select_builder, 

439 self.postprocessing, 

440 join_datasets=self.joins_analysis.datasets, 

441 union_datasets=None, 

442 projection_columns=self.projection_columns, 

443 needs_dimension_distinct=self.needs_dimension_distinct, 

444 needs_dataset_distinct=self.needs_dataset_distinct, 

445 needs_validity_match_count=self.postprocessing.check_validity_match_count, 

446 find_first_dataset=None if self.find_first is None else self.find_first.search.name, 

447 order_by=order_by, 

448 ) 

449 

450 def apply_find_first(self, driver: DirectQueryDriver) -> None: 

451 # Docstring inherited. 

452 if not self.find_first: 

453 return 

454 self._select_builder = driver.apply_query_find_first( 

455 self._select_builder, self.postprocessing, self.find_first 

456 ) 

457 

458 # The overloads in the base class seem to keep MyPy from recognizing the 

459 # return type as covariant. 

460 def finish_select( # type: ignore 

461 self, 

462 return_columns: bool = True, 

463 ) -> tuple[sqlalchemy.Select, SqlColumns]: 

464 # Docstring inherited. 

465 self._select_builder.columns = self.final_columns 

466 return self._select_builder.select(self.postprocessing), self._select_builder.joins 

467 

468 def finish_nested(self, cte: bool = False) -> SqlSelectBuilder: 

469 # Docstring inherited. 

470 self._select_builder.columns = self.final_columns 

471 return self._select_builder.nested(cte=cte, postprocessing=self.postprocessing) 

472 

473 

474@dataclasses.dataclass 

475class UnionQueryBuilderTerm: 

476 """A helper struct that holds state for `UnionQueryBuilder` that 

477 corresponds to a set of dataset types with the same post-filtering 

478 collection sequence. 

479 """ 

480 

481 select_builders: list[SqlSelectBuilder] 

482 """Under-construction SQL queries associated with this plan, to be unioned 

483 together when complete. 

484 

485 Each term corresponds to a different dataset type and a single SELECT; note 

486 that this means a `UnionQueryBuilderTerm` does not map 1-1 with a SELECT in 

487 the final UNION - it maps to a set of extremely similar SELECTs that differ 

488 only in the dataset type name injected into each SELECT at the end. 

489 """ 

490 

491 datasets: ResolvedDatasetSearch[list[str]] 

492 """Searches for datasets of different types to be joined into the rest of 

493 the query, with the results (after projection and find-first) unioned 

494 together. 

495 

496 The dataset types in a single `QueryUnionTermPlan` have the exact same 

497 post-filtering collection search path, and hence the exact same query 

498 plan, aside from the dataset type used to generate their dataset subquery. 

499 Dataset types that have the same dimensions but do not have the same 

500 post-filtering collection search path go in different `QueryUnionTermPlan` 

501 instances, which still contribute to the same UNION [ALL] query. 

502 Dataset types with different dimensions cannot go in the same SQL query 

503 at all. 

504 """ 

505 

506 needs_dataset_distinct: bool = False 

507 """If `True`, the projection columns do not include collection-specific 

508 dataset fields that were present in the "joins" stage, and hence a SELECT 

509 DISTINCT [ON] or GROUP BY must be added to make post-projection rows 

510 unique. 

511 """ 

512 

513 needs_validity_match_count: bool = False 

514 """Whether this query needs a validity match column for postprocessing 

515 to check. 

516 

517 This can be `False` even if `Postprocessing.check_validity_match_count` is 

518 `True`, indicating that some other term in the union needs the column and 

519 hence this term just needs a dummy column (with "1" as the value). 

520 """ 

521 

522 find_first: QueryFindFirstAnalysis[list[str]] | None = None 

523 """Description of the "find_first" stage of query construction. 

524 

525 This attribute is `None` if there is no find-first search at all, and 

526 `False` in boolean contexts if the search is trivial because there is only 

527 one collection after the collections have been resolved. 

528 """ 

529 

530 

531class UnionQueryBuilder(QueryBuilder): 

532 """An implementation of `QueryBuilder` for queries that are structured as 

533 a UNION ALL with one SELECT for each dataset type. 

534 

535 See `DirectQueryDriver.build_query` for an overview of query construction, 

536 including the role this class plays in it. This builder is used 

537 special butler queries where `.queries.tree.QueryTree.any_dataset` is not 

538 `None`. 

539 

540 Parameters 

541 ---------- 

542 tree_analysis : `QueryTreeAnalysis` 

543 Result of initial analysis of the most of the query description. 

544 considered consumed because nested attributes will be referenced and 

545 may be modified in-place in the future. 

546 projection_columns : `.queries.tree.ColumnSet` 

547 Columns to include in the query's "projection" stage, where a GROUP BY 

548 or DISTINCT may be performed. 

549 final_columns : `.queries.tree.ColumnSet` 

550 Columns to include in the final query. 

551 union_dataset_dimensions : `DimensionGroup` 

552 Dimensions of the dataset types that comprise the union. 

553 find_first_dataset : `str` or ``...`` or None 

554 Name of the dataset type that needs a find-first search. ``...`` 

555 is used to indicate the dataset types in a union dataset query. 

556 `None` means find-first is not used. 

557 

558 Notes 

559 ----- 

560 `UnionQueryBuilder` can be in one of two states: 

561 

562 - During the "analysis" phase and at the beginning of the "apply" phase, 

563 it has a single initial `SqlSelectBuilder`, because all union terms are 

564 identical at this stage. The `UnionQueryTerm.builder` lists are empty. 

565 - Within `apply_joins`, this single `SqlSelectBuilder` is copied to 

566 populate the per-dataset type `SqlSelectBuilder` instances in the 

567 `UnionQueryTerm.builders` lists. 

568 """ 

569 

570 def __init__( 

571 self, 

572 tree_analysis: QueryTreeAnalysis, 

573 *, 

574 projection_columns: qt.ColumnSet, 

575 final_columns: qt.ColumnSet, 

576 union_dataset_dimensions: DimensionGroup, 

577 find_first_dataset: str | qt.AnyDatasetType | None, 

578 ): 

579 super().__init__( 

580 tree_analysis=tree_analysis, 

581 projection_columns=projection_columns, 

582 final_columns=final_columns, 

583 find_first_dataset=find_first_dataset, 

584 ) 

585 self._initial_select_builder: SqlSelectBuilder | None = tree_analysis.initial_select_builder 

586 self.union_dataset_dimensions = union_dataset_dimensions 

587 self.union_terms = [ 

588 UnionQueryBuilderTerm(select_builders=[], datasets=datasets) 

589 for datasets in tree_analysis.union_datasets 

590 ] 

591 

592 @property 

593 def db(self) -> Database: 

594 """The database object associated with the nested select builders.""" 

595 if self._initial_select_builder is not None: 

596 return self._initial_select_builder.joins.db 

597 else: 

598 return self.union_terms[0].select_builders[0].joins.db 

599 

600 @property 

601 def special(self) -> Set[str]: 

602 """The special columns associated with the nested select builders.""" 

603 if self._initial_select_builder is not None: 

604 return self._initial_select_builder.joins.special.keys() 

605 else: 

606 return self.union_terms[0].select_builders[0].joins.special.keys() 

607 

608 def analyze_projection(self) -> None: 

609 # Docstring inherited. 

610 super().analyze_projection() 

611 # See if we need to do a DISTINCT [ON] or GROUP BY to get unique rows 

612 # because we have rows for datasets in multiple collections with the 

613 # same data ID and dataset type. 

614 for dataset_type in self.joins_analysis.columns.dataset_fields: 

615 if not self.projection_columns.dataset_fields[dataset_type]: 

616 if dataset_type is qt.ANY_DATASET: 

617 for union_term in self.union_terms: 

618 if len(union_term.datasets.collection_records) > 1: 

619 union_term.needs_dataset_distinct = True 

620 elif len(self.joins_analysis.datasets[dataset_type].collection_records) > 1: 

621 # If a dataset being joined into all union terms has 

622 # multiple collections, need_dataset_distinct is true 

623 # for all union terms and we can exit the loop early. 

624 for union_term in self.union_terms: 

625 union_term.needs_dataset_distinct = True 

626 break 

627 # If there are any dataset fields being propagated through the 

628 # projection and there is more than one collection, we need to include 

629 # the collection_key column so we can use that as one of the DISTINCT 

630 # or GROUP BY columns. 

631 for dataset_type, fields_for_dataset in self.projection_columns.dataset_fields.items(): 

632 if dataset_type is qt.ANY_DATASET: 

633 for union_term in self.union_terms: 

634 # If there is more than one collection for one union term, 

635 # we need to add collection_key to all of them to keep the 

636 # SELECT columns uniform. 

637 if self._needs_collection_key_field(union_term.datasets, fields_for_dataset): 

638 fields_for_dataset.add("collection_key") 

639 break 

640 elif self._needs_collection_key_field( 

641 self.joins_analysis.datasets[dataset_type], fields_for_dataset 

642 ): 

643 fields_for_dataset.add("collection_key") 

644 

645 def analyze_find_first(self) -> None: 

646 # Docstring inherited. 

647 assert self.find_first_dataset is not None 

648 if self.find_first_dataset is qt.ANY_DATASET: 

649 for union_term in self.union_terms: 

650 union_term.find_first = QueryFindFirstAnalysis(union_term.datasets) 

651 # If we're doing a find-first search and there's a calibration 

652 # collection in play, we need to make sure the rows coming out 

653 # of the base query have only one timespan for each data ID + 

654 # collection, and we can only do that with a GROUP BY and COUNT 

655 # that we inspect in postprocessing. 

656 # Because the postprocessing is applied to the full query, all 

657 # union terms will need this column, even if only one populates 

658 # it with a nontrivial value. 

659 if union_term.find_first.search.is_calibration_search: 

660 self.postprocessing.check_validity_match_count = True 

661 union_term.needs_validity_match_count = True 

662 else: 

663 # The query system machinery should actually be able to handle this 

664 # case without too much difficulty (we just put the same 

665 # find_first plan in each union term), but the result doesn't seem 

666 # like it'd be useful, so it's better not to have to maintain that 

667 # logic branch. 

668 raise NotImplementedError( 

669 f"Additional dataset search {self.find_first_dataset!r} can only be joined into a " 

670 "union dataset query as a constraint in data IDs, not as a find-first result." 

671 ) 

672 

673 def apply_joins(self, driver: DirectQueryDriver) -> None: 

674 # Docstring inherited. 

675 assert self._initial_select_builder is not None 

676 driver.apply_initial_query_joins( 

677 self._initial_select_builder, self.joins_analysis, self.union_dataset_dimensions 

678 ) 

679 # Join in the union datasets. This makes one copy of the initial 

680 # select builder for each dataset type, and hence from here on we have 

681 # to repeat whatever we do to all select builders. 

682 for union_term in self.union_terms: 

683 for dataset_type_name in union_term.datasets.name: 

684 select_builder = self._initial_select_builder.copy() 

685 driver.join_dataset_search( 

686 select_builder.joins, 

687 union_term.datasets, 

688 self.joins_analysis.columns.dataset_fields[qt.ANY_DATASET], 

689 union_dataset_type_name=dataset_type_name, 

690 ) 

691 union_term.select_builders.append(select_builder) 

692 self._initial_select_builder = None 

693 for union_term in self.union_terms: 

694 for select_builder in union_term.select_builders: 

695 driver.apply_missing_dimension_joins(select_builder, self.joins_analysis) 

696 

697 def apply_projection(self, driver: DirectQueryDriver, order_by: Iterable[qt.OrderExpression]) -> None: 

698 # Docstring inherited. 

699 driver.project_spatial_join_filtering( 

700 self.projection_columns, 

701 self.postprocessing, 

702 itertools.chain.from_iterable(union_term.select_builders for union_term in self.union_terms), 

703 ) 

704 for union_term in self.union_terms: 

705 for builder in union_term.select_builders: 

706 driver.apply_query_projection( 

707 builder, 

708 self.postprocessing, 

709 join_datasets=self.joins_analysis.datasets, 

710 union_datasets=union_term.datasets, 

711 projection_columns=self.projection_columns, 

712 needs_dimension_distinct=self.needs_dimension_distinct, 

713 needs_dataset_distinct=union_term.needs_dataset_distinct, 

714 needs_validity_match_count=union_term.needs_validity_match_count, 

715 find_first_dataset=None if union_term.find_first is None else qt.ANY_DATASET, 

716 order_by=order_by, 

717 ) 

718 

719 def apply_find_first(self, driver: DirectQueryDriver) -> None: 

720 # Docstring inherited. 

721 for union_term in self.union_terms: 

722 if not union_term.find_first: 

723 continue 

724 union_term.select_builders = [ 

725 driver.apply_query_find_first(builder, self.postprocessing, union_term.find_first) 

726 for builder in union_term.select_builders 

727 ] 

728 

729 @overload 

730 def finish_select( 730 ↛ exitline 730 didn't return from function 'finish_select' because

731 self, return_columns: Literal[True] = True 

732 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, SqlColumns]: ... 

733 

734 @overload 

735 def finish_select( 735 ↛ exitline 735 didn't return from function 'finish_select' because

736 self, return_columns: Literal[False] 

737 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, None]: ... 

738 

739 def finish_select( 

740 self, return_columns: bool = True 

741 ) -> tuple[sqlalchemy.CompoundSelect | sqlalchemy.Select, SqlColumns | None]: 

742 # Docstring inherited. 

743 terms: list[sqlalchemy.Select] = [] 

744 for union_term in self.union_terms: 

745 for dataset_type_name, select_builder in zip( 

746 union_term.datasets.name, union_term.select_builders 

747 ): 

748 select_builder.columns = self.final_columns 

749 select_builder.joins.special["_DATASET_TYPE_NAME"] = sqlalchemy.literal(dataset_type_name) 

750 terms.append(select_builder.select(self.postprocessing)) 

751 sql: sqlalchemy.Select | sqlalchemy.CompoundSelect = ( 

752 sqlalchemy.union_all(*terms) if len(terms) > 1 else terms[0] 

753 ) 

754 columns: SqlColumns | None = None 

755 if return_columns: 

756 columns = SqlColumns( 

757 db=self.db, 

758 ) 

759 columns.extract_columns( 

760 self.final_columns, 

761 self.postprocessing, 

762 self.special, 

763 column_collection=sql.selected_columns, 

764 ) 

765 return sql, columns 

766 

767 def finish_nested(self, cte: bool = False) -> SqlSelectBuilder: 

768 # Docstring inherited. 

769 sql_select, _ = self.finish_select(return_columns=False) 

770 from_clause = sql_select.cte() if cte else sql_select.subquery() 

771 joins_builder = SqlJoinsBuilder( 

772 db=self.db, 

773 from_clause=from_clause, 

774 ).extract_columns(self.final_columns, self.postprocessing) 

775 return SqlSelectBuilder(joins_builder, columns=self.final_columns)