Coverage for python/lsst/daf/butler/direct_query_driver/_driver.py: 15%

412 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-19 10:53 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30import uuid 

31 

32__all__ = ("DirectQueryDriver",) 

33 

34import dataclasses 

35import logging 

36import sys 

37from collections.abc import Iterable, Mapping, Set 

38from contextlib import ExitStack 

39from typing import TYPE_CHECKING, Any, cast, overload 

40 

41import sqlalchemy 

42 

43from .. import ddl 

44from .._dataset_type import DatasetType 

45from ..dimensions import DataIdValue, DimensionGroup, DimensionRecordSet, DimensionUniverse, SkyPixDimension 

46from ..name_shrinker import NameShrinker 

47from ..queries import tree as qt 

48from ..queries.driver import ( 

49 DataCoordinateResultPage, 

50 DatasetRefResultPage, 

51 DimensionRecordResultPage, 

52 GeneralResultPage, 

53 PageKey, 

54 QueryDriver, 

55 ResultPage, 

56) 

57from ..queries.result_specs import ( 

58 DataCoordinateResultSpec, 

59 DatasetRefResultSpec, 

60 DimensionRecordResultSpec, 

61 GeneralResultSpec, 

62 ResultSpec, 

63) 

64from ..registry import CollectionSummary, CollectionType, NoDefaultCollectionError, RegistryDefaults 

65from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord 

66from ..registry.managers import RegistryManagerInstances 

67from ._postprocessing import Postprocessing 

68from ._query_builder import QueryBuilder, QueryJoiner 

69from ._query_plan import ( 

70 QueryFindFirstPlan, 

71 QueryJoinsPlan, 

72 QueryPlan, 

73 QueryProjectionPlan, 

74 ResolvedDatasetSearch, 

75) 

76from ._sql_column_visitor import SqlColumnVisitor 

77 

78if TYPE_CHECKING: 

79 from ..registry.interfaces import Database 

80 

81 

82_LOG = logging.getLogger(__name__) 

83 

84 

85class DirectQueryDriver(QueryDriver): 

86 """The `QueryDriver` implementation for `DirectButler`. 

87 

88 Parameters 

89 ---------- 

90 db : `Database` 

91 Abstraction for the SQL database. 

92 universe : `DimensionUniverse` 

93 Definitions of all dimensions. 

94 managers : `RegistryManagerInstances` 

95 Struct of registry manager objects. 

96 defaults : `RegistryDefaults` 

97 Struct holding the default collection search path and governor 

98 dimensions. 

99 raw_page_size : `int`, optional 

100 Number of database rows to fetch for each result page. The actual 

101 number of rows in a page may be smaller due to postprocessing. 

102 constant_rows_limit : `int`, optional 

103 Maximum number of uploaded rows to include in queries via 

104 `Database.constant_rows`; above this limit a temporary table is used 

105 instead. 

106 postprocessing_filter_factor : `int`, optional 

107 The number of database rows we expect to have to fetch to yield a 

108 single output row for queries that involve postprocessing. This is 

109 purely a performance tuning parameter that attempts to balance between 

110 fetching too much and requiring multiple fetches; the true value is 

111 highly dependent on the actual query. 

112 """ 

113 

114 def __init__( 

115 self, 

116 db: Database, 

117 universe: DimensionUniverse, 

118 managers: RegistryManagerInstances, 

119 defaults: RegistryDefaults, 

120 raw_page_size: int = 10000, 

121 constant_rows_limit: int = 1000, 

122 postprocessing_filter_factor: int = 10, 

123 ): 

124 self.db = db 

125 self.managers = managers 

126 self._universe = universe 

127 self._defaults = defaults 

128 self._materializations: dict[qt.MaterializationKey, _MaterializationState] = {} 

129 self._upload_tables: dict[qt.DataCoordinateUploadKey, sqlalchemy.FromClause] = {} 

130 self._exit_stack: ExitStack | None = None 

131 self._raw_page_size = raw_page_size 

132 self._postprocessing_filter_factor = postprocessing_filter_factor 

133 self._constant_rows_limit = constant_rows_limit 

134 self._cursors: dict[PageKey, _Cursor] = {} 

135 

136 def __enter__(self) -> None: 

137 self._exit_stack = ExitStack() 

138 # It might be nice to defer opening a transaction here until first use 

139 # to reduce the time spent in transactions. But it's worth noting that 

140 # this is the default low-level behavior of the Python SQLite driver, 

141 # and it makes it incredibly prone to deadlocks. We might be okay 

142 # here, because Query doesn't do true write operations - just temp 

143 # table writes - but I'm not confident that's enough to make delayed 

144 # transaction starts safe against deadlocks, and it'd be more 

145 # complicated to implement anyway. 

146 # 

147 # We start a transaction rather than just opening a connection to make 

148 # temp table and cursors work with pg_bouncer transaction affinity. 

149 self._exit_stack.enter_context(self.db.transaction(for_temp_tables=True)) 

150 

151 def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: 

152 assert self._exit_stack is not None 

153 self._materializations.clear() 

154 self._upload_tables.clear() 

155 while self._cursors: 

156 _, cursor = self._cursors.popitem() 

157 cursor.close(exc_type, exc_value, traceback) 

158 self._exit_stack.__exit__(exc_type, exc_value, traceback) 

159 self._exit_stack = None 

160 

161 @property 

162 def universe(self) -> DimensionUniverse: 

163 return self._universe 

164 

165 @overload 

166 def execute( 166 ↛ exitline 166 didn't jump to the function exit

167 self, result_spec: DataCoordinateResultSpec, tree: qt.QueryTree 

168 ) -> DataCoordinateResultPage: ... 

169 

170 @overload 

171 def execute( 171 ↛ exitline 171 didn't jump to the function exit

172 self, result_spec: DimensionRecordResultSpec, tree: qt.QueryTree 

173 ) -> DimensionRecordResultPage: ... 

174 

175 @overload 

176 def execute(self, result_spec: DatasetRefResultSpec, tree: qt.QueryTree) -> DatasetRefResultPage: ... 176 ↛ exitline 176 didn't return from function 'execute', because

177 

178 @overload 

179 def execute(self, result_spec: GeneralResultSpec, tree: qt.QueryTree) -> GeneralResultPage: ... 179 ↛ exitline 179 didn't return from function 'execute', because

180 

181 def execute(self, result_spec: ResultSpec, tree: qt.QueryTree) -> ResultPage: 

182 # Docstring inherited. 

183 if self._exit_stack is None: 

184 raise RuntimeError("QueryDriver context must be entered before queries can be executed.") 

185 _, builder = self.build_query( 

186 tree, 

187 final_columns=result_spec.get_result_columns(), 

188 order_by=result_spec.order_by, 

189 find_first_dataset=result_spec.find_first_dataset, 

190 ) 

191 sql_select = builder.select() 

192 if result_spec.order_by: 

193 visitor = SqlColumnVisitor(builder.joiner, self) 

194 sql_select = sql_select.order_by(*[visitor.expect_scalar(term) for term in result_spec.order_by]) 

195 if result_spec.limit is not None: 

196 if builder.postprocessing: 

197 builder.postprocessing.limit = result_spec.limit 

198 else: 

199 sql_select = sql_select.limit(result_spec.limit) 

200 if builder.postprocessing.limit is not None: 

201 # We might want to fetch many fewer rows than the default page 

202 # size if we have to implement limit in postprocessing. 

203 raw_page_size = min( 

204 self._postprocessing_filter_factor * builder.postprocessing.limit, 

205 self._raw_page_size, 

206 ) 

207 else: 

208 raw_page_size = self._raw_page_size 

209 # Execute the query by initializing a _Cursor object that manages the 

210 # lifetime of the result. 

211 cursor = _Cursor( 

212 self.db, 

213 sql_select, 

214 result_spec=result_spec, 

215 name_shrinker=builder.joiner.name_shrinker, 

216 postprocessing=builder.postprocessing, 

217 raw_page_size=raw_page_size, 

218 ) 

219 result_page = cursor.next() 

220 if result_page.next_key is not None: 

221 # Cursor has not been exhausted; add it to the driver for use by 

222 # fetch_next_page. 

223 self._cursors[result_page.next_key] = cursor 

224 return result_page 

225 

226 @overload 

227 def fetch_next_page( 227 ↛ exitline 227 didn't jump to the function exit

228 self, result_spec: DataCoordinateResultSpec, key: PageKey 

229 ) -> DataCoordinateResultPage: ... 

230 

231 @overload 

232 def fetch_next_page( 232 ↛ exitline 232 didn't jump to the function exit

233 self, result_spec: DimensionRecordResultSpec, key: PageKey 

234 ) -> DimensionRecordResultPage: ... 

235 

236 @overload 

237 def fetch_next_page(self, result_spec: DatasetRefResultSpec, key: PageKey) -> DatasetRefResultPage: ... 237 ↛ exitline 237 didn't return from function 'fetch_next_page', because

238 

239 @overload 

240 def fetch_next_page(self, result_spec: GeneralResultSpec, key: PageKey) -> GeneralResultPage: ... 240 ↛ exitline 240 didn't return from function 'fetch_next_page', because

241 

242 def fetch_next_page(self, result_spec: ResultSpec, key: PageKey) -> ResultPage: 

243 # Docstring inherited. 

244 try: 

245 cursor = self._cursors.pop(key) 

246 except KeyError: 

247 raise RuntimeError("Cannot continue query result iteration after the query context has closed.") 

248 result_page = cursor.next() 

249 if result_page.next_key is not None: 

250 self._cursors[result_page.next_key] = cursor 

251 return result_page 

252 

253 def materialize( 

254 self, 

255 tree: qt.QueryTree, 

256 dimensions: DimensionGroup, 

257 datasets: frozenset[str], 

258 ) -> qt.MaterializationKey: 

259 # Docstring inherited. 

260 if self._exit_stack is None: 

261 raise RuntimeError("QueryDriver context must be entered before 'materialize' is called.") 

262 _, builder = self.build_query(tree, qt.ColumnSet(dimensions)) 

263 # Current implementation ignores 'datasets' aside from remembering 

264 # them, because figuring out what to put in the temporary table for 

265 # them is tricky, especially if calibration collections are involved. 

266 # That's okay because: 

267 # 

268 # - the query whose results we materialize includes the dataset 

269 # searches as constraints; 

270 # 

271 # - we still (in Query.materialize) join the dataset searches back in 

272 # anyway, and given materialized data IDs the join to the dataset 

273 # search is straightforward and definitely well-indexed, and not much 

274 # (if at all) worse than joining back in on a materialized UUID. 

275 # 

276 sql_select = builder.select() 

277 table = self._exit_stack.enter_context(self.db.temporary_table(builder.make_table_spec())) 

278 self.db.insert(table, select=sql_select) 

279 key = uuid.uuid4() 

280 self._materializations[key] = _MaterializationState(table, datasets, builder.postprocessing) 

281 return key 

282 

283 def upload_data_coordinates( 

284 self, dimensions: DimensionGroup, rows: Iterable[tuple[DataIdValue, ...]] 

285 ) -> qt.DataCoordinateUploadKey: 

286 # Docstring inherited. 

287 if self._exit_stack is None: 

288 raise RuntimeError( 

289 "QueryDriver context must be entered before 'upload_data_coordinates' is called." 

290 ) 

291 columns = qt.ColumnSet(dimensions).drop_implied_dimension_keys() 

292 table_spec = ddl.TableSpec( 

293 [columns.get_column_spec(logical_table, field).to_sql_spec() for logical_table, field in columns] 

294 ) 

295 dict_rows: list[dict[str, Any]] 

296 if not columns: 

297 table_spec.fields.add( 

298 ddl.FieldSpec( 

299 QueryBuilder.EMPTY_COLUMNS_NAME, dtype=QueryBuilder.EMPTY_COLUMNS_TYPE, nullable=True 

300 ) 

301 ) 

302 dict_rows = [{QueryBuilder.EMPTY_COLUMNS_NAME: None}] 

303 else: 

304 dict_rows = [dict(zip(dimensions.required, values)) for values in rows] 

305 from_clause: sqlalchemy.FromClause 

306 if len(dict_rows) > self._constant_rows_limit: 

307 from_clause = self._exit_stack.enter_context(self.db.temporary_table(table_spec)) 

308 self.db.insert(from_clause, *dict_rows) 

309 else: 

310 from_clause = self.db.constant_rows(table_spec.fields, *dict_rows) 

311 key = uuid.uuid4() 

312 self._upload_tables[key] = from_clause 

313 return key 

314 

315 def count( 

316 self, 

317 tree: qt.QueryTree, 

318 result_spec: ResultSpec, 

319 *, 

320 exact: bool, 

321 discard: bool, 

322 ) -> int: 

323 # Docstring inherited. 

324 columns = result_spec.get_result_columns() 

325 plan, builder = self.build_query(tree, columns, find_first_dataset=result_spec.find_first_dataset) 

326 if not all(d.collection_records for d in plan.joins.datasets.values()): 

327 return 0 

328 if not exact: 

329 builder.postprocessing = Postprocessing() 

330 if builder.postprocessing: 

331 if not discard: 

332 raise RuntimeError("Cannot count query rows exactly without discarding them.") 

333 sql_select = builder.select() 

334 builder.postprocessing.limit = result_spec.limit 

335 n = 0 

336 with self.db.query(sql_select.execution_options(yield_per=self._raw_page_size)) as results: 

337 for _ in builder.postprocessing.apply(results): 

338 n += 1 

339 return n 

340 # If the query has DISTINCT or GROUP BY, nest it in a subquery so we 

341 # count deduplicated rows. 

342 builder = builder.nested() 

343 # Replace the columns of the query with just COUNT(*). 

344 builder.columns = qt.ColumnSet(self._universe.empty.as_group()) 

345 count_func: sqlalchemy.ColumnElement[int] = sqlalchemy.func.count() 

346 builder.joiner.special["_ROWCOUNT"] = count_func 

347 # Render and run the query. 

348 sql_select = builder.select() 

349 with self.db.query(sql_select) as result: 

350 count = cast(int, result.scalar()) 

351 if result_spec.limit is not None: 

352 count = min(count, result_spec.limit) 

353 return count 

354 

355 def any(self, tree: qt.QueryTree, *, execute: bool, exact: bool) -> bool: 

356 # Docstring inherited. 

357 plan, builder = self.build_query(tree, qt.ColumnSet(tree.dimensions)) 

358 if not all(d.collection_records for d in plan.joins.datasets.values()): 

359 return False 

360 if not execute: 

361 if exact: 

362 raise RuntimeError("Cannot obtain exact result for 'any' without executing.") 

363 return True 

364 if builder.postprocessing and exact: 

365 sql_select = builder.select() 

366 with self.db.query( 

367 sql_select.execution_options(yield_per=self._postprocessing_filter_factor) 

368 ) as result: 

369 for _ in builder.postprocessing.apply(result): 

370 return True 

371 return False 

372 sql_select = builder.select().limit(1) 

373 with self.db.query(sql_select) as result: 

374 return result.first() is not None 

375 

376 def explain_no_results(self, tree: qt.QueryTree, execute: bool) -> Iterable[str]: 

377 # Docstring inherited. 

378 plan, _ = self.analyze_query(tree, qt.ColumnSet(tree.dimensions)) 

379 if plan.joins.messages or not execute: 

380 return plan.joins.messages 

381 # TODO: guess at ways to split up query that might fail or succeed if 

382 # run separately, execute them with LIMIT 1 and report the results. 

383 return [] 

384 

385 def get_dataset_type(self, name: str) -> DatasetType: 

386 # Docstring inherited 

387 return self.managers.datasets[name].datasetType 

388 

389 def get_default_collections(self) -> tuple[str, ...]: 

390 # Docstring inherited. 

391 if not self._defaults.collections: 

392 raise NoDefaultCollectionError("No collections provided and no default collections.") 

393 return tuple(self._defaults.collections) 

394 

395 def build_query( 

396 self, 

397 tree: qt.QueryTree, 

398 final_columns: qt.ColumnSet, 

399 order_by: Iterable[qt.OrderExpression] = (), 

400 find_first_dataset: str | None = None, 

401 ) -> tuple[QueryPlan, QueryBuilder]: 

402 """Convert a query description into a mostly-completed `QueryBuilder`. 

403 

404 Parameters 

405 ---------- 

406 tree : `.queries.tree.QueryTree` 

407 Description of the joins and row filters in the query. 

408 final_columns : `.queries.tree.ColumnSet` 

409 Final output columns that should be emitted by the SQL query. 

410 order_by : `~collections.abc.Iterable` [ \ 

411 `.queries.tree.OrderExpression` ], optional 

412 Column expressions to sort by. 

413 find_first_dataset : `str` or `None`, optional 

414 Name of a dataset type for which only one result row for each data 

415 ID should be returned, with the colletions searched in order. 

416 

417 Returns 

418 ------- 

419 plan : `QueryPlan` 

420 Plan used to transform the query into SQL, including some 

421 information (e.g. diagnostics about doomed-to-fail dataset 

422 searches) that isn't transferred into the builder itself. 

423 builder : `QueryBuilder` 

424 Builder object that can be used to create a SQL SELECT via its 

425 `~QueryBuilder.select` method. We return this instead of a 

426 `sqlalchemy.Select` object itself to allow different methods to 

427 customize the SELECT clause itself (e.g. `count` can replace the 

428 columns selected with ``COUNT(*)``). 

429 """ 

430 # See the QueryPlan docs for an overview of what these stages of query 

431 # construction do. 

432 plan, builder = self.analyze_query(tree, final_columns, order_by, find_first_dataset) 

433 self.apply_query_joins(plan.joins, builder.joiner) 

434 self.apply_query_projection(plan.projection, builder) 

435 builder = self.apply_query_find_first(plan.find_first, builder) 

436 builder.columns = plan.final_columns 

437 return plan, builder 

438 

439 def analyze_query( 

440 self, 

441 tree: qt.QueryTree, 

442 final_columns: qt.ColumnSet, 

443 order_by: Iterable[qt.OrderExpression] = (), 

444 find_first_dataset: str | None = None, 

445 ) -> tuple[QueryPlan, QueryBuilder]: 

446 """Construct a plan for building a query and initialize a builder. 

447 

448 Parameters 

449 ---------- 

450 tree : `.queries.tree.QueryTree` 

451 Description of the joins and row filters in the query. 

452 final_columns : `.queries.tree.ColumnSet` 

453 Final output columns that should be emitted by the SQL query. 

454 order_by : `~collections.abc.Iterable` [ \ 

455 `.queries.tree.OrderExpression` ], optional 

456 Column expressions to sort by. 

457 find_first_dataset : `str` or `None`, optional 

458 Name of a dataset type for which only one result row for each data 

459 ID should be returned, with the colletions searched in order. 

460 

461 Returns 

462 ------- 

463 plan : `QueryPlan` 

464 Plan used to transform the query into SQL, including some 

465 information (e.g. diagnostics about doomed-to-fail dataset 

466 searches) that isn't transferred into the builder itself. 

467 builder : `QueryBuilder` 

468 Builder object initialized with overlap joins and constraints 

469 potentially included, with the remainder still present in 

470 `QueryJoinPlans.predicate`. 

471 """ 

472 # The fact that this method returns both a QueryPlan and an initial 

473 # QueryBuilder (rather than just a QueryPlan) is a tradeoff that lets 

474 # DimensionRecordStorageManager.process_query_overlaps (which is called 

475 # by the `_analyze_query_tree` call below) pull out overlap expressions 

476 # from the predicate at the same time it turns them into SQL table 

477 # joins (in the builder). 

478 joins_plan, builder = self._analyze_query_tree(tree) 

479 

480 # The "projection" columns differ from the final columns by not 

481 # omitting any dimension keys (this keeps queries for different result 

482 # types more similar during construction), including any columns needed 

483 # only by order_by terms, and including the collection key if we need 

484 # it for GROUP BY or DISTINCT. 

485 projection_plan = QueryProjectionPlan( 

486 final_columns.copy(), joins_plan.datasets, find_first_dataset=find_first_dataset 

487 ) 

488 projection_plan.columns.restore_dimension_keys() 

489 for term in order_by: 

490 term.gather_required_columns(projection_plan.columns) 

491 # The projection gets interesting if it does not have all of the 

492 # dimension keys or dataset fields of the "joins" stage, because that 

493 # means it needs to do a GROUP BY or DISTINCT ON to get unique rows. 

494 if projection_plan.columns.dimensions != joins_plan.columns.dimensions: 

495 assert projection_plan.columns.dimensions.issubset(joins_plan.columns.dimensions) 

496 # We're going from a larger set of dimensions to a smaller set, 

497 # that means we'll be doing a SELECT DISTINCT [ON] or GROUP BY. 

498 projection_plan.needs_dimension_distinct = True 

499 for dataset_type, fields_for_dataset in joins_plan.columns.dataset_fields.items(): 

500 if not projection_plan.columns.dataset_fields[dataset_type]: 

501 # The "joins"-stage query has one row for each collection for 

502 # each data ID, but the projection-stage query just wants 

503 # one row for each data ID. 

504 if len(joins_plan.datasets[dataset_type].collection_records) > 1: 

505 projection_plan.needs_dataset_distinct = True 

506 break 

507 # If there are any dataset fields being propagated through that 

508 # projection and there is more than one collection, we need to 

509 # include the collection_key column so we can use that as one of 

510 # the DISTINCT or GROUP BY columns. 

511 for dataset_type, fields_for_dataset in projection_plan.columns.dataset_fields.items(): 

512 if len(joins_plan.datasets[dataset_type].collection_records) > 1: 

513 fields_for_dataset.add("collection_key") 

514 if projection_plan: 

515 # If there's a projection and we're doing postprocessing, we might 

516 # be collapsing the dimensions of the postprocessing regions. When 

517 # that happens, we want to apply an aggregate function to them that 

518 # computes the union of the regions that are grouped together. 

519 for element in builder.postprocessing.iter_missing(projection_plan.columns): 

520 if element.name not in projection_plan.columns.dimensions.elements: 

521 projection_plan.region_aggregates.append(element) 

522 

523 # The joins-stage query also needs to include all columns needed by the 

524 # downstream projection query. Note that this: 

525 # - never adds new dimensions to the joins stage (since those are 

526 # always a superset of the projection-stage dimensions); 

527 # - does not affect our determination of 

528 # projection_plan.needs_dataset_distinct, because any dataset fields 

529 # being added to the joins stage here are already in the projection. 

530 joins_plan.columns.update(projection_plan.columns) 

531 

532 find_first_plan = None 

533 if find_first_dataset is not None: 

534 find_first_plan = QueryFindFirstPlan(joins_plan.datasets[find_first_dataset]) 

535 # If we're doing a find-first search and there's a calibration 

536 # collection in play, we need to make sure the rows coming out of 

537 # the base query have only one timespan for each data ID + 

538 # collection, and we can only do that with a GROUP BY and COUNT 

539 # that we inspect in postprocessing. 

540 if find_first_plan.search.is_calibration_search: 

541 builder.postprocessing.check_validity_match_count = True 

542 plan = QueryPlan( 

543 joins=joins_plan, 

544 projection=projection_plan, 

545 find_first=find_first_plan, 

546 final_columns=final_columns, 

547 ) 

548 return plan, builder 

549 

550 def apply_query_joins(self, plan: QueryJoinsPlan, joiner: QueryJoiner) -> None: 

551 """Modify a `QueryJoiner` to include all tables and other FROM and 

552 WHERE clause terms needed. 

553 

554 Parameters 

555 ---------- 

556 plan : `QueryJoinPlan` 

557 Component of a `QueryPlan` relevant for the "joins" stage. 

558 joiner : `QueryJoiner` 

559 Component of a `QueryBuilder` that holds the FROM and WHERE 

560 clauses. This is expected to be initialized by `analyze_query` 

561 and will be modified in-place on return. 

562 """ 

563 # Process data coordinate upload joins. 

564 for upload_key, upload_dimensions in plan.data_coordinate_uploads.items(): 

565 joiner.join( 

566 QueryJoiner(self.db, self._upload_tables[upload_key]).extract_dimensions( 

567 upload_dimensions.required 

568 ) 

569 ) 

570 # Process materialization joins. We maintain a set of dataset types 

571 # that were included in a materialization; searches for these datasets 

572 # can be dropped if they are only present to provide a constraint on 

573 # data IDs, since that's already embedded in a materialization. 

574 materialized_datasets: set[str] = set() 

575 for materialization_key, materialization_dimensions in plan.materializations.items(): 

576 materialized_datasets.update( 

577 self._join_materialization(joiner, materialization_key, materialization_dimensions) 

578 ) 

579 # Process dataset joins. 

580 for dataset_search in plan.datasets.values(): 

581 self._join_dataset_search( 

582 joiner, 

583 dataset_search, 

584 plan.columns.dataset_fields[dataset_search.name], 

585 ) 

586 # Join in dimension element tables that we know we need relationships 

587 # or columns from. 

588 for element in plan.iter_mandatory(): 

589 joiner.join( 

590 self.managers.dimensions.make_query_joiner( 

591 element, plan.columns.dimension_fields[element.name] 

592 ) 

593 ) 

594 # See if any dimension keys are still missing, and if so join in their 

595 # tables. Note that we know there are no fields needed from these. 

596 while not (joiner.dimension_keys.keys() >= plan.columns.dimensions.names): 

597 # Look for opportunities to join in multiple dimensions via single 

598 # table, to reduce the total number of tables joined in. 

599 missing_dimension_names = plan.columns.dimensions.names - joiner.dimension_keys.keys() 

600 best = self._universe[ 

601 max( 

602 missing_dimension_names, 

603 key=lambda name: len(self._universe[name].dimensions.names & missing_dimension_names), 

604 ) 

605 ] 

606 joiner.join(self.managers.dimensions.make_query_joiner(best, frozenset())) 

607 # Add the WHERE clause to the joiner. 

608 joiner.where(plan.predicate.visit(SqlColumnVisitor(joiner, self))) 

609 

610 def apply_query_projection(self, plan: QueryProjectionPlan, builder: QueryBuilder) -> None: 

611 """Modify `QueryBuilder` to reflect the "projection" stage of query 

612 construction, which can involve a GROUP BY or DISTINCT [ON] clause 

613 that enforces uniqueness. 

614 

615 Parameters 

616 ---------- 

617 plan : `QueryProjectionPlan` 

618 Component of a `QueryPlan` relevant for the "projection" stage. 

619 builder : `QueryBuilder` 

620 Builder object that will be modified in place. Expected to be 

621 initialized by `analyze_query` and further modified by 

622 `apply_query_joins`. 

623 """ 

624 builder.columns = plan.columns 

625 if not plan and not builder.postprocessing.check_validity_match_count: 

626 # Rows are already unique; nothing else to do in this method. 

627 return 

628 # This method generates either a SELECT DISTINCT [ON] or a SELECT with 

629 # GROUP BY. We'll work out which as we go. 

630 have_aggregates: bool = False 

631 # Dimension key columns form at least most of our GROUP BY or DISTINCT 

632 # ON clause. 

633 unique_keys: list[sqlalchemy.ColumnElement[Any]] = [ 

634 builder.joiner.dimension_keys[k][0] for k in plan.columns.dimensions.data_coordinate_keys 

635 ] 

636 # There are two reasons we might need an aggregate function: 

637 # - to make sure temporal constraints and joins have resulted in at 

638 # most one validity range match for each data ID and collection, 

639 # when we're doing a find-first query. 

640 # - to compute the unions of regions we need for postprocessing, when 

641 # the data IDs for those regions are not wholly included in the 

642 # results (i.e. we need to postprocess on 

643 # visit_detector_region.region, but the output rows don't have 

644 # detector, just visit - so we compute the union of the 

645 # visit_detector region over all matched detectors). 

646 if builder.postprocessing.check_validity_match_count: 

647 builder.joiner.special[builder.postprocessing.VALIDITY_MATCH_COUNT] = ( 

648 sqlalchemy.func.count().label(builder.postprocessing.VALIDITY_MATCH_COUNT) 

649 ) 

650 have_aggregates = True 

651 for element in plan.region_aggregates: 

652 builder.joiner.fields[element.name]["region"] = ddl.Base64Region.union_aggregate( 

653 builder.joiner.fields[element.name]["region"] 

654 ) 

655 have_aggregates = True 

656 # Many of our fields derive their uniqueness from the unique_key 

657 # fields: if rows are uniqe over the 'unique_key' fields, then they're 

658 # automatically unique over these 'derived_fields'. We just remember 

659 # these as pairs of (logical_table, field) for now. 

660 derived_fields: list[tuple[str, str]] = [] 

661 # All dimension record fields are derived fields. 

662 for element_name, fields_for_element in plan.columns.dimension_fields.items(): 

663 for element_field in fields_for_element: 

664 derived_fields.append((element_name, element_field)) 

665 # Some dataset fields are derived fields and some are unique keys, and 

666 # it depends on the kinds of collection(s) we're searching and whether 

667 # it's a find-first query. 

668 for dataset_type, fields_for_dataset in plan.columns.dataset_fields.items(): 

669 for dataset_field in fields_for_dataset: 

670 if dataset_field == "collection_key": 

671 # If the collection_key field is present, it's needed for 

672 # uniqueness if we're looking in more than one collection. 

673 # If not, it's a derived field. 

674 if len(plan.datasets[dataset_type].collection_records) > 1: 

675 unique_keys.append(builder.joiner.fields[dataset_type]["collection_key"]) 

676 else: 

677 derived_fields.append((dataset_type, "collection_key")) 

678 elif dataset_field == "timespan" and plan.datasets[dataset_type].is_calibration_search: 

679 # If we're doing a non-find-first query against a 

680 # CALIBRATION collection, the timespan is also a unique 

681 # key... 

682 if dataset_type == plan.find_first_dataset: 

683 # ...unless we're doing a find-first search on this 

684 # dataset, in which case we need to use ANY_VALUE on 

685 # the timespan and check that _VALIDITY_MATCH_COUNT 

686 # (added earlier) is one, indicating that there was 

687 # indeed only one timespan for each data ID in each 

688 # collection that survived the base query's WHERE 

689 # clauses and JOINs. 

690 if not self.db.has_any_aggregate: 

691 raise NotImplementedError( 

692 f"Cannot generate query that returns {dataset_type}.timespan after a " 

693 "find-first search, because this a database does not support the ANY_VALUE " 

694 "aggregate function (or equivalent)." 

695 ) 

696 builder.joiner.timespans[dataset_type] = builder.joiner.timespans[ 

697 dataset_type 

698 ].apply_any_aggregate(self.db.apply_any_aggregate) 

699 else: 

700 unique_keys.extend(builder.joiner.timespans[dataset_type].flatten()) 

701 else: 

702 # Other dataset fields derive their uniqueness from key 

703 # fields. 

704 derived_fields.append((dataset_type, dataset_field)) 

705 if not have_aggregates and not derived_fields: 

706 # SELECT DISTINCT is sufficient. 

707 builder.distinct = True 

708 elif not have_aggregates and self.db.has_distinct_on: 

709 # SELECT DISTINCT ON is sufficient and supported by this database. 

710 builder.distinct = unique_keys 

711 else: 

712 # GROUP BY is the only option. 

713 if derived_fields: 

714 if self.db.has_any_aggregate: 

715 for logical_table, field in derived_fields: 

716 if field == "timespan": 

717 builder.joiner.timespans[logical_table] = builder.joiner.timespans[ 

718 logical_table 

719 ].apply_any_aggregate(self.db.apply_any_aggregate) 

720 else: 

721 builder.joiner.fields[logical_table][field] = self.db.apply_any_aggregate( 

722 builder.joiner.fields[logical_table][field] 

723 ) 

724 else: 

725 _LOG.warning( 

726 "Adding %d fields to GROUP BY because this database backend does not support the " 

727 "ANY_VALUE aggregate function (or equivalent). This may result in a poor query " 

728 "plan. Materializing the query first sometimes avoids this problem.", 

729 len(derived_fields), 

730 ) 

731 for logical_table, field in derived_fields: 

732 if field == "timespan": 

733 unique_keys.extend(builder.joiner.timespans[logical_table].flatten()) 

734 else: 

735 unique_keys.append(builder.joiner.fields[logical_table][field]) 

736 builder.group_by = unique_keys 

737 

738 def apply_query_find_first(self, plan: QueryFindFirstPlan | None, builder: QueryBuilder) -> QueryBuilder: 

739 """Modify an under-construction SQL query to return only one row for 

740 each data ID, searching collections in order. 

741 

742 Parameters 

743 ---------- 

744 plan : `QueryFindFirstPlan` or `None` 

745 Component of a `QueryPlan` relevant for the "find first" stage. 

746 builder : `QueryBuilder` 

747 Builder object as produced by `apply_query_projection`. This 

748 object should be considered to be consumed by this method - the 

749 same instance may or may not be returned, and if it is not 

750 returned, its state is not defined. 

751 

752 Returns 

753 ------- 

754 builder : `QueryBuilder` 

755 Modified query builder that includes the find-first resolution, if 

756 one was needed. 

757 """ 

758 if not plan: 

759 return builder 

760 # The query we're building looks like this: 

761 # 

762 # WITH {dst}_base AS ( 

763 # {target} 

764 # ... 

765 # ) 

766 # SELECT 

767 # {dst}_window.*, 

768 # FROM ( 

769 # SELECT 

770 # {dst}_base.*, 

771 # ROW_NUMBER() OVER ( 

772 # PARTITION BY {dst_base}.{dimensions} 

773 # ORDER BY {rank} 

774 # ) AS rownum 

775 # ) {dst}_window 

776 # WHERE 

777 # {dst}_window.rownum = 1; 

778 # 

779 # The outermost SELECT will be represented by the QueryBuilder we 

780 # return. The QueryBuilder we're given corresponds to the Common Table 

781 # Expression (CTE) at the top. 

782 # 

783 # For SQLite only, we could use a much simpler GROUP BY instead, 

784 # because it extends the standard to do exactly what we want when MIN 

785 # or MAX appears once and a column does not have an aggregate function 

786 # (https://www.sqlite.org/quirks.html). But since that doesn't work 

787 # with PostgreSQL it doesn't help us. 

788 # 

789 builder = builder.nested(cte=True, force=True) 

790 # We start by filling out the "window" SELECT statement... 

791 partition_by = [builder.joiner.dimension_keys[d][0] for d in builder.columns.dimensions.required] 

792 rank_sql_column = sqlalchemy.case( 

793 {record.key: n for n, record in enumerate(plan.search.collection_records)}, 

794 value=builder.joiner.fields[plan.dataset_type]["collection_key"], 

795 ) 

796 if partition_by: 

797 builder.joiner.special["_ROWNUM"] = sqlalchemy.sql.func.row_number().over( 

798 partition_by=partition_by, order_by=rank_sql_column 

799 ) 

800 else: 

801 builder.joiner.special["_ROWNUM"] = sqlalchemy.sql.func.row_number().over( 

802 order_by=rank_sql_column 

803 ) 

804 # ... and then turn that into a subquery with a constraint on rownum. 

805 builder = builder.nested(force=True) 

806 # We can now add the WHERE constraint on rownum into the outer query. 

807 builder.joiner.where(builder.joiner.special["_ROWNUM"] == 1) 

808 # Don't propagate _ROWNUM into downstream queries. 

809 del builder.joiner.special["_ROWNUM"] 

810 return builder 

811 

812 def _analyze_query_tree(self, tree: qt.QueryTree) -> tuple[QueryJoinsPlan, QueryBuilder]: 

813 """Start constructing a plan for building a query from a 

814 `.queries.tree.QueryTree`. 

815 

816 Parameters 

817 ---------- 

818 tree : `.queries.tree.QueryTree` 

819 Description of the joins and row filters in the query. 

820 

821 Returns 

822 ------- 

823 plan : `QueryJoinsPlan` 

824 Initial component of the plan relevant for the "joins" stage, 

825 including all joins and columns needed by ``tree``. Additional 

826 columns will be added to this plan later. 

827 builder : `QueryBuilder` 

828 Builder object initialized with overlap joins and constraints 

829 potentially included, with the remainder still present in 

830 `QueryJoinPlans.predicate`. 

831 """ 

832 # Delegate to the dimensions manager to rewrite the predicate and start 

833 # a QueryBuilder to cover any spatial overlap joins or constraints. 

834 # We'll return that QueryBuilder at the end. 

835 ( 

836 predicate, 

837 builder, 

838 ) = self.managers.dimensions.process_query_overlaps( 

839 tree.dimensions, 

840 tree.predicate, 

841 tree.get_joined_dimension_groups(), 

842 ) 

843 result = QueryJoinsPlan(predicate=predicate, columns=builder.columns) 

844 # Add columns required by postprocessing. 

845 builder.postprocessing.gather_columns_required(result.columns) 

846 # We also check that the predicate doesn't reference any dimensions 

847 # without constraining their governor dimensions, since that's a 

848 # particularly easy mistake to make and it's almost never intentional. 

849 # We also allow the registry data ID values to provide governor values. 

850 where_columns = qt.ColumnSet(self.universe.empty.as_group()) 

851 result.predicate.gather_required_columns(where_columns) 

852 for governor in where_columns.dimensions.governors: 

853 if governor not in result.constraint_data_id: 

854 if governor in self._defaults.dataId.dimensions: 

855 result.constraint_data_id[governor] = self._defaults.dataId[governor] 

856 else: 

857 raise qt.InvalidQueryError( 

858 f"Query 'where' expression references a dimension dependent on {governor} without " 

859 "constraining it directly." 

860 ) 

861 # Add materializations, which can also bring in more postprocessing. 

862 for m_key, m_dimensions in tree.materializations.items(): 

863 m_state = self._materializations[m_key] 

864 result.materializations[m_key] = m_dimensions 

865 # When a query is materialized, the new tree has an empty 

866 # (trivially true) predicate because the original was used to make 

867 # the materialized rows. But the original postprocessing isn't 

868 # executed when the materialization happens, so we have to include 

869 # it here. 

870 builder.postprocessing.spatial_join_filtering.extend( 

871 m_state.postprocessing.spatial_join_filtering 

872 ) 

873 builder.postprocessing.spatial_where_filtering.extend( 

874 m_state.postprocessing.spatial_where_filtering 

875 ) 

876 # Add data coordinate uploads. 

877 result.data_coordinate_uploads.update(tree.data_coordinate_uploads) 

878 # Add dataset_searches and filter out collections that don't have the 

879 # right dataset type or governor dimensions. 

880 for dataset_type_name, dataset_search in tree.datasets.items(): 

881 resolved_dataset_search = self._resolve_dataset_search( 

882 dataset_type_name, dataset_search, result.constraint_data_id 

883 ) 

884 result.datasets[dataset_type_name] = resolved_dataset_search 

885 if not resolved_dataset_search.collection_records: 

886 result.messages.append(f"Search for dataset type {dataset_type_name!r} is doomed to fail.") 

887 result.messages.extend(resolved_dataset_search.messages) 

888 return result, builder 

889 

890 def _resolve_dataset_search( 

891 self, 

892 dataset_type_name: str, 

893 dataset_search: qt.DatasetSearch, 

894 constraint_data_id: Mapping[str, DataIdValue], 

895 ) -> ResolvedDatasetSearch: 

896 """Resolve the collections that should actually be searched for 

897 datasets of a particular type. 

898 

899 Parameters 

900 ---------- 

901 dataset_type_name : `str` 

902 Name of the dataset being searched for. 

903 dataset_search : `.queries.tree.DatasetSearch` 

904 Struct holding the dimensions and original collection search path. 

905 constraint_data_id : `~collections.abc.Mapping` 

906 Data ID mapping derived from the query predicate that may be used 

907 to filter out some collections based on their governor dimensions. 

908 

909 Returns 

910 ------- 

911 resolved : `ResolvedDatasetSearch` 

912 Struct that extends `dataset_search`` with the dataset type name 

913 and resolved collection records. 

914 """ 

915 result = ResolvedDatasetSearch(dataset_type_name, dataset_search.dimensions) 

916 for collection_record, collection_summary in self._resolve_collection_path( 

917 dataset_search.collections 

918 ): 

919 rejected: bool = False 

920 if result.name not in collection_summary.dataset_types.names: 

921 result.messages.append( 

922 f"No datasets of type {result.name!r} in collection {collection_record.name!r}." 

923 ) 

924 rejected = True 

925 for governor in constraint_data_id.keys() & collection_summary.governors.keys(): 

926 if constraint_data_id[governor] not in collection_summary.governors[governor]: 

927 result.messages.append( 

928 f"No datasets with {governor}={constraint_data_id[governor]!r} " 

929 f"in collection {collection_record.name!r}." 

930 ) 

931 rejected = True 

932 if not rejected: 

933 if collection_record.type is CollectionType.CALIBRATION: 

934 result.is_calibration_search = True 

935 result.collection_records.append(collection_record) 

936 if result.dimensions != self.get_dataset_type(dataset_type_name).dimensions.as_group(): 

937 # This is really for server-side defensiveness; it's hard to 

938 # imagine the query getting different dimensions for a dataset 

939 # type in two calls to the same query driver. 

940 raise qt.InvalidQueryError( 

941 f"Incorrect dimensions {result.dimensions} for dataset {dataset_type_name} " 

942 f"in query (vs. {self.get_dataset_type(dataset_type_name).dimensions.as_group()})." 

943 ) 

944 return result 

945 

946 def _resolve_collection_path( 

947 self, collections: Iterable[str] 

948 ) -> list[tuple[CollectionRecord, CollectionSummary]]: 

949 """Expand an ordered iterable of collection names into a list of 

950 collection records and summaries. 

951 

952 Parameters 

953 ---------- 

954 collections : `~collections.abc.Iterable` [ `str` ] 

955 Ordered iterable of collections. 

956 

957 Returns 

958 ------- 

959 resolved : `list` [ `tuple` [ `.registry.interfaces.CollectionRecord`,\ 

960 `.registry.CollectionSummary` ] ] 

961 Tuples of collection record and summary. `~CollectionType.CHAINED` 

962 collections are flattened out and not included. 

963 """ 

964 result: list[tuple[CollectionRecord, CollectionSummary]] = [] 

965 done: set[str] = set() 

966 

967 # Eventually we really want this recursive Python code to be replaced 

968 # by a recursive SQL query, especially if we extend this method to 

969 # support collection glob patterns to support public APIs we don't yet 

970 # have in the new query system (but will need to add). 

971 

972 def recurse(collection_names: Iterable[str]) -> None: 

973 for collection_name in collection_names: 

974 if collection_name not in done: 

975 done.add(collection_name) 

976 record = self.managers.collections.find(collection_name) 

977 

978 if record.type is CollectionType.CHAINED: 

979 recurse(cast(ChainedCollectionRecord, record).children) 

980 else: 

981 result.append((record, self.managers.datasets.getCollectionSummary(record))) 

982 

983 recurse(collections) 

984 

985 return result 

986 

987 def _join_materialization( 

988 self, 

989 joiner: QueryJoiner, 

990 key: qt.MaterializationKey, 

991 dimensions: DimensionGroup, 

992 ) -> frozenset[str]: 

993 """Join a materialization into an under-construction query. 

994 

995 Parameters 

996 ---------- 

997 joiner : `QueryJoiner` 

998 Component of a `QueryBuilder` that holds the FROM and WHERE 

999 clauses. This will be modified in-place on return. 

1000 key : `.queries.tree.MaterializationKey` 

1001 Unique identifier created for this materialization when it was 

1002 created. 

1003 dimensions : `DimensionGroup` 

1004 Dimensions of the materialization. 

1005 

1006 Returns 

1007 ------- 

1008 datasets : `frozenset` [ `str` ] 

1009 Dataset types that were included as constraints when this 

1010 materialization was created. 

1011 """ 

1012 columns = qt.ColumnSet(dimensions) 

1013 m_state = self._materializations[key] 

1014 joiner.join(QueryJoiner(self.db, m_state.table).extract_columns(columns, m_state.postprocessing)) 

1015 return m_state.datasets 

1016 

1017 def _join_dataset_search( 

1018 self, 

1019 joiner: QueryJoiner, 

1020 resolved_search: ResolvedDatasetSearch, 

1021 fields: Set[str], 

1022 ) -> None: 

1023 """Join a dataset search into an under-construction query. 

1024 

1025 Parameters 

1026 ---------- 

1027 joiner : `QueryJoiner` 

1028 Component of a `QueryBuilder` that holds the FROM and WHERE 

1029 clauses. This will be modified in-place on return. 

1030 resolved_search : `ResolvedDatasetSearch` 

1031 Struct that describes the dataset type and collections. 

1032 fields : `~collections.abc.Set` [ `str` ] 

1033 Dataset fields to include. 

1034 """ 

1035 storage = self.managers.datasets[resolved_search.name] 

1036 # The next two asserts will need to be dropped (and the implications 

1037 # dealt with instead) if materializations start having dataset fields. 

1038 assert ( 

1039 resolved_search.name not in joiner.fields 

1040 ), "Dataset fields have unexpectedly already been joined in." 

1041 assert ( 

1042 resolved_search.name not in joiner.timespans 

1043 ), "Dataset timespan has unexpectedly already been joined in." 

1044 joiner.join(storage.make_query_joiner(resolved_search.collection_records, fields)) 

1045 

1046 

1047@dataclasses.dataclass 

1048class _MaterializationState: 

1049 table: sqlalchemy.Table 

1050 datasets: frozenset[str] 

1051 postprocessing: Postprocessing 

1052 

1053 

1054class _Cursor: 

1055 """A helper class for managing paged query results and cursor lifetimes. 

1056 

1057 This class holds a context manager for the SQLAlchemy cursor object but is 

1058 not itself a context manager. It always cleans up (i.e. calls its `close` 

1059 method) when it raises an exception or exhausts the cursor, but external 

1060 code is responsible for calling `close` when the cursor is abandoned before 

1061 it is exhausted, including when that happens due to an external exception. 

1062 

1063 Parameters 

1064 ---------- 

1065 db : `.registry.interface.Database` 

1066 Database to run the query against. 

1067 sql : `sqlalchemy.Executable` 

1068 SQL query to execute. 

1069 result : `ResultSpec` 

1070 Specification of the result type. 

1071 name_shrinker : `NameShrinker` or `None` 

1072 Object that was used to shrink dataset column names to fit within the 

1073 database identifier limit. 

1074 postprocessing : `Postprocessing` 

1075 Post-query filtering and checks to perform. 

1076 raw_page_size : `int` 

1077 Maximum number of SQL result rows to return in each page, before 

1078 postprocessing. 

1079 """ 

1080 

1081 def __init__( 

1082 self, 

1083 db: Database, 

1084 sql: sqlalchemy.Executable, 

1085 result_spec: ResultSpec, 

1086 name_shrinker: NameShrinker | None, 

1087 postprocessing: Postprocessing, 

1088 raw_page_size: int, 

1089 ): 

1090 self._result_spec = result_spec 

1091 self._name_shrinker = name_shrinker 

1092 self._raw_page_size = raw_page_size 

1093 self._postprocessing = postprocessing 

1094 self._timespan_repr_cls = db.getTimespanRepresentation() 

1095 self._context = db.query(sql, execution_options=dict(yield_per=raw_page_size)) 

1096 cursor = self._context.__enter__() 

1097 try: 

1098 self._iterator = cursor.partitions() 

1099 except: # noqa: E722 

1100 self._context.__exit__(*sys.exc_info()) 

1101 raise 

1102 

1103 def close(self, exc_type: Any = None, exc_value: Any = None, traceback: Any = None) -> None: 

1104 """Close this cursor. 

1105 

1106 Parameters 

1107 ---------- 

1108 exc_type : `type` 

1109 Exception type as obtained from `sys.exc_info`, or `None` if there 

1110 was no error. 

1111 exc_value : `BaseException` or `None` 

1112 Exception instance as obtained from `sys.exc_info`, or `None` if 

1113 there was no error. 

1114 traceback : `object` 

1115 Traceback as obtained from `sys.exc_info`, or `None` if there was 

1116 no error. 

1117 """ 

1118 self._context.__exit__(exc_type, exc_value, traceback) 

1119 

1120 def next(self) -> ResultPage: 

1121 """Return the next result page from this query. 

1122 

1123 When there are no more results after this result page, the `next_page` 

1124 attribute of the returned object is `None` and the cursor will be 

1125 closed. The cursor is also closed if this method raises an exception. 

1126 """ 

1127 try: 

1128 raw_page = next(self._iterator, tuple()) 

1129 if len(raw_page) == self._raw_page_size: 

1130 # There's some chance we got unlucky and this page exactly 

1131 # finishes off the query, and we won't know the next page does 

1132 # not exist until we try to fetch it. But that's better than 

1133 # always fetching the next page up front. 

1134 next_key = uuid.uuid4() 

1135 else: 

1136 next_key = None 

1137 self.close() 

1138 

1139 postprocessed_rows = self._postprocessing.apply(raw_page) 

1140 match self._result_spec: 

1141 case DimensionRecordResultSpec(): 

1142 return self._convert_dimension_record_results(postprocessed_rows, next_key) 

1143 case _: 

1144 raise NotImplementedError("TODO") 

1145 except: # noqa: E722 

1146 self._context.__exit__(*sys.exc_info()) 

1147 raise 

1148 

1149 def _convert_dimension_record_results( 

1150 self, 

1151 raw_rows: Iterable[sqlalchemy.Row], 

1152 next_key: PageKey | None, 

1153 ) -> DimensionRecordResultPage: 

1154 """Convert a raw SQL result iterable into a page of `DimensionRecord` 

1155 query results. 

1156 

1157 Parameters 

1158 ---------- 

1159 raw_rows : `~collections.abc.Iterable` [ `sqlalchemy.Row` ] 

1160 Iterable of SQLAlchemy rows, with `Postprocessing` filters already 

1161 applied. 

1162 next_key : `PageKey` or `None` 

1163 Key for the next page to add into the returned page object. 

1164 

1165 Returns 

1166 ------- 

1167 result_page : `DimensionRecordResultPage` 

1168 Page object that holds a `DimensionRecord` container. 

1169 """ 

1170 result_spec = cast(DimensionRecordResultSpec, self._result_spec) 

1171 record_set = DimensionRecordSet(result_spec.element) 

1172 record_cls = result_spec.element.RecordClass 

1173 if isinstance(result_spec.element, SkyPixDimension): 

1174 pixelization = result_spec.element.pixelization 

1175 id_qualified_name = qt.ColumnSet.get_qualified_name(result_spec.element.name, None) 

1176 for raw_row in raw_rows: 

1177 pixel_id = raw_row._mapping[id_qualified_name] 

1178 record_set.add(record_cls(id=pixel_id, region=pixelization.pixel(pixel_id))) 

1179 else: 

1180 # Mapping from DimensionRecord attribute name to qualified column 

1181 # name, but as a list of tuples since we'd just iterate over items 

1182 # anyway. 

1183 column_map = list( 

1184 zip( 

1185 result_spec.element.schema.dimensions.names, 

1186 result_spec.element.dimensions.names, 

1187 ) 

1188 ) 

1189 for field in result_spec.element.schema.remainder.names: 

1190 if field != "timespan": 

1191 column_map.append( 

1192 (field, qt.ColumnSet.get_qualified_name(result_spec.element.name, field)) 

1193 ) 

1194 if result_spec.element.temporal: 

1195 timespan_qualified_name = qt.ColumnSet.get_qualified_name( 

1196 result_spec.element.name, "timespan" 

1197 ) 

1198 else: 

1199 timespan_qualified_name = None 

1200 for raw_row in raw_rows: 

1201 m = raw_row._mapping 

1202 d = {k: m[v] for k, v in column_map} 

1203 if timespan_qualified_name is not None: 

1204 d["timespan"] = self._timespan_repr_cls.extract(m, name=timespan_qualified_name) 

1205 record_set.add(record_cls(**d)) 

1206 return DimensionRecordResultPage(spec=result_spec, next_key=next_key, rows=record_set)