Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 11%

187 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-07-03 01:08 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import AbstractSet, Any, Iterable, List, Optional 

26 

27import sqlalchemy.sql 

28 

29from ...core import DatasetType, Dimension, DimensionElement, SimpleQuery, SkyPixDimension 

30from ...core.named import NamedKeyDict, NamedValueAbstractSet, NamedValueSet 

31from .._collectionType import CollectionType 

32from .._exceptions import DataIdValueError 

33from ..interfaces import CollectionRecord, DatasetRecordStorage, GovernorDimensionRecordStorage 

34from ..wildcards import CollectionQuery, CollectionSearch 

35from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, OrderByColumn, Query 

36from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers 

37from .expressions import convertExpressionToSql 

38 

39 

40class QueryBuilder: 

41 """A builder for potentially complex queries that join tables based 

42 on dimension relationships. 

43 

44 Parameters 

45 ---------- 

46 summary : `QuerySummary` 

47 Struct organizing the dimensions involved in the query. 

48 managers : `RegistryManagers` 

49 A struct containing the registry manager instances used by the query 

50 system. 

51 doomed_by : `Iterable` [ `str` ], optional 

52 A list of messages (appropriate for e.g. logging or exceptions) that 

53 explain why the query is known to return no results even before it is 

54 executed. Queries with a non-empty list will never be executed. 

55 """ 

56 

57 def __init__(self, summary: QuerySummary, managers: RegistryManagers, doomed_by: Iterable[str] = ()): 

58 self.summary = summary 

59 self._simpleQuery = SimpleQuery() 

60 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict() 

61 self._columns = QueryColumns() 

62 self._managers = managers 

63 self._doomed_by = list(doomed_by) 

64 

65 self._validateGovernors() 

66 

67 def _validateGovernors(self) -> None: 

68 """Check that governor dimensions specified by query actually exist. 

69 

70 This helps to avoid mistakes in governor values. It also implements 

71 consistent failure behavior for cases when governor dimensions are 

72 specified in either DataId ow WHERE clause. 

73 

74 Raises 

75 ------ 

76 LookupError 

77 Raised when governor dimension values are not found. 

78 """ 

79 for governor, values in self.summary.where.restriction.items(): 

80 storage = self._managers.dimensions[governor] 

81 assert isinstance( 

82 storage, GovernorDimensionRecordStorage 

83 ), f"Unexpected type of the governor dimension record storage {type(storage)}" 

84 if not values <= storage.values: 

85 unknown = values - storage.values 

86 raise DataIdValueError( 

87 f"Unknown values specified for governor dimension {governor}: {unknown}" 

88 ) 

89 

90 def hasDimensionKey(self, dimension: Dimension) -> bool: 

91 """Return `True` if the given dimension's primary key column has 

92 been included in the query (possibly via a foreign key column on some 

93 other table). 

94 """ 

95 return dimension in self._columns.keys 

96 

97 def joinDimensionElement(self, element: DimensionElement) -> None: 

98 """Add the table for a `DimensionElement` to the query. 

99 

100 This automatically joins the element table to all other tables in the 

101 query with which it is related, via both dimension keys and spatial 

102 and temporal relationships. 

103 

104 External calls to this method should rarely be necessary; `finish` will 

105 automatically call it if the `DimensionElement` has been identified as 

106 one that must be included. 

107 

108 Parameters 

109 ---------- 

110 element : `DimensionElement` 

111 Element for which a table should be added. The element must be 

112 associated with a database table (see `DimensionElement.hasTable`). 

113 """ 

114 assert element not in self._elements, "Element already included in query." 

115 storage = self._managers.dimensions[element] 

116 fromClause = storage.join( 

117 self, 

118 regions=self._columns.regions if element in self.summary.spatial else None, 

119 timespans=self._columns.timespans if element in self.summary.temporal else None, 

120 ) 

121 self._elements[element] = fromClause 

122 

123 def joinDataset( 

124 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False 

125 ) -> bool: 

126 """Add a dataset search or constraint to the query. 

127 

128 Unlike other `QueryBuilder` join methods, this *must* be called 

129 directly to search for datasets of a particular type or constrain the 

130 query results based on the exists of datasets. However, all dimensions 

131 used to identify the dataset type must have already been included in 

132 `QuerySummary.requested` when initializing the `QueryBuilder`. 

133 

134 Parameters 

135 ---------- 

136 datasetType : `DatasetType` 

137 The type of datasets to search for. 

138 collections : `Any` 

139 An expression that fully or partially identifies the collections 

140 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

141 thereof. `...` can be used to return all collections. See 

142 :ref:`daf_butler_collection_expressions` for more information. 

143 isResult : `bool`, optional 

144 If `True` (default), include the dataset ID column in the 

145 result columns of the query, allowing complete `DatasetRef` 

146 instances to be produced from the query results for this dataset 

147 type. If `False`, the existence of datasets of this type is used 

148 only to constrain the data IDs returned by the query. 

149 `joinDataset` may be called with ``isResult=True`` at most one time 

150 on a particular `QueryBuilder` instance. 

151 findFirst : `bool`, optional 

152 If `True` (`False` is default), only include the first match for 

153 each data ID, searching the given collections in order. Requires 

154 that all entries in ``collections`` be regular strings, so there is 

155 a clear search order. Ignored if ``isResult`` is `False`. 

156 

157 Returns 

158 ------- 

159 anyRecords : `bool` 

160 If `True`, joining the dataset table was successful and the query 

161 should proceed. If `False`, we were able to determine (from the 

162 combination of ``datasetType`` and ``collections``) that there 

163 would be no results joined in from this dataset, and hence (due to 

164 the inner join that would normally be present), the full query will 

165 return no results. 

166 """ 

167 assert datasetType in self.summary.datasets 

168 if isResult and findFirst: 

169 collections = CollectionSearch.fromExpression(collections) 

170 else: 

171 collections = CollectionQuery.fromExpression(collections) 

172 explicitCollections = frozenset(collections.explicitNames()) 

173 # If we are searching all collections with no constraints, loop over 

174 # RUN collections only, because that will include all datasets. 

175 collectionTypes: AbstractSet[CollectionType] 

176 if collections == CollectionQuery(): 

177 collectionTypes = {CollectionType.RUN} 

178 else: 

179 collectionTypes = CollectionType.all() 

180 datasetRecordStorage = self._managers.datasets.find(datasetType.name) 

181 if datasetRecordStorage is None: 

182 # Unrecognized dataset type means no results. It might be better 

183 # to raise here, but this is consistent with previous behavior, 

184 # which is expected by QuantumGraph generation code in pipe_base. 

185 self._doomed_by.append( 

186 f"Dataset type {datasetType.name!r} is not registered, so no instances of it can exist in " 

187 "any collection." 

188 ) 

189 return False 

190 collectionRecords: List[CollectionRecord] = [] 

191 rejections: List[str] = [] 

192 for collectionRecord in collections.iter(self._managers.collections, collectionTypes=collectionTypes): 

193 # Only include collections that (according to collection summaries) 

194 # might have datasets of this type and governor dimensions 

195 # consistent with the query's WHERE clause. 

196 collection_summary = self._managers.datasets.getCollectionSummary(collectionRecord) 

197 if not collection_summary.is_compatible_with( 

198 datasetType, 

199 self.summary.where.restriction, 

200 rejections=rejections, 

201 name=collectionRecord.name, 

202 ): 

203 continue 

204 if collectionRecord.type is CollectionType.CALIBRATION: 

205 # If collection name was provided explicitly then say sorry if 

206 # this is a kind of query we don't support yet; otherwise 

207 # collection is a part of chained one or regex match and we 

208 # skip it to not break queries of other included collections. 

209 if datasetType.isCalibration(): 

210 if self.summary.temporal or self.summary.mustHaveKeysJoined.temporal: 

211 if collectionRecord.name in explicitCollections: 

212 raise NotImplementedError( 

213 f"Temporal query for dataset type '{datasetType.name}' in CALIBRATION-type " 

214 f"collection '{collectionRecord.name}' is not yet supported." 

215 ) 

216 else: 

217 rejections.append( 

218 f"Not searching for dataset {datasetType.name!r} in CALIBRATION collection " 

219 f"{collectionRecord.name!r} because temporal calibration queries aren't " 

220 "implemented; this is not an error only because the query structure implies " 

221 "that searching this collection may be incidental." 

222 ) 

223 continue 

224 elif findFirst: 

225 if collectionRecord.name in explicitCollections: 

226 raise NotImplementedError( 

227 f"Find-first query for dataset type '{datasetType.name}' in " 

228 f"CALIBRATION-type collection '{collectionRecord.name}' is not yet " 

229 "supported." 

230 ) 

231 else: 

232 rejections.append( 

233 f"Not searching for dataset {datasetType.name!r} in CALIBRATION collection " 

234 f"{collectionRecord.name!r} because find-first calibration queries aren't " 

235 "implemented; this is not an error only because the query structure implies " 

236 "that searching this collection may be incidental." 

237 ) 

238 continue 

239 else: 

240 collectionRecords.append(collectionRecord) 

241 else: 

242 # We can never find a non-calibration dataset in a 

243 # CALIBRATION collection. 

244 rejections.append( 

245 f"Not searching for non-calibration dataset {datasetType.name!r} " 

246 f"in CALIBRATION collection {collectionRecord.name!r}." 

247 ) 

248 continue 

249 else: 

250 collectionRecords.append(collectionRecord) 

251 if isResult: 

252 if findFirst: 

253 subquery = self._build_dataset_search_subquery( 

254 datasetRecordStorage, 

255 collectionRecords, 

256 ) 

257 else: 

258 subquery = self._build_dataset_query_subquery( 

259 datasetRecordStorage, 

260 collectionRecords, 

261 ) 

262 columns = DatasetQueryColumns( 

263 datasetType=datasetType, 

264 id=subquery.columns["id"], 

265 runKey=subquery.columns[self._managers.collections.getRunForeignKeyName()], 

266 ingestDate=subquery.columns["ingest_date"], 

267 ) 

268 else: 

269 subquery = self._build_dataset_constraint_subquery(datasetRecordStorage, collectionRecords) 

270 columns = None 

271 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns) 

272 if not collectionRecords: 

273 if rejections: 

274 self._doomed_by.extend(rejections) 

275 else: 

276 self._doomed_by.append(f"No collections to search matching expression {collections}.") 

277 return False 

278 return not self._doomed_by 

279 

280 def _build_dataset_constraint_subquery( 

281 self, storage: DatasetRecordStorage, collections: List[CollectionRecord] 

282 ) -> sqlalchemy.sql.FromClause: 

283 """Internal helper method to build a dataset subquery for a parent 

284 query that does not return dataset results. 

285 

286 Parameters 

287 ---------- 

288 storage : `DatasetRecordStorage` 

289 Storage object for the dataset type the subquery is for. 

290 collections : `list` [ `CollectionRecord` ] 

291 Records for the collections to be searched. Collections with no 

292 datasets of this type or with governor dimensions incompatible with 

293 the rest of the query should already have been filtered out. 

294 `~CollectionType.CALIBRATION` collections should also be filtered 

295 out if this is a temporal query. 

296 

297 Returns 

298 ------- 

299 sql : `sqlalchemy.sql.FromClause` 

300 A SQLAlchemy aliased subquery object. Has columns for each 

301 dataset type dimension, or an unspecified column (just to prevent 

302 SQL syntax errors) where there is no data ID. 

303 """ 

304 return storage.select( 

305 *collections, 

306 dataId=SimpleQuery.Select, 

307 # If this dataset type has no dimensions, we're in danger of 

308 # generating an invalid subquery that has no columns in the 

309 # SELECT clause. An easy fix is to just select some arbitrary 

310 # column that goes unused, like the dataset ID. 

311 id=None if storage.datasetType.dimensions else SimpleQuery.Select, 

312 run=None, 

313 ingestDate=None, 

314 timespan=None, 

315 ).alias(storage.datasetType.name) 

316 

317 def _build_dataset_query_subquery( 

318 self, storage: DatasetRecordStorage, collections: List[CollectionRecord] 

319 ) -> sqlalchemy.sql.FromClause: 

320 """Internal helper method to build a dataset subquery for a parent 

321 query that returns all matching dataset results. 

322 

323 Parameters 

324 ---------- 

325 storage : `DatasetRecordStorage` 

326 Storage object for the dataset type the subquery is for. 

327 collections : `list` [ `CollectionRecord` ] 

328 Records for the collections to be searched. Collections with no 

329 datasets of this type or with governor dimensions incompatible with 

330 the rest of the query should already have been filtered out. 

331 `~CollectionType.CALIBRATION` collections should also be filtered 

332 out if this is a temporal query. 

333 

334 Returns 

335 ------- 

336 sql : `sqlalchemy.sql.FromClause` 

337 A SQLAlchemy aliased subquery object. Has columns for each dataset 

338 type dimension, the dataset ID, the `~CollectionType.RUN` 

339 collection key, and the ingest date. 

340 """ 

341 sql = storage.select( 

342 *collections, 

343 dataId=SimpleQuery.Select, 

344 id=SimpleQuery.Select, 

345 run=SimpleQuery.Select, 

346 ingestDate=SimpleQuery.Select, 

347 timespan=None, 

348 ).alias(storage.datasetType.name) 

349 return sql 

350 

351 def _build_dataset_search_subquery( 

352 self, storage: DatasetRecordStorage, collections: List[CollectionRecord] 

353 ) -> sqlalchemy.sql.FromClause: 

354 """Internal helper method to build a dataset subquery for a parent 

355 query that returns the first matching dataset for each data ID and 

356 dataset type name from an ordered list of collections. 

357 

358 Parameters 

359 ---------- 

360 storage : `DatasetRecordStorage` 

361 Storage object for the dataset type the subquery is for. 

362 collections : `list` [ `CollectionRecord` ] 

363 Records for the collections to be searched. Collections with no 

364 datasets of this type or with governor dimensions incompatible with 

365 the rest of the query should already have been filtered out. 

366 `~CollectionType.CALIBRATION` collections should be filtered out as 

367 well. 

368 

369 Returns 

370 ------- 

371 sql : `sqlalchemy.sql.FromClause` 

372 A SQLAlchemy aliased subquery object. Has columns for each dataset 

373 type dimension, the dataset ID, the `~CollectionType.RUN` 

374 collection key, and the ingest date. 

375 """ 

376 # Query-simplification shortcut: if there is only one collection, a 

377 # find-first search is just a regular result subquery. Same is true 

378 # if this is a doomed query with no collections to search. 

379 if len(collections) <= 1: 

380 return self._build_dataset_query_subquery(storage, collections) 

381 # In the more general case, we build a subquery of the form below to 

382 # search the collections in order. 

383 # 

384 # WITH {dst}_search AS ( 

385 # SELECT {data-id-cols}, id, run_id, 1 AS rank 

386 # FROM <collection1> 

387 # UNION ALL 

388 # SELECT {data-id-cols}, id, run_id, 2 AS rank 

389 # FROM <collection2> 

390 # UNION ALL 

391 # ... 

392 # ) 

393 # SELECT 

394 # {dst}_window.{data-id-cols}, 

395 # {dst}_window.id, 

396 # {dst}_window.run_id 

397 # FROM ( 

398 # SELECT 

399 # {dst}_search.{data-id-cols}, 

400 # {dst}_search.id, 

401 # {dst}_search.run_id, 

402 # ROW_NUMBER() OVER ( 

403 # PARTITION BY {dst_search}.{data-id-cols} 

404 # ORDER BY rank 

405 # ) AS rownum 

406 # ) {dst}_window 

407 # WHERE 

408 # {dst}_window.rownum = 1; 

409 # 

410 # We'll start with the Common Table Expression (CTE) at the top. 

411 subqueries = [] 

412 for rank, collection_record in enumerate(collections): 

413 ssq = storage.select( 

414 collection_record, 

415 dataId=SimpleQuery.Select, 

416 id=SimpleQuery.Select, 

417 run=SimpleQuery.Select, 

418 ingestDate=SimpleQuery.Select, 

419 timespan=None, 

420 ) 

421 subqueries.append(ssq.add_columns(sqlalchemy.sql.literal(rank).label("rank"))) 

422 # Although one would expect that these subqueries can be UNION ALL 

423 # instead of UNION because each subquery is already distinct, it turns 

424 # out that with many subqueries this causes catastrophic performance 

425 # problems with both sqlite and postgres. Using UNION may require more 

426 # table scans, but a much simpler query plan given our table 

427 # structures. See DM-31429. 

428 search = sqlalchemy.sql.union(*subqueries).cte(f"{storage.datasetType.name}_search") 

429 # Now we fill out the SELECT the CTE, and the subquery it contains (at 

430 # the same time, since they have the same columns, aside from the OVER 

431 # clause). 

432 run_key_name = self._managers.collections.getRunForeignKeyName() 

433 window_data_id_cols = [ 

434 search.columns[name].label(name) for name in storage.datasetType.dimensions.required.names 

435 ] 

436 window_select_cols = [ 

437 search.columns["id"].label("id"), 

438 search.columns[run_key_name].label(run_key_name), 

439 search.columns["ingest_date"].label("ingest_date"), 

440 ] 

441 window_select_cols += window_data_id_cols 

442 window_select_cols.append( 

443 sqlalchemy.sql.func.row_number() 

444 .over(partition_by=window_data_id_cols, order_by=search.columns["rank"]) 

445 .label("rownum") 

446 ) 

447 window = ( 

448 sqlalchemy.sql.select(*window_select_cols) 

449 .select_from(search) 

450 .alias(f"{storage.datasetType.name}_window") 

451 ) 

452 sql = ( 

453 sqlalchemy.sql.select(*[window.columns[col.name].label(col.name) for col in window_select_cols]) 

454 .select_from(window) 

455 .where(window.columns["rownum"] == 1) 

456 .alias(storage.datasetType.name) 

457 ) 

458 return sql 

459 

460 def joinTable( 

461 self, 

462 table: sqlalchemy.sql.FromClause, 

463 dimensions: NamedValueAbstractSet[Dimension], 

464 *, 

465 datasets: Optional[DatasetQueryColumns] = None, 

466 ) -> None: 

467 """Join an arbitrary table to the query via dimension relationships. 

468 

469 External calls to this method should only be necessary for tables whose 

470 records represent neither datasets nor dimension elements. 

471 

472 Parameters 

473 ---------- 

474 table : `sqlalchemy.sql.FromClause` 

475 SQLAlchemy object representing the logical table (which may be a 

476 join or subquery expression) to be joined. 

477 dimensions : iterable of `Dimension` 

478 The dimensions that relate this table to others that may be in the 

479 query. The table must have columns with the names of the 

480 dimensions. 

481 datasets : `DatasetQueryColumns`, optional 

482 Columns that identify a dataset that is part of the query results. 

483 """ 

484 unexpectedDimensions = NamedValueSet(dimensions - self.summary.mustHaveKeysJoined.dimensions) 

485 unexpectedDimensions.discard(self.summary.universe.commonSkyPix) 

486 if unexpectedDimensions: 

487 raise NotImplementedError( 

488 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that " 

489 f"were not provided originally to the QuerySummary object passed at construction." 

490 ) 

491 joinOn = self.startJoin(table, dimensions, dimensions.names) 

492 self.finishJoin(table, joinOn) 

493 if datasets is not None: 

494 assert ( 

495 self._columns.datasets is None 

496 ), "At most one result dataset type can be returned by a query." 

497 self._columns.datasets = datasets 

498 

499 def startJoin( 

500 self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension], columnNames: Iterable[str] 

501 ) -> List[sqlalchemy.sql.ColumnElement]: 

502 """Begin a join on dimensions. 

503 

504 Must be followed by call to `finishJoin`. 

505 

506 Parameters 

507 ---------- 

508 table : `sqlalchemy.sql.FromClause` 

509 SQLAlchemy object representing the logical table (which may be a 

510 join or subquery expression) to be joined. 

511 dimensions : iterable of `Dimension` 

512 The dimensions that relate this table to others that may be in the 

513 query. The table must have columns with the names of the 

514 dimensions. 

515 columnNames : iterable of `str` 

516 Names of the columns that correspond to dimension key values; must 

517 be `zip` iterable with ``dimensions``. 

518 

519 Returns 

520 ------- 

521 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

522 Sequence of boolean expressions that should be combined with AND 

523 to form (part of) the ON expression for this JOIN. 

524 """ 

525 joinOn = [] 

526 for dimension, columnName in zip(dimensions, columnNames): 

527 columnInTable = table.columns[columnName] 

528 columnsInQuery = self._columns.keys.setdefault(dimension, []) 

529 for columnInQuery in columnsInQuery: 

530 joinOn.append(columnInQuery == columnInTable) 

531 columnsInQuery.append(columnInTable) 

532 return joinOn 

533 

534 def finishJoin( 

535 self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement] 

536 ) -> None: 

537 """Complete a join on dimensions. 

538 

539 Must be preceded by call to `startJoin`. 

540 

541 Parameters 

542 ---------- 

543 table : `sqlalchemy.sql.FromClause` 

544 SQLAlchemy object representing the logical table (which may be a 

545 join or subquery expression) to be joined. Must be the same object 

546 passed to `startJoin`. 

547 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

548 Sequence of boolean expressions that should be combined with AND 

549 to form (part of) the ON expression for this JOIN. Should include 

550 at least the elements of the list returned by `startJoin`. 

551 """ 

552 onclause: Optional[sqlalchemy.sql.ColumnElement] 

553 if len(joinOn) == 0: 

554 onclause = None 

555 elif len(joinOn) == 1: 

556 onclause = joinOn[0] 

557 else: 

558 onclause = sqlalchemy.sql.and_(*joinOn) 

559 self._simpleQuery.join(table, onclause=onclause) 

560 

561 def _joinMissingDimensionElements(self) -> None: 

562 """Join all dimension element tables that were identified as necessary 

563 by `QuerySummary` and have not yet been joined. 

564 

565 For internal use by `QueryBuilder` only; will be called (and should 

566 only by called) by `finish`. 

567 """ 

568 # Join all DimensionElement tables that we need for spatial/temporal 

569 # joins/filters or a nontrivial WHERE expression. 

570 # We iterate over these in *reverse* topological order to minimize the 

571 # number of tables joined. For example, the "visit" table provides 

572 # the primary key value for the "instrument" table it depends on, so we 

573 # don't need to join "instrument" as well unless we had a nontrivial 

574 # expression on it (and hence included it already above). 

575 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True): 

576 self.joinDimensionElement(element) 

577 # Join in any requested Dimension tables that don't already have their 

578 # primary keys identified by the query. 

579 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True): 

580 if dimension not in self._columns.keys: 

581 self.joinDimensionElement(dimension) 

582 

583 def _addWhereClause(self) -> None: 

584 """Add a WHERE clause to the query under construction, connecting all 

585 joined dimensions to the expression and data ID dimensions from 

586 `QuerySummary`. 

587 

588 For internal use by `QueryBuilder` only; will be called (and should 

589 only by called) by `finish`. 

590 """ 

591 if self.summary.where.tree is not None: 

592 self._simpleQuery.where.append( 

593 convertExpressionToSql( 

594 self.summary.where.tree, 

595 self.summary.universe, 

596 columns=self._columns, 

597 elements=self._elements, 

598 bind=self.summary.where.bind, 

599 TimespanReprClass=self._managers.TimespanReprClass, 

600 ) 

601 ) 

602 for dimension, columnsInQuery in self._columns.keys.items(): 

603 if dimension in self.summary.where.dataId.graph: 

604 givenKey = self.summary.where.dataId[dimension] 

605 # Add a WHERE term for each column that corresponds to each 

606 # key. This is redundant with the JOIN ON clauses that make 

607 # them equal to each other, but more constraints have a chance 

608 # of making things easier on the DB's query optimizer. 

609 for columnInQuery in columnsInQuery: 

610 self._simpleQuery.where.append(columnInQuery == givenKey) 

611 else: 

612 # Dimension is not fully identified, but it might be a skypix 

613 # dimension that's constrained by a given region. 

614 if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension): 

615 # We know the region now. 

616 givenSkyPixIds: List[int] = [] 

617 for begin, end in dimension.pixelization.envelope(self.summary.where.region): 

618 givenSkyPixIds.extend(range(begin, end)) 

619 for columnInQuery in columnsInQuery: 

620 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds)) 

621 # If we are given an dataId with a timespan, and there are one or more 

622 # timespans in the query that aren't given, add a WHERE expression for 

623 # each of them. 

624 if self.summary.where.dataId.graph.temporal and self.summary.temporal: 

625 # Timespan is known now. 

626 givenInterval = self.summary.where.dataId.timespan 

627 assert givenInterval is not None 

628 for element, intervalInQuery in self._columns.timespans.items(): 

629 assert element not in self.summary.where.dataId.graph.elements 

630 self._simpleQuery.where.append( 

631 intervalInQuery.overlaps(self._managers.TimespanReprClass.fromLiteral(givenInterval)) 

632 ) 

633 

634 def finish(self, joinMissing: bool = True) -> Query: 

635 """Finish query constructing, returning a new `Query` instance. 

636 

637 Parameters 

638 ---------- 

639 joinMissing : `bool`, optional 

640 If `True` (default), automatically join any missing dimension 

641 element tables (according to the categorization of the 

642 `QuerySummary` the builder was constructed with). `False` should 

643 only be passed if the caller can independently guarantee that all 

644 dimension relationships are already captured in non-dimension 

645 tables that have been manually included in the query. 

646 

647 Returns 

648 ------- 

649 query : `Query` 

650 A `Query` object that can be executed and used to interpret result 

651 rows. 

652 """ 

653 if joinMissing: 

654 self._joinMissingDimensionElements() 

655 self._addWhereClause() 

656 if self._columns.isEmpty(): 

657 return EmptyQuery( 

658 self.summary.requested.universe, managers=self._managers, doomed_by=self._doomed_by 

659 ) 

660 return DirectQuery( 

661 graph=self.summary.requested, 

662 uniqueness=DirectQueryUniqueness.NOT_UNIQUE, 

663 whereRegion=self.summary.where.dataId.region, 

664 simpleQuery=self._simpleQuery, 

665 columns=self._columns, 

666 order_by_columns=self._order_by_columns(), 

667 limit=self.summary.limit, 

668 managers=self._managers, 

669 doomed_by=self._doomed_by, 

670 ) 

671 

672 def _order_by_columns(self) -> Iterable[OrderByColumn]: 

673 """Generate columns to be used for ORDER BY clause. 

674 

675 Returns 

676 ------- 

677 order_by_columns : `Iterable` [ `ColumnIterable` ] 

678 Sequence of columns to appear in ORDER BY clause. 

679 """ 

680 order_by_columns: List[OrderByColumn] = [] 

681 if not self.summary.order_by: 

682 return order_by_columns 

683 

684 for order_by_column in self.summary.order_by.order_by_columns: 

685 

686 column: sqlalchemy.sql.ColumnElement 

687 if order_by_column.column is None: 

688 # dimension name, it has to be in SELECT list already, only 

689 # add it to ORDER BY 

690 assert isinstance(order_by_column.element, Dimension), "expecting full Dimension" 

691 column = self._columns.getKeyColumn(order_by_column.element) 

692 else: 

693 table = self._elements[order_by_column.element] 

694 

695 if order_by_column.column in ("timespan.begin", "timespan.end"): 

696 TimespanReprClass = self._managers.TimespanReprClass 

697 timespan_repr = TimespanReprClass.fromSelectable(table) 

698 if order_by_column.column == "timespan.begin": 

699 column = timespan_repr.lower() 

700 label = f"{order_by_column.element.name}_timespan_begin" 

701 else: 

702 column = timespan_repr.upper() 

703 label = f"{order_by_column.element.name}_timespan_end" 

704 else: 

705 column = table.columns[order_by_column.column] 

706 # make a unique label for it 

707 label = f"{order_by_column.element.name}_{order_by_column.column}" 

708 

709 column = column.label(label) 

710 

711 order_by_columns.append(OrderByColumn(column=column, ordering=order_by_column.ordering)) 

712 

713 return order_by_columns