Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import AbstractSet, Any, Iterable, List, Optional 

26 

27import sqlalchemy.sql 

28 

29from ...core import ( 

30 DimensionElement, 

31 SkyPixDimension, 

32 Dimension, 

33 DatasetType, 

34 SimpleQuery, 

35) 

36 

37from ...core.named import NamedKeyDict, NamedValueAbstractSet, NamedValueSet 

38 

39from .._collectionType import CollectionType 

40from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns, RegistryManagers 

41from .expressions import convertExpressionToSql 

42from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, Query 

43from ..wildcards import CollectionSearch, CollectionQuery 

44 

45 

46class QueryBuilder: 

47 """A builder for potentially complex queries that join tables based 

48 on dimension relationships. 

49 

50 Parameters 

51 ---------- 

52 summary : `QuerySummary` 

53 Struct organizing the dimensions involved in the query. 

54 managers : `RegistryManagers` 

55 A struct containing the registry manager instances used by the query 

56 system. 

57 doomed_by : `Iterable` [ `str` ], optional 

58 A list of messages (appropriate for e.g. logging or exceptions) that 

59 explain why the query is known to return no results even before it is 

60 executed. Queries with a non-empty list will never be executed. 

61 """ 

62 def __init__(self, summary: QuerySummary, managers: RegistryManagers, doomed_by: Iterable[str] = ()): 

63 self.summary = summary 

64 self._simpleQuery = SimpleQuery() 

65 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict() 

66 self._columns = QueryColumns() 

67 self._managers = managers 

68 self._doomed_by = list(doomed_by) 

69 

70 def hasDimensionKey(self, dimension: Dimension) -> bool: 

71 """Return `True` if the given dimension's primary key column has 

72 been included in the query (possibly via a foreign key column on some 

73 other table). 

74 """ 

75 return dimension in self._columns.keys 

76 

77 def joinDimensionElement(self, element: DimensionElement) -> None: 

78 """Add the table for a `DimensionElement` to the query. 

79 

80 This automatically joins the element table to all other tables in the 

81 query with which it is related, via both dimension keys and spatial 

82 and temporal relationships. 

83 

84 External calls to this method should rarely be necessary; `finish` will 

85 automatically call it if the `DimensionElement` has been identified as 

86 one that must be included. 

87 

88 Parameters 

89 ---------- 

90 element : `DimensionElement` 

91 Element for which a table should be added. The element must be 

92 associated with a database table (see `DimensionElement.hasTable`). 

93 """ 

94 assert element not in self._elements, "Element already included in query." 

95 storage = self._managers.dimensions[element] 

96 fromClause = storage.join( 

97 self, 

98 regions=self._columns.regions if element in self.summary.spatial else None, 

99 timespans=self._columns.timespans if element in self.summary.temporal else None, 

100 ) 

101 self._elements[element] = fromClause 

102 

103 def joinDataset(self, datasetType: DatasetType, collections: Any, *, 

104 isResult: bool = True, findFirst: bool = False) -> bool: 

105 """Add a dataset search or constraint to the query. 

106 

107 Unlike other `QueryBuilder` join methods, this *must* be called 

108 directly to search for datasets of a particular type or constrain the 

109 query results based on the exists of datasets. However, all dimensions 

110 used to identify the dataset type must have already been included in 

111 `QuerySummary.requested` when initializing the `QueryBuilder`. 

112 

113 Parameters 

114 ---------- 

115 datasetType : `DatasetType` 

116 The type of datasets to search for. 

117 collections : `Any` 

118 An expression that fully or partially identifies the collections 

119 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

120 thereof. `...` can be used to return all collections. See 

121 :ref:`daf_butler_collection_expressions` for more information. 

122 isResult : `bool`, optional 

123 If `True` (default), include the dataset ID column in the 

124 result columns of the query, allowing complete `DatasetRef` 

125 instances to be produced from the query results for this dataset 

126 type. If `False`, the existence of datasets of this type is used 

127 only to constrain the data IDs returned by the query. 

128 `joinDataset` may be called with ``isResult=True`` at most one time 

129 on a particular `QueryBuilder` instance. 

130 findFirst : `bool`, optional 

131 If `True` (`False` is default), only include the first match for 

132 each data ID, searching the given collections in order. Requires 

133 that all entries in ``collections`` be regular strings, so there is 

134 a clear search order. Ignored if ``isResult`` is `False`. 

135 

136 Returns 

137 ------- 

138 anyRecords : `bool` 

139 If `True`, joining the dataset table was successful and the query 

140 should proceed. If `False`, we were able to determine (from the 

141 combination of ``datasetType`` and ``collections``) that there 

142 would be no results joined in from this dataset, and hence (due to 

143 the inner join that would normally be present), the full query will 

144 return no results. 

145 """ 

146 assert datasetType.dimensions.issubset(self.summary.requested) 

147 if isResult and findFirst: 

148 collections = CollectionSearch.fromExpression(collections) 

149 else: 

150 collections = CollectionQuery.fromExpression(collections) 

151 explicitCollections = frozenset(collections.explicitNames()) 

152 # If we are searching all collections with no constraints, loop over 

153 # RUN collections only, because that will include all datasets. 

154 collectionTypes: AbstractSet[CollectionType] 

155 if collections == CollectionQuery(): 

156 collectionTypes = {CollectionType.RUN} 

157 else: 

158 collectionTypes = CollectionType.all() 

159 datasetRecordStorage = self._managers.datasets.find(datasetType.name) 

160 if datasetRecordStorage is None: 

161 # Unrecognized dataset type means no results. It might be better 

162 # to raise here, but this is consistent with previous behavior, 

163 # which is expected by QuantumGraph generation code in pipe_base. 

164 self._doomed_by.append( 

165 f"Dataset type {datasetType.name!r} is not registered, so no instances of it can exist in " 

166 "any collection." 

167 ) 

168 return False 

169 subsubqueries = [] 

170 runKeyName = self._managers.collections.getRunForeignKeyName() 

171 baseColumnNames = {"id", runKeyName, "ingest_date"} if isResult else set() 

172 baseColumnNames.update(datasetType.dimensions.required.names) 

173 if not findFirst: 

174 calibration_collections = [] 

175 other_collections = [] 

176 rejections: List[str] = [] 

177 for rank, collectionRecord in enumerate(collections.iter(self._managers.collections, 

178 collectionTypes=collectionTypes)): 

179 # Only include collections that (according to collection summaries) 

180 # might have datasets of this type and governor dimensions 

181 # consistent with the query's WHERE clause. 

182 collection_summary = self._managers.datasets.getCollectionSummary(collectionRecord) 

183 if not collection_summary.is_compatible_with( 

184 datasetType, 

185 self.summary.where.restriction, 

186 rejections=rejections, 

187 name=collectionRecord.name, 

188 ): 

189 continue 

190 if collectionRecord.type is CollectionType.CALIBRATION: 

191 # If collection name was provided explicitly then say sorry, 

192 # otherwise collection is a part of chained one and we skip it. 

193 if datasetType.isCalibration() and collectionRecord.name in explicitCollections: 

194 if self.summary.temporal or self.summary.mustHaveKeysJoined.temporal: 

195 raise NotImplementedError( 

196 f"Temporal query for dataset type '{datasetType.name}' in CALIBRATION-type " 

197 f"collection '{collectionRecord.name}' is not yet supported." 

198 ) 

199 elif findFirst: 

200 raise NotImplementedError( 

201 f"Find-first query for dataset type '{datasetType.name}' in CALIBRATION-type " 

202 f"collection '{collectionRecord.name}' is not yet supported." 

203 ) 

204 else: 

205 calibration_collections.append(collectionRecord) 

206 else: 

207 # We can never find a non-calibration dataset in a 

208 # CALIBRATION collection. 

209 rejections.append( 

210 f"Not searching for non-calibration dataset {datasetType.name!r} " 

211 f"in CALIBRATION collection {collectionRecord.name!r}." 

212 ) 

213 continue 

214 elif findFirst: 

215 # If findFirst=True, each collection gets its own subquery so 

216 # we can add a literal rank for it. 

217 ssq = datasetRecordStorage.select( 

218 collectionRecord, 

219 dataId=SimpleQuery.Select, 

220 id=SimpleQuery.Select if isResult else None, 

221 run=SimpleQuery.Select if isResult else None, 

222 ingestDate=SimpleQuery.Select if isResult else None, 

223 ) 

224 assert {c.name for c in ssq.columns} == baseColumnNames 

225 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank")) 

226 subsubqueries.append(ssq.combine()) 

227 else: 

228 # If findFirst=False, we have one subquery for all CALIBRATION 

229 # collections and one subquery for all other collections; we'll 

230 # assemble those later after grouping by collection type. 

231 other_collections.append(collectionRecord) 

232 if not findFirst: 

233 if other_collections: 

234 ssq = datasetRecordStorage.select( 

235 *other_collections, 

236 dataId=SimpleQuery.Select, 

237 id=SimpleQuery.Select if isResult else None, 

238 run=SimpleQuery.Select if isResult else None, 

239 ingestDate=SimpleQuery.Select if isResult else None, 

240 ) 

241 subsubqueries.append(ssq.combine()) 

242 if calibration_collections: 

243 ssq = datasetRecordStorage.select( 

244 *calibration_collections, 

245 dataId=SimpleQuery.Select, 

246 id=SimpleQuery.Select if isResult else None, 

247 run=SimpleQuery.Select if isResult else None, 

248 ingestDate=SimpleQuery.Select if isResult else None, 

249 ) 

250 subsubqueries.append(ssq.combine()) 

251 if not subsubqueries: 

252 if rejections: 

253 self._doomed_by.extend(rejections) 

254 else: 

255 self._doomed_by.append(f"No collections to search matching expression {collections}.") 

256 # Make a single subquery with no collections that never yields 

257 # results; this should never get executed, but downstream code 

258 # still needs to access the SQLAlchemy column objects. 

259 ssq = datasetRecordStorage.select( 

260 dataId=SimpleQuery.Select, 

261 id=SimpleQuery.Select if isResult else None, 

262 run=SimpleQuery.Select if isResult else None, 

263 ingestDate=SimpleQuery.Select if isResult else None, 

264 ) 

265 if findFirst: 

266 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank")) 

267 subsubqueries.append(ssq.combine()) 

268 # Although one would expect that these subqueries can be 

269 # UNION ALL instead of UNION because each subquery is already 

270 # distinct, it turns out that with many 

271 # subqueries this causes catastrophic performance problems 

272 # with both sqlite and postgres. Using UNION may require 

273 # more table scans, but a much simpler query plan given our 

274 # table structures. See DM-31429. 

275 subquery = sqlalchemy.sql.union(*subsubqueries) 

276 columns: Optional[DatasetQueryColumns] = None 

277 if isResult: 

278 if findFirst: 

279 # Rewrite the subquery (currently a UNION ALL over 

280 # per-collection subsubqueries) to select the rows with the 

281 # lowest rank per data ID. The block below will set subquery 

282 # to something like this: 

283 # 

284 # WITH {dst}_search AS ( 

285 # SELECT {data-id-cols}, id, run_id, 1 AS rank 

286 # FROM <collection1> 

287 # UNION ALL 

288 # SELECT {data-id-cols}, id, run_id, 2 AS rank 

289 # FROM <collection2> 

290 # UNION ALL 

291 # ... 

292 # ) 

293 # SELECT 

294 # {dst}_window.{data-id-cols}, 

295 # {dst}_window.id, 

296 # {dst}_window.run_id 

297 # FROM ( 

298 # SELECT 

299 # {dst}_search.{data-id-cols}, 

300 # {dst}_search.id, 

301 # {dst}_search.run_id, 

302 # ROW_NUMBER() OVER ( 

303 # PARTITION BY {dst_search}.{data-id-cols} 

304 # ORDER BY rank 

305 # ) AS rownum 

306 # ) {dst}_window 

307 # WHERE 

308 # {dst}_window.rownum = 1; 

309 # 

310 search = subquery.cte(f"{datasetType.name}_search") 

311 windowDataIdCols = [ 

312 search.columns[name].label(name) for name in datasetType.dimensions.required.names 

313 ] 

314 windowSelectCols = [ 

315 search.columns["id"].label("id"), 

316 search.columns[runKeyName].label(runKeyName), 

317 search.columns["ingest_date"].label("ingest_date"), 

318 ] 

319 windowSelectCols += windowDataIdCols 

320 assert {c.name for c in windowSelectCols} == baseColumnNames 

321 windowSelectCols.append( 

322 sqlalchemy.sql.func.row_number().over( 

323 partition_by=windowDataIdCols, 

324 order_by=search.columns["rank"] 

325 ).label("rownum") 

326 ) 

327 window = sqlalchemy.sql.select( 

328 *windowSelectCols 

329 ).select_from(search).alias( 

330 f"{datasetType.name}_window" 

331 ) 

332 subquery = sqlalchemy.sql.select( 

333 *[window.columns[name].label(name) for name in baseColumnNames] 

334 ).select_from( 

335 window 

336 ).where( 

337 window.columns["rownum"] == 1 

338 ).alias(datasetType.name) 

339 else: 

340 subquery = subquery.alias(datasetType.name) 

341 columns = DatasetQueryColumns( 

342 datasetType=datasetType, 

343 id=subquery.columns["id"], 

344 runKey=subquery.columns[runKeyName], 

345 ingestDate=subquery.columns["ingest_date"], 

346 ) 

347 else: 

348 subquery = subquery.alias(datasetType.name) 

349 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns) 

350 return not self._doomed_by 

351 

352 def joinTable(self, table: sqlalchemy.sql.FromClause, dimensions: NamedValueAbstractSet[Dimension], *, 

353 datasets: Optional[DatasetQueryColumns] = None) -> None: 

354 """Join an arbitrary table to the query via dimension relationships. 

355 

356 External calls to this method should only be necessary for tables whose 

357 records represent neither datasets nor dimension elements. 

358 

359 Parameters 

360 ---------- 

361 table : `sqlalchemy.sql.FromClause` 

362 SQLAlchemy object representing the logical table (which may be a 

363 join or subquery expression) to be joined. 

364 dimensions : iterable of `Dimension` 

365 The dimensions that relate this table to others that may be in the 

366 query. The table must have columns with the names of the 

367 dimensions. 

368 datasets : `DatasetQueryColumns`, optional 

369 Columns that identify a dataset that is part of the query results. 

370 """ 

371 unexpectedDimensions = NamedValueSet(dimensions - self.summary.mustHaveKeysJoined.dimensions) 

372 unexpectedDimensions.discard(self.summary.universe.commonSkyPix) 

373 if unexpectedDimensions: 

374 raise NotImplementedError( 

375 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that " 

376 f"were not provided originally to the QuerySummary object passed at construction." 

377 ) 

378 joinOn = self.startJoin(table, dimensions, dimensions.names) 

379 self.finishJoin(table, joinOn) 

380 if datasets is not None: 

381 assert self._columns.datasets is None, \ 

382 "At most one result dataset type can be returned by a query." 

383 self._columns.datasets = datasets 

384 

385 def startJoin(self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension], 

386 columnNames: Iterable[str] 

387 ) -> List[sqlalchemy.sql.ColumnElement]: 

388 """Begin a join on dimensions. 

389 

390 Must be followed by call to `finishJoin`. 

391 

392 Parameters 

393 ---------- 

394 table : `sqlalchemy.sql.FromClause` 

395 SQLAlchemy object representing the logical table (which may be a 

396 join or subquery expression) to be joined. 

397 dimensions : iterable of `Dimension` 

398 The dimensions that relate this table to others that may be in the 

399 query. The table must have columns with the names of the 

400 dimensions. 

401 columnNames : iterable of `str` 

402 Names of the columns that correspond to dimension key values; must 

403 be `zip` iterable with ``dimensions``. 

404 

405 Returns 

406 ------- 

407 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

408 Sequence of boolean expressions that should be combined with AND 

409 to form (part of) the ON expression for this JOIN. 

410 """ 

411 joinOn = [] 

412 for dimension, columnName in zip(dimensions, columnNames): 

413 columnInTable = table.columns[columnName] 

414 columnsInQuery = self._columns.keys.setdefault(dimension, []) 

415 for columnInQuery in columnsInQuery: 

416 joinOn.append(columnInQuery == columnInTable) 

417 columnsInQuery.append(columnInTable) 

418 return joinOn 

419 

420 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement] 

421 ) -> None: 

422 """Complete a join on dimensions. 

423 

424 Must be preceded by call to `startJoin`. 

425 

426 Parameters 

427 ---------- 

428 table : `sqlalchemy.sql.FromClause` 

429 SQLAlchemy object representing the logical table (which may be a 

430 join or subquery expression) to be joined. Must be the same object 

431 passed to `startJoin`. 

432 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

433 Sequence of boolean expressions that should be combined with AND 

434 to form (part of) the ON expression for this JOIN. Should include 

435 at least the elements of the list returned by `startJoin`. 

436 """ 

437 onclause: Optional[sqlalchemy.sql.ColumnElement] 

438 if len(joinOn) == 0: 

439 onclause = None 

440 elif len(joinOn) == 1: 

441 onclause = joinOn[0] 

442 else: 

443 onclause = sqlalchemy.sql.and_(*joinOn) 

444 self._simpleQuery.join(table, onclause=onclause) 

445 

446 def _joinMissingDimensionElements(self) -> None: 

447 """Join all dimension element tables that were identified as necessary 

448 by `QuerySummary` and have not yet been joined. 

449 

450 For internal use by `QueryBuilder` only; will be called (and should 

451 only by called) by `finish`. 

452 """ 

453 # Join all DimensionElement tables that we need for spatial/temporal 

454 # joins/filters or a nontrivial WHERE expression. 

455 # We iterate over these in *reverse* topological order to minimize the 

456 # number of tables joined. For example, the "visit" table provides 

457 # the primary key value for the "instrument" table it depends on, so we 

458 # don't need to join "instrument" as well unless we had a nontrivial 

459 # expression on it (and hence included it already above). 

460 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True): 

461 self.joinDimensionElement(element) 

462 # Join in any requested Dimension tables that don't already have their 

463 # primary keys identified by the query. 

464 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True): 

465 if dimension not in self._columns.keys: 

466 self.joinDimensionElement(dimension) 

467 

468 def _addWhereClause(self) -> None: 

469 """Add a WHERE clause to the query under construction, connecting all 

470 joined dimensions to the expression and data ID dimensions from 

471 `QuerySummary`. 

472 

473 For internal use by `QueryBuilder` only; will be called (and should 

474 only by called) by `finish`. 

475 """ 

476 if self.summary.where.tree is not None: 

477 self._simpleQuery.where.append( 

478 convertExpressionToSql( 

479 self.summary.where.tree, 

480 self.summary.universe, 

481 columns=self._columns, 

482 elements=self._elements, 

483 bind=self.summary.where.bind, 

484 TimespanReprClass=self._managers.TimespanReprClass, 

485 ) 

486 ) 

487 for dimension, columnsInQuery in self._columns.keys.items(): 

488 if dimension in self.summary.where.dataId.graph: 

489 givenKey = self.summary.where.dataId[dimension] 

490 # Add a WHERE term for each column that corresponds to each 

491 # key. This is redundant with the JOIN ON clauses that make 

492 # them equal to each other, but more constraints have a chance 

493 # of making things easier on the DB's query optimizer. 

494 for columnInQuery in columnsInQuery: 

495 self._simpleQuery.where.append(columnInQuery == givenKey) 

496 else: 

497 # Dimension is not fully identified, but it might be a skypix 

498 # dimension that's constrained by a given region. 

499 if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension): 

500 # We know the region now. 

501 givenSkyPixIds: List[int] = [] 

502 for begin, end in dimension.pixelization.envelope(self.summary.where.region): 

503 givenSkyPixIds.extend(range(begin, end)) 

504 for columnInQuery in columnsInQuery: 

505 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds)) 

506 # If we are given an dataId with a timespan, and there are one or more 

507 # timespans in the query that aren't given, add a WHERE expression for 

508 # each of them. 

509 if self.summary.where.dataId.graph.temporal and self.summary.temporal: 

510 # Timespan is known now. 

511 givenInterval = self.summary.where.dataId.timespan 

512 assert givenInterval is not None 

513 for element, intervalInQuery in self._columns.timespans.items(): 

514 assert element not in self.summary.where.dataId.graph.elements 

515 self._simpleQuery.where.append( 

516 intervalInQuery.overlaps(self._managers.TimespanReprClass.fromLiteral(givenInterval)) 

517 ) 

518 

519 def finish(self, joinMissing: bool = True) -> Query: 

520 """Finish query constructing, returning a new `Query` instance. 

521 

522 Parameters 

523 ---------- 

524 joinMissing : `bool`, optional 

525 If `True` (default), automatically join any missing dimension 

526 element tables (according to the categorization of the 

527 `QuerySummary` the builder was constructed with). `False` should 

528 only be passed if the caller can independently guarantee that all 

529 dimension relationships are already captured in non-dimension 

530 tables that have been manually included in the query. 

531 

532 Returns 

533 ------- 

534 query : `Query` 

535 A `Query` object that can be executed and used to interpret result 

536 rows. 

537 """ 

538 if joinMissing: 

539 self._joinMissingDimensionElements() 

540 self._addWhereClause() 

541 if self._columns.isEmpty(): 

542 return EmptyQuery(self.summary.requested.universe, managers=self._managers, 

543 doomed_by=self._doomed_by) 

544 return DirectQuery(graph=self.summary.requested, 

545 uniqueness=DirectQueryUniqueness.NOT_UNIQUE, 

546 whereRegion=self.summary.where.dataId.region, 

547 simpleQuery=self._simpleQuery, 

548 columns=self._columns, 

549 managers=self._managers, 

550 doomed_by=self._doomed_by)