Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import AbstractSet, Any, Iterable, List, Optional 

26 

27import sqlalchemy.sql 

28 

29from ...core import ( 

30 DimensionElement, 

31 SkyPixDimension, 

32 Dimension, 

33 DatasetType, 

34 SimpleQuery, 

35) 

36 

37from ...core.named import NamedKeyDict, NamedValueAbstractSet, NamedValueSet 

38 

39from .._collectionType import CollectionType 

40from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns, RegistryManagers 

41from .expressions import convertExpressionToSql 

42from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, Query 

43from ..wildcards import CollectionSearch, CollectionQuery 

44 

45 

46class QueryBuilder: 

47 """A builder for potentially complex queries that join tables based 

48 on dimension relationships. 

49 

50 Parameters 

51 ---------- 

52 summary : `QuerySummary` 

53 Struct organizing the dimensions involved in the query. 

54 managers : `RegistryManagers` 

55 A struct containing the registry manager instances used by the query 

56 system. 

57 """ 

58 def __init__(self, summary: QuerySummary, managers: RegistryManagers): 

59 self.summary = summary 

60 self._simpleQuery = SimpleQuery() 

61 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict() 

62 self._columns = QueryColumns() 

63 self._managers = managers 

64 

65 def hasDimensionKey(self, dimension: Dimension) -> bool: 

66 """Return `True` if the given dimension's primary key column has 

67 been included in the query (possibly via a foreign key column on some 

68 other table). 

69 """ 

70 return dimension in self._columns.keys 

71 

72 def joinDimensionElement(self, element: DimensionElement) -> None: 

73 """Add the table for a `DimensionElement` to the query. 

74 

75 This automatically joins the element table to all other tables in the 

76 query with which it is related, via both dimension keys and spatial 

77 and temporal relationships. 

78 

79 External calls to this method should rarely be necessary; `finish` will 

80 automatically call it if the `DimensionElement` has been identified as 

81 one that must be included. 

82 

83 Parameters 

84 ---------- 

85 element : `DimensionElement` 

86 Element for which a table should be added. The element must be 

87 associated with a database table (see `DimensionElement.hasTable`). 

88 """ 

89 assert element not in self._elements, "Element already included in query." 

90 storage = self._managers.dimensions[element] 

91 fromClause = storage.join( 

92 self, 

93 regions=self._columns.regions if element in self.summary.spatial else None, 

94 timespans=self._columns.timespans if element in self.summary.temporal else None, 

95 ) 

96 self._elements[element] = fromClause 

97 

98 def joinDataset(self, datasetType: DatasetType, collections: Any, *, 

99 isResult: bool = True, findFirst: bool = False) -> bool: 

100 """Add a dataset search or constraint to the query. 

101 

102 Unlike other `QueryBuilder` join methods, this *must* be called 

103 directly to search for datasets of a particular type or constrain the 

104 query results based on the exists of datasets. However, all dimensions 

105 used to identify the dataset type must have already been included in 

106 `QuerySummary.requested` when initializing the `QueryBuilder`. 

107 

108 Parameters 

109 ---------- 

110 datasetType : `DatasetType` 

111 The type of datasets to search for. 

112 collections : `Any` 

113 An expression that fully or partially identifies the collections 

114 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

115 thereof. `...` can be used to return all collections. See 

116 :ref:`daf_butler_collection_expressions` for more information. 

117 isResult : `bool`, optional 

118 If `True` (default), include the dataset ID column in the 

119 result columns of the query, allowing complete `DatasetRef` 

120 instances to be produced from the query results for this dataset 

121 type. If `False`, the existence of datasets of this type is used 

122 only to constrain the data IDs returned by the query. 

123 `joinDataset` may be called with ``isResult=True`` at most one time 

124 on a particular `QueryBuilder` instance. 

125 findFirst : `bool`, optional 

126 If `True` (`False` is default), only include the first match for 

127 each data ID, searching the given collections in order. Requires 

128 that all entries in ``collections`` be regular strings, so there is 

129 a clear search order. Ignored if ``isResult`` is `False`. 

130 

131 Returns 

132 ------- 

133 anyRecords : `bool` 

134 If `True`, joining the dataset table was successful and the query 

135 should proceed. If `False`, we were able to determine (from the 

136 combination of ``datasetType`` and ``collections``) that there 

137 would be no results joined in from this dataset, and hence (due to 

138 the inner join that would normally be present), the full query will 

139 return no results. 

140 """ 

141 assert datasetType.dimensions.issubset(self.summary.requested) 

142 if isResult and findFirst: 

143 collections = CollectionSearch.fromExpression(collections) 

144 else: 

145 collections = CollectionQuery.fromExpression(collections) 

146 explicitCollections = frozenset(collections.explicitNames()) 

147 # If we are searching all collections with no constraints, loop over 

148 # RUN collections only, because that will include all datasets. 

149 collectionTypes: AbstractSet[CollectionType] 

150 if collections == CollectionQuery(): 

151 collectionTypes = {CollectionType.RUN} 

152 else: 

153 collectionTypes = CollectionType.all() 

154 datasetRecordStorage = self._managers.datasets.find(datasetType.name) 

155 if datasetRecordStorage is None: 

156 # Unrecognized dataset type means no results. It might be better 

157 # to raise here, but this is consistent with previous behavior, 

158 # which is expected by QuantumGraph generation code in pipe_base. 

159 return False 

160 subsubqueries = [] 

161 runKeyName = self._managers.collections.getRunForeignKeyName() 

162 baseColumnNames = {"id", runKeyName, "ingest_date"} if isResult else set() 

163 baseColumnNames.update(datasetType.dimensions.required.names) 

164 for rank, collectionRecord in enumerate(collections.iter(self._managers.collections, 

165 collectionTypes=collectionTypes)): 

166 if collectionRecord.type is CollectionType.CALIBRATION: 

167 # If collection name was provided explicitly then say sorry, 

168 # otherwise collection is a part of chained one and we skip it. 

169 if datasetType.isCalibration() and collectionRecord.name in explicitCollections: 

170 raise NotImplementedError( 

171 f"Query for dataset type '{datasetType.name}' in CALIBRATION-type collection " 

172 f"'{collectionRecord.name}' is not yet supported." 

173 ) 

174 else: 

175 # We can never find a non-calibration dataset in a 

176 # CALIBRATION collection. 

177 continue 

178 ssq = datasetRecordStorage.select(collection=collectionRecord, 

179 dataId=SimpleQuery.Select, 

180 id=SimpleQuery.Select if isResult else None, 

181 run=SimpleQuery.Select if isResult else None, 

182 ingestDate=SimpleQuery.Select if isResult else None) 

183 if ssq is None: 

184 continue 

185 assert {c.name for c in ssq.columns} == baseColumnNames 

186 if findFirst: 

187 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank")) 

188 subsubqueries.append(ssq.combine()) 

189 if not subsubqueries: 

190 return False 

191 # Although one would expect that these subqueries can be 

192 # UNION ALL instead of UNION because each subquery is already 

193 # distinct, it turns out that with many 

194 # subqueries this causes catastrophic performance problems 

195 # with both sqlite and postgres. Using UNION may require 

196 # more table scans, but a much simpler query plan given our 

197 # table structures. See DM-31429. 

198 subquery = sqlalchemy.sql.union(*subsubqueries) 

199 columns: Optional[DatasetQueryColumns] = None 

200 if isResult: 

201 if findFirst: 

202 # Rewrite the subquery (currently a UNION ALL over 

203 # per-collection subsubqueries) to select the rows with the 

204 # lowest rank per data ID. The block below will set subquery 

205 # to something like this: 

206 # 

207 # WITH {dst}_search AS ( 

208 # SELECT {data-id-cols}, id, run_id, 1 AS rank 

209 # FROM <collection1> 

210 # UNION ALL 

211 # SELECT {data-id-cols}, id, run_id, 2 AS rank 

212 # FROM <collection2> 

213 # UNION ALL 

214 # ... 

215 # ) 

216 # SELECT 

217 # {dst}_window.{data-id-cols}, 

218 # {dst}_window.id, 

219 # {dst}_window.run_id 

220 # FROM ( 

221 # SELECT 

222 # {dst}_search.{data-id-cols}, 

223 # {dst}_search.id, 

224 # {dst}_search.run_id, 

225 # ROW_NUMBER() OVER ( 

226 # PARTITION BY {dst_search}.{data-id-cols} 

227 # ORDER BY rank 

228 # ) AS rownum 

229 # ) {dst}_window 

230 # WHERE 

231 # {dst}_window.rownum = 1; 

232 # 

233 search = subquery.cte(f"{datasetType.name}_search") 

234 windowDataIdCols = [ 

235 search.columns[name].label(name) for name in datasetType.dimensions.required.names 

236 ] 

237 windowSelectCols = [ 

238 search.columns["id"].label("id"), 

239 search.columns[runKeyName].label(runKeyName), 

240 search.columns["ingest_date"].label("ingest_date"), 

241 ] 

242 windowSelectCols += windowDataIdCols 

243 assert {c.name for c in windowSelectCols} == baseColumnNames 

244 windowSelectCols.append( 

245 sqlalchemy.sql.func.row_number().over( 

246 partition_by=windowDataIdCols, 

247 order_by=search.columns["rank"] 

248 ).label("rownum") 

249 ) 

250 window = sqlalchemy.sql.select( 

251 *windowSelectCols 

252 ).select_from(search).alias( 

253 f"{datasetType.name}_window" 

254 ) 

255 subquery = sqlalchemy.sql.select( 

256 *[window.columns[name].label(name) for name in baseColumnNames] 

257 ).select_from( 

258 window 

259 ).where( 

260 window.columns["rownum"] == 1 

261 ).alias(datasetType.name) 

262 else: 

263 subquery = subquery.alias(datasetType.name) 

264 columns = DatasetQueryColumns( 

265 datasetType=datasetType, 

266 id=subquery.columns["id"], 

267 runKey=subquery.columns[runKeyName], 

268 ingestDate=subquery.columns["ingest_date"], 

269 ) 

270 else: 

271 subquery = subquery.alias(datasetType.name) 

272 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns) 

273 return True 

274 

275 def joinTable(self, table: sqlalchemy.sql.FromClause, dimensions: NamedValueAbstractSet[Dimension], *, 

276 datasets: Optional[DatasetQueryColumns] = None) -> None: 

277 """Join an arbitrary table to the query via dimension relationships. 

278 

279 External calls to this method should only be necessary for tables whose 

280 records represent neither datasets nor dimension elements. 

281 

282 Parameters 

283 ---------- 

284 table : `sqlalchemy.sql.FromClause` 

285 SQLAlchemy object representing the logical table (which may be a 

286 join or subquery expression) to be joined. 

287 dimensions : iterable of `Dimension` 

288 The dimensions that relate this table to others that may be in the 

289 query. The table must have columns with the names of the 

290 dimensions. 

291 datasets : `DatasetQueryColumns`, optional 

292 Columns that identify a dataset that is part of the query results. 

293 """ 

294 unexpectedDimensions = NamedValueSet(dimensions - self.summary.mustHaveKeysJoined.dimensions) 

295 unexpectedDimensions.discard(self.summary.universe.commonSkyPix) 

296 if unexpectedDimensions: 

297 raise NotImplementedError( 

298 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that " 

299 f"were not provided originally to the QuerySummary object passed at construction." 

300 ) 

301 joinOn = self.startJoin(table, dimensions, dimensions.names) 

302 self.finishJoin(table, joinOn) 

303 if datasets is not None: 

304 assert self._columns.datasets is None, \ 

305 "At most one result dataset type can be returned by a query." 

306 self._columns.datasets = datasets 

307 

308 def startJoin(self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension], 

309 columnNames: Iterable[str] 

310 ) -> List[sqlalchemy.sql.ColumnElement]: 

311 """Begin a join on dimensions. 

312 

313 Must be followed by call to `finishJoin`. 

314 

315 Parameters 

316 ---------- 

317 table : `sqlalchemy.sql.FromClause` 

318 SQLAlchemy object representing the logical table (which may be a 

319 join or subquery expression) to be joined. 

320 dimensions : iterable of `Dimension` 

321 The dimensions that relate this table to others that may be in the 

322 query. The table must have columns with the names of the 

323 dimensions. 

324 columnNames : iterable of `str` 

325 Names of the columns that correspond to dimension key values; must 

326 be `zip` iterable with ``dimensions``. 

327 

328 Returns 

329 ------- 

330 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

331 Sequence of boolean expressions that should be combined with AND 

332 to form (part of) the ON expression for this JOIN. 

333 """ 

334 joinOn = [] 

335 for dimension, columnName in zip(dimensions, columnNames): 

336 columnInTable = table.columns[columnName] 

337 columnsInQuery = self._columns.keys.setdefault(dimension, []) 

338 for columnInQuery in columnsInQuery: 

339 joinOn.append(columnInQuery == columnInTable) 

340 columnsInQuery.append(columnInTable) 

341 return joinOn 

342 

343 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement] 

344 ) -> None: 

345 """Complete a join on dimensions. 

346 

347 Must be preceded by call to `startJoin`. 

348 

349 Parameters 

350 ---------- 

351 table : `sqlalchemy.sql.FromClause` 

352 SQLAlchemy object representing the logical table (which may be a 

353 join or subquery expression) to be joined. Must be the same object 

354 passed to `startJoin`. 

355 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

356 Sequence of boolean expressions that should be combined with AND 

357 to form (part of) the ON expression for this JOIN. Should include 

358 at least the elements of the list returned by `startJoin`. 

359 """ 

360 onclause: Optional[sqlalchemy.sql.ColumnElement] 

361 if len(joinOn) == 0: 

362 onclause = None 

363 elif len(joinOn) == 1: 

364 onclause = joinOn[0] 

365 else: 

366 onclause = sqlalchemy.sql.and_(*joinOn) 

367 self._simpleQuery.join(table, onclause=onclause) 

368 

369 def _joinMissingDimensionElements(self) -> None: 

370 """Join all dimension element tables that were identified as necessary 

371 by `QuerySummary` and have not yet been joined. 

372 

373 For internal use by `QueryBuilder` only; will be called (and should 

374 only by called) by `finish`. 

375 """ 

376 # Join all DimensionElement tables that we need for spatial/temporal 

377 # joins/filters or a nontrivial WHERE expression. 

378 # We iterate over these in *reverse* topological order to minimize the 

379 # number of tables joined. For example, the "visit" table provides 

380 # the primary key value for the "instrument" table it depends on, so we 

381 # don't need to join "instrument" as well unless we had a nontrivial 

382 # expression on it (and hence included it already above). 

383 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True): 

384 self.joinDimensionElement(element) 

385 # Join in any requested Dimension tables that don't already have their 

386 # primary keys identified by the query. 

387 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True): 

388 if dimension not in self._columns.keys: 

389 self.joinDimensionElement(dimension) 

390 

391 def _addWhereClause(self) -> None: 

392 """Add a WHERE clause to the query under construction, connecting all 

393 joined dimensions to the expression and data ID dimensions from 

394 `QuerySummary`. 

395 

396 For internal use by `QueryBuilder` only; will be called (and should 

397 only by called) by `finish`. 

398 """ 

399 if self.summary.where.tree is not None: 

400 self._simpleQuery.where.append( 

401 convertExpressionToSql( 

402 self.summary.where.tree, 

403 self.summary.universe, 

404 columns=self._columns, 

405 elements=self._elements, 

406 bind=self.summary.where.bind, 

407 TimespanReprClass=self._managers.TimespanReprClass, 

408 ) 

409 ) 

410 for dimension, columnsInQuery in self._columns.keys.items(): 

411 if dimension in self.summary.where.dataId.graph: 

412 givenKey = self.summary.where.dataId[dimension] 

413 # Add a WHERE term for each column that corresponds to each 

414 # key. This is redundant with the JOIN ON clauses that make 

415 # them equal to each other, but more constraints have a chance 

416 # of making things easier on the DB's query optimizer. 

417 for columnInQuery in columnsInQuery: 

418 self._simpleQuery.where.append(columnInQuery == givenKey) 

419 else: 

420 # Dimension is not fully identified, but it might be a skypix 

421 # dimension that's constrained by a given region. 

422 if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension): 

423 # We know the region now. 

424 givenSkyPixIds: List[int] = [] 

425 for begin, end in dimension.pixelization.envelope(self.summary.where.region): 

426 givenSkyPixIds.extend(range(begin, end)) 

427 for columnInQuery in columnsInQuery: 

428 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds)) 

429 # If we are given an dataId with a timespan, and there are one or more 

430 # timespans in the query that aren't given, add a WHERE expression for 

431 # each of them. 

432 if self.summary.where.dataId.graph.temporal and self.summary.temporal: 

433 # Timespan is known now. 

434 givenInterval = self.summary.where.dataId.timespan 

435 assert givenInterval is not None 

436 for element, intervalInQuery in self._columns.timespans.items(): 

437 assert element not in self.summary.where.dataId.graph.elements 

438 self._simpleQuery.where.append( 

439 intervalInQuery.overlaps(self._managers.TimespanReprClass.fromLiteral(givenInterval)) 

440 ) 

441 

442 def finish(self, joinMissing: bool = True) -> Query: 

443 """Finish query constructing, returning a new `Query` instance. 

444 

445 Parameters 

446 ---------- 

447 joinMissing : `bool`, optional 

448 If `True` (default), automatically join any missing dimension 

449 element tables (according to the categorization of the 

450 `QuerySummary` the builder was constructed with). `False` should 

451 only be passed if the caller can independently guarantee that all 

452 dimension relationships are already captured in non-dimension 

453 tables that have been manually included in the query. 

454 

455 Returns 

456 ------- 

457 query : `Query` 

458 A `Query` object that can be executed and used to interpret result 

459 rows. 

460 """ 

461 if joinMissing: 

462 self._joinMissingDimensionElements() 

463 self._addWhereClause() 

464 if self._columns.isEmpty(): 

465 return EmptyQuery(self.summary.requested.universe, managers=self._managers) 

466 return DirectQuery(graph=self.summary.requested, 

467 uniqueness=DirectQueryUniqueness.NOT_UNIQUE, 

468 whereRegion=self.summary.where.dataId.region, 

469 simpleQuery=self._simpleQuery, 

470 columns=self._columns, 

471 managers=self._managers)