Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import Any, Iterable, List, Optional 

26 

27import sqlalchemy.sql 

28 

29from ...core import ( 

30 DimensionElement, 

31 SkyPixDimension, 

32 Dimension, 

33 DatasetType, 

34 NamedKeyDict, 

35 NamedValueSet, 

36 SimpleQuery, 

37) 

38 

39from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns, RegistryManagers 

40from .expressions import ClauseVisitor 

41from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, Query 

42from ..wildcards import CollectionSearch, CollectionQuery 

43 

44 

45class QueryBuilder: 

46 """A builder for potentially complex queries that join tables based 

47 on dimension relationships. 

48 

49 Parameters 

50 ---------- 

51 summary : `QuerySummary` 

52 Struct organizing the dimensions involved in the query. 

53 managers : `RegistryManagers` 

54 A struct containing the registry manager instances used by the query 

55 system. 

56 """ 

57 def __init__(self, summary: QuerySummary, managers: RegistryManagers): 

58 self.summary = summary 

59 self._simpleQuery = SimpleQuery() 

60 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict() 

61 self._columns = QueryColumns() 

62 self._managers = managers 

63 

64 def hasDimensionKey(self, dimension: Dimension) -> bool: 

65 """Return `True` if the given dimension's primary key column has 

66 been included in the query (possibly via a foreign key column on some 

67 other table). 

68 """ 

69 return dimension in self._columns.keys 

70 

71 def joinDimensionElement(self, element: DimensionElement) -> None: 

72 """Add the table for a `DimensionElement` to the query. 

73 

74 This automatically joins the element table to all other tables in the 

75 query with which it is related, via both dimension keys and spatial 

76 and temporal relationships. 

77 

78 External calls to this method should rarely be necessary; `finish` will 

79 automatically call it if the `DimensionElement` has been identified as 

80 one that must be included. 

81 

82 Parameters 

83 ---------- 

84 element : `DimensionElement` 

85 Element for which a table should be added. The element must be 

86 associated with a database table (see `DimensionElement.hasTable`). 

87 """ 

88 assert element not in self._elements, "Element already included in query." 

89 storage = self._managers.dimensions[element] 

90 fromClause = storage.join( 

91 self, 

92 regions=self._columns.regions if element in self.summary.spatial else None, 

93 timespans=self._columns.timespans if element in self.summary.temporal else None, 

94 ) 

95 self._elements[element] = fromClause 

96 

97 def joinDataset(self, datasetType: DatasetType, collections: Any, *, 

98 isResult: bool = True, deduplicate: bool = False) -> bool: 

99 """Add a dataset search or constraint to the query. 

100 

101 Unlike other `QueryBuilder` join methods, this *must* be called 

102 directly to search for datasets of a particular type or constrain the 

103 query results based on the exists of datasets. However, all dimensions 

104 used to identify the dataset type must have already been included in 

105 `QuerySummary.requested` when initializing the `QueryBuilder`. 

106 

107 Parameters 

108 ---------- 

109 datasetType : `DatasetType` 

110 The type of datasets to search for. 

111 collections : `Any` 

112 An expression that fully or partially identifies the collections 

113 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

114 thereof. `...` can be used to return all collections. See 

115 :ref:`daf_butler_collection_expressions` for more information. 

116 isResult : `bool`, optional 

117 If `True` (default), include the dataset ID column in the 

118 result columns of the query, allowing complete `DatasetRef` 

119 instances to be produced from the query results for this dataset 

120 type. If `False`, the existence of datasets of this type is used 

121 only to constrain the data IDs returned by the query. 

122 `joinDataset` may be called with ``isResult=True`` at most one time 

123 on a particular `QueryBuilder` instance. 

124 deduplicate : `bool`, optional 

125 If `True` (`False` is default), only include the first match for 

126 each data ID, searching the given collections in order. Requires 

127 that all entries in ``collections`` be regular strings, so there is 

128 a clear search order. Ignored if ``isResult`` is `False`. 

129 

130 Returns 

131 ------- 

132 anyRecords : `bool` 

133 If `True`, joining the dataset table was successful and the query 

134 should proceed. If `False`, we were able to determine (from the 

135 combination of ``datasetType`` and ``collections``) that there 

136 would be no results joined in from this dataset, and hence (due to 

137 the inner join that would normally be present), the full query will 

138 return no results. 

139 """ 

140 assert datasetType.dimensions.issubset(self.summary.requested) 

141 if isResult and deduplicate: 

142 collections = CollectionSearch.fromExpression(collections) 

143 else: 

144 collections = CollectionQuery.fromExpression(collections) 

145 datasetRecordStorage = self._managers.datasets.find(datasetType.name) 

146 if datasetRecordStorage is None: 

147 # Unrecognized dataset type means no results. It might be better 

148 # to raise here, but this is consistent with previous behavior, 

149 # which is expected by QuantumGraph generation code in pipe_base. 

150 return False 

151 subsubqueries = [] 

152 runKeyName = self._managers.collections.getRunForeignKeyName() 

153 baseColumnNames = {"id", runKeyName} if isResult else set() 

154 baseColumnNames.update(datasetType.dimensions.required.names) 

155 for rank, collectionRecord in enumerate(collections.iter(self._managers.collections, 

156 datasetType=datasetType)): 

157 ssq = datasetRecordStorage.select(collection=collectionRecord, 

158 dataId=SimpleQuery.Select, 

159 id=SimpleQuery.Select if isResult else None, 

160 run=SimpleQuery.Select if isResult else None) 

161 if ssq is None: 

162 continue 

163 assert {c.name for c in ssq.columns} == baseColumnNames 

164 if deduplicate: 

165 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank")) 

166 subsubqueries.append(ssq.combine()) 

167 if not subsubqueries: 

168 return False 

169 subquery = sqlalchemy.sql.union_all(*subsubqueries) 

170 columns: Optional[DatasetQueryColumns] = None 

171 if isResult: 

172 if deduplicate: 

173 # Rewrite the subquery (currently a UNION ALL over 

174 # per-collection subsubqueries) to select the rows with the 

175 # lowest rank per data ID. The block below will set subquery 

176 # to something like this: 

177 # 

178 # WITH {dst}_search AS ( 

179 # SELECT {data-id-cols}, id, run_id, 1 AS rank 

180 # FROM <collection1> 

181 # UNION ALL 

182 # SELECT {data-id-cols}, id, run_id, 2 AS rank 

183 # FROM <collection2> 

184 # UNION ALL 

185 # ... 

186 # ) 

187 # SELECT 

188 # {dst}_window.{data-id-cols}, 

189 # {dst}_window.id, 

190 # {dst}_window.run_id 

191 # FROM ( 

192 # SELECT 

193 # {dst}_search.{data-id-cols}, 

194 # {dst}_search.id, 

195 # {dst}_search.run_id, 

196 # ROW_NUMBER() OVER ( 

197 # PARTITION BY {dst_search}.{data-id-cols} 

198 # ORDER BY rank 

199 # ) AS rownum 

200 # ) {dst}_window 

201 # WHERE 

202 # {dst}_window.rownum = 1; 

203 # 

204 search = subquery.cte(f"{datasetType.name}_search") 

205 windowDataIdCols = [ 

206 search.columns[name].label(name) for name in datasetType.dimensions.required.names 

207 ] 

208 windowSelectCols = [ 

209 search.columns["id"].label("id"), 

210 search.columns[runKeyName].label(runKeyName) 

211 ] 

212 windowSelectCols += windowDataIdCols 

213 assert {c.name for c in windowSelectCols} == baseColumnNames 

214 windowSelectCols.append( 

215 sqlalchemy.sql.func.row_number().over( 

216 partition_by=windowDataIdCols, 

217 order_by=search.columns["rank"] 

218 ).label("rownum") 

219 ) 

220 window = sqlalchemy.sql.select( 

221 windowSelectCols 

222 ).select_from(search).alias( 

223 f"{datasetType.name}_window" 

224 ) 

225 subquery = sqlalchemy.sql.select( 

226 [window.columns[name].label(name) for name in baseColumnNames] 

227 ).select_from( 

228 window 

229 ).where( 

230 window.columns["rownum"] == 1 

231 ).alias(datasetType.name) 

232 else: 

233 subquery = subquery.alias(datasetType.name) 

234 columns = DatasetQueryColumns( 

235 datasetType=datasetType, 

236 id=subquery.columns["id"], 

237 runKey=subquery.columns[runKeyName], 

238 ) 

239 else: 

240 subquery = subquery.alias(datasetType.name) 

241 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns) 

242 return True 

243 

244 def joinTable(self, table: sqlalchemy.sql.FromClause, dimensions: NamedValueSet[Dimension], *, 

245 datasets: Optional[DatasetQueryColumns] = None) -> None: 

246 """Join an arbitrary table to the query via dimension relationships. 

247 

248 External calls to this method should only be necessary for tables whose 

249 records represent neither datasets nor dimension elements. 

250 

251 Parameters 

252 ---------- 

253 table : `sqlalchemy.sql.FromClause` 

254 SQLAlchemy object representing the logical table (which may be a 

255 join or subquery expression) to be joined. 

256 dimensions : iterable of `Dimension` 

257 The dimensions that relate this table to others that may be in the 

258 query. The table must have columns with the names of the 

259 dimensions. 

260 datasets : `DatasetQueryColumns`, optional 

261 Columns that identify a dataset that is part of the query results. 

262 """ 

263 unexpectedDimensions = NamedValueSet(dimensions - self.summary.requested.dimensions) 

264 unexpectedDimensions.discard(self.summary.universe.commonSkyPix) 

265 if unexpectedDimensions: 

266 raise NotImplementedError( 

267 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that " 

268 f"were not provided originally to the QuerySummary object passed at construction." 

269 ) 

270 joinOn = self.startJoin(table, dimensions, dimensions.names) 

271 self.finishJoin(table, joinOn) 

272 if datasets is not None: 

273 assert self._columns.datasets is None, \ 

274 "At most one result dataset type can be returned by a query." 

275 self._columns.datasets = datasets 

276 

277 def startJoin(self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension], 

278 columnNames: Iterable[str] 

279 ) -> List[sqlalchemy.sql.ColumnElement]: 

280 """Begin a join on dimensions. 

281 

282 Must be followed by call to `finishJoin`. 

283 

284 Parameters 

285 ---------- 

286 table : `sqlalchemy.sql.FromClause` 

287 SQLAlchemy object representing the logical table (which may be a 

288 join or subquery expression) to be joined. 

289 dimensions : iterable of `Dimension` 

290 The dimensions that relate this table to others that may be in the 

291 query. The table must have columns with the names of the 

292 dimensions. 

293 columnNames : iterable of `str` 

294 Names of the columns that correspond to dimension key values; must 

295 be `zip` iterable with ``dimensions``. 

296 

297 Returns 

298 ------- 

299 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

300 Sequence of boolean expressions that should be combined with AND 

301 to form (part of) the ON expression for this JOIN. 

302 """ 

303 joinOn = [] 

304 for dimension, columnName in zip(dimensions, columnNames): 

305 columnInTable = table.columns[columnName] 

306 columnsInQuery = self._columns.keys.setdefault(dimension, []) 

307 for columnInQuery in columnsInQuery: 

308 joinOn.append(columnInQuery == columnInTable) 

309 columnsInQuery.append(columnInTable) 

310 return joinOn 

311 

312 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement] 

313 ) -> None: 

314 """Complete a join on dimensions. 

315 

316 Must be preceded by call to `startJoin`. 

317 

318 Parameters 

319 ---------- 

320 table : `sqlalchemy.sql.FromClause` 

321 SQLAlchemy object representing the logical table (which may be a 

322 join or subquery expression) to be joined. Must be the same object 

323 passed to `startJoin`. 

324 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

325 Sequence of boolean expressions that should be combined with AND 

326 to form (part of) the ON expression for this JOIN. Should include 

327 at least the elements of the list returned by `startJoin`. 

328 """ 

329 onclause: Optional[sqlalchemy.sql.ColumnElement] 

330 if len(joinOn) == 0: 

331 onclause = None 

332 elif len(joinOn) == 1: 

333 onclause = joinOn[0] 

334 else: 

335 onclause = sqlalchemy.sql.and_(*joinOn) 

336 self._simpleQuery.join(table, onclause=onclause) 

337 

338 def _joinMissingDimensionElements(self) -> None: 

339 """Join all dimension element tables that were identified as necessary 

340 by `QuerySummary` and have not yet been joined. 

341 

342 For internal use by `QueryBuilder` only; will be called (and should 

343 only by called) by `finish`. 

344 """ 

345 # Join all DimensionElement tables that we need for spatial/temporal 

346 # joins/filters or a nontrivial WHERE expression. 

347 # We iterate over these in *reverse* topological order to minimize the 

348 # number of tables joined. For example, the "visit" table provides 

349 # the primary key value for the "instrument" table it depends on, so we 

350 # don't need to join "instrument" as well unless we had a nontrivial 

351 # expression on it (and hence included it already above). 

352 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True): 

353 self.joinDimensionElement(element) 

354 # Join in any requested Dimension tables that don't already have their 

355 # primary keys identified by the query. 

356 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True): 

357 if dimension not in self._columns.keys: 

358 self.joinDimensionElement(dimension) 

359 

360 def _addWhereClause(self) -> None: 

361 """Add a WHERE clause to the query under construction, connecting all 

362 joined dimensions to the expression and data ID dimensions from 

363 `QuerySummary`. 

364 

365 For internal use by `QueryBuilder` only; will be called (and should 

366 only by called) by `finish`. 

367 """ 

368 if self.summary.expression.tree is not None: 

369 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements) 

370 self._simpleQuery.where.append(self.summary.expression.tree.visit(visitor)) 

371 for dimension, columnsInQuery in self._columns.keys.items(): 

372 if dimension in self.summary.dataId.graph: 

373 givenKey = self.summary.dataId[dimension] 

374 # Add a WHERE term for each column that corresponds to each 

375 # key. This is redundant with the JOIN ON clauses that make 

376 # them equal to each other, but more constraints have a chance 

377 # of making things easier on the DB's query optimizer. 

378 for columnInQuery in columnsInQuery: 

379 self._simpleQuery.where.append(columnInQuery == givenKey) 

380 else: 

381 # Dimension is not fully identified, but it might be a skypix 

382 # dimension that's constrained by a given region. 

383 if self.summary.whereRegion is not None and isinstance(dimension, SkyPixDimension): 

384 # We know the region now. 

385 givenSkyPixIds: List[int] = [] 

386 for begin, end in dimension.pixelization.envelope(self.summary.whereRegion): 

387 givenSkyPixIds.extend(range(begin, end)) 

388 for columnInQuery in columnsInQuery: 

389 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds)) 

390 # If we are given an dataId with a timespan, and there are one or more 

391 # timespans in the query that aren't given, add a WHERE expression for 

392 # each of them. 

393 if self.summary.dataId.graph.temporal and self.summary.temporal: 

394 # Timespan is known now. 

395 givenInterval = self.summary.dataId.timespan 

396 assert givenInterval is not None 

397 for element, intervalInQuery in self._columns.timespans.items(): 

398 assert element not in self.summary.dataId.graph.elements 

399 self._simpleQuery.where.append(intervalInQuery.overlaps(givenInterval)) 

400 

401 def finish(self, joinMissing: bool = True) -> Query: 

402 """Finish query constructing, returning a new `Query` instance. 

403 

404 Parameters 

405 ---------- 

406 joinMissing : `bool`, optional 

407 If `True` (default), automatically join any missing dimension 

408 element tables (according to the categorization of the 

409 `QuerySummary` the builder was constructed with). `False` should 

410 only be passed if the caller can independently guarantee that all 

411 dimension relationships are already captured in non-dimension 

412 tables that have been manually included in the query. 

413 

414 Returns 

415 ------- 

416 query : `Query` 

417 A `Query` object that can be executed and used to interpret result 

418 rows. 

419 """ 

420 if joinMissing: 

421 self._joinMissingDimensionElements() 

422 self._addWhereClause() 

423 if self._columns.isEmpty(): 

424 return EmptyQuery(self.summary.requested.universe, managers=self._managers) 

425 return DirectQuery(graph=self.summary.requested, 

426 uniqueness=DirectQueryUniqueness.NOT_UNIQUE, 

427 whereRegion=self.summary.dataId.region, 

428 simpleQuery=self._simpleQuery, 

429 columns=self._columns, 

430 managers=self._managers)