Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import AbstractSet, Any, Iterable, List, Optional 

26 

27import sqlalchemy.sql 

28 

29from ...core import ( 

30 DimensionElement, 

31 SkyPixDimension, 

32 Dimension, 

33 DatasetType, 

34 NamedKeyDict, 

35 NamedValueSet, 

36 SimpleQuery, 

37) 

38 

39from .._collectionType import CollectionType 

40from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns, RegistryManagers 

41from .expressions import ClauseVisitor 

42from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, Query 

43from ..wildcards import CollectionSearch, CollectionQuery 

44 

45 

46class QueryBuilder: 

47 """A builder for potentially complex queries that join tables based 

48 on dimension relationships. 

49 

50 Parameters 

51 ---------- 

52 summary : `QuerySummary` 

53 Struct organizing the dimensions involved in the query. 

54 managers : `RegistryManagers` 

55 A struct containing the registry manager instances used by the query 

56 system. 

57 """ 

58 def __init__(self, summary: QuerySummary, managers: RegistryManagers): 

59 self.summary = summary 

60 self._simpleQuery = SimpleQuery() 

61 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict() 

62 self._columns = QueryColumns() 

63 self._managers = managers 

64 

65 def hasDimensionKey(self, dimension: Dimension) -> bool: 

66 """Return `True` if the given dimension's primary key column has 

67 been included in the query (possibly via a foreign key column on some 

68 other table). 

69 """ 

70 return dimension in self._columns.keys 

71 

72 def joinDimensionElement(self, element: DimensionElement) -> None: 

73 """Add the table for a `DimensionElement` to the query. 

74 

75 This automatically joins the element table to all other tables in the 

76 query with which it is related, via both dimension keys and spatial 

77 and temporal relationships. 

78 

79 External calls to this method should rarely be necessary; `finish` will 

80 automatically call it if the `DimensionElement` has been identified as 

81 one that must be included. 

82 

83 Parameters 

84 ---------- 

85 element : `DimensionElement` 

86 Element for which a table should be added. The element must be 

87 associated with a database table (see `DimensionElement.hasTable`). 

88 """ 

89 assert element not in self._elements, "Element already included in query." 

90 storage = self._managers.dimensions[element] 

91 fromClause = storage.join( 

92 self, 

93 regions=self._columns.regions if element in self.summary.spatial else None, 

94 timespans=self._columns.timespans if element in self.summary.temporal else None, 

95 ) 

96 self._elements[element] = fromClause 

97 

98 def joinDataset(self, datasetType: DatasetType, collections: Any, *, 

99 isResult: bool = True, findFirst: bool = False) -> bool: 

100 """Add a dataset search or constraint to the query. 

101 

102 Unlike other `QueryBuilder` join methods, this *must* be called 

103 directly to search for datasets of a particular type or constrain the 

104 query results based on the exists of datasets. However, all dimensions 

105 used to identify the dataset type must have already been included in 

106 `QuerySummary.requested` when initializing the `QueryBuilder`. 

107 

108 Parameters 

109 ---------- 

110 datasetType : `DatasetType` 

111 The type of datasets to search for. 

112 collections : `Any` 

113 An expression that fully or partially identifies the collections 

114 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

115 thereof. `...` can be used to return all collections. See 

116 :ref:`daf_butler_collection_expressions` for more information. 

117 isResult : `bool`, optional 

118 If `True` (default), include the dataset ID column in the 

119 result columns of the query, allowing complete `DatasetRef` 

120 instances to be produced from the query results for this dataset 

121 type. If `False`, the existence of datasets of this type is used 

122 only to constrain the data IDs returned by the query. 

123 `joinDataset` may be called with ``isResult=True`` at most one time 

124 on a particular `QueryBuilder` instance. 

125 findFirst : `bool`, optional 

126 If `True` (`False` is default), only include the first match for 

127 each data ID, searching the given collections in order. Requires 

128 that all entries in ``collections`` be regular strings, so there is 

129 a clear search order. Ignored if ``isResult`` is `False`. 

130 

131 Returns 

132 ------- 

133 anyRecords : `bool` 

134 If `True`, joining the dataset table was successful and the query 

135 should proceed. If `False`, we were able to determine (from the 

136 combination of ``datasetType`` and ``collections``) that there 

137 would be no results joined in from this dataset, and hence (due to 

138 the inner join that would normally be present), the full query will 

139 return no results. 

140 """ 

141 assert datasetType.dimensions.issubset(self.summary.requested) 

142 if isResult and findFirst: 

143 collections = CollectionSearch.fromExpression(collections) 

144 else: 

145 collections = CollectionQuery.fromExpression(collections) 

146 # If we are searching all collections with no constraints, loop over 

147 # RUN collections only, because that will include all datasets. 

148 collectionTypes: AbstractSet[CollectionType] 

149 if collections == CollectionQuery.any: 

150 collectionTypes = {CollectionType.RUN} 

151 else: 

152 collectionTypes = CollectionType.all() 

153 datasetRecordStorage = self._managers.datasets.find(datasetType.name) 

154 if datasetRecordStorage is None: 

155 # Unrecognized dataset type means no results. It might be better 

156 # to raise here, but this is consistent with previous behavior, 

157 # which is expected by QuantumGraph generation code in pipe_base. 

158 return False 

159 subsubqueries = [] 

160 runKeyName = self._managers.collections.getRunForeignKeyName() 

161 baseColumnNames = {"id", runKeyName} if isResult else set() 

162 baseColumnNames.update(datasetType.dimensions.required.names) 

163 for rank, collectionRecord in enumerate(collections.iter(self._managers.collections, 

164 datasetType=datasetType, 

165 collectionTypes=collectionTypes)): 

166 if collectionRecord.type is CollectionType.CALIBRATION: 

167 if datasetType.isCalibration(): 

168 raise NotImplementedError( 

169 f"Query for dataset type '{datasetType.name}' in CALIBRATION-type collection " 

170 f"'{collectionRecord.name}' is not yet supported." 

171 ) 

172 else: 

173 # We can never find a non-calibration dataset in a 

174 # CALIBRATION collection. 

175 continue 

176 ssq = datasetRecordStorage.select(collection=collectionRecord, 

177 dataId=SimpleQuery.Select, 

178 id=SimpleQuery.Select if isResult else None, 

179 run=SimpleQuery.Select if isResult else None) 

180 if ssq is None: 

181 continue 

182 assert {c.name for c in ssq.columns} == baseColumnNames 

183 if findFirst: 

184 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank")) 

185 subsubqueries.append(ssq.combine()) 

186 if not subsubqueries: 

187 return False 

188 subquery = sqlalchemy.sql.union_all(*subsubqueries) 

189 columns: Optional[DatasetQueryColumns] = None 

190 if isResult: 

191 if findFirst: 

192 # Rewrite the subquery (currently a UNION ALL over 

193 # per-collection subsubqueries) to select the rows with the 

194 # lowest rank per data ID. The block below will set subquery 

195 # to something like this: 

196 # 

197 # WITH {dst}_search AS ( 

198 # SELECT {data-id-cols}, id, run_id, 1 AS rank 

199 # FROM <collection1> 

200 # UNION ALL 

201 # SELECT {data-id-cols}, id, run_id, 2 AS rank 

202 # FROM <collection2> 

203 # UNION ALL 

204 # ... 

205 # ) 

206 # SELECT 

207 # {dst}_window.{data-id-cols}, 

208 # {dst}_window.id, 

209 # {dst}_window.run_id 

210 # FROM ( 

211 # SELECT 

212 # {dst}_search.{data-id-cols}, 

213 # {dst}_search.id, 

214 # {dst}_search.run_id, 

215 # ROW_NUMBER() OVER ( 

216 # PARTITION BY {dst_search}.{data-id-cols} 

217 # ORDER BY rank 

218 # ) AS rownum 

219 # ) {dst}_window 

220 # WHERE 

221 # {dst}_window.rownum = 1; 

222 # 

223 search = subquery.cte(f"{datasetType.name}_search") 

224 windowDataIdCols = [ 

225 search.columns[name].label(name) for name in datasetType.dimensions.required.names 

226 ] 

227 windowSelectCols = [ 

228 search.columns["id"].label("id"), 

229 search.columns[runKeyName].label(runKeyName) 

230 ] 

231 windowSelectCols += windowDataIdCols 

232 assert {c.name for c in windowSelectCols} == baseColumnNames 

233 windowSelectCols.append( 

234 sqlalchemy.sql.func.row_number().over( 

235 partition_by=windowDataIdCols, 

236 order_by=search.columns["rank"] 

237 ).label("rownum") 

238 ) 

239 window = sqlalchemy.sql.select( 

240 windowSelectCols 

241 ).select_from(search).alias( 

242 f"{datasetType.name}_window" 

243 ) 

244 subquery = sqlalchemy.sql.select( 

245 [window.columns[name].label(name) for name in baseColumnNames] 

246 ).select_from( 

247 window 

248 ).where( 

249 window.columns["rownum"] == 1 

250 ).alias(datasetType.name) 

251 else: 

252 subquery = subquery.alias(datasetType.name) 

253 columns = DatasetQueryColumns( 

254 datasetType=datasetType, 

255 id=subquery.columns["id"], 

256 runKey=subquery.columns[runKeyName], 

257 ) 

258 else: 

259 subquery = subquery.alias(datasetType.name) 

260 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns) 

261 return True 

262 

263 def joinTable(self, table: sqlalchemy.sql.FromClause, dimensions: NamedValueSet[Dimension], *, 

264 datasets: Optional[DatasetQueryColumns] = None) -> None: 

265 """Join an arbitrary table to the query via dimension relationships. 

266 

267 External calls to this method should only be necessary for tables whose 

268 records represent neither datasets nor dimension elements. 

269 

270 Parameters 

271 ---------- 

272 table : `sqlalchemy.sql.FromClause` 

273 SQLAlchemy object representing the logical table (which may be a 

274 join or subquery expression) to be joined. 

275 dimensions : iterable of `Dimension` 

276 The dimensions that relate this table to others that may be in the 

277 query. The table must have columns with the names of the 

278 dimensions. 

279 datasets : `DatasetQueryColumns`, optional 

280 Columns that identify a dataset that is part of the query results. 

281 """ 

282 unexpectedDimensions = NamedValueSet(dimensions - self.summary.requested.dimensions) 

283 unexpectedDimensions.discard(self.summary.universe.commonSkyPix) 

284 if unexpectedDimensions: 

285 raise NotImplementedError( 

286 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that " 

287 f"were not provided originally to the QuerySummary object passed at construction." 

288 ) 

289 joinOn = self.startJoin(table, dimensions, dimensions.names) 

290 self.finishJoin(table, joinOn) 

291 if datasets is not None: 

292 assert self._columns.datasets is None, \ 

293 "At most one result dataset type can be returned by a query." 

294 self._columns.datasets = datasets 

295 

296 def startJoin(self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension], 

297 columnNames: Iterable[str] 

298 ) -> List[sqlalchemy.sql.ColumnElement]: 

299 """Begin a join on dimensions. 

300 

301 Must be followed by call to `finishJoin`. 

302 

303 Parameters 

304 ---------- 

305 table : `sqlalchemy.sql.FromClause` 

306 SQLAlchemy object representing the logical table (which may be a 

307 join or subquery expression) to be joined. 

308 dimensions : iterable of `Dimension` 

309 The dimensions that relate this table to others that may be in the 

310 query. The table must have columns with the names of the 

311 dimensions. 

312 columnNames : iterable of `str` 

313 Names of the columns that correspond to dimension key values; must 

314 be `zip` iterable with ``dimensions``. 

315 

316 Returns 

317 ------- 

318 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

319 Sequence of boolean expressions that should be combined with AND 

320 to form (part of) the ON expression for this JOIN. 

321 """ 

322 joinOn = [] 

323 for dimension, columnName in zip(dimensions, columnNames): 

324 columnInTable = table.columns[columnName] 

325 columnsInQuery = self._columns.keys.setdefault(dimension, []) 

326 for columnInQuery in columnsInQuery: 

327 joinOn.append(columnInQuery == columnInTable) 

328 columnsInQuery.append(columnInTable) 

329 return joinOn 

330 

331 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement] 

332 ) -> None: 

333 """Complete a join on dimensions. 

334 

335 Must be preceded by call to `startJoin`. 

336 

337 Parameters 

338 ---------- 

339 table : `sqlalchemy.sql.FromClause` 

340 SQLAlchemy object representing the logical table (which may be a 

341 join or subquery expression) to be joined. Must be the same object 

342 passed to `startJoin`. 

343 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

344 Sequence of boolean expressions that should be combined with AND 

345 to form (part of) the ON expression for this JOIN. Should include 

346 at least the elements of the list returned by `startJoin`. 

347 """ 

348 onclause: Optional[sqlalchemy.sql.ColumnElement] 

349 if len(joinOn) == 0: 

350 onclause = None 

351 elif len(joinOn) == 1: 

352 onclause = joinOn[0] 

353 else: 

354 onclause = sqlalchemy.sql.and_(*joinOn) 

355 self._simpleQuery.join(table, onclause=onclause) 

356 

357 def _joinMissingDimensionElements(self) -> None: 

358 """Join all dimension element tables that were identified as necessary 

359 by `QuerySummary` and have not yet been joined. 

360 

361 For internal use by `QueryBuilder` only; will be called (and should 

362 only by called) by `finish`. 

363 """ 

364 # Join all DimensionElement tables that we need for spatial/temporal 

365 # joins/filters or a nontrivial WHERE expression. 

366 # We iterate over these in *reverse* topological order to minimize the 

367 # number of tables joined. For example, the "visit" table provides 

368 # the primary key value for the "instrument" table it depends on, so we 

369 # don't need to join "instrument" as well unless we had a nontrivial 

370 # expression on it (and hence included it already above). 

371 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True): 

372 self.joinDimensionElement(element) 

373 # Join in any requested Dimension tables that don't already have their 

374 # primary keys identified by the query. 

375 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True): 

376 if dimension not in self._columns.keys: 

377 self.joinDimensionElement(dimension) 

378 

379 def _addWhereClause(self) -> None: 

380 """Add a WHERE clause to the query under construction, connecting all 

381 joined dimensions to the expression and data ID dimensions from 

382 `QuerySummary`. 

383 

384 For internal use by `QueryBuilder` only; will be called (and should 

385 only by called) by `finish`. 

386 """ 

387 if self.summary.expression.tree is not None: 

388 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements) 

389 self._simpleQuery.where.append(self.summary.expression.tree.visit(visitor)) 

390 for dimension, columnsInQuery in self._columns.keys.items(): 

391 if dimension in self.summary.dataId.graph: 

392 givenKey = self.summary.dataId[dimension] 

393 # Add a WHERE term for each column that corresponds to each 

394 # key. This is redundant with the JOIN ON clauses that make 

395 # them equal to each other, but more constraints have a chance 

396 # of making things easier on the DB's query optimizer. 

397 for columnInQuery in columnsInQuery: 

398 self._simpleQuery.where.append(columnInQuery == givenKey) 

399 else: 

400 # Dimension is not fully identified, but it might be a skypix 

401 # dimension that's constrained by a given region. 

402 if self.summary.whereRegion is not None and isinstance(dimension, SkyPixDimension): 

403 # We know the region now. 

404 givenSkyPixIds: List[int] = [] 

405 for begin, end in dimension.pixelization.envelope(self.summary.whereRegion): 

406 givenSkyPixIds.extend(range(begin, end)) 

407 for columnInQuery in columnsInQuery: 

408 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds)) 

409 # If we are given an dataId with a timespan, and there are one or more 

410 # timespans in the query that aren't given, add a WHERE expression for 

411 # each of them. 

412 if self.summary.dataId.graph.temporal and self.summary.temporal: 

413 # Timespan is known now. 

414 givenInterval = self.summary.dataId.timespan 

415 assert givenInterval is not None 

416 for element, intervalInQuery in self._columns.timespans.items(): 

417 assert element not in self.summary.dataId.graph.elements 

418 self._simpleQuery.where.append(intervalInQuery.overlaps(givenInterval)) 

419 

420 def finish(self, joinMissing: bool = True) -> Query: 

421 """Finish query constructing, returning a new `Query` instance. 

422 

423 Parameters 

424 ---------- 

425 joinMissing : `bool`, optional 

426 If `True` (default), automatically join any missing dimension 

427 element tables (according to the categorization of the 

428 `QuerySummary` the builder was constructed with). `False` should 

429 only be passed if the caller can independently guarantee that all 

430 dimension relationships are already captured in non-dimension 

431 tables that have been manually included in the query. 

432 

433 Returns 

434 ------- 

435 query : `Query` 

436 A `Query` object that can be executed and used to interpret result 

437 rows. 

438 """ 

439 if joinMissing: 

440 self._joinMissingDimensionElements() 

441 self._addWhereClause() 

442 if self._columns.isEmpty(): 

443 return EmptyQuery(self.summary.requested.universe, managers=self._managers) 

444 return DirectQuery(graph=self.summary.requested, 

445 uniqueness=DirectQueryUniqueness.NOT_UNIQUE, 

446 whereRegion=self.summary.dataId.region, 

447 simpleQuery=self._simpleQuery, 

448 columns=self._columns, 

449 managers=self._managers)