Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import AbstractSet, Any, Iterable, List, Optional 

26 

27import sqlalchemy.sql 

28 

29from ...core import ( 

30 DimensionElement, 

31 SkyPixDimension, 

32 Dimension, 

33 DatasetType, 

34 SimpleQuery, 

35) 

36 

37from ...core.named import NamedKeyDict, NamedValueAbstractSet, NamedValueSet 

38 

39from .._collectionType import CollectionType 

40from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns, RegistryManagers 

41from .expressions import ClauseVisitor 

42from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, Query 

43from ..wildcards import CollectionSearch, CollectionQuery 

44 

45 

46class QueryBuilder: 

47 """A builder for potentially complex queries that join tables based 

48 on dimension relationships. 

49 

50 Parameters 

51 ---------- 

52 summary : `QuerySummary` 

53 Struct organizing the dimensions involved in the query. 

54 managers : `RegistryManagers` 

55 A struct containing the registry manager instances used by the query 

56 system. 

57 """ 

58 def __init__(self, summary: QuerySummary, managers: RegistryManagers): 

59 self.summary = summary 

60 self._simpleQuery = SimpleQuery() 

61 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict() 

62 self._columns = QueryColumns() 

63 self._managers = managers 

64 

65 def hasDimensionKey(self, dimension: Dimension) -> bool: 

66 """Return `True` if the given dimension's primary key column has 

67 been included in the query (possibly via a foreign key column on some 

68 other table). 

69 """ 

70 return dimension in self._columns.keys 

71 

72 def joinDimensionElement(self, element: DimensionElement) -> None: 

73 """Add the table for a `DimensionElement` to the query. 

74 

75 This automatically joins the element table to all other tables in the 

76 query with which it is related, via both dimension keys and spatial 

77 and temporal relationships. 

78 

79 External calls to this method should rarely be necessary; `finish` will 

80 automatically call it if the `DimensionElement` has been identified as 

81 one that must be included. 

82 

83 Parameters 

84 ---------- 

85 element : `DimensionElement` 

86 Element for which a table should be added. The element must be 

87 associated with a database table (see `DimensionElement.hasTable`). 

88 """ 

89 assert element not in self._elements, "Element already included in query." 

90 storage = self._managers.dimensions[element] 

91 fromClause = storage.join( 

92 self, 

93 regions=self._columns.regions if element in self.summary.spatial else None, 

94 timespans=self._columns.timespans if element in self.summary.temporal else None, 

95 ) 

96 self._elements[element] = fromClause 

97 

98 def joinDataset(self, datasetType: DatasetType, collections: Any, *, 

99 isResult: bool = True, findFirst: bool = False) -> bool: 

100 """Add a dataset search or constraint to the query. 

101 

102 Unlike other `QueryBuilder` join methods, this *must* be called 

103 directly to search for datasets of a particular type or constrain the 

104 query results based on the exists of datasets. However, all dimensions 

105 used to identify the dataset type must have already been included in 

106 `QuerySummary.requested` when initializing the `QueryBuilder`. 

107 

108 Parameters 

109 ---------- 

110 datasetType : `DatasetType` 

111 The type of datasets to search for. 

112 collections : `Any` 

113 An expression that fully or partially identifies the collections 

114 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

115 thereof. `...` can be used to return all collections. See 

116 :ref:`daf_butler_collection_expressions` for more information. 

117 isResult : `bool`, optional 

118 If `True` (default), include the dataset ID column in the 

119 result columns of the query, allowing complete `DatasetRef` 

120 instances to be produced from the query results for this dataset 

121 type. If `False`, the existence of datasets of this type is used 

122 only to constrain the data IDs returned by the query. 

123 `joinDataset` may be called with ``isResult=True`` at most one time 

124 on a particular `QueryBuilder` instance. 

125 findFirst : `bool`, optional 

126 If `True` (`False` is default), only include the first match for 

127 each data ID, searching the given collections in order. Requires 

128 that all entries in ``collections`` be regular strings, so there is 

129 a clear search order. Ignored if ``isResult`` is `False`. 

130 

131 Returns 

132 ------- 

133 anyRecords : `bool` 

134 If `True`, joining the dataset table was successful and the query 

135 should proceed. If `False`, we were able to determine (from the 

136 combination of ``datasetType`` and ``collections``) that there 

137 would be no results joined in from this dataset, and hence (due to 

138 the inner join that would normally be present), the full query will 

139 return no results. 

140 """ 

141 assert datasetType.dimensions.issubset(self.summary.requested) 

142 if isResult and findFirst: 

143 collections = CollectionSearch.fromExpression(collections) 

144 else: 

145 collections = CollectionQuery.fromExpression(collections) 

146 # If we are searching all collections with no constraints, loop over 

147 # RUN collections only, because that will include all datasets. 

148 collectionTypes: AbstractSet[CollectionType] 

149 if collections == CollectionQuery(): 

150 collectionTypes = {CollectionType.RUN} 

151 else: 

152 collectionTypes = CollectionType.all() 

153 datasetRecordStorage = self._managers.datasets.find(datasetType.name) 

154 if datasetRecordStorage is None: 

155 # Unrecognized dataset type means no results. It might be better 

156 # to raise here, but this is consistent with previous behavior, 

157 # which is expected by QuantumGraph generation code in pipe_base. 

158 return False 

159 subsubqueries = [] 

160 runKeyName = self._managers.collections.getRunForeignKeyName() 

161 baseColumnNames = {"id", runKeyName, "ingest_date"} if isResult else set() 

162 baseColumnNames.update(datasetType.dimensions.required.names) 

163 for rank, collectionRecord in enumerate(collections.iter(self._managers.collections, 

164 collectionTypes=collectionTypes)): 

165 if collectionRecord.type is CollectionType.CALIBRATION: 

166 if datasetType.isCalibration(): 

167 raise NotImplementedError( 

168 f"Query for dataset type '{datasetType.name}' in CALIBRATION-type collection " 

169 f"'{collectionRecord.name}' is not yet supported." 

170 ) 

171 else: 

172 # We can never find a non-calibration dataset in a 

173 # CALIBRATION collection. 

174 continue 

175 ssq = datasetRecordStorage.select(collection=collectionRecord, 

176 dataId=SimpleQuery.Select, 

177 id=SimpleQuery.Select if isResult else None, 

178 run=SimpleQuery.Select if isResult else None, 

179 ingestDate=SimpleQuery.Select if isResult else None) 

180 if ssq is None: 

181 continue 

182 assert {c.name for c in ssq.columns} == baseColumnNames 

183 if findFirst: 

184 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank")) 

185 subsubqueries.append(ssq.combine()) 

186 if not subsubqueries: 

187 return False 

188 subquery = sqlalchemy.sql.union_all(*subsubqueries) 

189 columns: Optional[DatasetQueryColumns] = None 

190 if isResult: 

191 if findFirst: 

192 # Rewrite the subquery (currently a UNION ALL over 

193 # per-collection subsubqueries) to select the rows with the 

194 # lowest rank per data ID. The block below will set subquery 

195 # to something like this: 

196 # 

197 # WITH {dst}_search AS ( 

198 # SELECT {data-id-cols}, id, run_id, 1 AS rank 

199 # FROM <collection1> 

200 # UNION ALL 

201 # SELECT {data-id-cols}, id, run_id, 2 AS rank 

202 # FROM <collection2> 

203 # UNION ALL 

204 # ... 

205 # ) 

206 # SELECT 

207 # {dst}_window.{data-id-cols}, 

208 # {dst}_window.id, 

209 # {dst}_window.run_id 

210 # FROM ( 

211 # SELECT 

212 # {dst}_search.{data-id-cols}, 

213 # {dst}_search.id, 

214 # {dst}_search.run_id, 

215 # ROW_NUMBER() OVER ( 

216 # PARTITION BY {dst_search}.{data-id-cols} 

217 # ORDER BY rank 

218 # ) AS rownum 

219 # ) {dst}_window 

220 # WHERE 

221 # {dst}_window.rownum = 1; 

222 # 

223 search = subquery.cte(f"{datasetType.name}_search") 

224 windowDataIdCols = [ 

225 search.columns[name].label(name) for name in datasetType.dimensions.required.names 

226 ] 

227 windowSelectCols = [ 

228 search.columns["id"].label("id"), 

229 search.columns[runKeyName].label(runKeyName), 

230 search.columns["ingest_date"].label("ingest_date"), 

231 ] 

232 windowSelectCols += windowDataIdCols 

233 assert {c.name for c in windowSelectCols} == baseColumnNames 

234 windowSelectCols.append( 

235 sqlalchemy.sql.func.row_number().over( 

236 partition_by=windowDataIdCols, 

237 order_by=search.columns["rank"] 

238 ).label("rownum") 

239 ) 

240 window = sqlalchemy.sql.select( 

241 windowSelectCols 

242 ).select_from(search).alias( 

243 f"{datasetType.name}_window" 

244 ) 

245 subquery = sqlalchemy.sql.select( 

246 [window.columns[name].label(name) for name in baseColumnNames] 

247 ).select_from( 

248 window 

249 ).where( 

250 window.columns["rownum"] == 1 

251 ).alias(datasetType.name) 

252 else: 

253 subquery = subquery.alias(datasetType.name) 

254 columns = DatasetQueryColumns( 

255 datasetType=datasetType, 

256 id=subquery.columns["id"], 

257 runKey=subquery.columns[runKeyName], 

258 ingestDate=subquery.columns["ingest_date"], 

259 ) 

260 else: 

261 subquery = subquery.alias(datasetType.name) 

262 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns) 

263 return True 

264 

265 def joinTable(self, table: sqlalchemy.sql.FromClause, dimensions: NamedValueAbstractSet[Dimension], *, 

266 datasets: Optional[DatasetQueryColumns] = None) -> None: 

267 """Join an arbitrary table to the query via dimension relationships. 

268 

269 External calls to this method should only be necessary for tables whose 

270 records represent neither datasets nor dimension elements. 

271 

272 Parameters 

273 ---------- 

274 table : `sqlalchemy.sql.FromClause` 

275 SQLAlchemy object representing the logical table (which may be a 

276 join or subquery expression) to be joined. 

277 dimensions : iterable of `Dimension` 

278 The dimensions that relate this table to others that may be in the 

279 query. The table must have columns with the names of the 

280 dimensions. 

281 datasets : `DatasetQueryColumns`, optional 

282 Columns that identify a dataset that is part of the query results. 

283 """ 

284 unexpectedDimensions = NamedValueSet(dimensions - self.summary.requested.dimensions) 

285 unexpectedDimensions.discard(self.summary.universe.commonSkyPix) 

286 if unexpectedDimensions: 

287 raise NotImplementedError( 

288 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that " 

289 f"were not provided originally to the QuerySummary object passed at construction." 

290 ) 

291 joinOn = self.startJoin(table, dimensions, dimensions.names) 

292 self.finishJoin(table, joinOn) 

293 if datasets is not None: 

294 assert self._columns.datasets is None, \ 

295 "At most one result dataset type can be returned by a query." 

296 self._columns.datasets = datasets 

297 

298 def startJoin(self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension], 

299 columnNames: Iterable[str] 

300 ) -> List[sqlalchemy.sql.ColumnElement]: 

301 """Begin a join on dimensions. 

302 

303 Must be followed by call to `finishJoin`. 

304 

305 Parameters 

306 ---------- 

307 table : `sqlalchemy.sql.FromClause` 

308 SQLAlchemy object representing the logical table (which may be a 

309 join or subquery expression) to be joined. 

310 dimensions : iterable of `Dimension` 

311 The dimensions that relate this table to others that may be in the 

312 query. The table must have columns with the names of the 

313 dimensions. 

314 columnNames : iterable of `str` 

315 Names of the columns that correspond to dimension key values; must 

316 be `zip` iterable with ``dimensions``. 

317 

318 Returns 

319 ------- 

320 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

321 Sequence of boolean expressions that should be combined with AND 

322 to form (part of) the ON expression for this JOIN. 

323 """ 

324 joinOn = [] 

325 for dimension, columnName in zip(dimensions, columnNames): 

326 columnInTable = table.columns[columnName] 

327 columnsInQuery = self._columns.keys.setdefault(dimension, []) 

328 for columnInQuery in columnsInQuery: 

329 joinOn.append(columnInQuery == columnInTable) 

330 columnsInQuery.append(columnInTable) 

331 return joinOn 

332 

333 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement] 

334 ) -> None: 

335 """Complete a join on dimensions. 

336 

337 Must be preceded by call to `startJoin`. 

338 

339 Parameters 

340 ---------- 

341 table : `sqlalchemy.sql.FromClause` 

342 SQLAlchemy object representing the logical table (which may be a 

343 join or subquery expression) to be joined. Must be the same object 

344 passed to `startJoin`. 

345 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

346 Sequence of boolean expressions that should be combined with AND 

347 to form (part of) the ON expression for this JOIN. Should include 

348 at least the elements of the list returned by `startJoin`. 

349 """ 

350 onclause: Optional[sqlalchemy.sql.ColumnElement] 

351 if len(joinOn) == 0: 

352 onclause = None 

353 elif len(joinOn) == 1: 

354 onclause = joinOn[0] 

355 else: 

356 onclause = sqlalchemy.sql.and_(*joinOn) 

357 self._simpleQuery.join(table, onclause=onclause) 

358 

359 def _joinMissingDimensionElements(self) -> None: 

360 """Join all dimension element tables that were identified as necessary 

361 by `QuerySummary` and have not yet been joined. 

362 

363 For internal use by `QueryBuilder` only; will be called (and should 

364 only by called) by `finish`. 

365 """ 

366 # Join all DimensionElement tables that we need for spatial/temporal 

367 # joins/filters or a nontrivial WHERE expression. 

368 # We iterate over these in *reverse* topological order to minimize the 

369 # number of tables joined. For example, the "visit" table provides 

370 # the primary key value for the "instrument" table it depends on, so we 

371 # don't need to join "instrument" as well unless we had a nontrivial 

372 # expression on it (and hence included it already above). 

373 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True): 

374 self.joinDimensionElement(element) 

375 # Join in any requested Dimension tables that don't already have their 

376 # primary keys identified by the query. 

377 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True): 

378 if dimension not in self._columns.keys: 

379 self.joinDimensionElement(dimension) 

380 

381 def _addWhereClause(self) -> None: 

382 """Add a WHERE clause to the query under construction, connecting all 

383 joined dimensions to the expression and data ID dimensions from 

384 `QuerySummary`. 

385 

386 For internal use by `QueryBuilder` only; will be called (and should 

387 only by called) by `finish`. 

388 """ 

389 if self.summary.where.tree is not None: 

390 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements) 

391 self._simpleQuery.where.append(self.summary.where.tree.visit(visitor)) 

392 for dimension, columnsInQuery in self._columns.keys.items(): 

393 if dimension in self.summary.where.dataId.graph: 

394 givenKey = self.summary.where.dataId[dimension] 

395 # Add a WHERE term for each column that corresponds to each 

396 # key. This is redundant with the JOIN ON clauses that make 

397 # them equal to each other, but more constraints have a chance 

398 # of making things easier on the DB's query optimizer. 

399 for columnInQuery in columnsInQuery: 

400 self._simpleQuery.where.append(columnInQuery == givenKey) 

401 else: 

402 # Dimension is not fully identified, but it might be a skypix 

403 # dimension that's constrained by a given region. 

404 if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension): 

405 # We know the region now. 

406 givenSkyPixIds: List[int] = [] 

407 for begin, end in dimension.pixelization.envelope(self.summary.where.region): 

408 givenSkyPixIds.extend(range(begin, end)) 

409 for columnInQuery in columnsInQuery: 

410 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds)) 

411 # If we are given an dataId with a timespan, and there are one or more 

412 # timespans in the query that aren't given, add a WHERE expression for 

413 # each of them. 

414 if self.summary.where.dataId.graph.temporal and self.summary.temporal: 

415 # Timespan is known now. 

416 givenInterval = self.summary.where.dataId.timespan 

417 assert givenInterval is not None 

418 for element, intervalInQuery in self._columns.timespans.items(): 

419 assert element not in self.summary.where.dataId.graph.elements 

420 self._simpleQuery.where.append(intervalInQuery.overlaps(givenInterval)) 

421 

422 def finish(self, joinMissing: bool = True) -> Query: 

423 """Finish query constructing, returning a new `Query` instance. 

424 

425 Parameters 

426 ---------- 

427 joinMissing : `bool`, optional 

428 If `True` (default), automatically join any missing dimension 

429 element tables (according to the categorization of the 

430 `QuerySummary` the builder was constructed with). `False` should 

431 only be passed if the caller can independently guarantee that all 

432 dimension relationships are already captured in non-dimension 

433 tables that have been manually included in the query. 

434 

435 Returns 

436 ------- 

437 query : `Query` 

438 A `Query` object that can be executed and used to interpret result 

439 rows. 

440 """ 

441 if joinMissing: 

442 self._joinMissingDimensionElements() 

443 self._addWhereClause() 

444 if self._columns.isEmpty(): 

445 return EmptyQuery(self.summary.requested.universe, managers=self._managers) 

446 return DirectQuery(graph=self.summary.requested, 

447 uniqueness=DirectQueryUniqueness.NOT_UNIQUE, 

448 whereRegion=self.summary.where.dataId.region, 

449 simpleQuery=self._simpleQuery, 

450 columns=self._columns, 

451 managers=self._managers)