Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import Any, List, Iterable, Optional, TYPE_CHECKING 

26 

27from sqlalchemy.sql import ColumnElement, and_, literal, select, FromClause 

28import sqlalchemy.sql 

29 

30from ...core import ( 

31 DimensionElement, 

32 SkyPixDimension, 

33 Dimension, 

34 DatasetType, 

35 NamedKeyDict, 

36 NamedValueSet, 

37) 

38 

39from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns 

40from .expressions import ClauseVisitor 

41from ._query import Query 

42from ..simpleQuery import Select 

43from ..wildcards import CollectionSearch, CollectionQuery 

44 

45if TYPE_CHECKING: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true

46 from ..interfaces import CollectionManager, DimensionRecordStorageManager, DatasetRecordStorageManager 

47 

48 

49class QueryBuilder: 

50 """A builder for potentially complex queries that join tables based 

51 on dimension relationships. 

52 

53 Parameters 

54 ---------- 

55 summary : `QuerySummary` 

56 Struct organizing the dimensions involved in the query. 

57 collections : `CollectionManager` 

58 Manager object for collection tables. 

59 dimensions : `DimensionRecordStorageManager` 

60 Manager for storage backend objects that abstract access to dimension 

61 tables. 

62 datasets : `DatasetRegistryStorage` 

63 Storage backend object that abstracts access to dataset tables. 

64 """ 

65 

66 def __init__(self, summary: QuerySummary, *, 

67 collections: CollectionManager, 

68 dimensions: DimensionRecordStorageManager, 

69 datasets: DatasetRecordStorageManager): 

70 self.summary = summary 

71 self._collections = collections 

72 self._dimensions = dimensions 

73 self._datasets = datasets 

74 self._sql: Optional[sqlalchemy.sql.FromClause] = None 

75 self._elements: NamedKeyDict[DimensionElement, FromClause] = NamedKeyDict() 

76 self._columns = QueryColumns() 

77 

78 def hasDimensionKey(self, dimension: Dimension) -> bool: 

79 """Return `True` if the given dimension's primary key column has 

80 been included in the query (possibly via a foreign key column on some 

81 other table). 

82 """ 

83 return dimension in self._columns.keys 

84 

85 def joinDimensionElement(self, element: DimensionElement) -> None: 

86 """Add the table for a `DimensionElement` to the query. 

87 

88 This automatically joins the element table to all other tables in the 

89 query with which it is related, via both dimension keys and spatial 

90 and temporal relationships. 

91 

92 External calls to this method should rarely be necessary; `finish` will 

93 automatically call it if the `DimensionElement` has been identified as 

94 one that must be included. 

95 

96 Parameters 

97 ---------- 

98 element : `DimensionElement` 

99 Element for which a table should be added. The element must be 

100 associated with a database table (see `DimensionElement.hasTable`). 

101 """ 

102 assert element not in self._elements, "Element already included in query." 

103 storage = self._dimensions[element] 

104 fromClause = storage.join( 

105 self, 

106 regions=self._columns.regions if element in self.summary.spatial else None, 

107 timespans=self._columns.timespans if element in self.summary.temporal else None, 

108 ) 

109 self._elements[element] = fromClause 

110 

111 def joinDataset(self, datasetType: DatasetType, collections: Any, *, 

112 isResult: bool = True, addRank: bool = False) -> bool: 

113 """Add a dataset search or constraint to the query. 

114 

115 Unlike other `QueryBuilder` join methods, this *must* be called 

116 directly to search for datasets of a particular type or constrain the 

117 query results based on the exists of datasets. However, all dimensions 

118 used to identify the dataset type must have already been included in 

119 `QuerySummary.requested` when initializing the `QueryBuilder`. 

120 

121 Parameters 

122 ---------- 

123 datasetType : `DatasetType` 

124 The type of datasets to search for. 

125 collections : `Any` 

126 An expression that fully or partially identifies the collections 

127 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

128 thereof. `...` can be used to return all collections. See 

129 :ref:`daf_butler_collection_expressions` for more information. 

130 isResult : `bool`, optional 

131 If `True` (default), include the dataset ID column in the 

132 result columns of the query, allowing complete `DatasetRef` 

133 instances to be produced from the query results for this dataset 

134 type. If `False`, the existence of datasets of this type is used 

135 only to constrain the data IDs returned by the query. 

136 addRank : `bool`, optional 

137 If `True` (`False` is default), also include a calculated column 

138 that ranks the collection in which the dataset was found (lower 

139 is better). Requires that all entries in ``collections`` be 

140 regular strings, so there is a clear search order. Ignored if 

141 ``isResult`` is `False`. 

142 

143 Returns 

144 ------- 

145 anyRecords : `bool` 

146 If `True`, joining the dataset table was successful and the query 

147 should proceed. If `False`, we were able to determine (from the 

148 combination of ``datasetType`` and ``collections``) that there 

149 would be no results joined in from this dataset, and hence (due to 

150 the inner join that would normally be present), the full query will 

151 return no results. 

152 """ 

153 assert datasetType.dimensions.issubset(self.summary.requested) 

154 if isResult and addRank: 

155 collections = CollectionSearch.fromExpression(collections) 

156 else: 

157 collections = CollectionQuery.fromExpression(collections) 

158 datasetRecordStorage = self._datasets.find(datasetType.name) 

159 if datasetRecordStorage is None: 

160 # Unrecognized dataset type means no results. It might be better 

161 # to raise here, but this is consistent with previous behavior, 

162 # which is expected by QuantumGraph generation code in pipe_base. 

163 return False 

164 subsubqueries = [] 

165 for rank, collectionRecord in enumerate(collections.iter(self._collections, datasetType=datasetType)): 

166 ssq = datasetRecordStorage.select(collection=collectionRecord, 

167 dataId=Select, 

168 id=Select if isResult else None, 

169 run=Select if isResult else None) 

170 if ssq is None: 

171 continue 

172 if addRank: 

173 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank")) 

174 subsubqueries.append(ssq.combine()) 

175 if not subsubqueries: 

176 return False 

177 subquery = sqlalchemy.sql.union_all(*subsubqueries).alias(datasetType.name) 

178 self.joinTable(subquery, datasetType.dimensions.required) 

179 if isResult: 

180 self._columns.datasets[datasetType] = DatasetQueryColumns( 

181 id=subquery.columns["id"], 

182 runKey=subquery.columns[self._collections.getRunForeignKeyName()], 

183 rank=subquery.columns["rank"] if addRank else None 

184 ) 

185 return True 

186 

187 def joinTable(self, table: FromClause, dimensions: NamedValueSet[Dimension]) -> None: 

188 """Join an arbitrary table to the query via dimension relationships. 

189 

190 External calls to this method should only be necessary for tables whose 

191 records represent neither dataset nor dimension elements (i.e. 

192 extensions to the standard `Registry` schema). 

193 

194 Parameters 

195 ---------- 

196 table : `sqlalchemy.sql.FromClause` 

197 SQLAlchemy object representing the logical table (which may be a 

198 join or subquery expression) to be joined. 

199 dimensions : iterable of `Dimension` 

200 The dimensions that relate this table to others that may be in the 

201 query. The table must have columns with the names of the 

202 dimensions. 

203 """ 

204 joinOn = self.startJoin(table, dimensions, dimensions.names) 

205 self.finishJoin(table, joinOn) 

206 

207 def startJoin(self, table: FromClause, dimensions: Iterable[Dimension], columnNames: Iterable[str] 

208 ) -> List[ColumnElement]: 

209 """Begin a join on dimensions. 

210 

211 Must be followed by call to `finishJoin`. 

212 

213 Parameters 

214 ---------- 

215 table : `sqlalchemy.sql.FromClause` 

216 SQLAlchemy object representing the logical table (which may be a 

217 join or subquery expression) to be joined. 

218 dimensions : iterable of `Dimension` 

219 The dimensions that relate this table to others that may be in the 

220 query. The table must have columns with the names of the 

221 dimensions. 

222 columnNames : iterable of `str` 

223 Names of the columns that correspond to dimension key values; must 

224 be `zip` iterable with ``dimensions``. 

225 

226 Returns 

227 ------- 

228 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

229 Sequence of boolean expressions that should be combined with AND 

230 to form (part of) the ON expression for this JOIN. 

231 """ 

232 joinOn = [] 

233 for dimension, columnName in zip(dimensions, columnNames): 

234 columnInTable = table.columns[columnName] 

235 columnsInQuery = self._columns.keys.setdefault(dimension, []) 

236 for columnInQuery in columnsInQuery: 

237 joinOn.append(columnInQuery == columnInTable) 

238 columnsInQuery.append(columnInTable) 

239 return joinOn 

240 

241 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement] 

242 ) -> None: 

243 """Complete a join on dimensions. 

244 

245 Must be preceded by call to `startJoin`. 

246 

247 Parameters 

248 ---------- 

249 table : `sqlalchemy.sql.FromClause` 

250 SQLAlchemy object representing the logical table (which may be a 

251 join or subquery expression) to be joined. Must be the same object 

252 passed to `startJoin`. 

253 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

254 Sequence of boolean expressions that should be combined with AND 

255 to form (part of) the ON expression for this JOIN. Should include 

256 at least the elements of the list returned by `startJoin`. 

257 """ 

258 if joinOn: 

259 assert self._sql is not None 

260 self._sql = self._sql.join(table, and_(*joinOn)) 

261 elif self._sql is None: 

262 self._sql = table 

263 else: 

264 # New table is completely unrelated to all already-included 

265 # tables. We need a cross join here but SQLAlchemy does not 

266 # have a specific method for that. Using join() without 

267 # `onclause` will try to join on FK and will raise an exception 

268 # for unrelated tables, so we have to use `onclause` which is 

269 # always true. 

270 self._sql = self._sql.join(table, literal(True) == literal(True)) 

271 

272 def _joinMissingDimensionElements(self) -> None: 

273 """Join all dimension element tables that were identified as necessary 

274 by `QuerySummary` and have not yet been joined. 

275 

276 For internal use by `QueryBuilder` only; will be called (and should 

277 only by called) by `finish`. 

278 """ 

279 # Join all DimensionElement tables that we need for spatial/temporal 

280 # joins/filters or a nontrivial WHERE expression. 

281 # We iterate over these in *reverse* topological order to minimize the 

282 # number of tables joined. For example, the "visit" table provides 

283 # the primary key value for the "instrument" table it depends on, so we 

284 # don't need to join "instrument" as well unless we had a nontrivial 

285 # expression on it (and hence included it already above). 

286 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True): 

287 self.joinDimensionElement(element) 

288 # Join in any requested Dimension tables that don't already have their 

289 # primary keys identified by the query. 

290 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True): 

291 if dimension not in self._columns.keys: 

292 self.joinDimensionElement(dimension) 

293 

294 def _addWhereClause(self) -> None: 

295 """Add a WHERE clause to the query under construction, connecting all 

296 joined dimensions to the expression and data ID dimensions from 

297 `QuerySummary`. 

298 

299 For internal use by `QueryBuilder` only; will be called (and should 

300 only by called) by `finish`. 

301 """ 

302 whereTerms = [] 

303 if self.summary.expression.tree is not None: 

304 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements) 

305 whereTerms.append(self.summary.expression.tree.visit(visitor)) 

306 for dimension, columnsInQuery in self._columns.keys.items(): 

307 if dimension in self.summary.dataId.graph: 

308 givenKey = self.summary.dataId[dimension] 

309 # Add a WHERE term for each column that corresponds to each 

310 # key. This is redundant with the JOIN ON clauses that make 

311 # them equal to each other, but more constraints have a chance 

312 # of making things easier on the DB's query optimizer. 

313 for columnInQuery in columnsInQuery: 

314 whereTerms.append(columnInQuery == givenKey) 

315 else: 

316 # Dimension is not fully identified, but it might be a skypix 

317 # dimension that's constrained by a given region. 

318 if self.summary.dataId.graph.spatial and isinstance(dimension, SkyPixDimension): 

319 # We know the region now. 

320 givenSkyPixIds: List[int] = [] 

321 for begin, end in dimension.pixelization.envelope(self.summary.dataId.region): 

322 givenSkyPixIds.extend(range(begin, end)) 

323 for columnInQuery in columnsInQuery: 

324 whereTerms.append(columnInQuery.in_(givenSkyPixIds)) 

325 # If we are given an dataId with a timespan, and there are one or more 

326 # timespans in the query that aren't given, add a WHERE expression for 

327 # each of them. 

328 if self.summary.dataId.graph.temporal and self.summary.temporal: 

329 # Timespan is known now. 

330 givenInterval = self.summary.dataId.timespan 

331 assert givenInterval is not None 

332 for element, intervalInQuery in self._columns.timespans.items(): 

333 assert element not in self.summary.dataId.graph.elements 

334 whereTerms.append(intervalInQuery.overlaps(givenInterval, ops=sqlalchemy.sql)) 

335 # AND-together the full WHERE clause, and combine it with the FROM 

336 # clause. 

337 assert self._sql is not None 

338 self._sql = self._sql.where(and_(*whereTerms)) 

339 

340 def _addSelectClause(self) -> None: 

341 """Add a SELECT clause to the query under construction containing all 

342 output columns identified by the `QuerySummary` and requested in calls 

343 to `joinDataset` with ``isResult=True``. 

344 

345 For internal use by `QueryBuilder` only; will be called (and should 

346 only by called) by `finish`. 

347 """ 

348 columns = [] 

349 for dimension in self.summary.requested: 

350 columns.append(self._columns.getKeyColumn(dimension)) 

351 for datasetColumns in self._columns.datasets.values(): 

352 columns.extend(datasetColumns) 

353 for regionColumn in self._columns.regions.values(): 

354 columns.append(regionColumn) 

355 self._sql = select(columns).select_from(self._sql) 

356 

357 def finish(self) -> Query: 

358 """Finish query constructing, returning a new `Query` instance. 

359 

360 This automatically joins any missing dimension element tables 

361 (according to the categorization of the `QuerySummary` the builder was 

362 constructed with). 

363 

364 This consumes the `QueryBuilder`; no other methods should be called 

365 after this one. 

366 

367 Returns 

368 ------- 

369 query : `Query` 

370 A `Query` object that can be executed (possibly multiple times 

371 with different bind parameter values) and used to interpret result 

372 rows. 

373 """ 

374 self._joinMissingDimensionElements() 

375 self._addSelectClause() 

376 self._addWhereClause() 

377 return Query(summary=self.summary, sql=self._sql, columns=self._columns, 

378 collections=self._collections)