Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import Any, List, Iterable, TYPE_CHECKING 

26 

27from sqlalchemy.sql import ColumnElement, and_, literal, select, FromClause 

28import sqlalchemy.sql 

29 

30from ...core import ( 

31 DimensionElement, 

32 SkyPixDimension, 

33 Dimension, 

34 DatasetType, 

35 NamedKeyDict, 

36) 

37 

38from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns 

39from .expressions import ClauseVisitor 

40from ._query import Query 

41from ..simpleQuery import Select 

42from ..wildcards import CollectionSearch, CollectionQuery 

43 

44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true

45 from ..interfaces import CollectionsManager, DimensionRecordStorageManager, DatasetRecordStorageManager 

46 

47 

48class QueryBuilder: 

49 """A builder for potentially complex queries that join tables based 

50 on dimension relationships. 

51 

52 Parameters 

53 ---------- 

54 summary : `QuerySummary` 

55 Struct organizing the dimensions involved in the query. 

56 collections : `CollectionsManager` 

57 Manager object for collection tables. 

58 dimensions : `DimensionRecordStorageManager` 

59 Manager for storage backend objects that abstract access to dimension 

60 tables. 

61 datasets : `DatasetRegistryStorage` 

62 Storage backend object that abstracts access to dataset tables. 

63 """ 

64 

65 def __init__(self, summary: QuerySummary, *, 

66 collections: CollectionsManager, 

67 dimensions: DimensionRecordStorageManager, 

68 datasets: DatasetRecordStorageManager): 

69 self.summary = summary 

70 self._collections = collections 

71 self._dimensions = dimensions 

72 self._datasets = datasets 

73 self._sql = None 

74 self._elements: NamedKeyDict[DimensionElement, FromClause] = NamedKeyDict() 

75 self._columns = QueryColumns() 

76 

77 def hasDimensionKey(self, dimension: Dimension) -> bool: 

78 """Return `True` if the given dimension's primary key column has 

79 been included in the query (possibly via a foreign key column on some 

80 other table). 

81 """ 

82 return dimension in self._columns.keys 

83 

84 def joinDimensionElement(self, element: DimensionElement): 

85 """Add the table for a `DimensionElement` to the query. 

86 

87 This automatically joins the element table to all other tables in the 

88 query with which it is related, via both dimension keys and spatial 

89 and temporal relationships. 

90 

91 External calls to this method should rarely be necessary; `finish` will 

92 automatically call it if the `DimensionElement` has been identified as 

93 one that must be included. 

94 

95 Parameters 

96 ---------- 

97 element : `DimensionElement` 

98 Element for which a table should be added. The element must be 

99 associated with a database table (see `DimensionElement.hasTable`). 

100 """ 

101 assert element not in self._elements, "Element already included in query." 

102 storage = self._dimensions[element] 

103 fromClause = storage.join( 

104 self, 

105 regions=self._columns.regions if element in self.summary.spatial else None, 

106 timespans=self._columns.timespans if element in self.summary.temporal else None, 

107 ) 

108 self._elements[element] = fromClause 

109 

110 def joinDataset(self, datasetType: DatasetType, collections: Any, *, 

111 isResult: bool = True, addRank: bool = False) -> bool: 

112 """Add a dataset search or constraint to the query. 

113 

114 Unlike other `QueryBuilder` join methods, this *must* be called 

115 directly to search for datasets of a particular type or constrain the 

116 query results based on the exists of datasets. However, all dimensions 

117 used to identify the dataset type must have already been included in 

118 `QuerySummary.requested` when initializing the `QueryBuilder`. 

119 

120 Parameters 

121 ---------- 

122 datasetType : `DatasetType` 

123 The type of datasets to search for. 

124 collections : `Any` 

125 An expression that fully or partially identifies the collections 

126 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

127 thereof. `...` can be used to return all collections. See 

128 :ref:`daf_butler_collection_expressions` for more information. 

129 isResult : `bool`, optional 

130 If `True` (default), include the dataset ID column in the 

131 result columns of the query, allowing complete `DatasetRef` 

132 instances to be produced from the query results for this dataset 

133 type. If `False`, the existence of datasets of this type is used 

134 only to constrain the data IDs returned by the query. 

135 addRank : `bool`, optional 

136 If `True` (`False` is default), also include a calculated column 

137 that ranks the collection in which the dataset was found (lower 

138 is better). Requires that all entries in ``collections`` be 

139 regular strings, so there is a clear search order. Ignored if 

140 ``isResult`` is `False`. 

141 

142 Returns 

143 ------- 

144 anyRecords : `bool` 

145 If `True`, joining the dataset table was successful and the query 

146 should proceed. If `False`, we were able to determine (from the 

147 combination of ``datasetType`` and ``collections``) that there 

148 would be no results joined in from this dataset, and hence (due to 

149 the inner join that would normally be present), the full query will 

150 return no results. 

151 """ 

152 assert datasetType.dimensions.issubset(self.summary.requested) 

153 if isResult and addRank: 

154 collections = CollectionSearch.fromExpression(collections) 

155 else: 

156 collections = CollectionQuery.fromExpression(collections) 

157 datasetRecordStorage = self._datasets.find(datasetType.name) 

158 if datasetRecordStorage is None: 

159 # Unrecognized dataset type means no results. It might be better 

160 # to raise here, but this is consistent with previous behavior, 

161 # which is expected by QuantumGraph generation code in pipe_base. 

162 return False 

163 subsubqueries = [] 

164 for rank, collectionRecord in enumerate(collections.iter(self._collections, datasetType=datasetType)): 

165 ssq = datasetRecordStorage.select(collection=collectionRecord, 

166 dataId=Select, 

167 id=Select if isResult else None, 

168 run=Select if isResult else None) 

169 if addRank: 

170 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank")) 

171 subsubqueries.append(ssq.combine()) 

172 if not subsubqueries: 

173 return False 

174 subquery = sqlalchemy.sql.union_all(*subsubqueries).alias(datasetType.name) 

175 self.joinTable(subquery, datasetType.dimensions.required) 

176 if isResult: 

177 self._columns.datasets[datasetType] = DatasetQueryColumns( 

178 id=subquery.columns["id"], 

179 runKey=subquery.columns[self._collections.getRunForeignKeyName()], 

180 rank=subquery.columns["rank"] if addRank else None 

181 ) 

182 return True 

183 

184 def joinTable(self, table: FromClause, dimensions: Iterable[Dimension]): 

185 """Join an arbitrary table to the query via dimension relationships. 

186 

187 External calls to this method should only be necessary for tables whose 

188 records represent neither dataset nor dimension elements (i.e. 

189 extensions to the standard `Registry` schema). 

190 

191 Parameters 

192 ---------- 

193 table : `sqlalchemy.sql.FromClause` 

194 SQLAlchemy object representing the logical table (which may be a 

195 join or subquery expression) to be joined. 

196 dimensions : iterable of `Dimension` 

197 The dimensions that relate this table to others that may be in the 

198 query. The table must have columns with the names of the 

199 dimensions. 

200 """ 

201 joinOn = self.startJoin(table, dimensions, dimensions.names) 

202 self.finishJoin(table, joinOn) 

203 

204 def startJoin(self, table: FromClause, dimensions: Iterable[Dimension], columnNames: Iterable[str] 

205 ) -> List[ColumnElement]: 

206 """Begin a join on dimensions. 

207 

208 Must be followed by call to `finishJoin`. 

209 

210 Parameters 

211 ---------- 

212 table : `sqlalchemy.sql.FromClause` 

213 SQLAlchemy object representing the logical table (which may be a 

214 join or subquery expression) to be joined. 

215 dimensions : iterable of `Dimension` 

216 The dimensions that relate this table to others that may be in the 

217 query. The table must have columns with the names of the 

218 dimensions. 

219 columnNames : iterable of `str` 

220 Names of the columns that correspond to dimension key values; must 

221 be `zip` iterable with ``dimensions``. 

222 

223 Returns 

224 ------- 

225 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

226 Sequence of boolean expressions that should be combined with AND 

227 to form (part of) the ON expression for this JOIN. 

228 """ 

229 joinOn = [] 

230 for dimension, columnName in zip(dimensions, columnNames): 

231 columnInTable = table.columns[columnName] 

232 columnsInQuery = self._columns.keys.setdefault(dimension, []) 

233 for columnInQuery in columnsInQuery: 

234 joinOn.append(columnInQuery == columnInTable) 

235 columnsInQuery.append(columnInTable) 

236 return joinOn 

237 

238 def finishJoin(self, table, joinOn): 

239 """Complete a join on dimensions. 

240 

241 Must be preceded by call to `startJoin`. 

242 

243 Parameters 

244 ---------- 

245 table : `sqlalchemy.sql.FromClause` 

246 SQLAlchemy object representing the logical table (which may be a 

247 join or subquery expression) to be joined. Must be the same object 

248 passed to `startJoin`. 

249 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

250 Sequence of boolean expressions that should be combined with AND 

251 to form (part of) the ON expression for this JOIN. Should include 

252 at least the elements of the list returned by `startJoin`. 

253 """ 

254 if joinOn: 

255 self._sql = self._sql.join(table, and_(*joinOn)) 

256 elif self._sql is None: 

257 self._sql = table 

258 else: 

259 # New table is completely unrelated to all already-included 

260 # tables. We need a cross join here but SQLAlchemy does not 

261 # have a specific method for that. Using join() without 

262 # `onclause` will try to join on FK and will raise an exception 

263 # for unrelated tables, so we have to use `onclause` which is 

264 # always true. 

265 self._sql = self._sql.join(table, literal(True) == literal(True)) 

266 

267 def _joinMissingDimensionElements(self): 

268 """Join all dimension element tables that were identified as necessary 

269 by `QuerySummary` and have not yet been joined. 

270 

271 For internal use by `QueryBuilder` only; will be called (and should 

272 only by called) by `finish`. 

273 """ 

274 # Join all DimensionElement tables that we need for spatial/temporal 

275 # joins/filters or a nontrivial WHERE expression. 

276 # We iterate over these in *reverse* topological order to minimize the 

277 # number of tables joined. For example, the "visit" table provides 

278 # the primary key value for the "instrument" table it depends on, so we 

279 # don't need to join "instrument" as well unless we had a nontrivial 

280 # expression on it (and hence included it already above). 

281 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True): 

282 self.joinDimensionElement(element) 

283 # Join in any requested Dimension tables that don't already have their 

284 # primary keys identified by the query. 

285 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True): 

286 if dimension not in self._columns.keys: 

287 self.joinDimensionElement(dimension) 

288 

289 def _addWhereClause(self): 

290 """Add a WHERE clause to the query under construction, connecting all 

291 joined dimensions to the expression and data ID dimensions from 

292 `QuerySummary`. 

293 

294 For internal use by `QueryBuilder` only; will be called (and should 

295 only by called) by `finish`. 

296 """ 

297 whereTerms = [] 

298 if self.summary.expression.tree is not None: 

299 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements) 

300 whereTerms.append(self.summary.expression.tree.visit(visitor)) 

301 for dimension, columnsInQuery in self._columns.keys.items(): 

302 if dimension in self.summary.dataId.graph: 

303 givenKey = self.summary.dataId[dimension] 

304 # Add a WHERE term for each column that corresponds to each 

305 # key. This is redundant with the JOIN ON clauses that make 

306 # them equal to each other, but more constraints have a chance 

307 # of making things easier on the DB's query optimizer. 

308 for columnInQuery in columnsInQuery: 

309 whereTerms.append(columnInQuery == givenKey) 

310 else: 

311 # Dimension is not fully identified, but it might be a skypix 

312 # dimension that's constrained by a given region. 

313 if self.summary.dataId.graph.spatial and isinstance(dimension, SkyPixDimension): 

314 # We know the region now. 

315 givenSkyPixIds = [] 

316 for begin, end in dimension.pixelization.envelope(self.summary.dataId.region): 

317 givenSkyPixIds.extend(range(begin, end)) 

318 for columnInQuery in columnsInQuery: 

319 whereTerms.append(columnInQuery.in_(givenSkyPixIds)) 

320 # If we are given an dataId with a timespan, and there are one or more 

321 # timespans in the query that aren't given, add a WHERE expression for 

322 # each of them. 

323 if self.summary.dataId.graph.temporal and self.summary.temporal: 

324 # Timespan is known now. 

325 givenInterval = self.summary.dataId.timespan 

326 for element, intervalInQuery in self._columns.timespans.items(): 

327 assert element not in self.summary.dataId.graph.elements 

328 whereTerms.append(intervalInQuery.overlaps(givenInterval, ops=sqlalchemy.sql)) 

329 # AND-together the full WHERE clause, and combine it with the FROM 

330 # clause. 

331 self._sql = self._sql.where(and_(*whereTerms)) 

332 

333 def _addSelectClause(self): 

334 """Add a SELECT clause to the query under construction containing all 

335 output columns identified by the `QuerySummary` and requested in calls 

336 to `joinDataset` with ``isResult=True``. 

337 

338 For internal use by `QueryBuilder` only; will be called (and should 

339 only by called) by `finish`. 

340 """ 

341 columns = [] 

342 for dimension in self.summary.requested: 

343 columns.append(self._columns.getKeyColumn(dimension)) 

344 for datasetColumns in self._columns.datasets.values(): 

345 columns.extend(datasetColumns) 

346 for regionColumn in self._columns.regions.values(): 

347 columns.append(regionColumn) 

348 self._sql = select(columns).select_from(self._sql) 

349 

350 def finish(self) -> Query: 

351 """Finish query constructing, returning a new `Query` instance. 

352 

353 This automatically joins any missing dimension element tables 

354 (according to the categorization of the `QuerySummary` the builder was 

355 constructed with). 

356 

357 This consumes the `QueryBuilder`; no other methods should be called 

358 after this one. 

359 

360 Returns 

361 ------- 

362 query : `Query` 

363 A `Query` object that can be executed (possibly multiple times 

364 with different bind parameter values) and used to interpret result 

365 rows. 

366 """ 

367 self._joinMissingDimensionElements() 

368 self._addSelectClause() 

369 self._addWhereClause() 

370 return Query(summary=self.summary, sql=self._sql, columns=self._columns, 

371 collections=self._collections)