Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import Any, List, Iterable, TYPE_CHECKING 

26 

27from sqlalchemy.sql import ColumnElement, and_, literal, bindparam, select, FromClause 

28import sqlalchemy.sql 

29from sqlalchemy.engine import Connection 

30 

31from ...core import ( 

32 DimensionElement, 

33 SkyPixDimension, 

34 Dimension, 

35 DatasetType, 

36 Timespan, 

37) 

38from ...core.utils import NamedKeyDict 

39 

40from ._structs import QuerySummary, QueryColumns, QueryParameters, GivenTime 

41from ._datasets import DatasetRegistryStorage 

42from .expressions import ClauseVisitor 

43from ._query import Query 

44 

45if TYPE_CHECKING: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true

46 from ..interfaces import DimensionRecordStorageManager 

47 

48 

49class QueryBuilder: 

50 """A builder for potentially complex queries that join tables based 

51 on dimension relationships. 

52 

53 Parameters 

54 ---------- 

55 connection : `sqlalchemy.engine.Connection` 

56 SQLAlchemy connection object. This is only used to pass through 

57 to the `Query` object returned by `finish`. 

58 summary : `QuerySummary` 

59 Struct organizing the dimensions involved in the query. 

60 dimensionStorage : `DimensionRecordStorageManager` 

61 Manager for storage backend objects that abstract access to dimension 

62 tables. 

63 datasetStorage : `DatasetRegistryStorage` 

64 Storage backend object that abstracts access to dataset tables. 

65 """ 

66 

67 def __init__(self, connection: Connection, summary: QuerySummary, 

68 dimensionStorage: DimensionRecordStorageManager, 

69 datasetStorage: DatasetRegistryStorage): 

70 self.summary = summary 

71 self._connection = connection 

72 self._dimensionStorage = dimensionStorage 

73 self._datasetStorage = datasetStorage 

74 self._sql = None 

75 self._elements: NamedKeyDict[DimensionElement, FromClause] = NamedKeyDict() 

76 self._columns = QueryColumns() 

77 

78 def hasDimensionKey(self, dimension: Dimension) -> bool: 

79 """Return `True` if the given dimension's primary key column has 

80 been included in the query (possibly via a foreign key column on some 

81 other table). 

82 """ 

83 return dimension in self._columns.keys 

84 

85 def joinDimensionElement(self, element: DimensionElement): 

86 """Add the table for a `DimensionElement` to the query. 

87 

88 This automatically joins the element table to all other tables in the 

89 query with which it is related, via both dimension keys and spatial 

90 and temporal relationships. 

91 

92 External calls to this method should rarely be necessary; `finish` will 

93 automatically call it if the `DimensionElement` has been identified as 

94 one that must be included. 

95 

96 Parameters 

97 ---------- 

98 element : `DimensionElement` 

99 Element for which a table should be added. The element must be 

100 associated with a database table (see `DimensionElement.hasTable`). 

101 """ 

102 assert element not in self._elements, "Element already included in query." 

103 storage = self._dimensionStorage[element] 

104 fromClause = storage.join( 

105 self, 

106 regions=self._columns.regions if element in self.summary.spatial else None, 

107 timespans=self._columns.timespans if element in self.summary.temporal else None, 

108 ) 

109 self._elements[element] = fromClause 

110 

111 def joinDataset(self, datasetType: DatasetType, collections: Any, *, 

112 isResult: bool = True, addRank: bool = False): 

113 """Add a dataset search or constraint to the query. 

114 

115 Unlike other `QueryBuilder` join methods, this *must* be called 

116 directly to search for datasets of a particular type or constrain the 

117 query results based on the exists of datasets. However, all dimensions 

118 used to identify the dataset type must have already been included in 

119 `QuerySummary.requested` when initializing the `QueryBuilder`. 

120 

121 Parameters 

122 ---------- 

123 datasetType : `DatasetType` 

124 The type of datasets to search for. 

125 collections : sequence of `str` or `Like`, or ``...`` 

126 An expression describing the collections in which to search for 

127 the datasets. This may be a single instance of or an iterable of 

128 any of the following: 

129 

130 - a `str` collection name; 

131 - a `Like` pattern to match against collection names; 

132 - `...`, indicating all collections. 

133 isResult : `bool`, optional 

134 If `True` (default), include the ``dataset_id`` column in the 

135 result columns of the query, allowing complete `DatasetRef` 

136 instances to be produced from the query results for this dataset 

137 type. If `False`, the existence of datasets of this type is used 

138 only to constrain the data IDs returned by the query. 

139 addRank : `bool`, optional 

140 If `True` (`False` is default), also include a calculated column 

141 that ranks the collection in which the dataset was found (lower 

142 is better). Requires that all entries in ``collections`` be 

143 regular strings, so there is a clear search order. Ignored if 

144 ``isResult`` is `False`. 

145 """ 

146 assert datasetType.dimensions.issubset(self.summary.requested) 

147 table = self._datasetStorage.getDatasetSubquery(datasetType, collections=collections, 

148 isResult=isResult, addRank=addRank) 

149 self.joinTable(table, datasetType.dimensions) 

150 if isResult: 

151 self._columns.datasets[datasetType] = (table.columns["dataset_id"], 

152 table.columns["rank"] if addRank else None) 

153 

154 def joinTable(self, table: FromClause, dimensions: Iterable[Dimension]): 

155 """Join an arbitrary table to the query via dimension relationships. 

156 

157 External calls to this method should only be necessary for tables whose 

158 records represent neither dataset nor dimension elements (i.e. 

159 extensions to the standard `Registry` schema). 

160 

161 Parameters 

162 ---------- 

163 table : `sqlalchemy.sql.FromClause` 

164 SQLAlchemy object representing the logical table (which may be a 

165 join or subquery expression) to be joined. 

166 dimensions : iterable of `Dimension` 

167 The dimensions that relate this table to others that may be in the 

168 query. The table must have columns with the names of the 

169 dimensions. 

170 """ 

171 joinOn = self.startJoin(table, dimensions, dimensions.names) 

172 self.finishJoin(table, joinOn) 

173 

174 def startJoin(self, table: FromClause, dimensions: Iterable[Dimension], columnNames: Iterable[str] 

175 ) -> List[ColumnElement]: 

176 """Begin a join on dimensions. 

177 

178 Must be followed by call to `finishJoin`. 

179 

180 Parameters 

181 ---------- 

182 table : `sqlalchemy.sql.FromClause` 

183 SQLAlchemy object representing the logical table (which may be a 

184 join or subquery expression) to be joined. 

185 dimensions : iterable of `Dimension` 

186 The dimensions that relate this table to others that may be in the 

187 query. The table must have columns with the names of the 

188 dimensions. 

189 columnNames : iterable of `str` 

190 Names of the columns that correspond to dimension key values; must 

191 be `zip` iterable with ``dimensions``. 

192 

193 Returns 

194 ------- 

195 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

196 Sequence of boolean expressions that should be combined with AND 

197 to form (part of) the ON expression for this JOIN. 

198 """ 

199 joinOn = [] 

200 for dimension, columnName in zip(dimensions, columnNames): 

201 columnInTable = table.columns[columnName] 

202 columnsInQuery = self._columns.keys.setdefault(dimension, []) 

203 for columnInQuery in columnsInQuery: 

204 joinOn.append(columnInQuery == columnInTable) 

205 columnsInQuery.append(columnInTable) 

206 return joinOn 

207 

208 def finishJoin(self, table, joinOn): 

209 """Complete a join on dimensions. 

210 

211 Must be preceded by call to `startJoin`. 

212 

213 Parameters 

214 ---------- 

215 table : `sqlalchemy.sql.FromClause` 

216 SQLAlchemy object representing the logical table (which may be a 

217 join or subquery expression) to be joined. Must be the same object 

218 passed to `startJoin`. 

219 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

220 Sequence of boolean expressions that should be combined with AND 

221 to form (part of) the ON expression for this JOIN. Should include 

222 at least the elements of the list returned by `startJoin`. 

223 """ 

224 if joinOn: 

225 self._sql = self._sql.join(table, and_(*joinOn)) 

226 elif self._sql is None: 

227 self._sql = table 

228 else: 

229 # New table is completely unrelated to all already-included 

230 # tables. We need a cross join here but SQLAlchemy does not 

231 # have a specific method for that. Using join() without 

232 # `onclause` will try to join on FK and will raise an exception 

233 # for unrelated tables, so we have to use `onclause` which is 

234 # always true. 

235 self._sql = self._sql.join(table, literal(True) == literal(True)) 

236 

237 def _joinMissingDimensionElements(self): 

238 """Join all dimension element tables that were identified as necessary 

239 by `QuerySummary` and have not yet been joined. 

240 

241 For internal use by `QueryBuilder` only; will be called (and should 

242 only by called) by `finish`. 

243 """ 

244 # Join all DimensionElement tables that we need for spatial/temporal 

245 # joins/filters or a nontrivial WHERE expression. 

246 # We iterate over these in *reverse* topological order to minimize the 

247 # number of tables joined. For example, the "visit" table provides 

248 # the primary key value for the "instrument" table it depends on, so we 

249 # don't need to join "instrument" as well unless we had a nontrivial 

250 # expression on it (and hence included it already above). 

251 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True): 

252 self.joinDimensionElement(element) 

253 # Join in any requested Dimension tables that don't already have their 

254 # primary keys identified by the query. 

255 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True): 

256 if dimension not in self._columns.keys: 

257 self.joinDimensionElement(dimension) 

258 

259 def _addWhereClause(self): 

260 """Add a WHERE clause to the query under construction, connecting all 

261 joined dimensions to the expression and given dimensions from 

262 `QuerySummary`. 

263 

264 For internal use by `QueryBuilder` only; will be called (and should 

265 only by called) by `finish`. 

266 """ 

267 parameters = QueryParameters() 

268 whereTerms = [] 

269 if self.summary.expression.tree is not None: 

270 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements) 

271 whereTerms.append(self.summary.expression.tree.visit(visitor)) 

272 for dimension, columnsInQuery in self._columns.keys.items(): 

273 if dimension in self.summary.given: 

274 if self.summary.whenIsDimensionGiven(dimension) == GivenTime.AT_EXECUTION: 

275 givenKey = bindparam(f"_given_later_{dimension.name}") 

276 parameters.keys[dimension] = givenKey 

277 else: 

278 givenKey = self.summary.dataId[dimension] 

279 # Add a WHERE term for each column that corresponds to each 

280 # key. This is redundant with the JOIN ON clauses that make 

281 # them equal to each other, but more constraints have a chance 

282 # of making things easier on the DB's query optimizer. 

283 for columnInQuery in columnsInQuery: 

284 whereTerms.append(columnInQuery == givenKey) 

285 else: 

286 # Dimension is not fully identified, but it might be a skypix 

287 # dimension that's constrained by a given region. 

288 if self.summary.given.spatial and isinstance(dimension, SkyPixDimension): 

289 if self.summary.whenIsRegionGiven() == GivenTime.AT_CONSTRUCTION: 

290 # We know the region now. 

291 givenSkyPixIds = [] 

292 for begin, end in dimension.pixelization.envelope(self.summary.dataId.region): 

293 givenSkyPixIds.extend(range(begin, end)) 

294 else: 

295 # We'll know the region later (there might be a region 

296 # now, too, but we'll know a more precise one later, 

297 # and hence we'll ignore the one we know now). 

298 givenSkyPixIds = bindparam(f"_given_later_{dimension.name}") 

299 parameters.skypix[dimension] = givenSkyPixIds 

300 for columnInQuery in columnsInQuery: 

301 whereTerms.append(columnInQuery.in_(givenSkyPixIds)) 

302 # If we are [to be] given an dataId with a timespan, and there are 

303 # one or more timespans in the query that aren't given, add a WHERE 

304 # expression for each of them. 

305 if self.summary.given.temporal and self.summary.temporal: 

306 if self.summary.whenIsTimespanGiven() == GivenTime.AT_CONSTRUCTION: 

307 # Timespan is known now. 

308 givenInterval = self.summary.dataId.timespan 

309 else: 

310 # We'll know the timespan later (there might be a timespan now, 

311 # too, but we'll know a more precise one later, and hence we'll 

312 # ignore the one we know now). 

313 givenInterval = Timespan( 

314 begin=bindparam(f"_given_later_timespan_begin"), 

315 end=bindparam(f"_given_later_timespan_end"), 

316 ) 

317 for element, intervalInQuery in self._columns.timespans.items(): 

318 assert element not in self.summary.given.elements 

319 whereTerms.append(intervalInQuery.overlaps(givenInterval, ops=sqlalchemy.sql)) 

320 # AND-together the full WHERE clause, and combine it with the FROM 

321 # clause. 

322 self._sql = self._sql.where(and_(*whereTerms)) 

323 return parameters 

324 

325 def _addSelectClause(self): 

326 """Add a SELECT clause to the query under construction containing all 

327 output columns identified by the `QuerySummary` and requested in calls 

328 to `joinDataset` with ``isResult=True``. 

329 

330 For internal use by `QueryBuilder` only; will be called (and should 

331 only by called) by `finish`. 

332 """ 

333 columns = [] 

334 for dimension in self.summary.requested: 

335 columns.append(self._columns.getKeyColumn(dimension)) 

336 for datasetType, columnPair in self._columns.datasets.items(): 

337 columns.extend(columnPair) 

338 for element, column in self._columns.regions.items(): 

339 columns.append(column) 

340 self._sql = select(columns).select_from(self._sql) 

341 

342 def finish(self) -> Query: 

343 """Finish query constructing, returning a new `Query` instance. 

344 

345 This automatically joins any missing dimension element tables 

346 (according to the categorization of the `QuerySummary` the builder was 

347 constructed with). 

348 

349 This consumes the `QueryBuilder`; no other methods should be called 

350 after this one. 

351 

352 Returns 

353 ------- 

354 query : `Query` 

355 A `Query` object that can be executed (possibly multiple times 

356 with different bind parameter values) and used to interpret result 

357 rows. 

358 """ 

359 self._joinMissingDimensionElements() 

360 self._addSelectClause() 

361 parameters = self._addWhereClause() 

362 return Query(summary=self.summary, connection=self._connection, 

363 sql=self._sql, columns=self._columns, parameters=parameters)