Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import Any, List, Iterable, TYPE_CHECKING 

26 

27from sqlalchemy.sql import ColumnElement, and_, literal, select, FromClause 

28import sqlalchemy.sql 

29from sqlalchemy.engine import Connection 

30 

31from ...core import ( 

32 DimensionElement, 

33 SkyPixDimension, 

34 Dimension, 

35 DatasetType, 

36) 

37from ...core.utils import NamedKeyDict 

38 

39from ._structs import QuerySummary, QueryColumns 

40from ._datasets import DatasetRegistryStorage 

41from .expressions import ClauseVisitor 

42from ._query import Query 

43 

44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true

45 from ..interfaces import DimensionRecordStorageManager 

46 

47 

48class QueryBuilder: 

49 """A builder for potentially complex queries that join tables based 

50 on dimension relationships. 

51 

52 Parameters 

53 ---------- 

54 connection : `sqlalchemy.engine.Connection` 

55 SQLAlchemy connection object. This is only used to pass through 

56 to the `Query` object returned by `finish`. 

57 summary : `QuerySummary` 

58 Struct organizing the dimensions involved in the query. 

59 dimensionStorage : `DimensionRecordStorageManager` 

60 Manager for storage backend objects that abstract access to dimension 

61 tables. 

62 datasetStorage : `DatasetRegistryStorage` 

63 Storage backend object that abstracts access to dataset tables. 

64 """ 

65 

66 def __init__(self, connection: Connection, summary: QuerySummary, 

67 dimensionStorage: DimensionRecordStorageManager, 

68 datasetStorage: DatasetRegistryStorage): 

69 self.summary = summary 

70 self._connection = connection 

71 self._dimensionStorage = dimensionStorage 

72 self._datasetStorage = datasetStorage 

73 self._sql = None 

74 self._elements: NamedKeyDict[DimensionElement, FromClause] = NamedKeyDict() 

75 self._columns = QueryColumns() 

76 

77 def hasDimensionKey(self, dimension: Dimension) -> bool: 

78 """Return `True` if the given dimension's primary key column has 

79 been included in the query (possibly via a foreign key column on some 

80 other table). 

81 """ 

82 return dimension in self._columns.keys 

83 

84 def joinDimensionElement(self, element: DimensionElement): 

85 """Add the table for a `DimensionElement` to the query. 

86 

87 This automatically joins the element table to all other tables in the 

88 query with which it is related, via both dimension keys and spatial 

89 and temporal relationships. 

90 

91 External calls to this method should rarely be necessary; `finish` will 

92 automatically call it if the `DimensionElement` has been identified as 

93 one that must be included. 

94 

95 Parameters 

96 ---------- 

97 element : `DimensionElement` 

98 Element for which a table should be added. The element must be 

99 associated with a database table (see `DimensionElement.hasTable`). 

100 """ 

101 assert element not in self._elements, "Element already included in query." 

102 storage = self._dimensionStorage[element] 

103 fromClause = storage.join( 

104 self, 

105 regions=self._columns.regions if element in self.summary.spatial else None, 

106 timespans=self._columns.timespans if element in self.summary.temporal else None, 

107 ) 

108 self._elements[element] = fromClause 

109 

110 def joinDataset(self, datasetType: DatasetType, collections: Any, *, 

111 isResult: bool = True, addRank: bool = False) -> bool: 

112 """Add a dataset search or constraint to the query. 

113 

114 Unlike other `QueryBuilder` join methods, this *must* be called 

115 directly to search for datasets of a particular type or constrain the 

116 query results based on the exists of datasets. However, all dimensions 

117 used to identify the dataset type must have already been included in 

118 `QuerySummary.requested` when initializing the `QueryBuilder`. 

119 

120 Parameters 

121 ---------- 

122 datasetType : `DatasetType` 

123 The type of datasets to search for. 

124 collections : sequence of `str` or `Like`, or ``...`` 

125 An expression describing the collections in which to search for 

126 the datasets. This may be a single instance of or an iterable of 

127 any of the following: 

128 

129 - a `str` collection name; 

130 - a `Like` pattern to match against collection names; 

131 - `...`, indicating all collections. 

132 isResult : `bool`, optional 

133 If `True` (default), include the ``dataset_id`` column in the 

134 result columns of the query, allowing complete `DatasetRef` 

135 instances to be produced from the query results for this dataset 

136 type. If `False`, the existence of datasets of this type is used 

137 only to constrain the data IDs returned by the query. 

138 addRank : `bool`, optional 

139 If `True` (`False` is default), also include a calculated column 

140 that ranks the collection in which the dataset was found (lower 

141 is better). Requires that all entries in ``collections`` be 

142 regular strings, so there is a clear search order. Ignored if 

143 ``isResult`` is `False`. 

144 

145 Returns 

146 ------- 

147 anyRecords : `bool` 

148 If `True`, joining the dataset table was successful and the query 

149 should proceed. If `False`, we were able to determine (from the 

150 combination of ``datasetType`` and ``collections``) that there 

151 would be no results joined in from this dataset, and hence (due to 

152 the inner join that would normally be present), the full query will 

153 return no results. 

154 """ 

155 assert datasetType.dimensions.issubset(self.summary.requested) 

156 table = self._datasetStorage.getDatasetSubquery(datasetType, collections=collections, 

157 isResult=isResult, addRank=addRank) 

158 if table is None: 

159 return False 

160 self.joinTable(table, datasetType.dimensions) 

161 if isResult: 

162 self._columns.datasets[datasetType] = (table.columns["dataset_id"], 

163 table.columns["rank"] if addRank else None) 

164 return True 

165 

166 def joinTable(self, table: FromClause, dimensions: Iterable[Dimension]): 

167 """Join an arbitrary table to the query via dimension relationships. 

168 

169 External calls to this method should only be necessary for tables whose 

170 records represent neither dataset nor dimension elements (i.e. 

171 extensions to the standard `Registry` schema). 

172 

173 Parameters 

174 ---------- 

175 table : `sqlalchemy.sql.FromClause` 

176 SQLAlchemy object representing the logical table (which may be a 

177 join or subquery expression) to be joined. 

178 dimensions : iterable of `Dimension` 

179 The dimensions that relate this table to others that may be in the 

180 query. The table must have columns with the names of the 

181 dimensions. 

182 """ 

183 joinOn = self.startJoin(table, dimensions, dimensions.names) 

184 self.finishJoin(table, joinOn) 

185 

186 def startJoin(self, table: FromClause, dimensions: Iterable[Dimension], columnNames: Iterable[str] 

187 ) -> List[ColumnElement]: 

188 """Begin a join on dimensions. 

189 

190 Must be followed by call to `finishJoin`. 

191 

192 Parameters 

193 ---------- 

194 table : `sqlalchemy.sql.FromClause` 

195 SQLAlchemy object representing the logical table (which may be a 

196 join or subquery expression) to be joined. 

197 dimensions : iterable of `Dimension` 

198 The dimensions that relate this table to others that may be in the 

199 query. The table must have columns with the names of the 

200 dimensions. 

201 columnNames : iterable of `str` 

202 Names of the columns that correspond to dimension key values; must 

203 be `zip` iterable with ``dimensions``. 

204 

205 Returns 

206 ------- 

207 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

208 Sequence of boolean expressions that should be combined with AND 

209 to form (part of) the ON expression for this JOIN. 

210 """ 

211 joinOn = [] 

212 for dimension, columnName in zip(dimensions, columnNames): 

213 columnInTable = table.columns[columnName] 

214 columnsInQuery = self._columns.keys.setdefault(dimension, []) 

215 for columnInQuery in columnsInQuery: 

216 joinOn.append(columnInQuery == columnInTable) 

217 columnsInQuery.append(columnInTable) 

218 return joinOn 

219 

220 def finishJoin(self, table, joinOn): 

221 """Complete a join on dimensions. 

222 

223 Must be preceded by call to `startJoin`. 

224 

225 Parameters 

226 ---------- 

227 table : `sqlalchemy.sql.FromClause` 

228 SQLAlchemy object representing the logical table (which may be a 

229 join or subquery expression) to be joined. Must be the same object 

230 passed to `startJoin`. 

231 joinOn : `list` of `sqlalchemy.sql.ColumnElement` 

232 Sequence of boolean expressions that should be combined with AND 

233 to form (part of) the ON expression for this JOIN. Should include 

234 at least the elements of the list returned by `startJoin`. 

235 """ 

236 if joinOn: 

237 self._sql = self._sql.join(table, and_(*joinOn)) 

238 elif self._sql is None: 

239 self._sql = table 

240 else: 

241 # New table is completely unrelated to all already-included 

242 # tables. We need a cross join here but SQLAlchemy does not 

243 # have a specific method for that. Using join() without 

244 # `onclause` will try to join on FK and will raise an exception 

245 # for unrelated tables, so we have to use `onclause` which is 

246 # always true. 

247 self._sql = self._sql.join(table, literal(True) == literal(True)) 

248 

249 def _joinMissingDimensionElements(self): 

250 """Join all dimension element tables that were identified as necessary 

251 by `QuerySummary` and have not yet been joined. 

252 

253 For internal use by `QueryBuilder` only; will be called (and should 

254 only by called) by `finish`. 

255 """ 

256 # Join all DimensionElement tables that we need for spatial/temporal 

257 # joins/filters or a nontrivial WHERE expression. 

258 # We iterate over these in *reverse* topological order to minimize the 

259 # number of tables joined. For example, the "visit" table provides 

260 # the primary key value for the "instrument" table it depends on, so we 

261 # don't need to join "instrument" as well unless we had a nontrivial 

262 # expression on it (and hence included it already above). 

263 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True): 

264 self.joinDimensionElement(element) 

265 # Join in any requested Dimension tables that don't already have their 

266 # primary keys identified by the query. 

267 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True): 

268 if dimension not in self._columns.keys: 

269 self.joinDimensionElement(dimension) 

270 

271 def _addWhereClause(self): 

272 """Add a WHERE clause to the query under construction, connecting all 

273 joined dimensions to the expression and data ID dimensions from 

274 `QuerySummary`. 

275 

276 For internal use by `QueryBuilder` only; will be called (and should 

277 only by called) by `finish`. 

278 """ 

279 whereTerms = [] 

280 if self.summary.expression.tree is not None: 

281 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements) 

282 whereTerms.append(self.summary.expression.tree.visit(visitor)) 

283 for dimension, columnsInQuery in self._columns.keys.items(): 

284 if dimension in self.summary.dataId.graph: 

285 givenKey = self.summary.dataId[dimension] 

286 # Add a WHERE term for each column that corresponds to each 

287 # key. This is redundant with the JOIN ON clauses that make 

288 # them equal to each other, but more constraints have a chance 

289 # of making things easier on the DB's query optimizer. 

290 for columnInQuery in columnsInQuery: 

291 whereTerms.append(columnInQuery == givenKey) 

292 else: 

293 # Dimension is not fully identified, but it might be a skypix 

294 # dimension that's constrained by a given region. 

295 if self.summary.dataId.graph.spatial and isinstance(dimension, SkyPixDimension): 

296 # We know the region now. 

297 givenSkyPixIds = [] 

298 for begin, end in dimension.pixelization.envelope(self.summary.dataId.region): 

299 givenSkyPixIds.extend(range(begin, end)) 

300 for columnInQuery in columnsInQuery: 

301 whereTerms.append(columnInQuery.in_(givenSkyPixIds)) 

302 # If we are given an dataId with a timespan, and there are one or more 

303 # timespans in the query that aren't given, add a WHERE expression for 

304 # each of them. 

305 if self.summary.dataId.graph.temporal and self.summary.temporal: 

306 # Timespan is known now. 

307 givenInterval = self.summary.dataId.timespan 

308 for element, intervalInQuery in self._columns.timespans.items(): 

309 assert element not in self.summary.dataId.graph.elements 

310 whereTerms.append(intervalInQuery.overlaps(givenInterval, ops=sqlalchemy.sql)) 

311 # AND-together the full WHERE clause, and combine it with the FROM 

312 # clause. 

313 self._sql = self._sql.where(and_(*whereTerms)) 

314 

315 def _addSelectClause(self): 

316 """Add a SELECT clause to the query under construction containing all 

317 output columns identified by the `QuerySummary` and requested in calls 

318 to `joinDataset` with ``isResult=True``. 

319 

320 For internal use by `QueryBuilder` only; will be called (and should 

321 only by called) by `finish`. 

322 """ 

323 columns = [] 

324 for dimension in self.summary.requested: 

325 columns.append(self._columns.getKeyColumn(dimension)) 

326 for datasetType, columnPair in self._columns.datasets.items(): 

327 columns.extend(columnPair) 

328 for element, column in self._columns.regions.items(): 

329 columns.append(column) 

330 self._sql = select(columns).select_from(self._sql) 

331 

332 def finish(self) -> Query: 

333 """Finish query constructing, returning a new `Query` instance. 

334 

335 This automatically joins any missing dimension element tables 

336 (according to the categorization of the `QuerySummary` the builder was 

337 constructed with). 

338 

339 This consumes the `QueryBuilder`; no other methods should be called 

340 after this one. 

341 

342 Returns 

343 ------- 

344 query : `Query` 

345 A `Query` object that can be executed (possibly multiple times 

346 with different bind parameter values) and used to interpret result 

347 rows. 

348 """ 

349 self._joinMissingDimensionElements() 

350 self._addSelectClause() 

351 self._addWhereClause() 

352 return Query(summary=self.summary, connection=self._connection, 

353 sql=self._sql, columns=self._columns)