Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary"] # other classes here are local to subpackage 

24 

25from dataclasses import dataclass 

26from typing import Iterator, List, Optional, Union 

27 

28from sqlalchemy.sql import ColumnElement 

29 

30from ...core import ( 

31 DataCoordinate, 

32 DatasetType, 

33 Dimension, 

34 DimensionElement, 

35 DimensionGraph, 

36 DimensionUniverse, 

37 NamedKeyDict, 

38 NamedValueSet, 

39 SkyPixDimension, 

40 Timespan, 

41) 

42# We're not trying to add typing to the lex/yacc parser code, so MyPy 

43# doesn't know about some of these imports. 

44from .exprParser import Node, ParserYacc # type: ignore 

45 

46 

47@dataclass 

48class QueryWhereExpression: 

49 """A struct representing a parsed user-provided WHERE expression. 

50 

51 Parameters 

52 ---------- 

53 universe : `DimensionUniverse` 

54 All known dimensions. 

55 expression : `str`, optional 

56 The string expression to parse. 

57 """ 

58 def __init__(self, universe: DimensionUniverse, expression: Optional[str] = None): 

59 if expression: 

60 from .expressions import InspectionVisitor 

61 try: 

62 parser = ParserYacc() 

63 self.tree = parser.parse(expression) 

64 except Exception as exc: 

65 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc 

66 visitor = InspectionVisitor(universe) 

67 assert self.tree is not None 

68 self.tree.visit(visitor) 

69 self.keys = visitor.keys 

70 self.metadata = visitor.metadata 

71 else: 

72 self.tree = None 

73 self.keys = NamedValueSet() 

74 self.metadata = NamedKeyDict() 

75 

76 tree: Optional[Node] 

77 """The parsed user expression tree, if present (`Node` or `None`). 

78 """ 

79 

80 keys: NamedValueSet[Dimension] 

81 """All dimensions whose keys are referenced by the expression 

82 (`NamedValueSet` of `Dimension`). 

83 """ 

84 

85 metadata: NamedKeyDict[DimensionElement, List[str]] 

86 """All dimension elements metadata fields referenced by the expression 

87 (`NamedKeyDict` mapping `DimensionElement` to a `set` of field names). 

88 """ 

89 

90 

91@dataclass 

92class QuerySummary: 

93 """A struct that holds and categorizes the dimensions involved in a query. 

94 

95 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

96 it needs to include all of the dimensions that will be included in the 

97 query (including any needed for querying datasets). 

98 

99 Parameters 

100 ---------- 

101 requested : `DimensionGraph` 

102 The dimensions whose primary keys should be included in the result rows 

103 of the query. 

104 dataId : `DataCoordinate`, optional 

105 A fully-expanded data ID identifying dimensions known in advance. If 

106 not provided, will be set to an empty data ID. ``dataId.hasRecords()`` 

107 must return `True`. 

108 expression : `str` or `QueryWhereExpression`, optional 

109 A user-provided string WHERE expression. 

110 """ 

111 def __init__(self, requested: DimensionGraph, *, 

112 dataId: Optional[DataCoordinate] = None, 

113 expression: Optional[Union[str, QueryWhereExpression]] = None): 

114 self.requested = requested 

115 self.dataId = dataId if dataId is not None else DataCoordinate.makeEmpty(requested.universe) 

116 self.expression = (expression if isinstance(expression, QueryWhereExpression) 

117 else QueryWhereExpression(requested.universe, expression)) 

118 

119 requested: DimensionGraph 

120 """Dimensions whose primary keys should be included in the result rows of 

121 the query (`DimensionGraph`). 

122 """ 

123 

124 dataId: DataCoordinate 

125 """A data ID identifying dimensions known before query construction 

126 (`DataCoordinate`). 

127 

128 ``dataId.hasRecords()`` is guaranteed to return `True`. 

129 """ 

130 

131 expression: QueryWhereExpression 

132 """Information about any parsed user WHERE expression 

133 (`QueryWhereExpression`). 

134 """ 

135 

136 @property 

137 def universe(self) -> DimensionUniverse: 

138 """All known dimensions (`DimensionUniverse`). 

139 """ 

140 return self.requested.universe 

141 

142 @property 

143 def spatial(self) -> NamedValueSet[DimensionElement]: 

144 """Dimension elements whose regions and skypix IDs should be included 

145 in the query (`NamedValueSet` of `DimensionElement`). 

146 """ 

147 # An element may participate spatially in the query if: 

148 # - it's the most precise spatial element for its system in the 

149 # requested dimensions (i.e. in `self.requested.spatial`); 

150 # - it isn't also given at query construction time. 

151 result = NamedValueSet(self.mustHaveKeysJoined.spatial - self.dataId.graph.elements) 

152 if len(result) == 1: 

153 # There's no spatial join, but there might be a WHERE filter based 

154 # on a given region. 

155 if self.dataId.graph.spatial: 

156 # We can only perform those filters against SkyPix dimensions, 

157 # so if what we have isn't one, add the common SkyPix dimension 

158 # to the query; the element we have will be joined to that. 

159 element, = result 

160 if not isinstance(element, SkyPixDimension): 

161 result.add(self.universe.commonSkyPix) 

162 else: 

163 # There is no spatial join or filter in this query. Even 

164 # if this element might be associated with spatial 

165 # information, we don't need it for this query. 

166 return NamedValueSet() 

167 elif len(result) > 1: 

168 # There's a spatial join. Those require the common SkyPix 

169 # system to be included in the query in order to connect them. 

170 result.add(self.universe.commonSkyPix) 

171 return result 

172 

173 @property 

174 def temporal(self) -> NamedValueSet[DimensionElement]: 

175 """Dimension elements whose timespans should be included in the 

176 query (`NamedValueSet` of `DimensionElement`). 

177 """ 

178 # An element may participate temporally in the query if: 

179 # - it's the most precise temporal element for its system in the 

180 # requested dimensions (i.e. in `self.requested.temporal`); 

181 # - it isn't also given at query construction time. 

182 result = NamedValueSet(self.mustHaveKeysJoined.temporal - self.dataId.graph.elements) 

183 if len(result) == 1 and not self.dataId.graph.temporal: 

184 # No temporal join or filter. Even if this element might be 

185 # associated with temporal information, we don't need it for this 

186 # query. 

187 return NamedValueSet() 

188 return result 

189 

190 @property 

191 def mustHaveKeysJoined(self) -> DimensionGraph: 

192 """Dimensions whose primary keys must be used in the JOIN ON clauses 

193 of the query, even if their tables do not appear (`DimensionGraph`). 

194 

195 A `Dimension` primary key can appear in a join clause without its table 

196 via a foreign key column in table of a dependent dimension element or 

197 dataset. 

198 """ 

199 names = set(self.requested.names | self.expression.keys.names) 

200 return DimensionGraph(self.universe, names=names) 

201 

202 @property 

203 def mustHaveTableJoined(self) -> NamedValueSet[DimensionElement]: 

204 """Dimension elements whose associated tables must appear in the 

205 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

206 """ 

207 result = NamedValueSet(self.spatial | self.temporal | self.expression.metadata.keys()) 

208 for dimension in self.mustHaveKeysJoined: 

209 if dimension.implied: 

210 result.add(dimension) 

211 for element in self.mustHaveKeysJoined.union(self.dataId.graph).elements: 

212 if element.alwaysJoin: 

213 result.add(element) 

214 return result 

215 

216 

217@dataclass 

218class DatasetQueryColumns: 

219 """A struct containing the columns used to reconstruct `DatasetRef` 

220 instances from query results. 

221 """ 

222 

223 id: ColumnElement 

224 """Column containing the unique integer ID for this dataset. 

225 """ 

226 

227 runKey: ColumnElement 

228 """Foreign key column to the `~CollectionType.RUN` collection that holds 

229 this dataset. 

230 """ 

231 

232 rank: Optional[ColumnElement] = None 

233 """Column containing the index into the ordered sequence of given 

234 collections for the collection in which this dataset was found. 

235 """ 

236 

237 def __iter__(self) -> Iterator[ColumnElement]: 

238 yield self.id 

239 yield self.runKey 

240 if self.rank is not None: 

241 yield self.rank 

242 

243 

244@dataclass 

245class QueryColumns: 

246 """A struct organizing the columns in an under-construction or currently- 

247 executing query. 

248 

249 Takes no parameters at construction, as expected usage is to add elements 

250 to its container attributes incrementally. 

251 """ 

252 def __init__(self) -> None: 

253 self.keys = NamedKeyDict() 

254 self.timespans = NamedKeyDict() 

255 self.regions = NamedKeyDict() 

256 self.datasets = NamedKeyDict() 

257 

258 keys: NamedKeyDict[Dimension, List[ColumnElement]] 

259 """Columns that correspond to the primary key values of dimensions 

260 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

261 

262 Each value list contains columns from multiple tables corresponding to the 

263 same dimension, and the query should constrain the values of those columns 

264 to be the same. 

265 

266 In a `Query`, the keys of this dictionary must include at least the 

267 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

268 """ 

269 

270 timespans: NamedKeyDict[DimensionElement, Timespan[ColumnElement]] 

271 """Columns that correspond to timespans for elements that participate in a 

272 temporal join or filter in the query (`NamedKeyDict` mapping 

273 `DimensionElement` to `Timespan` of `ColumnElement`). 

274 

275 In a `Query`, the keys of this dictionary must be exactly the elements 

276 in `QuerySummary.temporal`. 

277 """ 

278 

279 regions: NamedKeyDict[DimensionElement, ColumnElement] 

280 """Columns that correspond to regions for elements that participate in a 

281 spatial join or filter in the query (`NamedKeyDict` mapping 

282 `DimensionElement` to `ColumnElement`). 

283 

284 In a `Query`, the keys of this dictionary must be exactly the elements 

285 in `QuerySummary.spatial`. 

286 """ 

287 

288 datasets: NamedKeyDict[DatasetType, DatasetQueryColumns] 

289 """Columns that can be used to construct `DatasetRef` instances from query 

290 results, for each `DatasetType` included in the query 

291 (`NamedKeyDict` [ `DatasetType`, `DatasetQueryColumns` ] ). 

292 """ 

293 

294 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement: 

295 """ Return one of the columns in self.keys for the given dimension. 

296 

297 The column selected is an implentation detail but is guaranteed to 

298 be deterministic and consistent across multiple calls. 

299 

300 Parameters 

301 ---------- 

302 dimension : `Dimension` or `str` 

303 Dimension for which to obtain a key column. 

304 

305 Returns 

306 ------- 

307 column : `sqlalchemy.sql.ColumnElement` 

308 SQLAlchemy column object. 

309 """ 

310 # Choosing the last element here is entirely for human readers of the 

311 # query (e.g. developers debugging things); it makes it more likely a 

312 # dimension key will be provided by the dimension's own table, or 

313 # failing that, some closely related dimension, which might be less 

314 # surprising to see than e.g. some dataset subquery. From the 

315 # database's perspective this is entirely arbitrary, because the query 

316 # guarantees they all have equal values. 

317 return self.keys[dimension][-1]