Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary"] # other classes here are local to subpackage 

24 

25from dataclasses import dataclass 

26from typing import Iterator, List, Optional, Set, Union 

27 

28from sqlalchemy.sql import ColumnElement 

29 

30from ...core import ( 

31 DatasetType, 

32 Dimension, 

33 DimensionElement, 

34 DimensionGraph, 

35 DimensionUniverse, 

36 ExpandedDataCoordinate, 

37 SkyPixDimension, 

38 Timespan, 

39) 

40from ...core.utils import NamedValueSet, NamedKeyDict 

41from .exprParser import Node, ParserYacc 

42 

43 

44@dataclass 

45class QueryWhereExpression: 

46 """A struct representing a parsed user-provided WHERE expression. 

47 

48 Parameters 

49 ---------- 

50 universe : `DimensionUniverse` 

51 All known dimensions. 

52 expression : `str`, optional 

53 The string expression to parse. 

54 """ 

55 def __init__(self, universe: DimensionUniverse, expression: Optional[str] = None): 

56 if expression: 

57 from .expressions import InspectionVisitor 

58 try: 

59 parser = ParserYacc() 

60 self.tree = parser.parse(expression) 

61 except Exception as exc: 

62 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc 

63 visitor = InspectionVisitor(universe) 

64 self.tree.visit(visitor) 

65 self.keys = visitor.keys 

66 self.metadata = visitor.metadata 

67 else: 

68 self.tree = None 

69 self.keys = NamedValueSet() 

70 self.metadata = NamedKeyDict() 

71 

72 tree: Optional[Node] 

73 """The parsed user expression tree, if present (`Node` or `None`). 

74 """ 

75 

76 keys: NamedValueSet[Dimension] 

77 """All dimensions whose keys are referenced by the expression 

78 (`NamedValueSet` of `Dimension`). 

79 """ 

80 

81 metadata: NamedKeyDict[DimensionElement, Set[str]] 

82 """All dimension elements metadata fields referenced by the expression 

83 (`NamedKeyDict` mapping `DimensionElement` to a `set` of field names). 

84 """ 

85 

86 

87@dataclass 

88class QuerySummary: 

89 """A struct that holds and categorizes the dimensions involved in a query. 

90 

91 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

92 it needs to include all of the dimensions that will be included in the 

93 query (including any needed for querying datasets). 

94 

95 Parameters 

96 ---------- 

97 requested : `DimensionGraph` 

98 The dimensions whose primary keys should be included in the result rows 

99 of the query. 

100 dataId : `ExpandedDataCoordinate`, optional 

101 A fully-expanded data ID identifying dimensions known in advance. If 

102 not provided, will be set to an empty data ID. 

103 expression : `str` or `QueryWhereExpression`, optional 

104 A user-provided string WHERE expression. 

105 """ 

106 def __init__(self, requested: DimensionGraph, *, 

107 dataId: Optional[ExpandedDataCoordinate] = None, 

108 expression: Optional[Union[str, QueryWhereExpression]] = None): 

109 self.requested = requested 

110 self.dataId = dataId if dataId is not None else ExpandedDataCoordinate(requested.universe.empty, ()) 

111 self.expression = (expression if isinstance(expression, QueryWhereExpression) 

112 else QueryWhereExpression(requested.universe, expression)) 

113 

114 requested: DimensionGraph 

115 """Dimensions whose primary keys should be included in the result rows of 

116 the query (`DimensionGraph`). 

117 """ 

118 

119 dataId: ExpandedDataCoordinate 

120 """A data ID identifying dimensions known before query construction 

121 (`ExpandedDataCoordinate`). 

122 """ 

123 

124 expression: QueryWhereExpression 

125 """Information about any parsed user WHERE expression 

126 (`QueryWhereExpression`). 

127 """ 

128 

129 @property 

130 def universe(self) -> DimensionUniverse: 

131 """All known dimensions (`DimensionUniverse`). 

132 """ 

133 return self.requested.universe 

134 

135 @property 

136 def spatial(self) -> NamedValueSet[DimensionElement]: 

137 """Dimension elements whose regions and skypix IDs should be included 

138 in the query (`NamedValueSet` of `DimensionElement`). 

139 """ 

140 # An element may participate spatially in the query if: 

141 # - it's the most precise spatial element for its system in the 

142 # requested dimensions (i.e. in `self.requested.spatial`); 

143 # - it isn't also given at query construction time. 

144 result = self.mustHaveKeysJoined.spatial - self.dataId.graph.elements 

145 if len(result) == 1: 

146 # There's no spatial join, but there might be a WHERE filter based 

147 # on a given region. 

148 if self.dataId.graph.spatial: 

149 # We can only perform those filters against SkyPix dimensions, 

150 # so if what we have isn't one, add the common SkyPix dimension 

151 # to the query; the element we have will be joined to that. 

152 element, = result 

153 if not isinstance(element, SkyPixDimension): 

154 result.add(self.universe.commonSkyPix) 

155 else: 

156 # There is no spatial join or filter in this query. Even 

157 # if this element might be associated with spatial 

158 # information, we don't need it for this query. 

159 return NamedValueSet() 

160 elif len(result) > 1: 

161 # There's a spatial join. Those require the common SkyPix 

162 # system to be included in the query in order to connect them. 

163 result.add(self.universe.commonSkyPix) 

164 return result 

165 

166 @property 

167 def temporal(self) -> NamedValueSet[DimensionElement]: 

168 """Dimension elements whose timespans should be included in the 

169 query (`NamedValueSet` of `DimensionElement`). 

170 """ 

171 # An element may participate temporally in the query if: 

172 # - it's the most precise temporal element for its system in the 

173 # requested dimensions (i.e. in `self.requested.temporal`); 

174 # - it isn't also given at query construction time. 

175 result = self.mustHaveKeysJoined.temporal - self.dataId.graph.elements 

176 if len(result) == 1 and not self.dataId.graph.temporal: 

177 # No temporal join or filter. Even if this element might be 

178 # associated with temporal information, we don't need it for this 

179 # query. 

180 return NamedValueSet() 

181 return result 

182 

183 @property 

184 def mustHaveKeysJoined(self) -> DimensionGraph: 

185 """Dimensions whose primary keys must be used in the JOIN ON clauses 

186 of the query, even if their tables do not appear (`DimensionGraph`). 

187 

188 A `Dimension` primary key can appear in a join clause without its table 

189 via a foreign key column in table of a dependent dimension element or 

190 dataset. 

191 """ 

192 names = set(self.requested.names | self.expression.keys.names) 

193 return DimensionGraph(self.universe, names=names) 

194 

195 @property 

196 def mustHaveTableJoined(self) -> NamedValueSet[DimensionElement]: 

197 """Dimension elements whose associated tables must appear in the 

198 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

199 """ 

200 result = self.spatial | self.temporal | self.expression.metadata.keys() 

201 for dimension in self.mustHaveKeysJoined: 

202 if dimension.implied: 

203 result.add(dimension) 

204 for element in self.mustHaveKeysJoined.union(self.dataId.graph).elements: 

205 if element.alwaysJoin: 

206 result.add(element) 

207 return result 

208 

209 

210@dataclass 

211class DatasetQueryColumns: 

212 """A struct containing the columns used to reconstruct `DatasetRef` 

213 instances from query results. 

214 """ 

215 

216 id: ColumnElement 

217 """Column containing the unique integer ID for this dataset. 

218 """ 

219 

220 runKey: ColumnElement 

221 """Foreign key column to the `~CollectionType.RUN` collection that holds 

222 this dataset. 

223 """ 

224 

225 rank: Optional[ColumnElement] = None 

226 """Column containing the index into the ordered sequence of given 

227 collections for the collection in which this dataset was found. 

228 """ 

229 

230 def __iter__(self) -> Iterator[ColumnElement]: 

231 yield self.id 

232 yield self.runKey 

233 if self.rank is not None: 

234 yield self.rank 

235 

236 

237@dataclass 

238class QueryColumns: 

239 """A struct organizing the columns in an under-construction or currently- 

240 executing query. 

241 

242 Takes no parameters at construction, as expected usage is to add elements 

243 to its container attributes incrementally. 

244 """ 

245 def __init__(self): 

246 self.keys = NamedKeyDict() 

247 self.timespans = NamedKeyDict() 

248 self.regions = NamedKeyDict() 

249 self.datasets = NamedKeyDict() 

250 

251 keys: NamedKeyDict[Dimension, List[ColumnElement]] 

252 """Columns that correspond to the primary key values of dimensions 

253 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

254 

255 Each value list contains columns from multiple tables corresponding to the 

256 same dimension, and the query should constrain the values of those columns 

257 to be the same. 

258 

259 In a `Query`, the keys of this dictionary must include at least the 

260 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

261 """ 

262 

263 timespans: NamedKeyDict[DimensionElement, Timespan[ColumnElement]] 

264 """Columns that correspond to timespans for elements that participate in a 

265 temporal join or filter in the query (`NamedKeyDict` mapping 

266 `DimensionElement` to `Timespan` of `ColumnElement`). 

267 

268 In a `Query`, the keys of this dictionary must be exactly the elements 

269 in `QuerySummary.temporal`. 

270 """ 

271 

272 regions: NamedKeyDict[DimensionElement, ColumnElement] 

273 """Columns that correspond to regions for elements that participate in a 

274 spatial join or filter in the query (`NamedKeyDict` mapping 

275 `DimensionElement` to `ColumnElement`). 

276 

277 In a `Query`, the keys of this dictionary must be exactly the elements 

278 in `QuerySummary.spatial`. 

279 """ 

280 

281 datasets: NamedKeyDict[DatasetType, DatasetQueryColumns] 

282 """Columns that can be used to construct `DatasetRef` instances from query 

283 results, for each `DatasetType` included in the query 

284 (`NamedKeyDict` [ `DatasetType`, `DatasetQueryColumns` ] ). 

285 """ 

286 

287 def getKeyColumn(self, dimension: Dimension) -> ColumnElement: 

288 """ Return one of the columns in self.keys for the given dimension. 

289 

290 The column selected is an implentation detail but is guaranteed to 

291 be deterministic and consistent across multiple calls. 

292 

293 Parameters 

294 ---------- 

295 dimension : `Dimension` 

296 Element for which to obtain a key column. 

297 

298 Returns 

299 ------- 

300 column : `sqlalchemy.sql.ColumnElement` 

301 SQLAlchemy column object. 

302 """ 

303 # Choosing the last element here is entirely for human readers of the 

304 # query (e.g. developers debugging things); it makes it more likely a 

305 # dimension key will be provided by the dimension's own table, or 

306 # failing that, some closely related dimension, which might be less 

307 # surprising to see than e.g. some dataset subquery. From the 

308 # database's perspective this is entirely arbitrary, cause the query 

309 # guarantees they all have equal values. 

310 return self.keys[dimension][-1]