Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage 

24 

25from dataclasses import dataclass 

26from typing import Iterator, List, Optional, Union 

27 

28from sqlalchemy.sql import ColumnElement 

29 

30from lsst.sphgeom import Region 

31from ...core import ( 

32 TimespanDatabaseRepresentation, 

33 DataCoordinate, 

34 DatasetType, 

35 Dimension, 

36 DimensionElement, 

37 DimensionGraph, 

38 DimensionUniverse, 

39 NamedKeyDict, 

40 NamedValueSet, 

41 SkyPixDimension, 

42) 

43from ..interfaces import ( 

44 CollectionManager, 

45 DatasetRecordStorageManager, 

46 DimensionRecordStorageManager, 

47) 

48# We're not trying to add typing to the lex/yacc parser code, so MyPy 

49# doesn't know about some of these imports. 

50from .exprParser import Node, ParserYacc # type: ignore 

51 

52 

53@dataclass 

54class QueryWhereExpression: 

55 """A struct representing a parsed user-provided WHERE expression. 

56 

57 Parameters 

58 ---------- 

59 universe : `DimensionUniverse` 

60 All known dimensions. 

61 expression : `str`, optional 

62 The string expression to parse. 

63 """ 

64 def __init__(self, universe: DimensionUniverse, expression: Optional[str] = None): 

65 if expression: 

66 from .expressions import InspectionVisitor 

67 try: 

68 parser = ParserYacc() 

69 self.tree = parser.parse(expression) 

70 except Exception as exc: 

71 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc 

72 visitor = InspectionVisitor(universe) 

73 assert self.tree is not None 

74 self.tree.visit(visitor) 

75 self.keys = visitor.keys 

76 self.metadata = visitor.metadata 

77 else: 

78 self.tree = None 

79 self.keys = NamedValueSet() 

80 self.metadata = NamedKeyDict() 

81 

82 tree: Optional[Node] 

83 """The parsed user expression tree, if present (`Node` or `None`). 

84 """ 

85 

86 keys: NamedValueSet[Dimension] 

87 """All dimensions whose keys are referenced by the expression 

88 (`NamedValueSet` of `Dimension`). 

89 """ 

90 

91 metadata: NamedKeyDict[DimensionElement, List[str]] 

92 """All dimension elements metadata fields referenced by the expression 

93 (`NamedKeyDict` mapping `DimensionElement` to a `set` of field names). 

94 """ 

95 

96 

97@dataclass 

98class QuerySummary: 

99 """A struct that holds and categorizes the dimensions involved in a query. 

100 

101 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

102 it needs to include all of the dimensions that will be included in the 

103 query (including any needed for querying datasets). 

104 

105 Parameters 

106 ---------- 

107 requested : `DimensionGraph` 

108 The dimensions whose primary keys should be included in the result rows 

109 of the query. 

110 dataId : `DataCoordinate`, optional 

111 A fully-expanded data ID identifying dimensions known in advance. If 

112 not provided, will be set to an empty data ID. ``dataId.hasRecords()`` 

113 must return `True`. 

114 expression : `str` or `QueryWhereExpression`, optional 

115 A user-provided string WHERE expression. 

116 whereRegion : `lsst.sphgeom.Region`, optional 

117 A spatial region that all rows must overlap. If `None` and ``dataId`` 

118 is not `None`, ``dataId.region`` will be used. 

119 """ 

120 def __init__(self, requested: DimensionGraph, *, 

121 dataId: Optional[DataCoordinate] = None, 

122 expression: Optional[Union[str, QueryWhereExpression]] = None, 

123 whereRegion: Optional[Region] = None): 

124 self.requested = requested 

125 self.dataId = dataId if dataId is not None else DataCoordinate.makeEmpty(requested.universe) 

126 self.expression = (expression if isinstance(expression, QueryWhereExpression) 

127 else QueryWhereExpression(requested.universe, expression)) 

128 if whereRegion is None and self.dataId is not None: 

129 whereRegion = self.dataId.region 

130 self.whereRegion = whereRegion 

131 

132 requested: DimensionGraph 

133 """Dimensions whose primary keys should be included in the result rows of 

134 the query (`DimensionGraph`). 

135 """ 

136 

137 dataId: DataCoordinate 

138 """A data ID identifying dimensions known before query construction 

139 (`DataCoordinate`). 

140 

141 ``dataId.hasRecords()`` is guaranteed to return `True`. 

142 """ 

143 

144 whereRegion: Optional[Region] 

145 """A spatial region that all result rows must overlap 

146 (`lsst.sphgeom.Region` or `None`). 

147 """ 

148 

149 expression: QueryWhereExpression 

150 """Information about any parsed user WHERE expression 

151 (`QueryWhereExpression`). 

152 """ 

153 

154 @property 

155 def universe(self) -> DimensionUniverse: 

156 """All known dimensions (`DimensionUniverse`). 

157 """ 

158 return self.requested.universe 

159 

160 @property 

161 def spatial(self) -> NamedValueSet[DimensionElement]: 

162 """Dimension elements whose regions and skypix IDs should be included 

163 in the query (`NamedValueSet` of `DimensionElement`). 

164 """ 

165 # An element may participate spatially in the query if: 

166 # - it's the most precise spatial element for its system in the 

167 # requested dimensions (i.e. in `self.requested.spatial`); 

168 # - it isn't also given at query construction time. 

169 result: NamedValueSet[DimensionElement] = NamedValueSet() 

170 for family in self.mustHaveKeysJoined.spatial: 

171 element = family.choose(self.mustHaveKeysJoined.elements) 

172 assert isinstance(element, DimensionElement) 

173 if element not in self.dataId.graph.elements: 

174 result.add(element) 

175 if len(result) == 1: 

176 # There's no spatial join, but there might be a WHERE filter based 

177 # on a given region. 

178 if self.dataId.graph.spatial: 

179 # We can only perform those filters against SkyPix dimensions, 

180 # so if what we have isn't one, add the common SkyPix dimension 

181 # to the query; the element we have will be joined to that. 

182 element, = result 

183 if not isinstance(element, SkyPixDimension): 

184 result.add(self.universe.commonSkyPix) 

185 else: 

186 # There is no spatial join or filter in this query. Even 

187 # if this element might be associated with spatial 

188 # information, we don't need it for this query. 

189 return NamedValueSet() 

190 elif len(result) > 1: 

191 # There's a spatial join. Those require the common SkyPix 

192 # system to be included in the query in order to connect them. 

193 result.add(self.universe.commonSkyPix) 

194 return result 

195 

196 @property 

197 def temporal(self) -> NamedValueSet[DimensionElement]: 

198 """Dimension elements whose timespans should be included in the 

199 query (`NamedValueSet` of `DimensionElement`). 

200 """ 

201 # An element may participate temporally in the query if: 

202 # - it's the most precise temporal element for its system in the 

203 # requested dimensions (i.e. in `self.requested.temporal`); 

204 # - it isn't also given at query construction time. 

205 result: NamedValueSet[DimensionElement] = NamedValueSet() 

206 for family in self.mustHaveKeysJoined.temporal: 

207 element = family.choose(self.mustHaveKeysJoined.elements) 

208 assert isinstance(element, DimensionElement) 

209 if element not in self.dataId.graph.elements: 

210 result.add(element) 

211 if len(result) == 1 and not self.dataId.graph.temporal: 

212 # No temporal join or filter. Even if this element might be 

213 # associated with temporal information, we don't need it for this 

214 # query. 

215 return NamedValueSet() 

216 return result 

217 

218 @property 

219 def mustHaveKeysJoined(self) -> DimensionGraph: 

220 """Dimensions whose primary keys must be used in the JOIN ON clauses 

221 of the query, even if their tables do not appear (`DimensionGraph`). 

222 

223 A `Dimension` primary key can appear in a join clause without its table 

224 via a foreign key column in table of a dependent dimension element or 

225 dataset. 

226 """ 

227 names = set(self.requested.names | self.expression.keys.names) 

228 return DimensionGraph(self.universe, names=names) 

229 

230 @property 

231 def mustHaveTableJoined(self) -> NamedValueSet[DimensionElement]: 

232 """Dimension elements whose associated tables must appear in the 

233 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

234 """ 

235 result = NamedValueSet(self.spatial | self.temporal | self.expression.metadata.keys()) 

236 for dimension in self.mustHaveKeysJoined: 

237 if dimension.implied: 

238 result.add(dimension) 

239 for element in self.mustHaveKeysJoined.union(self.dataId.graph).elements: 

240 if element.alwaysJoin: 

241 result.add(element) 

242 return result 

243 

244 

245@dataclass 

246class DatasetQueryColumns: 

247 """A struct containing the columns used to reconstruct `DatasetRef` 

248 instances from query results. 

249 """ 

250 

251 datasetType: DatasetType 

252 """The dataset type being queried (`DatasetType`). 

253 """ 

254 

255 id: ColumnElement 

256 """Column containing the unique integer ID for this dataset. 

257 """ 

258 

259 runKey: ColumnElement 

260 """Foreign key column to the `~CollectionType.RUN` collection that holds 

261 this dataset. 

262 """ 

263 

264 ingestDate: Optional[ColumnElement] 

265 """Column containing the ingest timestamp, this is not a part of 

266 `DatasetRef` but it comes from the same table. 

267 """ 

268 

269 def __iter__(self) -> Iterator[ColumnElement]: 

270 yield self.id 

271 yield self.runKey 

272 

273 

274@dataclass 

275class QueryColumns: 

276 """A struct organizing the columns in an under-construction or currently- 

277 executing query. 

278 

279 Takes no parameters at construction, as expected usage is to add elements 

280 to its container attributes incrementally. 

281 """ 

282 def __init__(self) -> None: 

283 self.keys = NamedKeyDict() 

284 self.timespans = NamedKeyDict() 

285 self.regions = NamedKeyDict() 

286 self.datasets = None 

287 

288 keys: NamedKeyDict[Dimension, List[ColumnElement]] 

289 """Columns that correspond to the primary key values of dimensions 

290 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

291 

292 Each value list contains columns from multiple tables corresponding to the 

293 same dimension, and the query should constrain the values of those columns 

294 to be the same. 

295 

296 In a `Query`, the keys of this dictionary must include at least the 

297 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

298 """ 

299 

300 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] 

301 """Columns that correspond to timespans for elements that participate in a 

302 temporal join or filter in the query (`NamedKeyDict` mapping 

303 `DimensionElement` to `TimespanDatabaseRepresentation`). 

304 

305 In a `Query`, the keys of this dictionary must be exactly the elements 

306 in `QuerySummary.temporal`. 

307 """ 

308 

309 regions: NamedKeyDict[DimensionElement, ColumnElement] 

310 """Columns that correspond to regions for elements that participate in a 

311 spatial join or filter in the query (`NamedKeyDict` mapping 

312 `DimensionElement` to `ColumnElement`). 

313 

314 In a `Query`, the keys of this dictionary must be exactly the elements 

315 in `QuerySummary.spatial`. 

316 """ 

317 

318 datasets: Optional[DatasetQueryColumns] 

319 """Columns that can be used to construct `DatasetRef` instances from query 

320 results. 

321 (`DatasetQueryColumns` or `None`). 

322 """ 

323 

324 def isEmpty(self) -> bool: 

325 """Return `True` if this query has no columns at all. 

326 """ 

327 return not (self.keys or self.timespans or self.regions or self.datasets is not None) 

328 

329 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement: 

330 """ Return one of the columns in self.keys for the given dimension. 

331 

332 The column selected is an implentation detail but is guaranteed to 

333 be deterministic and consistent across multiple calls. 

334 

335 Parameters 

336 ---------- 

337 dimension : `Dimension` or `str` 

338 Dimension for which to obtain a key column. 

339 

340 Returns 

341 ------- 

342 column : `sqlalchemy.sql.ColumnElement` 

343 SQLAlchemy column object. 

344 """ 

345 # Choosing the last element here is entirely for human readers of the 

346 # query (e.g. developers debugging things); it makes it more likely a 

347 # dimension key will be provided by the dimension's own table, or 

348 # failing that, some closely related dimension, which might be less 

349 # surprising to see than e.g. some dataset subquery. From the 

350 # database's perspective this is entirely arbitrary, because the query 

351 # guarantees they all have equal values. 

352 return self.keys[dimension][-1] 

353 

354 

355@dataclass 

356class RegistryManagers: 

357 """Struct used to pass around the manager objects that back a `Registry` 

358 and are used internally by the query system. 

359 """ 

360 

361 collections: CollectionManager 

362 """Manager for collections (`CollectionManager`). 

363 """ 

364 

365 datasets: DatasetRecordStorageManager 

366 """Manager for datasets and dataset types (`DatasetRecordStorageManager`). 

367 """ 

368 

369 dimensions: DimensionRecordStorageManager 

370 """Manager for dimensions (`DimensionRecordStorageManager`). 

371 """