Coverage for python/lsst/daf/butler/direct_query_driver/_query_plan.py: 56%

126 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 02:46 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "QueryPlan", 

32 "QueryJoinsPlan", 

33 "QueryProjectionPlan", 

34 "QueryFindFirstPlan", 

35 "ResolvedDatasetSearch", 

36) 

37 

38import dataclasses 

39from collections.abc import Iterator 

40from typing import Any 

41 

42from ..dimensions import DataIdValue, DimensionElement, DimensionGroup 

43from ..queries import tree as qt 

44from ..queries.visitors import ColumnExpressionVisitor, PredicateVisitFlags, SimplePredicateVisitor 

45from ..registry.interfaces import CollectionRecord 

46 

47 

48@dataclasses.dataclass 

49class ResolvedDatasetSearch: 

50 """A struct describing a dataset search joined into a query, after 

51 resolving its collection search path. 

52 """ 

53 

54 name: str 

55 """Name of the dataset type.""" 

56 

57 dimensions: DimensionGroup 

58 """Dimensions of the dataset type.""" 

59 

60 collection_records: list[CollectionRecord] = dataclasses.field(default_factory=list) 

61 """Records of the collections to search for this dataset, in order, after 

62 removing collections inconsistent with the dataset type or the query's 

63 data ID constraint. 

64 """ 

65 

66 messages: list[str] = dataclasses.field(default_factory=list) 

67 """Diagnostic messages about collections that were filtered out of 

68 collection records. 

69 """ 

70 

71 is_calibration_search: bool = False 

72 """`True` if any of the collections to be searched is a 

73 `~CollectionType.CALIBRATION` collection, `False` otherwise. 

74 

75 Since only calibration datasets can be present in 

76 `~CollectionType.CALIBRATION` collections, this also 

77 """ 

78 

79 

80@dataclasses.dataclass 

81class QueryJoinsPlan: 

82 """A struct describing the "joins" section of a butler query. 

83 

84 See `QueryPlan` and `QueryPlan.joins` for additional information. 

85 """ 

86 

87 predicate: qt.Predicate 

88 """Boolean expression to apply to rows.""" 

89 

90 columns: qt.ColumnSet 

91 """All columns whose tables need to be joined into the query. 

92 

93 This is updated after construction to include all columns required by 

94 `predicate`. 

95 """ 

96 

97 materializations: dict[qt.MaterializationKey, DimensionGroup] = dataclasses.field(default_factory=dict) 

98 """Materializations to join into the query.""" 

99 

100 datasets: dict[str, ResolvedDatasetSearch] = dataclasses.field(default_factory=dict) 

101 """Dataset searches to join into the query.""" 

102 

103 data_coordinate_uploads: dict[qt.DataCoordinateUploadKey, DimensionGroup] = dataclasses.field( 

104 default_factory=dict 

105 ) 

106 """Data coordinate uploads to join into the query.""" 

107 

108 constraint_data_id: dict[str, DataIdValue] = dataclasses.field(default_factory=dict) 

109 """A data ID that must be consistent with all result rows, extracted from 

110 `predicate` at construction. 

111 """ 

112 

113 messages: list[str] = dataclasses.field(default_factory=list) 

114 """Diagnostic messages that report reasons the query may not return any 

115 rows. 

116 """ 

117 

118 def __post_init__(self) -> None: 

119 self.predicate.gather_required_columns(self.columns) 

120 # Extract the data ID implied by the predicate; we can use the governor 

121 # dimensions in that to constrain the collections we search for 

122 # datasets later. 

123 self.predicate.visit(_DataIdExtractionVisitor(self.constraint_data_id, self.messages)) 

124 

125 def iter_mandatory(self) -> Iterator[DimensionElement]: 

126 """Return an iterator over the dimension elements that must be joined 

127 into the query. 

128 

129 These elements either provide "field" (non-key) columns or define 

130 relationships that result rows must be consistent with. They do not 

131 necessarily include all dimension keys in `columns`, since each of 

132 those can typically be included in a query in multiple different ways. 

133 """ 

134 for element_name in self.columns.dimensions.elements: 

135 element = self.columns.dimensions.universe[element_name] 

136 if self.columns.dimension_fields[element_name]: 

137 # We need to get dimension record fields for this element, and 

138 # its table is the only place to get those. 

139 yield element 

140 elif element.defines_relationships: 

141 # We also need to join in DimensionElement tables that define 

142 # one-to-many and many-to-many relationships, but data 

143 # coordinate uploads, materializations, and datasets can also 

144 # provide these relationships. Data coordinate uploads and 

145 # dataset tables only have required dimensions, and can hence 

146 # only provide relationships involving those. 

147 if any( 

148 element.minimal_group.names <= upload_dimensions.required 

149 for upload_dimensions in self.data_coordinate_uploads.values() 

150 ): 

151 continue 

152 if any( 

153 element.minimal_group.names <= dataset_spec.dimensions.required 

154 for dataset_spec in self.datasets.values() 

155 ): 

156 continue 

157 # Materializations have all key columns for their dimensions. 

158 if any( 

159 element in materialization_dimensions.names 

160 for materialization_dimensions in self.materializations.values() 

161 ): 

162 continue 

163 yield element 

164 

165 

166@dataclasses.dataclass 

167class QueryProjectionPlan: 

168 """A struct describing the "projection" stage of a butler query. 

169 

170 This struct evaluates to `True` in boolean contexts if either 

171 `needs_dimension_distinct` or `needs_dataset_distict` are `True`. In other 

172 cases the projection is effectively a no-op, because the "joins"-stage rows 

173 are already unique. 

174 

175 See `QueryPlan` and `QueryPlan.projection` for additional information. 

176 """ 

177 

178 columns: qt.ColumnSet 

179 """The columns present in the query after the projection is applied. 

180 

181 This is always a subset of `QueryJoinsPlan.columns`. 

182 """ 

183 

184 datasets: dict[str, ResolvedDatasetSearch] 

185 """Dataset searches to join into the query.""" 

186 

187 needs_dimension_distinct: bool = False 

188 """If `True`, the projection's dimensions do not include all dimensions in 

189 the "joins" stage, and hence a SELECT DISTINCT [ON] or GROUP BY must be 

190 used to make post-projection rows unique. 

191 """ 

192 

193 needs_dataset_distinct: bool = False 

194 """If `True`, the projection columns do not include collection-specific 

195 dataset fields that were present in the "joins" stage, and hence a SELECT 

196 DISTINCT [ON] or GROUP BY must be added to make post-projection rows 

197 unique. 

198 """ 

199 

200 def __bool__(self) -> bool: 

201 return self.needs_dimension_distinct or self.needs_dataset_distinct 

202 

203 find_first_dataset: str | None = None 

204 """If not `None`, this is a find-first query for this dataset. 

205 

206 This is set even if the find-first search is trivial because there is only 

207 one resolved collection. 

208 """ 

209 

210 region_aggregates: list[DimensionElement] = dataclasses.field(default_factory=list) 

211 """Dimension elements with spatial regions that must be aggregated by the 

212 projection, since their dimension keys are being dropped. 

213 

214 This can only be non-empty if `needs_dimension_distinct` is `True`. 

215 """ 

216 

217 

218@dataclasses.dataclass 

219class QueryFindFirstPlan: 

220 """A struct describing the "find-first" stage of a butler query. 

221 

222 See `QueryPlan` and `QueryPlan.find_first` for additional information. 

223 """ 

224 

225 search: ResolvedDatasetSearch 

226 """Information about the dataset being searched for.""" 

227 

228 @property 

229 def dataset_type(self) -> str: 

230 """Name of the dataset type.""" 

231 return self.search.name 

232 

233 def __bool__(self) -> bool: 

234 return len(self.search.collection_records) > 1 

235 

236 

237@dataclasses.dataclass 

238class QueryPlan: 

239 """A struct that aggregates information about a complete butler query. 

240 

241 Notes 

242 ----- 

243 Butler queries are transformed into a combination of SQL and Python-side 

244 postprocessing in three stages, with each corresponding to an attributes of 

245 this class and a method of `DirectQueryDriver` 

246 

247 - In the `joins` stage (`~DirectQueryButler.apply_query_joins`), we define 

248 the main SQL FROM and WHERE clauses, by joining all tables needed to 

249 bring in any columns, or constrain the keys of its rows. 

250 

251 - In the `projection` stage (`~DirectQueryButler.apply_query_projection`), 

252 we select only the columns needed for the query's result rows (including 

253 columns needed only postprocessing and ORDER BY, as well those needed by 

254 the objects returned to users). If the result rows are not naturally 

255 unique given what went into the query in the "joins" stage, the 

256 projection involves a SELECT DISTINCT [ON] or GROUP BY to make them 

257 unique, and in a few rare cases uses aggregate functions with GROUP BY. 

258 

259 - In the `find_first` stage (`~DirectQueryButler.apply_query_find_first`), 

260 we use a window function (PARTITION BY) subquery to find only the first 

261 dataset in the collection search path for each data ID. This stage does 

262 nothing if there is no find-first dataset search, or if the search is 

263 trivial because there is only one collection. 

264 

265 In `DirectQueryDriver.build_query`, a `QueryPlan` instance is constructed 

266 via `DirectQueryDriver.analyze_query`, which also returns an initial 

267 `QueryBuilder`. After this point the plans are considered frozen, and the 

268 nested plan attributes are then passed to each of the corresponding 

269 `DirectQuery` along with the builder, which is mutated (and occasionally 

270 replaced) into the complete SQL/postprocessing form of the query. 

271 """ 

272 

273 joins: QueryJoinsPlan 

274 """Description of the "joins" stage of query construction.""" 

275 

276 projection: QueryProjectionPlan 

277 """Description of the "projection" stage of query construction.""" 

278 

279 find_first: QueryFindFirstPlan | None 

280 """Description of the "find_first" stage of query construction. 

281 

282 This attribute is `None` if there is no find-first search at all, and 

283 `False` in boolean contexts if the search is trivial because there is only 

284 one collection after the collections have been resolved. 

285 """ 

286 

287 final_columns: qt.ColumnSet 

288 """The columns included in the SELECT clause of the complete SQL query 

289 that is actually executed. 

290 

291 This is a subset of `QueryProjectionPlan.columns` that differs only in 

292 columns used by the `find_first` stage or an ORDER BY expression. 

293 

294 Like all other `.queries.tree.ColumnSet` attributes, it does not include 

295 fields added directly to `QueryBuilder.special`, which may also be added 

296 to the SELECT clause. 

297 """ 

298 

299 

300class _DataIdExtractionVisitor( 

301 SimplePredicateVisitor, 

302 ColumnExpressionVisitor[tuple[str, None] | tuple[None, Any] | tuple[None, None]], 

303): 

304 """A column-expression visitor that extracts quality constraints on 

305 dimensions that are not OR'd with anything else. 

306 

307 Parameters 

308 ---------- 

309 data_id : `dict` 

310 Dictionary to populate in place. 

311 messages : `list` [ `str` ] 

312 List of diagnostic messages to populate in place. 

313 """ 

314 

315 def __init__(self, data_id: dict[str, DataIdValue], messages: list[str]): 

316 self.data_id = data_id 

317 self.messages = messages 

318 

319 def visit_comparison( 

320 self, 

321 a: qt.ColumnExpression, 

322 operator: qt.ComparisonOperator, 

323 b: qt.ColumnExpression, 

324 flags: PredicateVisitFlags, 

325 ) -> None: 

326 if flags & PredicateVisitFlags.HAS_OR_SIBLINGS: 

327 return None 

328 if flags & PredicateVisitFlags.INVERTED: 

329 if operator == "!=": 

330 operator = "==" 

331 else: 

332 return None 

333 if operator != "==": 

334 return None 

335 k_a, v_a = a.visit(self) 

336 k_b, v_b = b.visit(self) 

337 if k_a is not None and v_b is not None: 

338 key = k_a 

339 value = v_b 

340 elif k_b is not None and v_a is not None: 

341 key = k_b 

342 value = v_a 

343 else: 

344 return None 

345 if (old := self.data_id.setdefault(key, value)) != value: 

346 self.messages.append(f"'where' expression requires both {key}={value!r} and {key}={old!r}.") 

347 return None 

348 

349 def visit_binary_expression(self, expression: qt.BinaryExpression) -> tuple[None, None]: 

350 return None, None 

351 

352 def visit_unary_expression(self, expression: qt.UnaryExpression) -> tuple[None, None]: 

353 return None, None 

354 

355 def visit_literal(self, expression: qt.ColumnLiteral) -> tuple[None, Any]: 

356 return None, expression.get_literal_value() 

357 

358 def visit_dimension_key_reference(self, expression: qt.DimensionKeyReference) -> tuple[str, None]: 

359 return expression.dimension.name, None 

360 

361 def visit_dimension_field_reference(self, expression: qt.DimensionFieldReference) -> tuple[None, None]: 

362 return None, None 

363 

364 def visit_dataset_field_reference(self, expression: qt.DatasetFieldReference) -> tuple[None, None]: 

365 return None, None 

366 

367 def visit_reversed(self, expression: qt.Reversed) -> tuple[None, None]: 

368 raise AssertionError("No Reversed expressions in predicates.")