Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 30%

159 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-28 09:59 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "CheckVisitor", 

25 "InspectionVisitor", 

26 "InspectionSummary", 

27) 

28 

29import dataclasses 

30from collections.abc import Mapping, Sequence, Set 

31from typing import TYPE_CHECKING, Any, List, Optional, Tuple 

32 

33from ....core import ( 

34 DataCoordinate, 

35 DataIdValue, 

36 Dimension, 

37 DimensionElement, 

38 DimensionGraph, 

39 DimensionUniverse, 

40 NamedKeyDict, 

41 NamedValueSet, 

42) 

43from ..._exceptions import UserExpressionError 

44from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId 

45from .normalForm import NormalForm, NormalFormVisitor 

46from .parser import Node, TreeVisitor 

47 

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 import astropy.time 

50 

51 

52@dataclasses.dataclass 

53class InspectionSummary: 

54 """Base class for objects used by `CheckVisitor` and `InspectionVisitor` 

55 to gather information about a parsed expression. 

56 """ 

57 

58 def update(self, other: InspectionSummary) -> None: 

59 """Update ``self`` with all dimensions and columns from ``other``. 

60 

61 Parameters 

62 ---------- 

63 other : `InspectionSummary` 

64 The other summary object. 

65 """ 

66 self.dimensions.update(other.dimensions) 

67 for element, columns in other.columns.items(): 

68 self.columns.setdefault(element, set()).update(columns) 

69 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate 

70 

71 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet) 

72 """Dimensions whose primary keys or dependencies were referenced anywhere 

73 in this branch (`NamedValueSet` [ `Dimension` ]). 

74 """ 

75 

76 columns: NamedKeyDict[DimensionElement, set[str]] = dataclasses.field(default_factory=NamedKeyDict) 

77 """Dimension element tables whose columns were referenced anywhere in this 

78 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]). 

79 """ 

80 

81 hasIngestDate: bool = False 

82 """Whether this expression includes the special dataset ingest date 

83 identifier (`bool`). 

84 """ 

85 

86 

87@dataclasses.dataclass 

88class TreeSummary(InspectionSummary): 

89 """Result object used by `InspectionVisitor` to gather information about 

90 a parsed expression. 

91 

92 Notes 

93 ----- 

94 TreeSummary adds attributes that allow dimension equivalence expressions 

95 (e.g. "tract=4") to be recognized when they appear in simple contexts 

96 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its 

97 own (i.e. when ``check=False`` in the query code), these don't do anything, 

98 but they don't cost much, either. They are used by `CheckVisitor` when it 

99 delegates to `InspectionVisitor` to see what governor dimension values are 

100 set in a branch of the normal-form expression. 

101 """ 

102 

103 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary: 

104 """Merge ``other`` into ``self``, making ``self`` a summary of both 

105 expression tree branches. 

106 

107 Parameters 

108 ---------- 

109 other : `TreeSummary` 

110 The other summary object. 

111 isEq : `bool`, optional 

112 If `True` (`False` is default), these summaries are being combined 

113 via the equality operator. 

114 

115 Returns 

116 ------- 

117 self : `TreeSummary` 

118 The merged summary (updated in-place). 

119 """ 

120 self.update(other) 

121 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly(): 

122 self.dataIdValue = other.dataIdValue 

123 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly(): 

124 self.dataIdKey = other.dataIdKey 

125 else: 

126 self.dataIdKey = None 

127 self.dataIdValue = None 

128 return self 

129 

130 def isDataIdKeyOnly(self) -> bool: 

131 """Test whether this branch is _just_ a data ID key identifier.""" 

132 return self.dataIdKey is not None and self.dataIdValue is None 

133 

134 def isDataIdValueOnly(self) -> bool: 

135 """Test whether this branch is _just_ a literal value that may be 

136 used as the value in a data ID key-value pair. 

137 """ 

138 return self.dataIdKey is None and self.dataIdValue is not None 

139 

140 dataIdKey: Optional[Dimension] = None 

141 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be 

142 (if `dataIdValue` is `None`) fully identified by a literal value in this 

143 branch. 

144 """ 

145 

146 dataIdValue: Optional[str] = None 

147 """A literal value that constrains (if `dataIdKey` is not `None`) or may 

148 constrain (if `dataIdKey` is `None`) a dimension in this branch. 

149 

150 This is always a `str` or `None`, but it may need to be coerced to `int` 

151 to reflect the actual user intent. 

152 """ 

153 

154 

155class InspectionVisitor(TreeVisitor[TreeSummary]): 

156 """Implements TreeVisitor to identify dimension elements that need 

157 to be included in a query, prior to actually constructing a SQLAlchemy 

158 WHERE clause from it. 

159 

160 Parameters 

161 ---------- 

162 universe : `DimensionUniverse` 

163 All known dimensions. 

164 bind : `Mapping` [ `str`, `object` ] 

165 Mapping containing literal values that should be injected into the 

166 query expression, keyed by the identifiers they replace. 

167 """ 

168 

169 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]): 

170 self.universe = universe 

171 self.bind = bind 

172 

173 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary: 

174 # Docstring inherited from TreeVisitor.visitNumericLiteral 

175 return TreeSummary(dataIdValue=value) 

176 

177 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary: 

178 # Docstring inherited from TreeVisitor.visitStringLiteral 

179 return TreeSummary(dataIdValue=value) 

180 

181 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary: 

182 # Docstring inherited from TreeVisitor.visitTimeLiteral 

183 return TreeSummary() 

184 

185 def visitIdentifier(self, name: str, node: Node) -> TreeSummary: 

186 # Docstring inherited from TreeVisitor.visitIdentifier 

187 if name in self.bind: 

188 value = self.bind[name] 

189 if isinstance(value, (list, tuple, Set)): 

190 # This can happen on rhs of IN operator, if there is only one 

191 # element in the list then take it. 

192 if len(value) == 1: 

193 return TreeSummary(dataIdValue=next(iter(value))) 

194 else: 

195 return TreeSummary() 

196 else: 

197 return TreeSummary(dataIdValue=value) 

198 constant = categorizeConstant(name) 

199 if constant is ExpressionConstant.INGEST_DATE: 

200 return TreeSummary(hasIngestDate=True) 

201 elif constant is ExpressionConstant.NULL: 

202 return TreeSummary() 

203 assert constant is None, "Enum variant conditionals should be exhaustive." 

204 element, column = categorizeElementId(self.universe, name) 

205 if column is None: 

206 assert isinstance(element, Dimension) 

207 return TreeSummary( 

208 dimensions=NamedValueSet(element.graph.dimensions), 

209 dataIdKey=element, 

210 ) 

211 else: 

212 return TreeSummary( 

213 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}}) 

214 ) 

215 

216 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary: 

217 # Docstring inherited from TreeVisitor.visitUnaryOp 

218 return operand 

219 

220 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary: 

221 # Docstring inherited from TreeVisitor.visitBinaryOp 

222 return lhs.merge(rhs, isEq=(operator == "=")) 

223 

224 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool, node: Node) -> TreeSummary: 

225 # Docstring inherited from TreeVisitor.visitIsIn 

226 for v in values: 

227 lhs.merge(v) 

228 return lhs 

229 

230 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary: 

231 # Docstring inherited from TreeVisitor.visitParens 

232 return expression 

233 

234 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary: 

235 # Docstring inherited from base class 

236 result = TreeSummary() 

237 for i in items: 

238 result.merge(i) 

239 return result 

240 

241 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node) -> TreeSummary: 

242 # Docstring inherited from TreeVisitor.visitRangeLiteral 

243 return TreeSummary() 

244 

245 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary: 

246 # Docstring inherited from base class 

247 return TreeSummary() 

248 

249 

250@dataclasses.dataclass 

251class InnerSummary(InspectionSummary): 

252 """Result object used by `CheckVisitor` to gather referenced dimensions 

253 and tables from an inner group of AND'd together expression branches, and 

254 check them for consistency and completeness. 

255 """ 

256 

257 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict) 

258 """Mapping containing the values of all dimensions that are equated with 

259 literal values in this expression branch. 

260 """ 

261 

262 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

263 """Governor dimensions whose values are needed by the query, not provided 

264 in the query itself, and present in the default data ID. 

265 

266 These should be added to the query's data ID when finalizing the WHERE 

267 clause. 

268 """ 

269 

270 

271@dataclasses.dataclass 

272class OuterSummary(InspectionSummary): 

273 """Result object used by `CheckVisitor` to gather referenced dimensions, 

274 tables, and governor dimension values from the entire expression. 

275 """ 

276 

277 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict) 

278 """Mapping containing all values that appear in this expression for 

279 dimensions relevant to the query. 

280 

281 Dimensions that are absent from this dict are not constrained by this 

282 expression. 

283 """ 

284 

285 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

286 """Governor dimensions whose values are needed by the query, not provided 

287 in the query itself, and present in the default data ID. 

288 

289 These should be added to the query's data ID when finalizing the WHERE 

290 clause. 

291 """ 

292 

293 

294class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]): 

295 """An implementation of `NormalFormVisitor` that identifies the dimensions 

296 and tables that need to be included in a query while performing some checks 

297 for completeness and consistency. 

298 

299 Parameters 

300 ---------- 

301 dataId : `DataCoordinate` 

302 Dimension values that are fully known in advance. 

303 graph : `DimensionGraph` 

304 The dimensions the query would include in the absence of this 

305 expression. 

306 bind : `Mapping` [ `str`, `object` ] 

307 Mapping containing literal values that should be injected into the 

308 query expression, keyed by the identifiers they replace. 

309 defaults : `DataCoordinate` 

310 A data ID containing default for governor dimensions. 

311 """ 

312 

313 def __init__( 

314 self, 

315 dataId: DataCoordinate, 

316 graph: DimensionGraph, 

317 bind: Mapping[str, Any], 

318 defaults: DataCoordinate, 

319 ): 

320 self.dataId = dataId 

321 self.graph = graph 

322 self.defaults = defaults 

323 self._branchVisitor = InspectionVisitor(dataId.universe, bind) 

324 

325 def visitBranch(self, node: Node) -> TreeSummary: 

326 # Docstring inherited from NormalFormVisitor. 

327 return node.visit(self._branchVisitor) 

328 

329 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary: 

330 # Docstring inherited from NormalFormVisitor. 

331 # Disjunctive normal form means inner branches are AND'd together... 

332 assert form is NormalForm.DISJUNCTIVE 

333 # ...and that means each branch we iterate over together below 

334 # constrains the others, and they all need to be consistent. Moreover, 

335 # because outer branches are OR'd together, we also know that if 

336 # something is missing from one of these branches (like a governor 

337 # dimension value like the instrument or skymap needed to interpret a 

338 # visit or tract number), it really is missing, because there's no way 

339 # some other inner branch can constraint it. 

340 # 

341 # That is, except the data ID the visitor was passed at construction; 

342 # that's AND'd to the entire expression later, and thus it affects all 

343 # branches. To take care of that, we add any governor values it 

344 # contains to the summary in advance. 

345 summary = InnerSummary() 

346 summary.dimension_values.update( 

347 (k, self.dataId[k]) 

348 for k in (self.dataId.graph.names if self.dataId.hasFull() else self.dataId.graph.required.names) 

349 ) 

350 # Finally, we loop over those branches. 

351 for branch in branches: 

352 # Update the sets of dimensions and columns we've seen anywhere in 

353 # the expression in any context. 

354 summary.update(branch) 

355 # Test whether this branch has a form like '<dimension>=<value>' 

356 # (or equivalent; categorizeIdentifier is smart enough to see that 

357 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so, 

358 # remember that we've constrained it on this branch to later make 

359 # sure it's consistent with any other constraints on any other 

360 # branches its AND'd with. 

361 if branch.dataIdKey is not None and branch.dataIdValue is not None: 

362 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue) 

363 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value) 

364 if value != new_value: 

365 # Expression says something like "instrument='HSC' AND 

366 # instrument='DECam'", or data ID has one and expression 

367 # has the other. 

368 if branch.dataIdKey in self.dataId: 

369 raise UserExpressionError( 

370 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} " 

371 f"and data ID with {branch.dataIdKey.name}={value!r}." 

372 ) 

373 else: 

374 raise UserExpressionError( 

375 f"Conflicting literal values for {branch.dataIdKey.name} in expression: " 

376 f"{value!r} != {branch.dataIdValue!r}." 

377 ) 

378 # Now that we know which governor values we've constrained, see if any 

379 # are missing, i.e. if the expression contains something like "visit=X" 

380 # without saying what instrument that visit corresponds to. This rules 

381 # out a lot of accidents, but it also rules out possibly-legitimate 

382 # multi-instrument queries like "visit.seeing < 0.7". But it's not 

383 # unreasonable to ask the user to be explicit about the instruments 

384 # they want to consider to work around this restriction, and that's 

385 # what we do. Note that if someone does write an expression like 

386 # 

387 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7 

388 # 

389 # then in disjunctive normal form that will become 

390 # 

391 # (instrument='HSC' AND visit.seeing < 0.7) 

392 # OR (instrument='DECam' AND visit.seeing < 0.7) 

393 # 

394 # i.e. each instrument will get its own outer branch and the logic here 

395 # still works (that sort of thing is why we convert to normal form, 

396 # after all). 

397 governorsNeededInBranch: set[str] = set() 

398 for dimension in summary.dimensions: 

399 governorsNeededInBranch.update(dimension.graph.governors.names) 

400 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()): 

401 missing = governorsNeededInBranch - summary.dimension_values.keys() 

402 if missing <= self.defaults.names: 

403 summary.defaultsNeeded.update(missing) 

404 else: 

405 still_missing = missing - self.defaults.names 

406 raise UserExpressionError( 

407 f"No value(s) for governor dimensions {still_missing} in expression " 

408 "that references dependent dimensions. 'Governor' dimensions must always be specified " 

409 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' " 

410 "terms) or in a data ID passed to the query method." 

411 ) 

412 return summary 

413 

414 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary: 

415 # Docstring inherited from NormalFormVisitor. 

416 # Disjunctive normal form means outer branches are OR'd together. 

417 assert form is NormalForm.DISJUNCTIVE 

418 summary = OuterSummary() 

419 if branches: 

420 # Iterate over branches in first pass to gather all dimensions and 

421 # columns referenced. This aggregation is for the full query, so 

422 # we don't care whether things are joined by AND or OR (or + or -, 

423 # etc). Also gather the set of dimensions directly constrained or 

424 # pulled from defaults in _all_ branches. This is the set we will 

425 # be able to bound overall; any dimensions not referenced by even 

426 # one branch could be unbounded. 

427 dimensions_in_all_branches = set(self.graph.universe.getStaticDimensions().names) 

428 for branch in branches: 

429 summary.update(branch) 

430 summary.defaultsNeeded.update(branch.defaultsNeeded) 

431 dimensions_in_all_branches.intersection_update(branch.dimension_values) 

432 # Go back through and set up the dimension bounds. 

433 summary.dimension_constraints.update( 

434 {dimension: set() for dimension in dimensions_in_all_branches} 

435 ) 

436 for dim in dimensions_in_all_branches: 

437 for branch in branches: 

438 summary.dimension_constraints[dim].add(branch.dimension_values[dim]) 

439 # See if we've referenced any dimensions that weren't in the original 

440 # query graph; if so, we update that to include them. This is what 

441 # lets a user say "tract=X" on the command line (well, "skymap=Y AND 

442 # tract=X" - logic in visitInner checks for that) when running a task 

443 # like ISR that has nothing to do with skymaps. 

444 if not summary.dimensions.issubset(self.graph.dimensions): 

445 self.graph = DimensionGraph( 

446 self.graph.universe, 

447 dimensions=(summary.dimensions | self.graph.dimensions), 

448 ) 

449 for dimension, values in summary.dimension_constraints.items(): 

450 if dimension in summary.defaultsNeeded: 

451 # One branch contained an explicit value for this dimension 

452 # while another needed to refer to the default data ID. 

453 # Even if these refer to the same value, that inconsistency 

454 # probably indicates user error. 

455 raise UserExpressionError( 

456 f"Governor dimension {dimension} is explicitly " 

457 f"constrained to {values} in one or more branches of " 

458 "this query where expression, but is left to default " 

459 f"to {self.defaults[dimension]!r} in another branch. " 

460 "Defaults and explicit constraints cannot be mixed." 

461 ) 

462 # If any default data ID values were needed, update self.dataId with 

463 # them, and then update the governor restriction with them. 

464 if summary.defaultsNeeded: 

465 defaultsNeededGraph = DimensionGraph(self.graph.universe, names=summary.defaultsNeeded) 

466 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph)) 

467 for dimension in summary.defaultsNeeded: 

468 summary.dimension_constraints[dimension] = {self.defaults[dimension]} 

469 

470 return summary