Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 34%

149 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-08-31 04:05 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "CheckVisitor", 

25 "InspectionVisitor", 

26 "InspectionSummary", 

27) 

28 

29import dataclasses 

30from typing import TYPE_CHECKING, Any, List, Mapping, Optional, Sequence, Set, Tuple 

31 

32from ....core import ( 

33 DataCoordinate, 

34 Dimension, 

35 DimensionElement, 

36 DimensionGraph, 

37 DimensionUniverse, 

38 GovernorDimension, 

39 NamedKeyDict, 

40 NamedValueSet, 

41) 

42from ..._exceptions import UserExpressionError 

43from ...summaries import GovernorDimensionRestriction 

44from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId 

45from .normalForm import NormalForm, NormalFormVisitor 

46from .parser import Node, TreeVisitor 

47 

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 import astropy.time 

50 

51 

52@dataclasses.dataclass 

53class InspectionSummary: 

54 """Base class for objects used by `CheckVisitor` and `InspectionVisitor` 

55 to gather information about a parsed expression. 

56 """ 

57 

58 def update(self, other: InspectionSummary) -> None: 

59 """Update ``self`` with all dimensions and columns from ``other``. 

60 

61 Parameters 

62 ---------- 

63 other : `InspectionSummary` 

64 The other summary object. 

65 """ 

66 self.dimensions.update(other.dimensions) 

67 for element, columns in other.columns.items(): 

68 self.columns.setdefault(element, set()).update(columns) 

69 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate 

70 

71 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet) 

72 """Dimensions whose primary keys or dependencies were referenced anywhere 

73 in this branch (`NamedValueSet` [ `Dimension` ]). 

74 """ 

75 

76 columns: NamedKeyDict[DimensionElement, Set[str]] = dataclasses.field(default_factory=NamedKeyDict) 

77 """Dimension element tables whose columns were referenced anywhere in this 

78 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]). 

79 """ 

80 

81 hasIngestDate: bool = False 

82 """Whether this expression includes the special dataset ingest date 

83 identifier (`bool`). 

84 """ 

85 

86 

87@dataclasses.dataclass 

88class TreeSummary(InspectionSummary): 

89 """Result object used by `InspectionVisitor` to gather information about 

90 a parsed expression. 

91 

92 Notes 

93 ----- 

94 TreeSummary adds attributes that allow dimension equivalence expressions 

95 (e.g. "tract=4") to be recognized when they appear in simple contexts 

96 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its 

97 own (i.e. when ``check=False`` in the query code), these don't do anything, 

98 but they don't cost much, either. They are used by `CheckVisitor` when it 

99 delegates to `InspectionVisitor` to see what governor dimension values are 

100 set in a branch of the normal-form expression. 

101 """ 

102 

103 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary: 

104 """Merge ``other`` into ``self``, making ``self`` a summary of both 

105 expression tree branches. 

106 

107 Parameters 

108 ---------- 

109 other : `TreeSummary` 

110 The other summary object. 

111 isEq : `bool`, optional 

112 If `True` (`False` is default), these summaries are being combined 

113 via the equality operator. 

114 

115 Returns 

116 ------- 

117 self : `TreeSummary` 

118 The merged summary (updated in-place). 

119 """ 

120 self.update(other) 

121 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly(): 

122 self.dataIdValue = other.dataIdValue 

123 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly(): 

124 self.dataIdKey = other.dataIdKey 

125 else: 

126 self.dataIdKey = None 

127 self.dataIdValue = None 

128 return self 

129 

130 def isDataIdKeyOnly(self) -> bool: 

131 """Test whether this branch is _just_ a data ID key identifier.""" 

132 return self.dataIdKey is not None and self.dataIdValue is None 

133 

134 def isDataIdValueOnly(self) -> bool: 

135 """Test whether this branch is _just_ a literal value that may be 

136 used as the value in a data ID key-value pair. 

137 """ 

138 return self.dataIdKey is None and self.dataIdValue is not None 

139 

140 dataIdKey: Optional[Dimension] = None 

141 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be 

142 (if `dataIdValue` is `None`) fully identified by a literal value in this 

143 branch. 

144 """ 

145 

146 dataIdValue: Optional[str] = None 

147 """A literal value that constrains (if `dataIdKey` is not `None`) or may 

148 constrain (if `dataIdKey` is `None`) a dimension in this branch. 

149 

150 This is always a `str` or `None`, but it may need to be coerced to `int` 

151 to reflect the actual user intent. 

152 """ 

153 

154 

155class InspectionVisitor(TreeVisitor[TreeSummary]): 

156 """Implements TreeVisitor to identify dimension elements that need 

157 to be included in a query, prior to actually constructing a SQLAlchemy 

158 WHERE clause from it. 

159 

160 Parameters 

161 ---------- 

162 universe : `DimensionUniverse` 

163 All known dimensions. 

164 bind : `Mapping` [ `str`, `object` ] 

165 Mapping containing literal values that should be injected into the 

166 query expression, keyed by the identifiers they replace. 

167 """ 

168 

169 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]): 

170 self.universe = universe 

171 self.bind = bind 

172 

173 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary: 

174 # Docstring inherited from TreeVisitor.visitNumericLiteral 

175 return TreeSummary(dataIdValue=value) 

176 

177 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary: 

178 # Docstring inherited from TreeVisitor.visitStringLiteral 

179 return TreeSummary(dataIdValue=value) 

180 

181 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary: 

182 # Docstring inherited from TreeVisitor.visitTimeLiteral 

183 return TreeSummary() 

184 

185 def visitIdentifier(self, name: str, node: Node) -> TreeSummary: 

186 # Docstring inherited from TreeVisitor.visitIdentifier 

187 if name in self.bind: 

188 return TreeSummary(dataIdValue=self.bind[name]) 

189 constant = categorizeConstant(name) 

190 if constant is ExpressionConstant.INGEST_DATE: 

191 return TreeSummary(hasIngestDate=True) 

192 elif constant is ExpressionConstant.NULL: 

193 return TreeSummary() 

194 assert constant is None, "Enum variant conditionals should be exhaustive." 

195 element, column = categorizeElementId(self.universe, name) 

196 if column is None: 

197 assert isinstance(element, Dimension) 

198 return TreeSummary( 

199 dimensions=NamedValueSet(element.graph.dimensions), 

200 dataIdKey=element, 

201 ) 

202 else: 

203 return TreeSummary( 

204 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}}) 

205 ) 

206 

207 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary: 

208 # Docstring inherited from TreeVisitor.visitUnaryOp 

209 return operand 

210 

211 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary: 

212 # Docstring inherited from TreeVisitor.visitBinaryOp 

213 return lhs.merge(rhs, isEq=(operator == "=")) 

214 

215 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool, node: Node) -> TreeSummary: 

216 # Docstring inherited from TreeVisitor.visitIsIn 

217 for v in values: 

218 lhs.merge(v) 

219 return lhs 

220 

221 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary: 

222 # Docstring inherited from TreeVisitor.visitParens 

223 return expression 

224 

225 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary: 

226 # Docstring inherited from base class 

227 result = TreeSummary() 

228 for i in items: 

229 result.merge(i) 

230 return result 

231 

232 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node) -> TreeSummary: 

233 # Docstring inherited from TreeVisitor.visitRangeLiteral 

234 return TreeSummary() 

235 

236 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary: 

237 # Docstring inherited from base class 

238 return TreeSummary() 

239 

240 

241@dataclasses.dataclass 

242class InnerSummary(InspectionSummary): 

243 """Result object used by `CheckVisitor` to gather referenced dimensions 

244 and tables from an inner group of AND'd together expression branches, and 

245 check them for consistency and completeness. 

246 """ 

247 

248 governors: NamedKeyDict[GovernorDimension, str] = dataclasses.field(default_factory=NamedKeyDict) 

249 """Mapping containing the values of all governor dimensions that are 

250 equated with literal values in this expression branch. 

251 """ 

252 

253 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet) 

254 """Governor dimensions whose values are needed by the query, not provided 

255 in the query itself, and present in the default data ID. 

256 

257 These should be added to the query's data ID when finalizing the WHERE 

258 clause. 

259 """ 

260 

261 

262@dataclasses.dataclass 

263class OuterSummary(InspectionSummary): 

264 """Result object used by `CheckVisitor` to gather referenced dimensions, 

265 tables, and governor dimension values from the entire expression. 

266 """ 

267 

268 governors: GovernorDimensionRestriction = dataclasses.field( 

269 default_factory=GovernorDimensionRestriction.makeFull 

270 ) 

271 """Mapping containing all values that appear in this expression for 

272 governor dimension relevant to the query. 

273 

274 Governor dimensions that are absent from this dict are not constrained by 

275 this expression. 

276 """ 

277 

278 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet) 

279 """Governor dimensions whose values are needed by the query, not provided 

280 in the query itself, and present in the default data ID. 

281 

282 These should be added to the query's data ID when finalizing the WHERE 

283 clause. 

284 """ 

285 

286 

287class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]): 

288 """An implementation of `NormalFormVisitor` that identifies the dimensions 

289 and tables that need to be included in a query while performing some checks 

290 for completeness and consistency. 

291 

292 Parameters 

293 ---------- 

294 dataId : `DataCoordinate` 

295 Dimension values that are fully known in advance. 

296 graph : `DimensionGraph` 

297 The dimensions the query would include in the absence of this 

298 expression. 

299 bind : `Mapping` [ `str`, `object` ] 

300 Mapping containing literal values that should be injected into the 

301 query expression, keyed by the identifiers they replace. 

302 defaults : `DataCoordinate` 

303 A data ID containing default for governor dimensions. 

304 """ 

305 

306 def __init__( 

307 self, 

308 dataId: DataCoordinate, 

309 graph: DimensionGraph, 

310 bind: Mapping[str, Any], 

311 defaults: DataCoordinate, 

312 ): 

313 self.dataId = dataId 

314 self.graph = graph 

315 self.defaults = defaults 

316 self._branchVisitor = InspectionVisitor(dataId.universe, bind) 

317 

318 def visitBranch(self, node: Node) -> TreeSummary: 

319 # Docstring inherited from NormalFormVisitor. 

320 return node.visit(self._branchVisitor) 

321 

322 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary: 

323 # Docstring inherited from NormalFormVisitor. 

324 # Disjunctive normal form means inner branches are AND'd together... 

325 assert form is NormalForm.DISJUNCTIVE 

326 # ...and that means each branch we iterate over together below 

327 # constrains the others, and they all need to be consistent. Moreover, 

328 # because outer branches are OR'd together, we also know that if 

329 # something is missing from one of these branches (like a governor 

330 # dimension value like the instrument or skymap needed to interpret a 

331 # visit or tract number), it really is missing, because there's no way 

332 # some other inner branch can constraint it. 

333 # 

334 # That is, except the data ID the visitor was passed at construction; 

335 # that's AND'd to the entire expression later, and thus it affects all 

336 # branches. To take care of that, we add any governor values it 

337 # contains to the summary in advance. 

338 summary = InnerSummary() 

339 summary.governors.update((k, self.dataId[k]) for k in self.dataId.graph.governors) # type: ignore 

340 # Finally, we loop over those branches. 

341 for branch in branches: 

342 # Update the sets of dimensions and columns we've seen anywhere in 

343 # the expression in any context. 

344 summary.update(branch) 

345 # Test whether this branch has a form like '<dimension>=<value' 

346 # (or equivalent; categorizeIdentifier is smart enough to see that 

347 # e.g. 'detector.id=4' is equivalent to 'detector=4'). 

348 # If so, and it's a governor dimension, remember that we've 

349 # constrained it on this branch, and make sure it's consistent 

350 # with any other constraints on any other branches its AND'd with. 

351 if isinstance(branch.dataIdKey, GovernorDimension) and branch.dataIdValue is not None: 

352 governor = branch.dataIdKey 

353 value = summary.governors.setdefault(governor, branch.dataIdValue) 

354 if value != branch.dataIdValue: 

355 # Expression says something like "instrument='HSC' AND 

356 # instrument='DECam'", or data ID has one and expression 

357 # has the other. 

358 if governor in self.dataId: 

359 raise UserExpressionError( 

360 f"Conflict between expression containing {governor.name}={branch.dataIdValue!r} " 

361 f"and data ID with {governor.name}={value!r}." 

362 ) 

363 else: 

364 raise UserExpressionError( 

365 f"Conflicting literal values for {governor.name} in expression: " 

366 f"{value!r} != {branch.dataIdValue!r}." 

367 ) 

368 # Now that we know which governor values we've constrained, see if any 

369 # are missing, i.e. if the expression contains something like "visit=X" 

370 # without saying what instrument that visit corresponds to. This rules 

371 # out a lot of accidents, but it also rules out possibly-legitimate 

372 # multi-instrument queries like "visit.seeing < 0.7". But it's not 

373 # unreasonable to ask the user to be explicit about the instruments 

374 # they want to consider to work around this restriction, and that's 

375 # what we do. Note that if someone does write an expression like 

376 # 

377 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7 

378 # 

379 # then in disjunctive normal form that will become 

380 # 

381 # (instrument='HSC' AND visit.seeing < 0.7) 

382 # OR (instrument='DECam' AND visit.seeing < 0.7) 

383 # 

384 # i.e. each instrument will get its own outer branch and the logic here 

385 # still works (that sort of thing is why we convert to normal form, 

386 # after all). 

387 governorsNeededInBranch: NamedValueSet[GovernorDimension] = NamedValueSet() 

388 for dimension in summary.dimensions: 

389 governorsNeededInBranch.update(dimension.graph.governors) 

390 if not governorsNeededInBranch.issubset(summary.governors.keys()): 

391 missing = NamedValueSet(governorsNeededInBranch - summary.governors.keys()) 

392 if missing <= self.defaults.keys(): 

393 summary.defaultsNeeded.update(missing) 

394 else: 

395 raise UserExpressionError( 

396 f"No value(s) for governor dimensions {missing - self.defaults.keys()} in expression " 

397 "that references dependent dimensions. 'Governor' dimensions must always be specified " 

398 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' " 

399 "terms) or in a data ID passed to the query method." 

400 ) 

401 return summary 

402 

403 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary: 

404 # Docstring inherited from NormalFormVisitor. 

405 # Disjunctive normal form means outer branches are OR'd together. 

406 assert form is NormalForm.DISJUNCTIVE 

407 # Iterate over branches in first pass to gather all dimensions and 

408 # columns referenced. This aggregation is for the full query, so we 

409 # don't care whether things are joined by AND or OR (or + or -, etc). 

410 summary = OuterSummary() 

411 if branches: 

412 # To make an OR of branch constraints start with empty selection. 

413 summary.governors = GovernorDimensionRestriction.makeEmpty(self.graph.universe) 

414 for branch in branches: 

415 summary.update(branch) 

416 summary.governors = summary.governors.union(branch.governors) 

417 summary.defaultsNeeded.update(branch.defaultsNeeded) 

418 # See if we've referenced any dimensions that weren't in the original 

419 # query graph; if so, we update that to include them. This is what 

420 # lets a user say "tract=X" on the command line (well, "skymap=Y AND 

421 # tract=X" - logic in visitInner checks for that) when running a task 

422 # like ISR that has nothing to do with skymaps. 

423 if not summary.dimensions.issubset(self.graph.dimensions): 

424 self.graph = DimensionGraph( 

425 self.graph.universe, 

426 dimensions=(summary.dimensions | self.graph.dimensions), 

427 ) 

428 for governor, values in summary.governors.items(): 

429 if governor in summary.defaultsNeeded: 

430 # One branch contained an explicit value for this dimension 

431 # while another needed to refer to the default data ID. 

432 # Even if these refer to the same value, that inconsistency 

433 # probably indicates user error. 

434 raise UserExpressionError( 

435 f"Governor dimension {governor.name} is explicitly " 

436 f"constrained to {values} in one or more branches of " 

437 "this query where expression, but is left to default " 

438 f"to {self.defaults[governor]!r} in another branch. " 

439 "Defaults and explicit constraints cannot be mixed." 

440 ) 

441 # If any default data ID values were needed, update self.dataId with 

442 # them, and then update the governor restriction with them. 

443 if summary.defaultsNeeded: 

444 defaultsNeededGraph = DimensionGraph(self.graph.universe, summary.defaultsNeeded) 

445 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph)) 

446 assert self.dataId.hasRecords(), ( 

447 "Should be a union of two data IDs with records, " 

448 "in which one only adds governor dimension values." 

449 ) 

450 summary.governors.intersection_update( 

451 # We know the value for a governor dimension is always a str, 

452 # and that's all self.defaults should contain, but MyPy doesn't 

453 # know that. 

454 {dimension: self.defaults[dimension] for dimension in summary.defaultsNeeded} # type: ignore 

455 ) 

456 return summary