Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 31%

153 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-07 09:47 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "CheckVisitor", 

25 "InspectionVisitor", 

26 "InspectionSummary", 

27) 

28 

29import dataclasses 

30from typing import TYPE_CHECKING, Any, List, Mapping, Optional, Sequence, Set, Tuple 

31 

32from ....core import ( 

33 DataCoordinate, 

34 DataIdValue, 

35 Dimension, 

36 DimensionElement, 

37 DimensionGraph, 

38 DimensionUniverse, 

39 NamedKeyDict, 

40 NamedValueSet, 

41) 

42from ..._exceptions import UserExpressionError 

43from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId 

44from .normalForm import NormalForm, NormalFormVisitor 

45from .parser import Node, TreeVisitor 

46 

47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true

48 import astropy.time 

49 

50 

51@dataclasses.dataclass 

52class InspectionSummary: 

53 """Base class for objects used by `CheckVisitor` and `InspectionVisitor` 

54 to gather information about a parsed expression. 

55 """ 

56 

57 def update(self, other: InspectionSummary) -> None: 

58 """Update ``self`` with all dimensions and columns from ``other``. 

59 

60 Parameters 

61 ---------- 

62 other : `InspectionSummary` 

63 The other summary object. 

64 """ 

65 self.dimensions.update(other.dimensions) 

66 for element, columns in other.columns.items(): 

67 self.columns.setdefault(element, set()).update(columns) 

68 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate 

69 

70 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet) 

71 """Dimensions whose primary keys or dependencies were referenced anywhere 

72 in this branch (`NamedValueSet` [ `Dimension` ]). 

73 """ 

74 

75 columns: NamedKeyDict[DimensionElement, Set[str]] = dataclasses.field(default_factory=NamedKeyDict) 

76 """Dimension element tables whose columns were referenced anywhere in this 

77 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]). 

78 """ 

79 

80 hasIngestDate: bool = False 

81 """Whether this expression includes the special dataset ingest date 

82 identifier (`bool`). 

83 """ 

84 

85 

86@dataclasses.dataclass 

87class TreeSummary(InspectionSummary): 

88 """Result object used by `InspectionVisitor` to gather information about 

89 a parsed expression. 

90 

91 Notes 

92 ----- 

93 TreeSummary adds attributes that allow dimension equivalence expressions 

94 (e.g. "tract=4") to be recognized when they appear in simple contexts 

95 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its 

96 own (i.e. when ``check=False`` in the query code), these don't do anything, 

97 but they don't cost much, either. They are used by `CheckVisitor` when it 

98 delegates to `InspectionVisitor` to see what governor dimension values are 

99 set in a branch of the normal-form expression. 

100 """ 

101 

102 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary: 

103 """Merge ``other`` into ``self``, making ``self`` a summary of both 

104 expression tree branches. 

105 

106 Parameters 

107 ---------- 

108 other : `TreeSummary` 

109 The other summary object. 

110 isEq : `bool`, optional 

111 If `True` (`False` is default), these summaries are being combined 

112 via the equality operator. 

113 

114 Returns 

115 ------- 

116 self : `TreeSummary` 

117 The merged summary (updated in-place). 

118 """ 

119 self.update(other) 

120 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly(): 

121 self.dataIdValue = other.dataIdValue 

122 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly(): 

123 self.dataIdKey = other.dataIdKey 

124 else: 

125 self.dataIdKey = None 

126 self.dataIdValue = None 

127 return self 

128 

129 def isDataIdKeyOnly(self) -> bool: 

130 """Test whether this branch is _just_ a data ID key identifier.""" 

131 return self.dataIdKey is not None and self.dataIdValue is None 

132 

133 def isDataIdValueOnly(self) -> bool: 

134 """Test whether this branch is _just_ a literal value that may be 

135 used as the value in a data ID key-value pair. 

136 """ 

137 return self.dataIdKey is None and self.dataIdValue is not None 

138 

139 dataIdKey: Optional[Dimension] = None 

140 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be 

141 (if `dataIdValue` is `None`) fully identified by a literal value in this 

142 branch. 

143 """ 

144 

145 dataIdValue: Optional[str] = None 

146 """A literal value that constrains (if `dataIdKey` is not `None`) or may 

147 constrain (if `dataIdKey` is `None`) a dimension in this branch. 

148 

149 This is always a `str` or `None`, but it may need to be coerced to `int` 

150 to reflect the actual user intent. 

151 """ 

152 

153 

154class InspectionVisitor(TreeVisitor[TreeSummary]): 

155 """Implements TreeVisitor to identify dimension elements that need 

156 to be included in a query, prior to actually constructing a SQLAlchemy 

157 WHERE clause from it. 

158 

159 Parameters 

160 ---------- 

161 universe : `DimensionUniverse` 

162 All known dimensions. 

163 bind : `Mapping` [ `str`, `object` ] 

164 Mapping containing literal values that should be injected into the 

165 query expression, keyed by the identifiers they replace. 

166 """ 

167 

168 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]): 

169 self.universe = universe 

170 self.bind = bind 

171 

172 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary: 

173 # Docstring inherited from TreeVisitor.visitNumericLiteral 

174 return TreeSummary(dataIdValue=value) 

175 

176 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary: 

177 # Docstring inherited from TreeVisitor.visitStringLiteral 

178 return TreeSummary(dataIdValue=value) 

179 

180 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary: 

181 # Docstring inherited from TreeVisitor.visitTimeLiteral 

182 return TreeSummary() 

183 

184 def visitIdentifier(self, name: str, node: Node) -> TreeSummary: 

185 # Docstring inherited from TreeVisitor.visitIdentifier 

186 if name in self.bind: 

187 return TreeSummary(dataIdValue=self.bind[name]) 

188 constant = categorizeConstant(name) 

189 if constant is ExpressionConstant.INGEST_DATE: 

190 return TreeSummary(hasIngestDate=True) 

191 elif constant is ExpressionConstant.NULL: 

192 return TreeSummary() 

193 assert constant is None, "Enum variant conditionals should be exhaustive." 

194 element, column = categorizeElementId(self.universe, name) 

195 if column is None: 

196 assert isinstance(element, Dimension) 

197 return TreeSummary( 

198 dimensions=NamedValueSet(element.graph.dimensions), 

199 dataIdKey=element, 

200 ) 

201 else: 

202 return TreeSummary( 

203 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}}) 

204 ) 

205 

206 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary: 

207 # Docstring inherited from TreeVisitor.visitUnaryOp 

208 return operand 

209 

210 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary: 

211 # Docstring inherited from TreeVisitor.visitBinaryOp 

212 return lhs.merge(rhs, isEq=(operator == "=")) 

213 

214 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool, node: Node) -> TreeSummary: 

215 # Docstring inherited from TreeVisitor.visitIsIn 

216 for v in values: 

217 lhs.merge(v) 

218 return lhs 

219 

220 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary: 

221 # Docstring inherited from TreeVisitor.visitParens 

222 return expression 

223 

224 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary: 

225 # Docstring inherited from base class 

226 result = TreeSummary() 

227 for i in items: 

228 result.merge(i) 

229 return result 

230 

231 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node) -> TreeSummary: 

232 # Docstring inherited from TreeVisitor.visitRangeLiteral 

233 return TreeSummary() 

234 

235 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary: 

236 # Docstring inherited from base class 

237 return TreeSummary() 

238 

239 

240@dataclasses.dataclass 

241class InnerSummary(InspectionSummary): 

242 """Result object used by `CheckVisitor` to gather referenced dimensions 

243 and tables from an inner group of AND'd together expression branches, and 

244 check them for consistency and completeness. 

245 """ 

246 

247 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict) 

248 """Mapping containing the values of all dimensions that are equated with 

249 literal values in this expression branch. 

250 """ 

251 

252 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

253 """Governor dimensions whose values are needed by the query, not provided 

254 in the query itself, and present in the default data ID. 

255 

256 These should be added to the query's data ID when finalizing the WHERE 

257 clause. 

258 """ 

259 

260 

261@dataclasses.dataclass 

262class OuterSummary(InspectionSummary): 

263 """Result object used by `CheckVisitor` to gather referenced dimensions, 

264 tables, and governor dimension values from the entire expression. 

265 """ 

266 

267 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict) 

268 """Mapping containing all values that appear in this expression for 

269 dimensions relevant to the query. 

270 

271 Dimensions that are absent from this dict are not constrained by this 

272 expression. 

273 """ 

274 

275 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

276 """Governor dimensions whose values are needed by the query, not provided 

277 in the query itself, and present in the default data ID. 

278 

279 These should be added to the query's data ID when finalizing the WHERE 

280 clause. 

281 """ 

282 

283 

284class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]): 

285 """An implementation of `NormalFormVisitor` that identifies the dimensions 

286 and tables that need to be included in a query while performing some checks 

287 for completeness and consistency. 

288 

289 Parameters 

290 ---------- 

291 dataId : `DataCoordinate` 

292 Dimension values that are fully known in advance. 

293 graph : `DimensionGraph` 

294 The dimensions the query would include in the absence of this 

295 expression. 

296 bind : `Mapping` [ `str`, `object` ] 

297 Mapping containing literal values that should be injected into the 

298 query expression, keyed by the identifiers they replace. 

299 defaults : `DataCoordinate` 

300 A data ID containing default for governor dimensions. 

301 """ 

302 

303 def __init__( 

304 self, 

305 dataId: DataCoordinate, 

306 graph: DimensionGraph, 

307 bind: Mapping[str, Any], 

308 defaults: DataCoordinate, 

309 ): 

310 self.dataId = dataId 

311 self.graph = graph 

312 self.defaults = defaults 

313 self._branchVisitor = InspectionVisitor(dataId.universe, bind) 

314 

315 def visitBranch(self, node: Node) -> TreeSummary: 

316 # Docstring inherited from NormalFormVisitor. 

317 return node.visit(self._branchVisitor) 

318 

319 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary: 

320 # Docstring inherited from NormalFormVisitor. 

321 # Disjunctive normal form means inner branches are AND'd together... 

322 assert form is NormalForm.DISJUNCTIVE 

323 # ...and that means each branch we iterate over together below 

324 # constrains the others, and they all need to be consistent. Moreover, 

325 # because outer branches are OR'd together, we also know that if 

326 # something is missing from one of these branches (like a governor 

327 # dimension value like the instrument or skymap needed to interpret a 

328 # visit or tract number), it really is missing, because there's no way 

329 # some other inner branch can constraint it. 

330 # 

331 # That is, except the data ID the visitor was passed at construction; 

332 # that's AND'd to the entire expression later, and thus it affects all 

333 # branches. To take care of that, we add any governor values it 

334 # contains to the summary in advance. 

335 summary = InnerSummary() 

336 summary.dimension_values.update( 

337 (k, self.dataId[k]) 

338 for k in (self.dataId.graph.names if self.dataId.hasFull() else self.dataId.graph.required.names) 

339 ) 

340 # Finally, we loop over those branches. 

341 for branch in branches: 

342 # Update the sets of dimensions and columns we've seen anywhere in 

343 # the expression in any context. 

344 summary.update(branch) 

345 # Test whether this branch has a form like '<dimension>=<value>' 

346 # (or equivalent; categorizeIdentifier is smart enough to see that 

347 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so, 

348 # remember that we've constrained it on this branch to later make 

349 # sure it's consistent with any other constraints on any other 

350 # branches its AND'd with. 

351 if branch.dataIdKey is not None and branch.dataIdValue is not None: 

352 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue) 

353 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value) 

354 if value != new_value: 

355 # Expression says something like "instrument='HSC' AND 

356 # instrument='DECam'", or data ID has one and expression 

357 # has the other. 

358 if branch.dataIdKey in self.dataId: 

359 raise UserExpressionError( 

360 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} " 

361 f"and data ID with {branch.dataIdKey.name}={value!r}." 

362 ) 

363 else: 

364 raise UserExpressionError( 

365 f"Conflicting literal values for {branch.dataIdKey.name} in expression: " 

366 f"{value!r} != {branch.dataIdValue!r}." 

367 ) 

368 # Now that we know which governor values we've constrained, see if any 

369 # are missing, i.e. if the expression contains something like "visit=X" 

370 # without saying what instrument that visit corresponds to. This rules 

371 # out a lot of accidents, but it also rules out possibly-legitimate 

372 # multi-instrument queries like "visit.seeing < 0.7". But it's not 

373 # unreasonable to ask the user to be explicit about the instruments 

374 # they want to consider to work around this restriction, and that's 

375 # what we do. Note that if someone does write an expression like 

376 # 

377 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7 

378 # 

379 # then in disjunctive normal form that will become 

380 # 

381 # (instrument='HSC' AND visit.seeing < 0.7) 

382 # OR (instrument='DECam' AND visit.seeing < 0.7) 

383 # 

384 # i.e. each instrument will get its own outer branch and the logic here 

385 # still works (that sort of thing is why we convert to normal form, 

386 # after all). 

387 governorsNeededInBranch: set[str] = set() 

388 for dimension in summary.dimensions: 

389 governorsNeededInBranch.update(dimension.graph.governors.names) 

390 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()): 

391 missing = governorsNeededInBranch - summary.dimension_values.keys() 

392 if missing <= self.defaults.names: 

393 summary.defaultsNeeded.update(missing) 

394 else: 

395 still_missing = missing - self.defaults.names 

396 raise UserExpressionError( 

397 f"No value(s) for governor dimensions {still_missing} in expression " 

398 "that references dependent dimensions. 'Governor' dimensions must always be specified " 

399 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' " 

400 "terms) or in a data ID passed to the query method." 

401 ) 

402 return summary 

403 

404 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary: 

405 # Docstring inherited from NormalFormVisitor. 

406 # Disjunctive normal form means outer branches are OR'd together. 

407 assert form is NormalForm.DISJUNCTIVE 

408 summary = OuterSummary() 

409 if branches: 

410 # Iterate over branches in first pass to gather all dimensions and 

411 # columns referenced. This aggregation is for the full query, so 

412 # we don't care whether things are joined by AND or OR (or + or -, 

413 # etc). Also gather the set of dimensions directly constrained or 

414 # pulled from defaults in _all_ branches. This is the set we will 

415 # be able to bound overall; any dimensions not referenced by even 

416 # one branch could be unbounded. 

417 dimensions_in_all_branches = set(self.graph.universe.getStaticDimensions().names) 

418 for branch in branches: 

419 summary.update(branch) 

420 summary.defaultsNeeded.update(branch.defaultsNeeded) 

421 dimensions_in_all_branches.intersection_update(branch.dimension_values) 

422 # Go back through and set up the dimension bounds. 

423 summary.dimension_constraints.update( 

424 {dimension: set() for dimension in dimensions_in_all_branches} 

425 ) 

426 for dim in dimensions_in_all_branches: 

427 for branch in branches: 

428 summary.dimension_constraints[dim].add(branch.dimension_values[dim]) 

429 # See if we've referenced any dimensions that weren't in the original 

430 # query graph; if so, we update that to include them. This is what 

431 # lets a user say "tract=X" on the command line (well, "skymap=Y AND 

432 # tract=X" - logic in visitInner checks for that) when running a task 

433 # like ISR that has nothing to do with skymaps. 

434 if not summary.dimensions.issubset(self.graph.dimensions): 

435 self.graph = DimensionGraph( 

436 self.graph.universe, 

437 dimensions=(summary.dimensions | self.graph.dimensions), 

438 ) 

439 for dimension, values in summary.dimension_constraints.items(): 

440 if dimension in summary.defaultsNeeded: 

441 # One branch contained an explicit value for this dimension 

442 # while another needed to refer to the default data ID. 

443 # Even if these refer to the same value, that inconsistency 

444 # probably indicates user error. 

445 raise UserExpressionError( 

446 f"Governor dimension {dimension} is explicitly " 

447 f"constrained to {values} in one or more branches of " 

448 "this query where expression, but is left to default " 

449 f"to {self.defaults[dimension]!r} in another branch. " 

450 "Defaults and explicit constraints cannot be mixed." 

451 ) 

452 # If any default data ID values were needed, update self.dataId with 

453 # them, and then update the governor restriction with them. 

454 if summary.defaultsNeeded: 

455 defaultsNeededGraph = DimensionGraph(self.graph.universe, names=summary.defaultsNeeded) 

456 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph)) 

457 for dimension in summary.defaultsNeeded: 

458 summary.dimension_constraints[dimension] = {self.defaults[dimension]} 

459 

460 return summary