Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 33%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

142 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "CheckVisitor", 

25 "InspectionVisitor", 

26 "InspectionSummary", 

27) 

28 

29import dataclasses 

30from typing import TYPE_CHECKING, AbstractSet, List, Optional, Sequence, Set, Tuple 

31 

32from ....core import ( 

33 DataCoordinate, 

34 Dimension, 

35 DimensionElement, 

36 DimensionGraph, 

37 DimensionUniverse, 

38 GovernorDimension, 

39 NamedKeyDict, 

40 NamedValueSet, 

41) 

42from ...summaries import GovernorDimensionRestriction 

43from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId 

44from .normalForm import NormalForm, NormalFormVisitor 

45from .parser import Node, TreeVisitor 

46 

47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true

48 import astropy.time 

49 

50 

51@dataclasses.dataclass 

52class InspectionSummary: 

53 """Base class for objects used by `CheckVisitor` and `InspectionVisitor` 

54 to gather information about a parsed expression. 

55 """ 

56 

57 def update(self, other: InspectionSummary) -> None: 

58 """Update ``self`` with all dimensions and columns from ``other``. 

59 

60 Parameters 

61 ---------- 

62 other : `InspectionSummary` 

63 The other summary object. 

64 """ 

65 self.dimensions.update(other.dimensions) 

66 for element, columns in other.columns.items(): 

67 self.columns.setdefault(element, set()).update(columns) 

68 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate 

69 

70 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet) 

71 """Dimensions whose primary keys or dependencies were referenced anywhere 

72 in this branch (`NamedValueSet` [ `Dimension` ]). 

73 """ 

74 

75 columns: NamedKeyDict[DimensionElement, Set[str]] = dataclasses.field(default_factory=NamedKeyDict) 

76 """Dimension element tables whose columns were referenced anywhere in this 

77 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]). 

78 """ 

79 

80 hasIngestDate: bool = False 

81 """Whether this expression includes the special dataset ingest date 

82 identifier (`bool`). 

83 """ 

84 

85 

86@dataclasses.dataclass 

87class TreeSummary(InspectionSummary): 

88 """Result object used by `InspectionVisitor` to gather information about 

89 a parsed expression. 

90 

91 Notes 

92 ----- 

93 TreeSummary adds attributes that allow dimension equivalence expressions 

94 (e.g. "tract=4") to be recognized when they appear in simple contexts 

95 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its 

96 own (i.e. when ``check=False`` in the query code), these don't do anything, 

97 but they don't cost much, either. They are used by `CheckVisitor` when it 

98 delegates to `InspectionVisitor` to see what governor dimension values are 

99 set in a branch of the normal-form expression. 

100 """ 

101 

102 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary: 

103 """Merge ``other`` into ``self``, making ``self`` a summary of both 

104 expression tree branches. 

105 

106 Parameters 

107 ---------- 

108 other : `TreeSummary` 

109 The other summary object. 

110 isEq : `bool`, optional 

111 If `True` (`False` is default), these summaries are being combined 

112 via the equality operator. 

113 

114 Returns 

115 ------- 

116 self : `TreeSummary` 

117 The merged summary (updated in-place). 

118 """ 

119 self.update(other) 

120 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly(): 

121 self.dataIdValue = other.dataIdValue 

122 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly(): 

123 self.dataIdKey = other.dataIdKey 

124 else: 

125 self.dataIdKey = None 

126 self.dataIdValue = None 

127 return self 

128 

129 def isDataIdKeyOnly(self) -> bool: 

130 """Test whether this branch is _just_ a data ID key identifier.""" 

131 return self.dataIdKey is not None and self.dataIdValue is None 

132 

133 def isDataIdValueOnly(self) -> bool: 

134 """Test whether this branch is _just_ a literal value that may be 

135 used as the value in a data ID key-value pair. 

136 """ 

137 return self.dataIdKey is None and self.dataIdValue is not None 

138 

139 dataIdKey: Optional[Dimension] = None 

140 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be 

141 (if `dataIdValue` is `None`) fully identified by a literal value in this 

142 branch. 

143 """ 

144 

145 dataIdValue: Optional[str] = None 

146 """A literal value that constrains (if `dataIdKey` is not `None`) or may 

147 constrain (if `dataIdKey` is `None`) a dimension in this branch. 

148 

149 This is always a `str` or `None`, but it may need to be coerced to `int` 

150 to reflect the actual user intent. 

151 """ 

152 

153 

154class InspectionVisitor(TreeVisitor[TreeSummary]): 

155 """Implements TreeVisitor to identify dimension elements that need 

156 to be included in a query, prior to actually constructing a SQLAlchemy 

157 WHERE clause from it. 

158 

159 Parameters 

160 ---------- 

161 universe : `DimensionUniverse` 

162 All known dimensions. 

163 bindKeys : `collections.abc.Set` [ `str` ] 

164 Identifiers that represent bound parameter values, and hence need not 

165 represent in-database entities. 

166 """ 

167 

168 def __init__(self, universe: DimensionUniverse, bindKeys: AbstractSet[str]): 

169 self.universe = universe 

170 self.bindKeys = bindKeys 

171 

172 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary: 

173 # Docstring inherited from TreeVisitor.visitNumericLiteral 

174 return TreeSummary(dataIdValue=value) 

175 

176 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary: 

177 # Docstring inherited from TreeVisitor.visitStringLiteral 

178 return TreeSummary(dataIdValue=value) 

179 

180 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary: 

181 # Docstring inherited from TreeVisitor.visitTimeLiteral 

182 return TreeSummary() 

183 

184 def visitIdentifier(self, name: str, node: Node) -> TreeSummary: 

185 # Docstring inherited from TreeVisitor.visitIdentifier 

186 if name in self.bindKeys: 

187 return TreeSummary() 

188 constant = categorizeConstant(name) 

189 if constant is ExpressionConstant.INGEST_DATE: 

190 return TreeSummary(hasIngestDate=True) 

191 elif constant is ExpressionConstant.NULL: 

192 return TreeSummary() 

193 assert constant is None, "Enum variant conditionals should be exhaustive." 

194 element, column = categorizeElementId(self.universe, name) 

195 if column is None: 

196 assert isinstance(element, Dimension) 

197 return TreeSummary( 

198 dimensions=NamedValueSet(element.graph.dimensions), 

199 dataIdKey=element, 

200 ) 

201 else: 

202 return TreeSummary( 

203 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}}) 

204 ) 

205 

206 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary: 

207 # Docstring inherited from TreeVisitor.visitUnaryOp 

208 return operand 

209 

210 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary: 

211 # Docstring inherited from TreeVisitor.visitBinaryOp 

212 return lhs.merge(rhs, isEq=(operator == "=")) 

213 

214 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool, node: Node) -> TreeSummary: 

215 # Docstring inherited from TreeVisitor.visitIsIn 

216 for v in values: 

217 lhs.merge(v) 

218 return lhs 

219 

220 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary: 

221 # Docstring inherited from TreeVisitor.visitParens 

222 return expression 

223 

224 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary: 

225 # Docstring inherited from base class 

226 result = TreeSummary() 

227 for i in items: 

228 result.merge(i) 

229 return result 

230 

231 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node) -> TreeSummary: 

232 # Docstring inherited from TreeVisitor.visitRangeLiteral 

233 return TreeSummary() 

234 

235 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary: 

236 # Docstring inherited from base class 

237 return TreeSummary() 

238 

239 

240@dataclasses.dataclass 

241class InnerSummary(InspectionSummary): 

242 """Result object used by `CheckVisitor` to gather referenced dimensions 

243 and tables from an inner group of AND'd together expression branches, and 

244 check them for consistency and completeness. 

245 """ 

246 

247 governors: NamedKeyDict[GovernorDimension, str] = dataclasses.field(default_factory=NamedKeyDict) 

248 """Mapping containing the values of all governor dimensions that are 

249 equated with literal values in this expression branch. 

250 """ 

251 

252 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet) 

253 """Governor dimensions whose values are needed by the query, not provided 

254 in the query itself, and present in the default data ID. 

255 

256 These should be added to the query's data ID when finalizing the WHERE 

257 clause. 

258 """ 

259 

260 

261@dataclasses.dataclass 

262class OuterSummary(InspectionSummary): 

263 """Result object used by `CheckVisitor` to gather referenced dimensions, 

264 tables, and governor dimension values from the entire expression. 

265 """ 

266 

267 governors: GovernorDimensionRestriction = dataclasses.field( 

268 default_factory=GovernorDimensionRestriction.makeFull 

269 ) 

270 """Mapping containing all values that appear in this expression for 

271 governor dimension relevant to the query. 

272 

273 Governor dimensions that are absent from this dict are not constrained by 

274 this expression. 

275 """ 

276 

277 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet) 

278 """Governor dimensions whose values are needed by the query, not provided 

279 in the query itself, and present in the default data ID. 

280 

281 These should be added to the query's data ID when finalizing the WHERE 

282 clause. 

283 """ 

284 

285 

286class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]): 

287 """An implementation of `NormalFormVisitor` that identifies the dimensions 

288 and tables that need to be included in a query while performing some checks 

289 for completeness and consistency. 

290 

291 Parameters 

292 ---------- 

293 dataId : `DataCoordinate` 

294 Dimension values that are fully known in advance. 

295 graph : `DimensionGraph` 

296 The dimensions the query would include in the absence of this 

297 expression. 

298 bindKeys : `collections.abc.Set` [ `str` ] 

299 Identifiers that represent bound parameter values, and hence need not 

300 represent in-database entities. 

301 defaults : `DataCoordinate` 

302 A data ID containing default for governor dimensions. 

303 """ 

304 

305 def __init__( 

306 self, 

307 dataId: DataCoordinate, 

308 graph: DimensionGraph, 

309 bindKeys: AbstractSet[str], 

310 defaults: DataCoordinate, 

311 ): 

312 self.dataId = dataId 

313 self.graph = graph 

314 self.bindKeys = bindKeys 

315 self.defaults = defaults 

316 self._branchVisitor = InspectionVisitor(dataId.universe, bindKeys) 

317 

318 def visitBranch(self, node: Node) -> TreeSummary: 

319 # Docstring inherited from NormalFormVisitor. 

320 return node.visit(self._branchVisitor) 

321 

322 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary: 

323 # Docstring inherited from NormalFormVisitor. 

324 # Disjunctive normal form means inner branches are AND'd together... 

325 assert form is NormalForm.DISJUNCTIVE 

326 # ...and that means each branch we iterate over together below 

327 # constrains the others, and they all need to be consistent. Moreover, 

328 # because outer branches are OR'd together, we also know that if 

329 # something is missing from one of these branches (like a governor 

330 # dimension value like the instrument or skymap needed to interpret a 

331 # visit or tract number), it really is missing, because there's no way 

332 # some other inner branch can constraint it. 

333 # 

334 # That is, except the data ID the visitor was passed at construction; 

335 # that's AND'd to the entire expression later, and thus it affects all 

336 # branches. To take care of that, we add any governor values it 

337 # contains to the summary in advance. 

338 summary = InnerSummary() 

339 summary.governors.update((k, self.dataId[k]) for k in self.dataId.graph.governors) # type: ignore 

340 # Finally, we loop over those branches. 

341 for branch in branches: 

342 # Update the sets of dimensions and columns we've seen anywhere in 

343 # the expression in any context. 

344 summary.update(branch) 

345 # Test whether this branch has a form like '<dimension>=<value' 

346 # (or equivalent; categorizeIdentifier is smart enough to see that 

347 # e.g. 'detector.id=4' is equivalent to 'detector=4'). 

348 # If so, and it's a governor dimension, remember that we've 

349 # constrained it on this branch, and make sure it's consistent 

350 # with any other constraints on any other branches its AND'd with. 

351 if isinstance(branch.dataIdKey, GovernorDimension) and branch.dataIdValue is not None: 

352 governor = branch.dataIdKey 

353 value = summary.governors.setdefault(governor, branch.dataIdValue) 

354 if value != branch.dataIdValue: 

355 # Expression says something like "instrument='HSC' AND 

356 # instrument='DECam'", or data ID has one and expression 

357 # has the other. 

358 if governor in self.dataId: 

359 raise RuntimeError( 

360 f"Conflict between expression containing {governor.name}={branch.dataIdValue!r} " 

361 f"and data ID with {governor.name}={value!r}." 

362 ) 

363 else: 

364 raise RuntimeError( 

365 f"Conflicting literal values for {governor.name} in expression: " 

366 f"{value!r} != {branch.dataIdValue!r}." 

367 ) 

368 # Now that we know which governor values we've constrained, see if any 

369 # are missing, i.e. if the expression contains something like "visit=X" 

370 # without saying what instrument that visit corresponds to. This rules 

371 # out a lot of accidents, but it also rules out possibly-legitimate 

372 # multi-instrument queries like "visit.seeing < 0.7". But it's not 

373 # unreasonable to ask the user to be explicit about the instruments 

374 # they want to consider to work around this restriction, and that's 

375 # what we do. Note that if someone does write an expression like 

376 # 

377 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7 

378 # 

379 # then in disjunctive normal form that will become 

380 # 

381 # (instrument='HSC' AND visit.seeing < 0.7) 

382 # OR (instrument='DECam' AND visit.seeing < 0.7) 

383 # 

384 # i.e. each instrument will get its own outer branch and the logic here 

385 # still works (that sort of thing is why we convert to normal form, 

386 # after all). 

387 governorsNeededInBranch: NamedValueSet[GovernorDimension] = NamedValueSet() 

388 for dimension in summary.dimensions: 

389 governorsNeededInBranch.update(dimension.graph.governors) 

390 if not governorsNeededInBranch.issubset(summary.governors.keys()): 

391 missing = NamedValueSet(governorsNeededInBranch - summary.governors.keys()) 

392 if missing <= self.defaults.keys(): 

393 summary.defaultsNeeded.update(missing) 

394 else: 

395 raise RuntimeError( 

396 f"No value(s) for governor dimensions {missing - self.defaults.keys()} in expression " 

397 "that references dependent dimensions. 'Governor' dimensions must always be specified " 

398 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' " 

399 "terms) or in a data ID passed to the query method." 

400 ) 

401 return summary 

402 

403 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary: 

404 # Docstring inherited from NormalFormVisitor. 

405 # Disjunctive normal form means outer branches are OR'd together. 

406 assert form is NormalForm.DISJUNCTIVE 

407 # Iterate over branches in first pass to gather all dimensions and 

408 # columns referenced. This aggregation is for the full query, so we 

409 # don't care whether things are joined by AND or OR (or + or -, etc). 

410 summary = OuterSummary() 

411 for branch in branches: 

412 summary.update(branch) 

413 summary.governors.update(branch.governors) 

414 summary.defaultsNeeded.update(branch.defaultsNeeded) 

415 # See if we've referenced any dimensions that weren't in the original 

416 # query graph; if so, we update that to include them. This is what 

417 # lets a user say "tract=X" on the command line (well, "skymap=Y AND 

418 # tract=X" - logic in visitInner checks for that) when running a task 

419 # like ISR that has nothing to do with skymaps. 

420 if not summary.dimensions.issubset(self.graph.dimensions): 

421 self.graph = DimensionGraph( 

422 self.graph.universe, 

423 dimensions=(summary.dimensions | self.graph.dimensions), 

424 ) 

425 for governor, values in branch.governors.items(): 

426 if governor in summary.defaultsNeeded: 

427 # One branch contained an explicit value for this dimension 

428 # while another needed to refer to the default data ID. 

429 # Even if these refer to the same value, that inconsistency 

430 # probably indicates user error. 

431 raise RuntimeError( 

432 f"Governor dimension {governor.name} is explicitly " 

433 f"constrained to {values} in one or more branches of " 

434 "this query where expression, but is left to default " 

435 f"to {self.defaults[governor]!r} in another branch. " 

436 "Defaults and explicit constraints cannot be mixed." 

437 ) 

438 # If any default data ID values were needed, update self.dataId with 

439 # them, and then update the governor restriction with them. 

440 if summary.defaultsNeeded: 

441 defaultsNeededGraph = DimensionGraph(self.graph.universe, summary.defaultsNeeded) 

442 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph)) 

443 assert self.dataId.hasRecords(), ( 

444 "Should be a union of two data IDs with records, " 

445 "in which one only adds governor dimension values." 

446 ) 

447 summary.governors.intersection_update( 

448 # We know the value for a governor dimension is always a str, 

449 # and that's all self.defaults should contain, but MyPy doesn't 

450 # know that. 

451 {dimension: self.defaults[dimension] for dimension in summary.defaultsNeeded} # type: ignore 

452 ) 

453 return summary