Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 33%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

142 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "CheckVisitor", 

25 "InspectionVisitor", 

26 "InspectionSummary", 

27) 

28 

29import dataclasses 

30from typing import ( 

31 AbstractSet, 

32 List, 

33 Optional, 

34 Sequence, 

35 Set, 

36 Tuple, 

37 TYPE_CHECKING, 

38) 

39 

40from ....core import ( 

41 DataCoordinate, 

42 DimensionUniverse, 

43 Dimension, 

44 DimensionElement, 

45 DimensionGraph, 

46 GovernorDimension, 

47 NamedKeyDict, 

48 NamedValueSet, 

49) 

50from ...summaries import GovernorDimensionRestriction 

51from .parser import Node, TreeVisitor 

52from .normalForm import NormalForm, NormalFormVisitor 

53from .categorize import categorizeElementId, categorizeConstant, ExpressionConstant 

54 

55if TYPE_CHECKING: 55 ↛ 56line 55 didn't jump to line 56, because the condition on line 55 was never true

56 import astropy.time 

57 

58 

59@dataclasses.dataclass 

60class InspectionSummary: 

61 """Base class for objects used by `CheckVisitor` and `InspectionVisitor` 

62 to gather information about a parsed expression. 

63 """ 

64 

65 def update(self, other: InspectionSummary) -> None: 

66 """Update ``self`` with all dimensions and columns from ``other``. 

67 

68 Parameters 

69 ---------- 

70 other : `InspectionSummary` 

71 The other summary object. 

72 """ 

73 self.dimensions.update(other.dimensions) 

74 for element, columns in other.columns.items(): 

75 self.columns.setdefault(element, set()).update(columns) 

76 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate 

77 

78 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet) 

79 """Dimensions whose primary keys or dependencies were referenced anywhere 

80 in this branch (`NamedValueSet` [ `Dimension` ]). 

81 """ 

82 

83 columns: NamedKeyDict[DimensionElement, Set[str]] = dataclasses.field(default_factory=NamedKeyDict) 

84 """Dimension element tables whose columns were referenced anywhere in this 

85 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]). 

86 """ 

87 

88 hasIngestDate: bool = False 

89 """Whether this expression includes the special dataset ingest date 

90 identifier (`bool`). 

91 """ 

92 

93 

94@dataclasses.dataclass 

95class TreeSummary(InspectionSummary): 

96 """Result object used by `InspectionVisitor` to gather information about 

97 a parsed expression. 

98 

99 Notes 

100 ----- 

101 TreeSummary adds attributes that allow dimension equivalence expressions 

102 (e.g. "tract=4") to be recognized when they appear in simple contexts 

103 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its 

104 own (i.e. when ``check=False`` in the query code), these don't do anything, 

105 but they don't cost much, either. They are used by `CheckVisitor` when it 

106 delegates to `InspectionVisitor` to see what governor dimension values are 

107 set in a branch of the normal-form expression. 

108 """ 

109 

110 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary: 

111 """Merge ``other`` into ``self``, making ``self`` a summary of both 

112 expression tree branches. 

113 

114 Parameters 

115 ---------- 

116 other : `TreeSummary` 

117 The other summary object. 

118 isEq : `bool`, optional 

119 If `True` (`False` is default), these summaries are being combined 

120 via the equality operator. 

121 

122 Returns 

123 ------- 

124 self : `TreeSummary` 

125 The merged summary (updated in-place). 

126 """ 

127 self.update(other) 

128 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly(): 

129 self.dataIdValue = other.dataIdValue 

130 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly(): 

131 self.dataIdKey = other.dataIdKey 

132 else: 

133 self.dataIdKey = None 

134 self.dataIdValue = None 

135 return self 

136 

137 def isDataIdKeyOnly(self) -> bool: 

138 """Test whether this branch is _just_ a data ID key identifier. 

139 """ 

140 return self.dataIdKey is not None and self.dataIdValue is None 

141 

142 def isDataIdValueOnly(self) -> bool: 

143 """Test whether this branch is _just_ a literal value that may be 

144 used as the value in a data ID key-value pair. 

145 """ 

146 return self.dataIdKey is None and self.dataIdValue is not None 

147 

148 dataIdKey: Optional[Dimension] = None 

149 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be 

150 (if `dataIdValue` is `None`) fully identified by a literal value in this 

151 branch. 

152 """ 

153 

154 dataIdValue: Optional[str] = None 

155 """A literal value that constrains (if `dataIdKey` is not `None`) or may 

156 constrain (if `dataIdKey` is `None`) a dimension in this branch. 

157 

158 This is always a `str` or `None`, but it may need to be coerced to `int` 

159 to reflect the actual user intent. 

160 """ 

161 

162 

163class InspectionVisitor(TreeVisitor[TreeSummary]): 

164 """Implements TreeVisitor to identify dimension elements that need 

165 to be included in a query, prior to actually constructing a SQLAlchemy 

166 WHERE clause from it. 

167 

168 Parameters 

169 ---------- 

170 universe : `DimensionUniverse` 

171 All known dimensions. 

172 bindKeys : `collections.abc.Set` [ `str` ] 

173 Identifiers that represent bound parameter values, and hence need not 

174 represent in-database entities. 

175 """ 

176 def __init__(self, universe: DimensionUniverse, bindKeys: AbstractSet[str]): 

177 self.universe = universe 

178 self.bindKeys = bindKeys 

179 

180 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary: 

181 # Docstring inherited from TreeVisitor.visitNumericLiteral 

182 return TreeSummary(dataIdValue=value) 

183 

184 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary: 

185 # Docstring inherited from TreeVisitor.visitStringLiteral 

186 return TreeSummary(dataIdValue=value) 

187 

188 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary: 

189 # Docstring inherited from TreeVisitor.visitTimeLiteral 

190 return TreeSummary() 

191 

192 def visitIdentifier(self, name: str, node: Node) -> TreeSummary: 

193 # Docstring inherited from TreeVisitor.visitIdentifier 

194 if name in self.bindKeys: 

195 return TreeSummary() 

196 constant = categorizeConstant(name) 

197 if constant is ExpressionConstant.INGEST_DATE: 

198 return TreeSummary(hasIngestDate=True) 

199 elif constant is ExpressionConstant.NULL: 

200 return TreeSummary() 

201 assert constant is None, "Enum variant conditionals should be exhaustive." 

202 element, column = categorizeElementId(self.universe, name) 

203 if column is None: 

204 assert isinstance(element, Dimension) 

205 return TreeSummary( 

206 dimensions=NamedValueSet(element.graph.dimensions), 

207 dataIdKey=element, 

208 ) 

209 else: 

210 return TreeSummary( 

211 dimensions=NamedValueSet(element.graph.dimensions), 

212 columns=NamedKeyDict({element: {column}}) 

213 ) 

214 

215 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node 

216 ) -> TreeSummary: 

217 # Docstring inherited from TreeVisitor.visitUnaryOp 

218 return operand 

219 

220 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, 

221 node: Node) -> TreeSummary: 

222 # Docstring inherited from TreeVisitor.visitBinaryOp 

223 return lhs.merge(rhs, isEq=(operator == "=")) 

224 

225 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool, 

226 node: Node) -> TreeSummary: 

227 # Docstring inherited from TreeVisitor.visitIsIn 

228 for v in values: 

229 lhs.merge(v) 

230 return lhs 

231 

232 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary: 

233 # Docstring inherited from TreeVisitor.visitParens 

234 return expression 

235 

236 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary: 

237 # Docstring inherited from base class 

238 result = TreeSummary() 

239 for i in items: 

240 result.merge(i) 

241 return result 

242 

243 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node 

244 ) -> TreeSummary: 

245 # Docstring inherited from TreeVisitor.visitRangeLiteral 

246 return TreeSummary() 

247 

248 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary: 

249 # Docstring inherited from base class 

250 return TreeSummary() 

251 

252 

253@dataclasses.dataclass 

254class InnerSummary(InspectionSummary): 

255 """Result object used by `CheckVisitor` to gather referenced dimensions 

256 and tables from an inner group of AND'd together expression branches, and 

257 check them for consistency and completeness. 

258 """ 

259 

260 governors: NamedKeyDict[GovernorDimension, str] = dataclasses.field(default_factory=NamedKeyDict) 

261 """Mapping containing the values of all governor dimensions that are 

262 equated with literal values in this expression branch. 

263 """ 

264 

265 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet) 

266 """Governor dimensions whose values are needed by the query, not provided 

267 in the query itself, and present in the default data ID. 

268 

269 These should be added to the query's data ID when finalizing the WHERE 

270 clause. 

271 """ 

272 

273 

274@dataclasses.dataclass 

275class OuterSummary(InspectionSummary): 

276 """Result object used by `CheckVisitor` to gather referenced dimensions, 

277 tables, and governor dimension values from the entire expression. 

278 """ 

279 

280 governors: GovernorDimensionRestriction = dataclasses.field( 

281 default_factory=GovernorDimensionRestriction.makeFull 

282 ) 

283 """Mapping containing all values that appear in this expression for 

284 governor dimension relevant to the query. 

285 

286 Governor dimensions that are absent from this dict are not constrained by 

287 this expression. 

288 """ 

289 

290 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet) 

291 """Governor dimensions whose values are needed by the query, not provided 

292 in the query itself, and present in the default data ID. 

293 

294 These should be added to the query's data ID when finalizing the WHERE 

295 clause. 

296 """ 

297 

298 

299class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]): 

300 """An implementation of `NormalFormVisitor` that identifies the dimensions 

301 and tables that need to be included in a query while performing some checks 

302 for completeness and consistency. 

303 

304 Parameters 

305 ---------- 

306 dataId : `DataCoordinate` 

307 Dimension values that are fully known in advance. 

308 graph : `DimensionGraph` 

309 The dimensions the query would include in the absence of this 

310 expression. 

311 bindKeys : `collections.abc.Set` [ `str` ] 

312 Identifiers that represent bound parameter values, and hence need not 

313 represent in-database entities. 

314 defaults : `DataCoordinate` 

315 A data ID containing default for governor dimensions. 

316 """ 

317 def __init__(self, dataId: DataCoordinate, graph: DimensionGraph, bindKeys: AbstractSet[str], 

318 defaults: DataCoordinate): 

319 self.dataId = dataId 

320 self.graph = graph 

321 self.bindKeys = bindKeys 

322 self.defaults = defaults 

323 self._branchVisitor = InspectionVisitor(dataId.universe, bindKeys) 

324 

325 def visitBranch(self, node: Node) -> TreeSummary: 

326 # Docstring inherited from NormalFormVisitor. 

327 return node.visit(self._branchVisitor) 

328 

329 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary: 

330 # Docstring inherited from NormalFormVisitor. 

331 # Disjunctive normal form means inner branches are AND'd together... 

332 assert form is NormalForm.DISJUNCTIVE 

333 # ...and that means each branch we iterate over together below 

334 # constrains the others, and they all need to be consistent. Moreover, 

335 # because outer branches are OR'd together, we also know that if 

336 # something is missing from one of these branches (like a governor 

337 # dimension value like the instrument or skymap needed to interpret a 

338 # visit or tract number), it really is missing, because there's no way 

339 # some other inner branch can constraint it. 

340 # 

341 # That is, except the data ID the visitor was passed at construction; 

342 # that's AND'd to the entire expression later, and thus it affects all 

343 # branches. To take care of that, we add any governor values it 

344 # contains to the summary in advance. 

345 summary = InnerSummary() 

346 summary.governors.update((k, self.dataId[k]) for k in self.dataId.graph.governors) # type: ignore 

347 # Finally, we loop over those branches. 

348 for branch in branches: 

349 # Update the sets of dimensions and columns we've seen anywhere in 

350 # the expression in any context. 

351 summary.update(branch) 

352 # Test whether this branch has a form like '<dimension>=<value' 

353 # (or equivalent; categorizeIdentifier is smart enough to see that 

354 # e.g. 'detector.id=4' is equivalent to 'detector=4'). 

355 # If so, and it's a governor dimension, remember that we've 

356 # constrained it on this branch, and make sure it's consistent 

357 # with any other constraints on any other branches its AND'd with. 

358 if isinstance(branch.dataIdKey, GovernorDimension) and branch.dataIdValue is not None: 

359 governor = branch.dataIdKey 

360 value = summary.governors.setdefault(governor, branch.dataIdValue) 

361 if value != branch.dataIdValue: 

362 # Expression says something like "instrument='HSC' AND 

363 # instrument='DECam'", or data ID has one and expression 

364 # has the other. 

365 if governor in self.dataId: 

366 raise RuntimeError( 

367 f"Conflict between expression containing {governor.name}={branch.dataIdValue!r} " 

368 f"and data ID with {governor.name}={value!r}." 

369 ) 

370 else: 

371 raise RuntimeError( 

372 f"Conflicting literal values for {governor.name} in expression: " 

373 f"{value!r} != {branch.dataIdValue!r}." 

374 ) 

375 # Now that we know which governor values we've constrained, see if any 

376 # are missing, i.e. if the expression contains something like "visit=X" 

377 # without saying what instrument that visit corresponds to. This rules 

378 # out a lot of accidents, but it also rules out possibly-legitimate 

379 # multi-instrument queries like "visit.seeing < 0.7". But it's not 

380 # unreasonable to ask the user to be explicit about the instruments 

381 # they want to consider to work around this restriction, and that's 

382 # what we do. Note that if someone does write an expression like 

383 # 

384 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7 

385 # 

386 # then in disjunctive normal form that will become 

387 # 

388 # (instrument='HSC' AND visit.seeing < 0.7) 

389 # OR (instrument='DECam' AND visit.seeing < 0.7) 

390 # 

391 # i.e. each instrument will get its own outer branch and the logic here 

392 # still works (that sort of thing is why we convert to normal form, 

393 # after all). 

394 governorsNeededInBranch: NamedValueSet[GovernorDimension] = NamedValueSet() 

395 for dimension in summary.dimensions: 

396 governorsNeededInBranch.update(dimension.graph.governors) 

397 if not governorsNeededInBranch.issubset(summary.governors.keys()): 

398 missing = NamedValueSet(governorsNeededInBranch - summary.governors.keys()) 

399 if missing <= self.defaults.keys(): 

400 summary.defaultsNeeded.update(missing) 

401 else: 

402 raise RuntimeError( 

403 f"No value(s) for governor dimensions {missing - self.defaults.keys()} in expression " 

404 "that references dependent dimensions. 'Governor' dimensions must always be specified " 

405 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' " 

406 "terms) or in a data ID passed to the query method." 

407 ) 

408 return summary 

409 

410 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary: 

411 # Docstring inherited from NormalFormVisitor. 

412 # Disjunctive normal form means outer branches are OR'd together. 

413 assert form is NormalForm.DISJUNCTIVE 

414 # Iterate over branches in first pass to gather all dimensions and 

415 # columns referenced. This aggregation is for the full query, so we 

416 # don't care whether things are joined by AND or OR (or + or -, etc). 

417 summary = OuterSummary() 

418 for branch in branches: 

419 summary.update(branch) 

420 summary.governors.update(branch.governors) 

421 summary.defaultsNeeded.update(branch.defaultsNeeded) 

422 # See if we've referenced any dimensions that weren't in the original 

423 # query graph; if so, we update that to include them. This is what 

424 # lets a user say "tract=X" on the command line (well, "skymap=Y AND 

425 # tract=X" - logic in visitInner checks for that) when running a task 

426 # like ISR that has nothing to do with skymaps. 

427 if not summary.dimensions.issubset(self.graph.dimensions): 

428 self.graph = DimensionGraph( 

429 self.graph.universe, 

430 dimensions=(summary.dimensions | self.graph.dimensions), 

431 ) 

432 for governor, values in branch.governors.items(): 

433 if governor in summary.defaultsNeeded: 

434 # One branch contained an explicit value for this dimension 

435 # while another needed to refer to the default data ID. 

436 # Even if these refer to the same value, that inconsistency 

437 # probably indicates user error. 

438 raise RuntimeError( 

439 f"Governor dimension {governor.name} is explicitly " 

440 f"constrained to {values} in one or more branches of " 

441 "this query where expression, but is left to default " 

442 f"to {self.defaults[governor]!r} in another branch. " 

443 "Defaults and explicit constraints cannot be mixed." 

444 ) 

445 # If any default data ID values were needed, update self.dataId with 

446 # them, and then update the governor restriction with them. 

447 if summary.defaultsNeeded: 

448 defaultsNeededGraph = DimensionGraph(self.graph.universe, summary.defaultsNeeded) 

449 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph)) 

450 assert self.dataId.hasRecords(), ( 

451 "Should be a union of two data IDs with records, " 

452 "in which one only adds governor dimension values." 

453 ) 

454 summary.governors.intersection_update( 

455 # We know the value for a governor dimension is always a str, 

456 # and that's all self.defaults should contain, but MyPy doesn't 

457 # know that. 

458 {dimension: self.defaults[dimension] for dimension in summary.defaultsNeeded} # type: ignore 

459 ) 

460 return summary