Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 28%

169 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-28 10:10 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "CheckVisitor", 

25 "InspectionVisitor", 

26 "InspectionSummary", 

27) 

28 

29import dataclasses 

30from collections.abc import Mapping, Sequence, Set 

31from typing import TYPE_CHECKING, Any 

32 

33from ....core import ( 

34 DataCoordinate, 

35 DataIdValue, 

36 DatasetColumnTag, 

37 Dimension, 

38 DimensionElement, 

39 DimensionGraph, 

40 DimensionKeyColumnTag, 

41 DimensionRecordColumnTag, 

42 DimensionUniverse, 

43 NamedKeyDict, 

44 NamedValueSet, 

45) 

46from ..._exceptions import UserExpressionError 

47from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId 

48from .normalForm import NormalForm, NormalFormVisitor 

49from .parser import Node, TreeVisitor 

50 

51if TYPE_CHECKING: 

52 import astropy.time 

53 from lsst.daf.relation import ColumnTag 

54 

55 

56@dataclasses.dataclass 

57class InspectionSummary: 

58 """Base class for objects used by `CheckVisitor` and `InspectionVisitor` 

59 to gather information about a parsed expression. 

60 """ 

61 

62 def update(self, other: InspectionSummary) -> None: 

63 """Update ``self`` with all dimensions and columns from ``other``. 

64 

65 Parameters 

66 ---------- 

67 other : `InspectionSummary` 

68 The other summary object. 

69 """ 

70 self.dimensions.update(other.dimensions) 

71 for element, columns in other.columns.items(): 

72 self.columns.setdefault(element, set()).update(columns) 

73 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate 

74 

75 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet) 

76 """Dimensions whose primary keys or dependencies were referenced anywhere 

77 in this branch (`NamedValueSet` [ `Dimension` ]). 

78 """ 

79 

80 columns: NamedKeyDict[DimensionElement, set[str]] = dataclasses.field(default_factory=NamedKeyDict) 

81 """Dimension element tables whose columns were referenced anywhere in this 

82 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]). 

83 """ 

84 

85 hasIngestDate: bool = False 

86 """Whether this expression includes the special dataset ingest date 

87 identifier (`bool`). 

88 """ 

89 

90 def make_column_tag_set(self, dataset_type_name: str | None) -> set[ColumnTag]: 

91 """Transform the columns captured here into a set of `ColumnTag` 

92 objects. 

93 

94 Parameters 

95 ---------- 

96 dataset_type_name : `str` or `None` 

97 Name of the dataset type to assume for unqualified dataset columns, 

98 or `None` to reject any such identifiers. 

99 

100 Returns 

101 ------- 

102 tag_set : `set` [ `ColumnTag` ] 

103 Set of categorized column tags. 

104 """ 

105 result: set[ColumnTag] = set() 

106 if self.hasIngestDate: 

107 if dataset_type_name is None: 

108 raise UserExpressionError( 

109 "Expression requires an ingest date, which requires exactly one dataset type." 

110 ) 

111 result.add(DatasetColumnTag(dataset_type_name, "ingest_date")) 

112 result.update(DimensionKeyColumnTag.generate(self.dimensions.names)) 

113 for dimension_element, columns in self.columns.items(): 

114 result.update(DimensionRecordColumnTag.generate(dimension_element.name, columns)) 

115 return result 

116 

117 

118@dataclasses.dataclass 

119class TreeSummary(InspectionSummary): 

120 """Result object used by `InspectionVisitor` to gather information about 

121 a parsed expression. 

122 

123 Notes 

124 ----- 

125 TreeSummary adds attributes that allow dimension equivalence expressions 

126 (e.g. "tract=4") to be recognized when they appear in simple contexts 

127 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its 

128 own (i.e. when ``check=False`` in the query code), these don't do anything, 

129 but they don't cost much, either. They are used by `CheckVisitor` when it 

130 delegates to `InspectionVisitor` to see what governor dimension values are 

131 set in a branch of the normal-form expression. 

132 """ 

133 

134 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary: 

135 """Merge ``other`` into ``self``, making ``self`` a summary of both 

136 expression tree branches. 

137 

138 Parameters 

139 ---------- 

140 other : `TreeSummary` 

141 The other summary object. 

142 isEq : `bool`, optional 

143 If `True` (`False` is default), these summaries are being combined 

144 via the equality operator. 

145 

146 Returns 

147 ------- 

148 self : `TreeSummary` 

149 The merged summary (updated in-place). 

150 """ 

151 self.update(other) 

152 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly(): 

153 self.dataIdValue = other.dataIdValue 

154 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly(): 

155 self.dataIdKey = other.dataIdKey 

156 else: 

157 self.dataIdKey = None 

158 self.dataIdValue = None 

159 return self 

160 

161 def isDataIdKeyOnly(self) -> bool: 

162 """Test whether this branch is _just_ a data ID key identifier.""" 

163 return self.dataIdKey is not None and self.dataIdValue is None 

164 

165 def isDataIdValueOnly(self) -> bool: 

166 """Test whether this branch is _just_ a literal value that may be 

167 used as the value in a data ID key-value pair. 

168 """ 

169 return self.dataIdKey is None and self.dataIdValue is not None 

170 

171 dataIdKey: Dimension | None = None 

172 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be 

173 (if `dataIdValue` is `None`) fully identified by a literal value in this 

174 branch. 

175 """ 

176 

177 dataIdValue: str | None = None 

178 """A literal value that constrains (if `dataIdKey` is not `None`) or may 

179 constrain (if `dataIdKey` is `None`) a dimension in this branch. 

180 

181 This is always a `str` or `None`, but it may need to be coerced to `int` 

182 to reflect the actual user intent. 

183 """ 

184 

185 

186class InspectionVisitor(TreeVisitor[TreeSummary]): 

187 """Implements TreeVisitor to identify dimension elements that need 

188 to be included in a query, prior to actually constructing a SQLAlchemy 

189 WHERE clause from it. 

190 

191 Parameters 

192 ---------- 

193 universe : `DimensionUniverse` 

194 All known dimensions. 

195 bind : `~collections.abc.Mapping` [ `str`, `object` ] 

196 Mapping containing literal values that should be injected into the 

197 query expression, keyed by the identifiers they replace. 

198 """ 

199 

200 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]): 

201 self.universe = universe 

202 self.bind = bind 

203 

204 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary: 

205 # Docstring inherited from TreeVisitor.visitNumericLiteral 

206 return TreeSummary(dataIdValue=value) 

207 

208 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary: 

209 # Docstring inherited from TreeVisitor.visitStringLiteral 

210 return TreeSummary(dataIdValue=value) 

211 

212 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary: 

213 # Docstring inherited from TreeVisitor.visitTimeLiteral 

214 return TreeSummary() 

215 

216 def visitIdentifier(self, name: str, node: Node) -> TreeSummary: 

217 # Docstring inherited from TreeVisitor.visitIdentifier 

218 if name in self.bind: 

219 value = self.bind[name] 

220 if isinstance(value, (list, tuple, Set)): 

221 # This can happen on rhs of IN operator, if there is only one 

222 # element in the list then take it. 

223 if len(value) == 1: 

224 return TreeSummary(dataIdValue=next(iter(value))) 

225 else: 

226 return TreeSummary() 

227 else: 

228 return TreeSummary(dataIdValue=value) 

229 constant = categorizeConstant(name) 

230 if constant is ExpressionConstant.INGEST_DATE: 

231 return TreeSummary(hasIngestDate=True) 

232 elif constant is ExpressionConstant.NULL: 

233 return TreeSummary() 

234 assert constant is None, "Enum variant conditionals should be exhaustive." 

235 element, column = categorizeElementId(self.universe, name) 

236 if column is None: 

237 assert isinstance(element, Dimension) 

238 return TreeSummary( 

239 dimensions=NamedValueSet(element.graph.dimensions), 

240 dataIdKey=element, 

241 ) 

242 else: 

243 return TreeSummary( 

244 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}}) 

245 ) 

246 

247 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary: 

248 # Docstring inherited from TreeVisitor.visitUnaryOp 

249 return operand 

250 

251 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary: 

252 # Docstring inherited from TreeVisitor.visitBinaryOp 

253 return lhs.merge(rhs, isEq=(operator == "=")) 

254 

255 def visitIsIn(self, lhs: TreeSummary, values: list[TreeSummary], not_in: bool, node: Node) -> TreeSummary: 

256 # Docstring inherited from TreeVisitor.visitIsIn 

257 for v in values: 

258 lhs.merge(v) 

259 return lhs 

260 

261 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary: 

262 # Docstring inherited from TreeVisitor.visitParens 

263 return expression 

264 

265 def visitTupleNode(self, items: tuple[TreeSummary, ...], node: Node) -> TreeSummary: 

266 # Docstring inherited from base class 

267 result = TreeSummary() 

268 for i in items: 

269 result.merge(i) 

270 return result 

271 

272 def visitRangeLiteral(self, start: int, stop: int, stride: int | None, node: Node) -> TreeSummary: 

273 # Docstring inherited from TreeVisitor.visitRangeLiteral 

274 return TreeSummary() 

275 

276 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary: 

277 # Docstring inherited from base class 

278 return TreeSummary() 

279 

280 

281@dataclasses.dataclass 

282class InnerSummary(InspectionSummary): 

283 """Result object used by `CheckVisitor` to gather referenced dimensions 

284 and tables from an inner group of AND'd together expression branches, and 

285 check them for consistency and completeness. 

286 """ 

287 

288 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict) 

289 """Mapping containing the values of all dimensions that are equated with 

290 literal values in this expression branch. 

291 """ 

292 

293 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

294 """Governor dimensions whose values are needed by the query, not provided 

295 in the query itself, and present in the default data ID. 

296 

297 These should be added to the query's data ID when finalizing the WHERE 

298 clause. 

299 """ 

300 

301 

302@dataclasses.dataclass 

303class OuterSummary(InspectionSummary): 

304 """Result object used by `CheckVisitor` to gather referenced dimensions, 

305 tables, and governor dimension values from the entire expression. 

306 """ 

307 

308 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict) 

309 """Mapping containing all values that appear in this expression for 

310 dimensions relevant to the query. 

311 

312 Dimensions that are absent from this dict are not constrained by this 

313 expression. 

314 """ 

315 

316 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

317 """Governor dimensions whose values are needed by the query, not provided 

318 in the query itself, and present in the default data ID. 

319 

320 These should be added to the query's data ID when finalizing the WHERE 

321 clause. 

322 """ 

323 

324 

325class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]): 

326 """An implementation of `NormalFormVisitor` that identifies the dimensions 

327 and tables that need to be included in a query while performing some checks 

328 for completeness and consistency. 

329 

330 Parameters 

331 ---------- 

332 dataId : `DataCoordinate` 

333 Dimension values that are fully known in advance. 

334 graph : `DimensionGraph` 

335 The dimensions the query would include in the absence of this 

336 expression. 

337 bind : `~collections.abc.Mapping` [ `str`, `object` ] 

338 Mapping containing literal values that should be injected into the 

339 query expression, keyed by the identifiers they replace. 

340 defaults : `DataCoordinate` 

341 A data ID containing default for governor dimensions. 

342 allow_orphans : `bool`, optional 

343 If `True`, permit expressions to refer to dimensions without providing 

344 a value for their governor dimensions (e.g. referring to a visit 

345 without an instrument). Should be left to default to `False` in 

346 essentially all new code. 

347 """ 

348 

349 def __init__( 

350 self, 

351 dataId: DataCoordinate, 

352 graph: DimensionGraph, 

353 bind: Mapping[str, Any], 

354 defaults: DataCoordinate, 

355 allow_orphans: bool = False, 

356 ): 

357 self.dataId = dataId 

358 self.graph = graph 

359 self.defaults = defaults 

360 self._branchVisitor = InspectionVisitor(dataId.universe, bind) 

361 self._allow_orphans = allow_orphans 

362 

363 def visitBranch(self, node: Node) -> TreeSummary: 

364 # Docstring inherited from NormalFormVisitor. 

365 return node.visit(self._branchVisitor) 

366 

367 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary: 

368 # Docstring inherited from NormalFormVisitor. 

369 # Disjunctive normal form means inner branches are AND'd together... 

370 assert form is NormalForm.DISJUNCTIVE 

371 # ...and that means each branch we iterate over together below 

372 # constrains the others, and they all need to be consistent. Moreover, 

373 # because outer branches are OR'd together, we also know that if 

374 # something is missing from one of these branches (like a governor 

375 # dimension value like the instrument or skymap needed to interpret a 

376 # visit or tract number), it really is missing, because there's no way 

377 # some other inner branch can constraint it. 

378 # 

379 # That is, except the data ID the visitor was passed at construction; 

380 # that's AND'd to the entire expression later, and thus it affects all 

381 # branches. To take care of that, we add any governor values it 

382 # contains to the summary in advance. 

383 summary = InnerSummary() 

384 summary.dimension_values.update( 

385 (k, self.dataId[k]) 

386 for k in (self.dataId.graph.names if self.dataId.hasFull() else self.dataId.graph.required.names) 

387 ) 

388 # Finally, we loop over those branches. 

389 for branch in branches: 

390 # Update the sets of dimensions and columns we've seen anywhere in 

391 # the expression in any context. 

392 summary.update(branch) 

393 # Test whether this branch has a form like '<dimension>=<value>' 

394 # (or equivalent; categorizeIdentifier is smart enough to see that 

395 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so, 

396 # remember that we've constrained it on this branch to later make 

397 # sure it's consistent with any other constraints on any other 

398 # branches its AND'd with. 

399 if branch.dataIdKey is not None and branch.dataIdValue is not None: 

400 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue) 

401 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value) 

402 if value != new_value: 

403 # Expression says something like "instrument='HSC' AND 

404 # instrument='DECam'", or data ID has one and expression 

405 # has the other. 

406 if branch.dataIdKey in self.dataId: 

407 raise UserExpressionError( 

408 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} " 

409 f"and data ID with {branch.dataIdKey.name}={value!r}." 

410 ) 

411 else: 

412 raise UserExpressionError( 

413 f"Conflicting literal values for {branch.dataIdKey.name} in expression: " 

414 f"{value!r} != {branch.dataIdValue!r}." 

415 ) 

416 # Now that we know which governor values we've constrained, see if any 

417 # are missing, i.e. if the expression contains something like "visit=X" 

418 # without saying what instrument that visit corresponds to. This rules 

419 # out a lot of accidents, but it also rules out possibly-legitimate 

420 # multi-instrument queries like "visit.seeing < 0.7". But it's not 

421 # unreasonable to ask the user to be explicit about the instruments 

422 # they want to consider to work around this restriction, and that's 

423 # what we do. Note that if someone does write an expression like 

424 # 

425 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7 

426 # 

427 # then in disjunctive normal form that will become 

428 # 

429 # (instrument='HSC' AND visit.seeing < 0.7) 

430 # OR (instrument='DECam' AND visit.seeing < 0.7) 

431 # 

432 # i.e. each instrument will get its own outer branch and the logic here 

433 # still works (that sort of thing is why we convert to normal form, 

434 # after all). 

435 governorsNeededInBranch: set[str] = set() 

436 for dimension in summary.dimensions: 

437 governorsNeededInBranch.update(dimension.graph.governors.names) 

438 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()): 

439 missing = governorsNeededInBranch - summary.dimension_values.keys() 

440 if missing <= self.defaults.names: 

441 summary.defaultsNeeded.update(missing) 

442 elif not self._allow_orphans: 

443 still_missing = missing - self.defaults.names 

444 raise UserExpressionError( 

445 f"No value(s) for governor dimensions {still_missing} in expression " 

446 "that references dependent dimensions. 'Governor' dimensions must always be specified " 

447 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' " 

448 "terms) or in a data ID passed to the query method." 

449 ) 

450 return summary 

451 

452 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary: 

453 # Docstring inherited from NormalFormVisitor. 

454 # Disjunctive normal form means outer branches are OR'd together. 

455 assert form is NormalForm.DISJUNCTIVE 

456 summary = OuterSummary() 

457 if branches: 

458 # Iterate over branches in first pass to gather all dimensions and 

459 # columns referenced. This aggregation is for the full query, so 

460 # we don't care whether things are joined by AND or OR (or + or -, 

461 # etc). Also gather the set of dimensions directly constrained or 

462 # pulled from defaults in _all_ branches. This is the set we will 

463 # be able to bound overall; any dimensions not referenced by even 

464 # one branch could be unbounded. 

465 dimensions_in_all_branches = set(self.graph.universe.getStaticDimensions().names) 

466 for branch in branches: 

467 summary.update(branch) 

468 summary.defaultsNeeded.update(branch.defaultsNeeded) 

469 dimensions_in_all_branches.intersection_update(branch.dimension_values) 

470 # Go back through and set up the dimension bounds. 

471 summary.dimension_constraints.update( 

472 {dimension: set() for dimension in dimensions_in_all_branches} 

473 ) 

474 for dim in dimensions_in_all_branches: 

475 for branch in branches: 

476 summary.dimension_constraints[dim].add(branch.dimension_values[dim]) 

477 # See if we've referenced any dimensions that weren't in the original 

478 # query graph; if so, we update that to include them. This is what 

479 # lets a user say "tract=X" on the command line (well, "skymap=Y AND 

480 # tract=X" - logic in visitInner checks for that) when running a task 

481 # like ISR that has nothing to do with skymaps. 

482 if not summary.dimensions.issubset(self.graph.dimensions): 

483 self.graph = DimensionGraph( 

484 self.graph.universe, 

485 dimensions=(summary.dimensions | self.graph.dimensions), 

486 ) 

487 for dimension, values in summary.dimension_constraints.items(): 

488 if dimension in summary.defaultsNeeded: 

489 # One branch contained an explicit value for this dimension 

490 # while another needed to refer to the default data ID. 

491 # Even if these refer to the same value, that inconsistency 

492 # probably indicates user error. 

493 raise UserExpressionError( 

494 f"Governor dimension {dimension} is explicitly " 

495 f"constrained to {values} in one or more branches of " 

496 "this query where expression, but is left to default " 

497 f"to {self.defaults[dimension]!r} in another branch. " 

498 "Defaults and explicit constraints cannot be mixed." 

499 ) 

500 # If any default data ID values were needed, update self.dataId with 

501 # them, and then update the governor restriction with them. 

502 if summary.defaultsNeeded: 

503 defaultsNeededGraph = DimensionGraph(self.graph.universe, names=summary.defaultsNeeded) 

504 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph)) 

505 for dimension in summary.defaultsNeeded: 

506 summary.dimension_constraints[dimension] = {self.defaults[dimension]} 

507 

508 return summary