Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 28%

171 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ( 

30 "CheckVisitor", 

31 "InspectionVisitor", 

32 "InspectionSummary", 

33) 

34 

35import dataclasses 

36from collections.abc import Mapping, Sequence, Set 

37from typing import TYPE_CHECKING, Any 

38 

39from ...._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag 

40from ...._named import NamedKeyDict, NamedValueSet 

41from ....dimensions import ( 

42 DataCoordinate, 

43 DataIdValue, 

44 Dimension, 

45 DimensionElement, 

46 DimensionGraph, 

47 DimensionUniverse, 

48) 

49from ..._exceptions import UserExpressionError 

50from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId 

51from .normalForm import NormalForm, NormalFormVisitor 

52from .parser import Node, TreeVisitor 

53 

54if TYPE_CHECKING: 

55 import astropy.time 

56 from lsst.daf.relation import ColumnTag 

57 

58 

59@dataclasses.dataclass 

60class InspectionSummary: 

61 """Base class for objects used by `CheckVisitor` and `InspectionVisitor` 

62 to gather information about a parsed expression. 

63 """ 

64 

65 def update(self, other: InspectionSummary) -> None: 

66 """Update ``self`` with all dimensions and columns from ``other``. 

67 

68 Parameters 

69 ---------- 

70 other : `InspectionSummary` 

71 The other summary object. 

72 """ 

73 self.dimensions.update(other.dimensions) 

74 for element, columns in other.columns.items(): 

75 self.columns.setdefault(element, set()).update(columns) 

76 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate 

77 

78 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet) 

79 """Dimensions whose primary keys or dependencies were referenced anywhere 

80 in this branch (`NamedValueSet` [ `Dimension` ]). 

81 """ 

82 

83 columns: NamedKeyDict[DimensionElement, set[str]] = dataclasses.field(default_factory=NamedKeyDict) 

84 """Dimension element tables whose columns were referenced anywhere in this 

85 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]). 

86 """ 

87 

88 hasIngestDate: bool = False 

89 """Whether this expression includes the special dataset ingest date 

90 identifier (`bool`). 

91 """ 

92 

93 def make_column_tag_set(self, dataset_type_name: str | None) -> set[ColumnTag]: 

94 """Transform the columns captured here into a set of `ColumnTag` 

95 objects. 

96 

97 Parameters 

98 ---------- 

99 dataset_type_name : `str` or `None` 

100 Name of the dataset type to assume for unqualified dataset columns, 

101 or `None` to reject any such identifiers. 

102 

103 Returns 

104 ------- 

105 tag_set : `set` [ `ColumnTag` ] 

106 Set of categorized column tags. 

107 """ 

108 result: set[ColumnTag] = set() 

109 if self.hasIngestDate: 

110 if dataset_type_name is None: 

111 raise UserExpressionError( 

112 "Expression requires an ingest date, which requires exactly one dataset type." 

113 ) 

114 result.add(DatasetColumnTag(dataset_type_name, "ingest_date")) 

115 result.update(DimensionKeyColumnTag.generate(self.dimensions.names)) 

116 for dimension_element, columns in self.columns.items(): 

117 result.update(DimensionRecordColumnTag.generate(dimension_element.name, columns)) 

118 return result 

119 

120 

121@dataclasses.dataclass 

122class TreeSummary(InspectionSummary): 

123 """Result object used by `InspectionVisitor` to gather information about 

124 a parsed expression. 

125 

126 Notes 

127 ----- 

128 TreeSummary adds attributes that allow dimension equivalence expressions 

129 (e.g. "tract=4") to be recognized when they appear in simple contexts 

130 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its 

131 own (i.e. when ``check=False`` in the query code), these don't do anything, 

132 but they don't cost much, either. They are used by `CheckVisitor` when it 

133 delegates to `InspectionVisitor` to see what governor dimension values are 

134 set in a branch of the normal-form expression. 

135 """ 

136 

137 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary: 

138 """Merge ``other`` into ``self``, making ``self`` a summary of both 

139 expression tree branches. 

140 

141 Parameters 

142 ---------- 

143 other : `TreeSummary` 

144 The other summary object. 

145 isEq : `bool`, optional 

146 If `True` (`False` is default), these summaries are being combined 

147 via the equality operator. 

148 

149 Returns 

150 ------- 

151 self : `TreeSummary` 

152 The merged summary (updated in-place). 

153 """ 

154 self.update(other) 

155 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly(): 

156 self.dataIdValue = other.dataIdValue 

157 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly(): 

158 self.dataIdKey = other.dataIdKey 

159 else: 

160 self.dataIdKey = None 

161 self.dataIdValue = None 

162 return self 

163 

164 def isDataIdKeyOnly(self) -> bool: 

165 """Test whether this branch is _just_ a data ID key identifier.""" 

166 return self.dataIdKey is not None and self.dataIdValue is None 

167 

168 def isDataIdValueOnly(self) -> bool: 

169 """Test whether this branch is _just_ a literal value that may be 

170 used as the value in a data ID key-value pair. 

171 """ 

172 return self.dataIdKey is None and self.dataIdValue is not None 

173 

174 dataIdKey: Dimension | None = None 

175 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be 

176 (if `dataIdValue` is `None`) fully identified by a literal value in this 

177 branch. 

178 """ 

179 

180 dataIdValue: str | None = None 

181 """A literal value that constrains (if `dataIdKey` is not `None`) or may 

182 constrain (if `dataIdKey` is `None`) a dimension in this branch. 

183 

184 This is always a `str` or `None`, but it may need to be coerced to `int` 

185 to reflect the actual user intent. 

186 """ 

187 

188 

189class InspectionVisitor(TreeVisitor[TreeSummary]): 

190 """Implements TreeVisitor to identify dimension elements that need 

191 to be included in a query, prior to actually constructing a SQLAlchemy 

192 WHERE clause from it. 

193 

194 Parameters 

195 ---------- 

196 universe : `DimensionUniverse` 

197 All known dimensions. 

198 bind : `~collections.abc.Mapping` [ `str`, `object` ] 

199 Mapping containing literal values that should be injected into the 

200 query expression, keyed by the identifiers they replace. 

201 """ 

202 

203 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]): 

204 self.universe = universe 

205 self.bind = bind 

206 

207 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary: 

208 # Docstring inherited from TreeVisitor.visitNumericLiteral 

209 return TreeSummary(dataIdValue=value) 

210 

211 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary: 

212 # Docstring inherited from TreeVisitor.visitStringLiteral 

213 return TreeSummary(dataIdValue=value) 

214 

215 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary: 

216 # Docstring inherited from TreeVisitor.visitTimeLiteral 

217 return TreeSummary() 

218 

219 def visitIdentifier(self, name: str, node: Node) -> TreeSummary: 

220 # Docstring inherited from TreeVisitor.visitIdentifier 

221 if name in self.bind: 

222 value = self.bind[name] 

223 if isinstance(value, list | tuple | Set): 

224 # This can happen on rhs of IN operator, if there is only one 

225 # element in the list then take it. 

226 if len(value) == 1: 

227 return TreeSummary(dataIdValue=next(iter(value))) 

228 else: 

229 return TreeSummary() 

230 else: 

231 return TreeSummary(dataIdValue=value) 

232 constant = categorizeConstant(name) 

233 if constant is ExpressionConstant.INGEST_DATE: 

234 return TreeSummary(hasIngestDate=True) 

235 elif constant is ExpressionConstant.NULL: 

236 return TreeSummary() 

237 assert constant is None, "Enum variant conditionals should be exhaustive." 

238 element, column = categorizeElementId(self.universe, name) 

239 if column is None: 

240 assert isinstance(element, Dimension) 

241 return TreeSummary( 

242 dimensions=NamedValueSet(element.graph.dimensions), 

243 dataIdKey=element, 

244 ) 

245 else: 

246 return TreeSummary( 

247 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}}) 

248 ) 

249 

250 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary: 

251 # Docstring inherited from TreeVisitor.visitUnaryOp 

252 return operand 

253 

254 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary: 

255 # Docstring inherited from TreeVisitor.visitBinaryOp 

256 return lhs.merge(rhs, isEq=(operator == "=")) 

257 

258 def visitIsIn(self, lhs: TreeSummary, values: list[TreeSummary], not_in: bool, node: Node) -> TreeSummary: 

259 # Docstring inherited from TreeVisitor.visitIsIn 

260 for v in values: 

261 lhs.merge(v) 

262 return lhs 

263 

264 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary: 

265 # Docstring inherited from TreeVisitor.visitParens 

266 return expression 

267 

268 def visitTupleNode(self, items: tuple[TreeSummary, ...], node: Node) -> TreeSummary: 

269 # Docstring inherited from base class 

270 result = TreeSummary() 

271 for i in items: 

272 result.merge(i) 

273 return result 

274 

275 def visitRangeLiteral(self, start: int, stop: int, stride: int | None, node: Node) -> TreeSummary: 

276 # Docstring inherited from TreeVisitor.visitRangeLiteral 

277 return TreeSummary() 

278 

279 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary: 

280 # Docstring inherited from base class 

281 return TreeSummary() 

282 

283 

284@dataclasses.dataclass 

285class InnerSummary(InspectionSummary): 

286 """Result object used by `CheckVisitor` to gather referenced dimensions 

287 and tables from an inner group of AND'd together expression branches, and 

288 check them for consistency and completeness. 

289 """ 

290 

291 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict) 

292 """Mapping containing the values of all dimensions that are equated with 

293 literal values in this expression branch. 

294 """ 

295 

296 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

297 """Governor dimensions whose values are needed by the query, not provided 

298 in the query itself, and present in the default data ID. 

299 

300 These should be added to the query's data ID when finalizing the WHERE 

301 clause. 

302 """ 

303 

304 

305@dataclasses.dataclass 

306class OuterSummary(InspectionSummary): 

307 """Result object used by `CheckVisitor` to gather referenced dimensions, 

308 tables, and governor dimension values from the entire expression. 

309 """ 

310 

311 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict) 

312 """Mapping containing all values that appear in this expression for 

313 dimensions relevant to the query. 

314 

315 Dimensions that are absent from this dict are not constrained by this 

316 expression. 

317 """ 

318 

319 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

320 """Governor dimensions whose values are needed by the query, not provided 

321 in the query itself, and present in the default data ID. 

322 

323 These should be added to the query's data ID when finalizing the WHERE 

324 clause. 

325 """ 

326 

327 

328class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]): 

329 """An implementation of `NormalFormVisitor` that identifies the dimensions 

330 and tables that need to be included in a query while performing some checks 

331 for completeness and consistency. 

332 

333 Parameters 

334 ---------- 

335 dataId : `DataCoordinate` 

336 Dimension values that are fully known in advance. 

337 graph : `DimensionGraph` 

338 The dimensions the query would include in the absence of this 

339 expression. 

340 bind : `~collections.abc.Mapping` [ `str`, `object` ] 

341 Mapping containing literal values that should be injected into the 

342 query expression, keyed by the identifiers they replace. 

343 defaults : `DataCoordinate` 

344 A data ID containing default for governor dimensions. 

345 allow_orphans : `bool`, optional 

346 If `True`, permit expressions to refer to dimensions without providing 

347 a value for their governor dimensions (e.g. referring to a visit 

348 without an instrument). Should be left to default to `False` in 

349 essentially all new code. 

350 """ 

351 

352 def __init__( 

353 self, 

354 dataId: DataCoordinate, 

355 graph: DimensionGraph, 

356 bind: Mapping[str, Any], 

357 defaults: DataCoordinate, 

358 allow_orphans: bool = False, 

359 ): 

360 self.dataId = dataId 

361 self.graph = graph 

362 self.defaults = defaults 

363 self._branchVisitor = InspectionVisitor(dataId.universe, bind) 

364 self._allow_orphans = allow_orphans 

365 

366 def visitBranch(self, node: Node) -> TreeSummary: 

367 # Docstring inherited from NormalFormVisitor. 

368 return node.visit(self._branchVisitor) 

369 

370 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary: 

371 # Docstring inherited from NormalFormVisitor. 

372 # Disjunctive normal form means inner branches are AND'd together... 

373 assert form is NormalForm.DISJUNCTIVE 

374 # ...and that means each branch we iterate over together below 

375 # constrains the others, and they all need to be consistent. Moreover, 

376 # because outer branches are OR'd together, we also know that if 

377 # something is missing from one of these branches (like a governor 

378 # dimension value like the instrument or skymap needed to interpret a 

379 # visit or tract number), it really is missing, because there's no way 

380 # some other inner branch can constraint it. 

381 # 

382 # That is, except the data ID the visitor was passed at construction; 

383 # that's AND'd to the entire expression later, and thus it affects all 

384 # branches. To take care of that, we add any governor values it 

385 # contains to the summary in advance. 

386 summary = InnerSummary() 

387 summary.dimension_values.update( 

388 (k, self.dataId[k]) 

389 for k in (self.dataId.graph.names if self.dataId.hasFull() else self.dataId.graph.required.names) 

390 ) 

391 # Finally, we loop over those branches. 

392 for branch in branches: 

393 # Update the sets of dimensions and columns we've seen anywhere in 

394 # the expression in any context. 

395 summary.update(branch) 

396 # Test whether this branch has a form like '<dimension>=<value>' 

397 # (or equivalent; categorizeIdentifier is smart enough to see that 

398 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so, 

399 # remember that we've constrained it on this branch to later make 

400 # sure it's consistent with any other constraints on any other 

401 # branches its AND'd with. 

402 if branch.dataIdKey is not None and branch.dataIdValue is not None: 

403 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue) 

404 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value) 

405 if value != new_value: 

406 # Expression says something like "instrument='HSC' AND 

407 # instrument='DECam'", or data ID has one and expression 

408 # has the other. 

409 if branch.dataIdKey in self.dataId: 

410 raise UserExpressionError( 

411 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} " 

412 f"and data ID with {branch.dataIdKey.name}={value!r}." 

413 ) 

414 else: 

415 raise UserExpressionError( 

416 f"Conflicting literal values for {branch.dataIdKey.name} in expression: " 

417 f"{value!r} != {branch.dataIdValue!r}." 

418 ) 

419 # Now that we know which governor values we've constrained, see if any 

420 # are missing, i.e. if the expression contains something like "visit=X" 

421 # without saying what instrument that visit corresponds to. This rules 

422 # out a lot of accidents, but it also rules out possibly-legitimate 

423 # multi-instrument queries like "visit.seeing < 0.7". But it's not 

424 # unreasonable to ask the user to be explicit about the instruments 

425 # they want to consider to work around this restriction, and that's 

426 # what we do. Note that if someone does write an expression like 

427 # 

428 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7 

429 # 

430 # then in disjunctive normal form that will become 

431 # 

432 # (instrument='HSC' AND visit.seeing < 0.7) 

433 # OR (instrument='DECam' AND visit.seeing < 0.7) 

434 # 

435 # i.e. each instrument will get its own outer branch and the logic here 

436 # still works (that sort of thing is why we convert to normal form, 

437 # after all). 

438 governorsNeededInBranch: set[str] = set() 

439 for dimension in summary.dimensions: 

440 governorsNeededInBranch.update(dimension.graph.governors.names) 

441 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()): 

442 missing = governorsNeededInBranch - summary.dimension_values.keys() 

443 if missing <= self.defaults.names: 

444 summary.defaultsNeeded.update(missing) 

445 elif not self._allow_orphans: 

446 still_missing = missing - self.defaults.names 

447 raise UserExpressionError( 

448 f"No value(s) for governor dimensions {still_missing} in expression " 

449 "that references dependent dimensions. 'Governor' dimensions must always be specified " 

450 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' " 

451 "terms) or in a data ID passed to the query method." 

452 ) 

453 return summary 

454 

455 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary: 

456 # Docstring inherited from NormalFormVisitor. 

457 # Disjunctive normal form means outer branches are OR'd together. 

458 assert form is NormalForm.DISJUNCTIVE 

459 summary = OuterSummary() 

460 if branches: 

461 # Iterate over branches in first pass to gather all dimensions and 

462 # columns referenced. This aggregation is for the full query, so 

463 # we don't care whether things are joined by AND or OR (or + or -, 

464 # etc). Also gather the set of dimensions directly constrained or 

465 # pulled from defaults in _all_ branches. This is the set we will 

466 # be able to bound overall; any dimensions not referenced by even 

467 # one branch could be unbounded. 

468 dimensions_in_all_branches = set(self.graph.universe.getStaticDimensions().names) 

469 for branch in branches: 

470 summary.update(branch) 

471 summary.defaultsNeeded.update(branch.defaultsNeeded) 

472 dimensions_in_all_branches.intersection_update(branch.dimension_values) 

473 # Go back through and set up the dimension bounds. 

474 summary.dimension_constraints.update( 

475 {dimension: set() for dimension in dimensions_in_all_branches} 

476 ) 

477 for dim in dimensions_in_all_branches: 

478 for branch in branches: 

479 summary.dimension_constraints[dim].add(branch.dimension_values[dim]) 

480 # See if we've referenced any dimensions that weren't in the original 

481 # query graph; if so, we update that to include them. This is what 

482 # lets a user say "tract=X" on the command line (well, "skymap=Y AND 

483 # tract=X" - logic in visitInner checks for that) when running a task 

484 # like ISR that has nothing to do with skymaps. 

485 if not summary.dimensions.issubset(self.graph.dimensions): 

486 self.graph = DimensionGraph( 

487 self.graph.universe, 

488 dimensions=(summary.dimensions | self.graph.dimensions), 

489 ) 

490 for dimension, values in summary.dimension_constraints.items(): 

491 if dimension in summary.defaultsNeeded: 

492 # One branch contained an explicit value for this dimension 

493 # while another needed to refer to the default data ID. 

494 # Even if these refer to the same value, that inconsistency 

495 # probably indicates user error. 

496 raise UserExpressionError( 

497 f"Governor dimension {dimension} is explicitly " 

498 f"constrained to {values} in one or more branches of " 

499 "this query where expression, but is left to default " 

500 f"to {self.defaults[dimension]!r} in another branch. " 

501 "Defaults and explicit constraints cannot be mixed." 

502 ) 

503 # If any default data ID values were needed, update self.dataId with 

504 # them, and then update the governor restriction with them. 

505 if summary.defaultsNeeded: 

506 defaultsNeededGraph = DimensionGraph(self.graph.universe, names=summary.defaultsNeeded) 

507 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph)) 

508 for dimension in summary.defaultsNeeded: 

509 summary.dimension_constraints[dimension] = {self.defaults[dimension]} 

510 

511 return summary