Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 28%

169 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ( 

30 "CheckVisitor", 

31 "InspectionVisitor", 

32 "InspectionSummary", 

33) 

34 

35import dataclasses 

36from collections.abc import Mapping, Sequence, Set 

37from typing import TYPE_CHECKING, Any 

38 

39from ....core import ( 

40 DataCoordinate, 

41 DataIdValue, 

42 DatasetColumnTag, 

43 Dimension, 

44 DimensionElement, 

45 DimensionGraph, 

46 DimensionKeyColumnTag, 

47 DimensionRecordColumnTag, 

48 DimensionUniverse, 

49 NamedKeyDict, 

50 NamedValueSet, 

51) 

52from ..._exceptions import UserExpressionError 

53from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId 

54from .normalForm import NormalForm, NormalFormVisitor 

55from .parser import Node, TreeVisitor 

56 

57if TYPE_CHECKING: 

58 import astropy.time 

59 from lsst.daf.relation import ColumnTag 

60 

61 

62@dataclasses.dataclass 

63class InspectionSummary: 

64 """Base class for objects used by `CheckVisitor` and `InspectionVisitor` 

65 to gather information about a parsed expression. 

66 """ 

67 

68 def update(self, other: InspectionSummary) -> None: 

69 """Update ``self`` with all dimensions and columns from ``other``. 

70 

71 Parameters 

72 ---------- 

73 other : `InspectionSummary` 

74 The other summary object. 

75 """ 

76 self.dimensions.update(other.dimensions) 

77 for element, columns in other.columns.items(): 

78 self.columns.setdefault(element, set()).update(columns) 

79 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate 

80 

81 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet) 

82 """Dimensions whose primary keys or dependencies were referenced anywhere 

83 in this branch (`NamedValueSet` [ `Dimension` ]). 

84 """ 

85 

86 columns: NamedKeyDict[DimensionElement, set[str]] = dataclasses.field(default_factory=NamedKeyDict) 

87 """Dimension element tables whose columns were referenced anywhere in this 

88 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]). 

89 """ 

90 

91 hasIngestDate: bool = False 

92 """Whether this expression includes the special dataset ingest date 

93 identifier (`bool`). 

94 """ 

95 

96 def make_column_tag_set(self, dataset_type_name: str | None) -> set[ColumnTag]: 

97 """Transform the columns captured here into a set of `ColumnTag` 

98 objects. 

99 

100 Parameters 

101 ---------- 

102 dataset_type_name : `str` or `None` 

103 Name of the dataset type to assume for unqualified dataset columns, 

104 or `None` to reject any such identifiers. 

105 

106 Returns 

107 ------- 

108 tag_set : `set` [ `ColumnTag` ] 

109 Set of categorized column tags. 

110 """ 

111 result: set[ColumnTag] = set() 

112 if self.hasIngestDate: 

113 if dataset_type_name is None: 

114 raise UserExpressionError( 

115 "Expression requires an ingest date, which requires exactly one dataset type." 

116 ) 

117 result.add(DatasetColumnTag(dataset_type_name, "ingest_date")) 

118 result.update(DimensionKeyColumnTag.generate(self.dimensions.names)) 

119 for dimension_element, columns in self.columns.items(): 

120 result.update(DimensionRecordColumnTag.generate(dimension_element.name, columns)) 

121 return result 

122 

123 

124@dataclasses.dataclass 

125class TreeSummary(InspectionSummary): 

126 """Result object used by `InspectionVisitor` to gather information about 

127 a parsed expression. 

128 

129 Notes 

130 ----- 

131 TreeSummary adds attributes that allow dimension equivalence expressions 

132 (e.g. "tract=4") to be recognized when they appear in simple contexts 

133 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its 

134 own (i.e. when ``check=False`` in the query code), these don't do anything, 

135 but they don't cost much, either. They are used by `CheckVisitor` when it 

136 delegates to `InspectionVisitor` to see what governor dimension values are 

137 set in a branch of the normal-form expression. 

138 """ 

139 

140 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary: 

141 """Merge ``other`` into ``self``, making ``self`` a summary of both 

142 expression tree branches. 

143 

144 Parameters 

145 ---------- 

146 other : `TreeSummary` 

147 The other summary object. 

148 isEq : `bool`, optional 

149 If `True` (`False` is default), these summaries are being combined 

150 via the equality operator. 

151 

152 Returns 

153 ------- 

154 self : `TreeSummary` 

155 The merged summary (updated in-place). 

156 """ 

157 self.update(other) 

158 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly(): 

159 self.dataIdValue = other.dataIdValue 

160 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly(): 

161 self.dataIdKey = other.dataIdKey 

162 else: 

163 self.dataIdKey = None 

164 self.dataIdValue = None 

165 return self 

166 

167 def isDataIdKeyOnly(self) -> bool: 

168 """Test whether this branch is _just_ a data ID key identifier.""" 

169 return self.dataIdKey is not None and self.dataIdValue is None 

170 

171 def isDataIdValueOnly(self) -> bool: 

172 """Test whether this branch is _just_ a literal value that may be 

173 used as the value in a data ID key-value pair. 

174 """ 

175 return self.dataIdKey is None and self.dataIdValue is not None 

176 

177 dataIdKey: Dimension | None = None 

178 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be 

179 (if `dataIdValue` is `None`) fully identified by a literal value in this 

180 branch. 

181 """ 

182 

183 dataIdValue: str | None = None 

184 """A literal value that constrains (if `dataIdKey` is not `None`) or may 

185 constrain (if `dataIdKey` is `None`) a dimension in this branch. 

186 

187 This is always a `str` or `None`, but it may need to be coerced to `int` 

188 to reflect the actual user intent. 

189 """ 

190 

191 

192class InspectionVisitor(TreeVisitor[TreeSummary]): 

193 """Implements TreeVisitor to identify dimension elements that need 

194 to be included in a query, prior to actually constructing a SQLAlchemy 

195 WHERE clause from it. 

196 

197 Parameters 

198 ---------- 

199 universe : `DimensionUniverse` 

200 All known dimensions. 

201 bind : `~collections.abc.Mapping` [ `str`, `object` ] 

202 Mapping containing literal values that should be injected into the 

203 query expression, keyed by the identifiers they replace. 

204 """ 

205 

206 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]): 

207 self.universe = universe 

208 self.bind = bind 

209 

210 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary: 

211 # Docstring inherited from TreeVisitor.visitNumericLiteral 

212 return TreeSummary(dataIdValue=value) 

213 

214 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary: 

215 # Docstring inherited from TreeVisitor.visitStringLiteral 

216 return TreeSummary(dataIdValue=value) 

217 

218 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary: 

219 # Docstring inherited from TreeVisitor.visitTimeLiteral 

220 return TreeSummary() 

221 

222 def visitIdentifier(self, name: str, node: Node) -> TreeSummary: 

223 # Docstring inherited from TreeVisitor.visitIdentifier 

224 if name in self.bind: 

225 value = self.bind[name] 

226 if isinstance(value, list | tuple | Set): 

227 # This can happen on rhs of IN operator, if there is only one 

228 # element in the list then take it. 

229 if len(value) == 1: 

230 return TreeSummary(dataIdValue=next(iter(value))) 

231 else: 

232 return TreeSummary() 

233 else: 

234 return TreeSummary(dataIdValue=value) 

235 constant = categorizeConstant(name) 

236 if constant is ExpressionConstant.INGEST_DATE: 

237 return TreeSummary(hasIngestDate=True) 

238 elif constant is ExpressionConstant.NULL: 

239 return TreeSummary() 

240 assert constant is None, "Enum variant conditionals should be exhaustive." 

241 element, column = categorizeElementId(self.universe, name) 

242 if column is None: 

243 assert isinstance(element, Dimension) 

244 return TreeSummary( 

245 dimensions=NamedValueSet(element.graph.dimensions), 

246 dataIdKey=element, 

247 ) 

248 else: 

249 return TreeSummary( 

250 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}}) 

251 ) 

252 

253 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary: 

254 # Docstring inherited from TreeVisitor.visitUnaryOp 

255 return operand 

256 

257 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary: 

258 # Docstring inherited from TreeVisitor.visitBinaryOp 

259 return lhs.merge(rhs, isEq=(operator == "=")) 

260 

261 def visitIsIn(self, lhs: TreeSummary, values: list[TreeSummary], not_in: bool, node: Node) -> TreeSummary: 

262 # Docstring inherited from TreeVisitor.visitIsIn 

263 for v in values: 

264 lhs.merge(v) 

265 return lhs 

266 

267 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary: 

268 # Docstring inherited from TreeVisitor.visitParens 

269 return expression 

270 

271 def visitTupleNode(self, items: tuple[TreeSummary, ...], node: Node) -> TreeSummary: 

272 # Docstring inherited from base class 

273 result = TreeSummary() 

274 for i in items: 

275 result.merge(i) 

276 return result 

277 

278 def visitRangeLiteral(self, start: int, stop: int, stride: int | None, node: Node) -> TreeSummary: 

279 # Docstring inherited from TreeVisitor.visitRangeLiteral 

280 return TreeSummary() 

281 

282 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary: 

283 # Docstring inherited from base class 

284 return TreeSummary() 

285 

286 

287@dataclasses.dataclass 

288class InnerSummary(InspectionSummary): 

289 """Result object used by `CheckVisitor` to gather referenced dimensions 

290 and tables from an inner group of AND'd together expression branches, and 

291 check them for consistency and completeness. 

292 """ 

293 

294 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict) 

295 """Mapping containing the values of all dimensions that are equated with 

296 literal values in this expression branch. 

297 """ 

298 

299 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

300 """Governor dimensions whose values are needed by the query, not provided 

301 in the query itself, and present in the default data ID. 

302 

303 These should be added to the query's data ID when finalizing the WHERE 

304 clause. 

305 """ 

306 

307 

308@dataclasses.dataclass 

309class OuterSummary(InspectionSummary): 

310 """Result object used by `CheckVisitor` to gather referenced dimensions, 

311 tables, and governor dimension values from the entire expression. 

312 """ 

313 

314 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict) 

315 """Mapping containing all values that appear in this expression for 

316 dimensions relevant to the query. 

317 

318 Dimensions that are absent from this dict are not constrained by this 

319 expression. 

320 """ 

321 

322 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

323 """Governor dimensions whose values are needed by the query, not provided 

324 in the query itself, and present in the default data ID. 

325 

326 These should be added to the query's data ID when finalizing the WHERE 

327 clause. 

328 """ 

329 

330 

331class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]): 

332 """An implementation of `NormalFormVisitor` that identifies the dimensions 

333 and tables that need to be included in a query while performing some checks 

334 for completeness and consistency. 

335 

336 Parameters 

337 ---------- 

338 dataId : `DataCoordinate` 

339 Dimension values that are fully known in advance. 

340 graph : `DimensionGraph` 

341 The dimensions the query would include in the absence of this 

342 expression. 

343 bind : `~collections.abc.Mapping` [ `str`, `object` ] 

344 Mapping containing literal values that should be injected into the 

345 query expression, keyed by the identifiers they replace. 

346 defaults : `DataCoordinate` 

347 A data ID containing default for governor dimensions. 

348 allow_orphans : `bool`, optional 

349 If `True`, permit expressions to refer to dimensions without providing 

350 a value for their governor dimensions (e.g. referring to a visit 

351 without an instrument). Should be left to default to `False` in 

352 essentially all new code. 

353 """ 

354 

355 def __init__( 

356 self, 

357 dataId: DataCoordinate, 

358 graph: DimensionGraph, 

359 bind: Mapping[str, Any], 

360 defaults: DataCoordinate, 

361 allow_orphans: bool = False, 

362 ): 

363 self.dataId = dataId 

364 self.graph = graph 

365 self.defaults = defaults 

366 self._branchVisitor = InspectionVisitor(dataId.universe, bind) 

367 self._allow_orphans = allow_orphans 

368 

369 def visitBranch(self, node: Node) -> TreeSummary: 

370 # Docstring inherited from NormalFormVisitor. 

371 return node.visit(self._branchVisitor) 

372 

373 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary: 

374 # Docstring inherited from NormalFormVisitor. 

375 # Disjunctive normal form means inner branches are AND'd together... 

376 assert form is NormalForm.DISJUNCTIVE 

377 # ...and that means each branch we iterate over together below 

378 # constrains the others, and they all need to be consistent. Moreover, 

379 # because outer branches are OR'd together, we also know that if 

380 # something is missing from one of these branches (like a governor 

381 # dimension value like the instrument or skymap needed to interpret a 

382 # visit or tract number), it really is missing, because there's no way 

383 # some other inner branch can constraint it. 

384 # 

385 # That is, except the data ID the visitor was passed at construction; 

386 # that's AND'd to the entire expression later, and thus it affects all 

387 # branches. To take care of that, we add any governor values it 

388 # contains to the summary in advance. 

389 summary = InnerSummary() 

390 summary.dimension_values.update( 

391 (k, self.dataId[k]) 

392 for k in (self.dataId.graph.names if self.dataId.hasFull() else self.dataId.graph.required.names) 

393 ) 

394 # Finally, we loop over those branches. 

395 for branch in branches: 

396 # Update the sets of dimensions and columns we've seen anywhere in 

397 # the expression in any context. 

398 summary.update(branch) 

399 # Test whether this branch has a form like '<dimension>=<value>' 

400 # (or equivalent; categorizeIdentifier is smart enough to see that 

401 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so, 

402 # remember that we've constrained it on this branch to later make 

403 # sure it's consistent with any other constraints on any other 

404 # branches its AND'd with. 

405 if branch.dataIdKey is not None and branch.dataIdValue is not None: 

406 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue) 

407 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value) 

408 if value != new_value: 

409 # Expression says something like "instrument='HSC' AND 

410 # instrument='DECam'", or data ID has one and expression 

411 # has the other. 

412 if branch.dataIdKey in self.dataId: 

413 raise UserExpressionError( 

414 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} " 

415 f"and data ID with {branch.dataIdKey.name}={value!r}." 

416 ) 

417 else: 

418 raise UserExpressionError( 

419 f"Conflicting literal values for {branch.dataIdKey.name} in expression: " 

420 f"{value!r} != {branch.dataIdValue!r}." 

421 ) 

422 # Now that we know which governor values we've constrained, see if any 

423 # are missing, i.e. if the expression contains something like "visit=X" 

424 # without saying what instrument that visit corresponds to. This rules 

425 # out a lot of accidents, but it also rules out possibly-legitimate 

426 # multi-instrument queries like "visit.seeing < 0.7". But it's not 

427 # unreasonable to ask the user to be explicit about the instruments 

428 # they want to consider to work around this restriction, and that's 

429 # what we do. Note that if someone does write an expression like 

430 # 

431 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7 

432 # 

433 # then in disjunctive normal form that will become 

434 # 

435 # (instrument='HSC' AND visit.seeing < 0.7) 

436 # OR (instrument='DECam' AND visit.seeing < 0.7) 

437 # 

438 # i.e. each instrument will get its own outer branch and the logic here 

439 # still works (that sort of thing is why we convert to normal form, 

440 # after all). 

441 governorsNeededInBranch: set[str] = set() 

442 for dimension in summary.dimensions: 

443 governorsNeededInBranch.update(dimension.graph.governors.names) 

444 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()): 

445 missing = governorsNeededInBranch - summary.dimension_values.keys() 

446 if missing <= self.defaults.names: 

447 summary.defaultsNeeded.update(missing) 

448 elif not self._allow_orphans: 

449 still_missing = missing - self.defaults.names 

450 raise UserExpressionError( 

451 f"No value(s) for governor dimensions {still_missing} in expression " 

452 "that references dependent dimensions. 'Governor' dimensions must always be specified " 

453 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' " 

454 "terms) or in a data ID passed to the query method." 

455 ) 

456 return summary 

457 

458 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary: 

459 # Docstring inherited from NormalFormVisitor. 

460 # Disjunctive normal form means outer branches are OR'd together. 

461 assert form is NormalForm.DISJUNCTIVE 

462 summary = OuterSummary() 

463 if branches: 

464 # Iterate over branches in first pass to gather all dimensions and 

465 # columns referenced. This aggregation is for the full query, so 

466 # we don't care whether things are joined by AND or OR (or + or -, 

467 # etc). Also gather the set of dimensions directly constrained or 

468 # pulled from defaults in _all_ branches. This is the set we will 

469 # be able to bound overall; any dimensions not referenced by even 

470 # one branch could be unbounded. 

471 dimensions_in_all_branches = set(self.graph.universe.getStaticDimensions().names) 

472 for branch in branches: 

473 summary.update(branch) 

474 summary.defaultsNeeded.update(branch.defaultsNeeded) 

475 dimensions_in_all_branches.intersection_update(branch.dimension_values) 

476 # Go back through and set up the dimension bounds. 

477 summary.dimension_constraints.update( 

478 {dimension: set() for dimension in dimensions_in_all_branches} 

479 ) 

480 for dim in dimensions_in_all_branches: 

481 for branch in branches: 

482 summary.dimension_constraints[dim].add(branch.dimension_values[dim]) 

483 # See if we've referenced any dimensions that weren't in the original 

484 # query graph; if so, we update that to include them. This is what 

485 # lets a user say "tract=X" on the command line (well, "skymap=Y AND 

486 # tract=X" - logic in visitInner checks for that) when running a task 

487 # like ISR that has nothing to do with skymaps. 

488 if not summary.dimensions.issubset(self.graph.dimensions): 

489 self.graph = DimensionGraph( 

490 self.graph.universe, 

491 dimensions=(summary.dimensions | self.graph.dimensions), 

492 ) 

493 for dimension, values in summary.dimension_constraints.items(): 

494 if dimension in summary.defaultsNeeded: 

495 # One branch contained an explicit value for this dimension 

496 # while another needed to refer to the default data ID. 

497 # Even if these refer to the same value, that inconsistency 

498 # probably indicates user error. 

499 raise UserExpressionError( 

500 f"Governor dimension {dimension} is explicitly " 

501 f"constrained to {values} in one or more branches of " 

502 "this query where expression, but is left to default " 

503 f"to {self.defaults[dimension]!r} in another branch. " 

504 "Defaults and explicit constraints cannot be mixed." 

505 ) 

506 # If any default data ID values were needed, update self.dataId with 

507 # them, and then update the governor restriction with them. 

508 if summary.defaultsNeeded: 

509 defaultsNeededGraph = DimensionGraph(self.graph.universe, names=summary.defaultsNeeded) 

510 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph)) 

511 for dimension in summary.defaultsNeeded: 

512 summary.dimension_constraints[dimension] = {self.defaults[dimension]} 

513 

514 return summary