Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 29%

173 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-19 03:44 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ( 

30 "CheckVisitor", 

31 "InspectionVisitor", 

32 "InspectionSummary", 

33) 

34 

35import dataclasses 

36from collections.abc import Mapping, Sequence, Set 

37from typing import TYPE_CHECKING, Any 

38 

39from ...._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag 

40from ....dimensions import DataCoordinate, DataIdValue, Dimension, DimensionGroup, DimensionUniverse 

41from ..._exceptions import UserExpressionError 

42from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId 

43from .normalForm import NormalForm, NormalFormVisitor 

44from .parser import Node, TreeVisitor 

45 

46if TYPE_CHECKING: 

47 import astropy.time 

48 from lsst.daf.relation import ColumnTag 

49 

50 

51@dataclasses.dataclass 

52class InspectionSummary: 

53 """Base class for objects used by `CheckVisitor` and `InspectionVisitor` 

54 to gather information about a parsed expression. 

55 """ 

56 

57 def update(self, other: InspectionSummary) -> None: 

58 """Update ``self`` with all dimensions and columns from ``other``. 

59 

60 Parameters 

61 ---------- 

62 other : `InspectionSummary` 

63 The other summary object. 

64 """ 

65 self.dimensions.update(other.dimensions) 

66 for element, columns in other.columns.items(): 

67 self.columns.setdefault(element, set()).update(columns) 

68 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate 

69 

70 dimensions: set[str] = dataclasses.field(default_factory=set) 

71 """Names of dimensions whose primary keys or dependencies were referenced 

72 anywhere in this branch (`set` [ `str` ]). 

73 """ 

74 

75 columns: dict[str, set[str]] = dataclasses.field(default_factory=dict) 

76 """Names of dimension element tables whose columns were referenced anywhere 

77 in this branch (`dict` [ `str`, `set` [ `str` ] ]). 

78 """ 

79 

80 hasIngestDate: bool = False 

81 """Whether this expression includes the special dataset ingest date 

82 identifier (`bool`). 

83 """ 

84 

85 def make_column_tag_set(self, dataset_type_name: str | None) -> set[ColumnTag]: 

86 """Transform the columns captured here into a set of `ColumnTag` 

87 objects. 

88 

89 Parameters 

90 ---------- 

91 dataset_type_name : `str` or `None` 

92 Name of the dataset type to assume for unqualified dataset columns, 

93 or `None` to reject any such identifiers. 

94 

95 Returns 

96 ------- 

97 tag_set : `set` [ `ColumnTag` ] 

98 Set of categorized column tags. 

99 """ 

100 result: set[ColumnTag] = set() 

101 if self.hasIngestDate: 

102 if dataset_type_name is None: 

103 raise UserExpressionError( 

104 "Expression requires an ingest date, which requires exactly one dataset type." 

105 ) 

106 result.add(DatasetColumnTag(dataset_type_name, "ingest_date")) 

107 result.update(DimensionKeyColumnTag.generate(self.dimensions)) 

108 for dimension_element, columns in self.columns.items(): 

109 result.update(DimensionRecordColumnTag.generate(dimension_element, columns)) 

110 return result 

111 

112 

113@dataclasses.dataclass 

114class TreeSummary(InspectionSummary): 

115 """Result object used by `InspectionVisitor` to gather information about 

116 a parsed expression. 

117 

118 Notes 

119 ----- 

120 TreeSummary adds attributes that allow dimension equivalence expressions 

121 (e.g. "tract=4") to be recognized when they appear in simple contexts 

122 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its 

123 own (i.e. when ``check=False`` in the query code), these don't do anything, 

124 but they don't cost much, either. They are used by `CheckVisitor` when it 

125 delegates to `InspectionVisitor` to see what governor dimension values are 

126 set in a branch of the normal-form expression. 

127 """ 

128 

129 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary: 

130 """Merge ``other`` into ``self``, making ``self`` a summary of both 

131 expression tree branches. 

132 

133 Parameters 

134 ---------- 

135 other : `TreeSummary` 

136 The other summary object. 

137 isEq : `bool`, optional 

138 If `True` (`False` is default), these summaries are being combined 

139 via the equality operator. 

140 

141 Returns 

142 ------- 

143 self : `TreeSummary` 

144 The merged summary (updated in-place). 

145 """ 

146 self.update(other) 

147 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly(): 

148 self.dataIdValue = other.dataIdValue 

149 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly(): 

150 self.dataIdKey = other.dataIdKey 

151 else: 

152 self.dataIdKey = None 

153 self.dataIdValue = None 

154 return self 

155 

156 def isDataIdKeyOnly(self) -> bool: 

157 """Test whether this branch is _just_ a data ID key identifier.""" 

158 return self.dataIdKey is not None and self.dataIdValue is None 

159 

160 def isDataIdValueOnly(self) -> bool: 

161 """Test whether this branch is _just_ a literal value that may be 

162 used as the value in a data ID key-value pair. 

163 """ 

164 return self.dataIdKey is None and self.dataIdValue is not None 

165 

166 dataIdKey: Dimension | None = None 

167 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be 

168 (if `dataIdValue` is `None`) fully identified by a literal value in this 

169 branch. 

170 """ 

171 

172 dataIdValue: str | None = None 

173 """A literal value that constrains (if `dataIdKey` is not `None`) or may 

174 constrain (if `dataIdKey` is `None`) a dimension in this branch. 

175 

176 This is always a `str` or `None`, but it may need to be coerced to `int` 

177 to reflect the actual user intent. 

178 """ 

179 

180 

181class InspectionVisitor(TreeVisitor[TreeSummary]): 

182 """Implements TreeVisitor to identify dimension elements that need 

183 to be included in a query, prior to actually constructing a SQLAlchemy 

184 WHERE clause from it. 

185 

186 Parameters 

187 ---------- 

188 universe : `DimensionUniverse` 

189 All known dimensions. 

190 bind : `~collections.abc.Mapping` [ `str`, `object` ] 

191 Mapping containing literal values that should be injected into the 

192 query expression, keyed by the identifiers they replace. 

193 """ 

194 

195 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]): 

196 self.universe = universe 

197 self.bind = bind 

198 

199 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary: 

200 # Docstring inherited from TreeVisitor.visitNumericLiteral 

201 return TreeSummary(dataIdValue=value) 

202 

203 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary: 

204 # Docstring inherited from TreeVisitor.visitStringLiteral 

205 return TreeSummary(dataIdValue=value) 

206 

207 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary: 

208 # Docstring inherited from TreeVisitor.visitTimeLiteral 

209 return TreeSummary() 

210 

211 def visitIdentifier(self, name: str, node: Node) -> TreeSummary: 

212 # Docstring inherited from TreeVisitor.visitIdentifier 

213 if name in self.bind: 

214 value = self.bind[name] 

215 if isinstance(value, list | tuple | Set): 

216 # This can happen on rhs of IN operator, if there is only one 

217 # element in the list then take it. 

218 if len(value) == 1: 

219 return TreeSummary(dataIdValue=next(iter(value))) 

220 else: 

221 return TreeSummary() 

222 else: 

223 return TreeSummary(dataIdValue=value) 

224 constant = categorizeConstant(name) 

225 if constant is ExpressionConstant.INGEST_DATE: 

226 return TreeSummary(hasIngestDate=True) 

227 elif constant is ExpressionConstant.NULL: 

228 return TreeSummary() 

229 assert constant is None, "Enum variant conditionals should be exhaustive." 

230 element, column = categorizeElementId(self.universe, name) 

231 if column is None: 

232 assert isinstance(element, Dimension) 

233 return TreeSummary( 

234 dimensions=set(element.minimal_group.names), 

235 dataIdKey=element, 

236 ) 

237 else: 

238 return TreeSummary(dimensions=set(element.minimal_group.names), columns={element.name: {column}}) 

239 

240 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary: 

241 # Docstring inherited from TreeVisitor.visitUnaryOp 

242 return operand 

243 

244 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary: 

245 # Docstring inherited from TreeVisitor.visitBinaryOp 

246 return lhs.merge(rhs, isEq=(operator == "=")) 

247 

248 def visitIsIn(self, lhs: TreeSummary, values: list[TreeSummary], not_in: bool, node: Node) -> TreeSummary: 

249 # Docstring inherited from TreeVisitor.visitIsIn 

250 for v in values: 

251 lhs.merge(v) 

252 return lhs 

253 

254 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary: 

255 # Docstring inherited from TreeVisitor.visitParens 

256 return expression 

257 

258 def visitTupleNode(self, items: tuple[TreeSummary, ...], node: Node) -> TreeSummary: 

259 # Docstring inherited from base class 

260 result = TreeSummary() 

261 for i in items: 

262 result.merge(i) 

263 return result 

264 

265 def visitRangeLiteral(self, start: int, stop: int, stride: int | None, node: Node) -> TreeSummary: 

266 # Docstring inherited from TreeVisitor.visitRangeLiteral 

267 return TreeSummary() 

268 

269 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary: 

270 # Docstring inherited from base class 

271 return TreeSummary() 

272 

273 

274@dataclasses.dataclass 

275class InnerSummary(InspectionSummary): 

276 """Result object used by `CheckVisitor` to gather referenced dimensions 

277 and tables from an inner group of AND'd together expression branches, and 

278 check them for consistency and completeness. 

279 """ 

280 

281 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict) 

282 """Mapping containing the values of all dimensions that are equated with 

283 literal values in this expression branch. 

284 """ 

285 

286 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

287 """Governor dimensions whose values are needed by the query, not provided 

288 in the query itself, and present in the default data ID. 

289 

290 These should be added to the query's data ID when finalizing the WHERE 

291 clause. 

292 """ 

293 

294 

295@dataclasses.dataclass 

296class OuterSummary(InspectionSummary): 

297 """Result object used by `CheckVisitor` to gather referenced dimensions, 

298 tables, and governor dimension values from the entire expression. 

299 """ 

300 

301 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict) 

302 """Mapping containing all values that appear in this expression for 

303 dimensions relevant to the query. 

304 

305 Dimensions that are absent from this dict are not constrained by this 

306 expression. 

307 """ 

308 

309 defaultsNeeded: set[str] = dataclasses.field(default_factory=set) 

310 """Governor dimensions whose values are needed by the query, not provided 

311 in the query itself, and present in the default data ID. 

312 

313 These should be added to the query's data ID when finalizing the WHERE 

314 clause. 

315 """ 

316 

317 

318class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]): 

319 """An implementation of `NormalFormVisitor` that identifies the dimensions 

320 and tables that need to be included in a query while performing some checks 

321 for completeness and consistency. 

322 

323 Parameters 

324 ---------- 

325 dataId : `DataCoordinate` 

326 Dimension values that are fully known in advance. 

327 dimensions : `DimensionGroup` 

328 The dimensions the query would include in the absence of this 

329 expression. 

330 bind : `~collections.abc.Mapping` [ `str`, `object` ] 

331 Mapping containing literal values that should be injected into the 

332 query expression, keyed by the identifiers they replace. 

333 defaults : `DataCoordinate` 

334 A data ID containing default for governor dimensions. 

335 allow_orphans : `bool`, optional 

336 If `True`, permit expressions to refer to dimensions without providing 

337 a value for their governor dimensions (e.g. referring to a visit 

338 without an instrument). Should be left to default to `False` in 

339 essentially all new code. 

340 """ 

341 

342 def __init__( 

343 self, 

344 dataId: DataCoordinate, 

345 dimensions: DimensionGroup, 

346 bind: Mapping[str, Any], 

347 defaults: DataCoordinate, 

348 allow_orphans: bool = False, 

349 ): 

350 self.dataId = dataId 

351 self.dimensions = dimensions 

352 self.defaults = defaults 

353 self._branchVisitor = InspectionVisitor(dataId.universe, bind) 

354 self._allow_orphans = allow_orphans 

355 

356 @property 

357 def universe(self) -> DimensionUniverse: 

358 """Object that defines all dimensions.""" 

359 return self.dimensions.universe 

360 

361 def visitBranch(self, node: Node) -> TreeSummary: 

362 # Docstring inherited from NormalFormVisitor. 

363 return node.visit(self._branchVisitor) 

364 

365 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary: 

366 # Docstring inherited from NormalFormVisitor. 

367 # Disjunctive normal form means inner branches are AND'd together... 

368 assert form is NormalForm.DISJUNCTIVE 

369 # ...and that means each branch we iterate over together below 

370 # constrains the others, and they all need to be consistent. Moreover, 

371 # because outer branches are OR'd together, we also know that if 

372 # something is missing from one of these branches (like a governor 

373 # dimension value like the instrument or skymap needed to interpret a 

374 # visit or tract number), it really is missing, because there's no way 

375 # some other inner branch can constraint it. 

376 # 

377 # That is, except the data ID the visitor was passed at construction; 

378 # that's AND'd to the entire expression later, and thus it affects all 

379 # branches. To take care of that, we add any governor values it 

380 # contains to the summary in advance. 

381 summary = InnerSummary() 

382 summary.dimension_values.update(self.dataId.mapping) 

383 # Finally, we loop over those branches. 

384 for branch in branches: 

385 # Update the sets of dimensions and columns we've seen anywhere in 

386 # the expression in any context. 

387 summary.update(branch) 

388 # Test whether this branch has a form like '<dimension>=<value>' 

389 # (or equivalent; categorizeIdentifier is smart enough to see that 

390 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so, 

391 # remember that we've constrained it on this branch to later make 

392 # sure it's consistent with any other constraints on any other 

393 # branches its AND'd with. 

394 if branch.dataIdKey is not None and branch.dataIdValue is not None: 

395 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue) 

396 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value) 

397 if value != new_value: 

398 # Expression says something like "instrument='HSC' AND 

399 # instrument='DECam'", or data ID has one and expression 

400 # has the other. 

401 if branch.dataIdKey.name in self.dataId: 

402 raise UserExpressionError( 

403 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} " 

404 f"and data ID with {branch.dataIdKey.name}={value!r}." 

405 ) 

406 else: 

407 raise UserExpressionError( 

408 f"Conflicting literal values for {branch.dataIdKey.name} in expression: " 

409 f"{value!r} != {branch.dataIdValue!r}." 

410 ) 

411 # Now that we know which governor values we've constrained, see if any 

412 # are missing, i.e. if the expression contains something like "visit=X" 

413 # without saying what instrument that visit corresponds to. This rules 

414 # out a lot of accidents, but it also rules out possibly-legitimate 

415 # multi-instrument queries like "visit.seeing < 0.7". But it's not 

416 # unreasonable to ask the user to be explicit about the instruments 

417 # they want to consider to work around this restriction, and that's 

418 # what we do. Note that if someone does write an expression like 

419 # 

420 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7 

421 # 

422 # then in disjunctive normal form that will become 

423 # 

424 # (instrument='HSC' AND visit.seeing < 0.7) 

425 # OR (instrument='DECam' AND visit.seeing < 0.7) 

426 # 

427 # i.e. each instrument will get its own outer branch and the logic here 

428 # still works (that sort of thing is why we convert to normal form, 

429 # after all). 

430 governorsNeededInBranch: set[str] = set() 

431 for dimension in summary.dimensions: 

432 governorsNeededInBranch.update(self.universe.dimensions[dimension].minimal_group.governors) 

433 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()): 

434 missing = governorsNeededInBranch - summary.dimension_values.keys() 

435 if missing <= self.defaults.dimensions.required: 

436 summary.defaultsNeeded.update(missing) 

437 elif not self._allow_orphans: 

438 still_missing = missing - self.defaults.names 

439 raise UserExpressionError( 

440 f"No value(s) for governor dimensions {still_missing} in expression " 

441 "that references dependent dimensions. 'Governor' dimensions must always be specified " 

442 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' " 

443 "terms) or in a data ID passed to the query method." 

444 ) 

445 return summary 

446 

447 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary: 

448 # Docstring inherited from NormalFormVisitor. 

449 # Disjunctive normal form means outer branches are OR'd together. 

450 assert form is NormalForm.DISJUNCTIVE 

451 summary = OuterSummary() 

452 if branches: 

453 # Iterate over branches in first pass to gather all dimensions and 

454 # columns referenced. This aggregation is for the full query, so 

455 # we don't care whether things are joined by AND or OR (or + or -, 

456 # etc). Also gather the set of dimensions directly constrained or 

457 # pulled from defaults in _all_ branches. This is the set we will 

458 # be able to bound overall; any dimensions not referenced by even 

459 # one branch could be unbounded. 

460 dimensions_in_all_branches = set(self.universe.dimensions.names) 

461 for branch in branches: 

462 summary.update(branch) 

463 summary.defaultsNeeded.update(branch.defaultsNeeded) 

464 dimensions_in_all_branches.intersection_update(branch.dimension_values) 

465 # Go back through and set up the dimension bounds. 

466 summary.dimension_constraints.update( 

467 {dimension: set() for dimension in dimensions_in_all_branches} 

468 ) 

469 for dim in dimensions_in_all_branches: 

470 for branch in branches: 

471 summary.dimension_constraints[dim].add(branch.dimension_values[dim]) 

472 # See if we've referenced any dimensions that weren't in the original 

473 # query graph; if so, we update that to include them. This is what 

474 # lets a user say "tract=X" on the command line (well, "skymap=Y AND 

475 # tract=X" - logic in visitInner checks for that) when running a task 

476 # like ISR that has nothing to do with skymaps. 

477 if not summary.dimensions.issubset(self.dimensions.names): 

478 self.dimensions = self.universe.conform(summary.dimensions | self.dimensions.names) 

479 for dimension, values in summary.dimension_constraints.items(): 

480 if dimension in summary.defaultsNeeded: 

481 # One branch contained an explicit value for this dimension 

482 # while another needed to refer to the default data ID. 

483 # Even if these refer to the same value, that inconsistency 

484 # probably indicates user error. 

485 raise UserExpressionError( 

486 f"Governor dimension {dimension} is explicitly " 

487 f"constrained to {values} in one or more branches of " 

488 "this query where expression, but is left to default " 

489 f"to {self.defaults[dimension]!r} in another branch. " 

490 "Defaults and explicit constraints cannot be mixed." 

491 ) 

492 # If any default data ID values were needed, update self.dataId with 

493 # them, and then update the governor restriction with them. 

494 if summary.defaultsNeeded: 

495 defaultsNeededGraph = self.universe.conform(summary.defaultsNeeded) 

496 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph)) 

497 for dimension in summary.defaultsNeeded: 

498 summary.dimension_constraints[dimension] = {self.defaults[dimension]} 

499 

500 return summary