Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 37%

225 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-21 02:03 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary"] # other classes here are local to subpackage 

24 

25from collections.abc import Iterable, Iterator, Mapping, Set 

26from dataclasses import dataclass 

27from typing import Any, cast 

28 

29from lsst.sphgeom import Region 

30from lsst.utils.classes import cached_getter, immutable 

31from sqlalchemy.sql import ColumnElement 

32 

33from ...core import ( 

34 DataCoordinate, 

35 DatasetType, 

36 Dimension, 

37 DimensionElement, 

38 DimensionGraph, 

39 DimensionUniverse, 

40 NamedKeyDict, 

41 NamedKeyMapping, 

42 NamedValueAbstractSet, 

43 NamedValueSet, 

44 SkyPixDimension, 

45 TimespanDatabaseRepresentation, 

46) 

47from .._exceptions import UserExpressionSyntaxError 

48 

49# We're not trying to add typing to the lex/yacc parser code, so MyPy 

50# doesn't know about some of these imports. 

51from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore 

52from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName 

53 

54 

55@immutable 

56class QueryWhereExpression: 

57 """A struct representing a parsed user-provided WHERE expression. 

58 

59 Parameters 

60 ---------- 

61 expression : `str`, optional 

62 The string expression to parse. If `None`, a where expression that 

63 always evaluates to `True` is implied. 

64 bind : `Mapping` [ `str`, `object` ], optional 

65 Mapping containing literal values that should be injected into the 

66 query expression, keyed by the identifiers they replace. 

67 """ 

68 

69 def __init__(self, expression: str | None = None, bind: Mapping[str, Any] | None = None): 

70 if expression: 

71 try: 

72 parser = ParserYacc() 

73 self._tree = parser.parse(expression) 

74 except Exception as exc: 

75 raise UserExpressionSyntaxError(f"Failed to parse user expression `{expression}'.") from exc 

76 assert self._tree is not None 

77 else: 

78 self._tree = None 

79 if bind is None: 

80 bind = {} 

81 self._bind = bind 

82 

83 def attach( 

84 self, 

85 graph: DimensionGraph, 

86 dataId: DataCoordinate | None = None, 

87 region: Region | None = None, 

88 defaults: DataCoordinate | None = None, 

89 check: bool = True, 

90 ) -> QueryWhereClause: 

91 """Allow this expression to be attached to a `QuerySummary` by 

92 transforming it into a `QueryWhereClause`, while checking it for both 

93 internal consistency and consistency with the rest of the query. 

94 

95 Parameters 

96 ---------- 

97 graph : `DimensionGraph` 

98 The dimensions the query would include in the absence of this 

99 WHERE expression. 

100 dataId : `DataCoordinate`, optional 

101 A fully-expanded data ID identifying dimensions known in advance. 

102 If not provided, will be set to an empty data ID. 

103 region : `lsst.sphgeom.Region`, optional 

104 A spatial constraint that all rows must overlap. If `None` and 

105 ``dataId`` is an expanded data ID, ``dataId.region`` will be used 

106 to construct one. 

107 defaults : `DataCoordinate`, optional 

108 A data ID containing default for governor dimensions. Ignored 

109 unless ``check=True``. 

110 check : `bool` 

111 If `True` (default) check the query for consistency and inject 

112 default values into the data ID when needed. This may 

113 reject some valid queries that resemble common mistakes (e.g. 

114 queries for visits without specifying an instrument). 

115 """ 

116 if dataId is not None and dataId.hasRecords(): 

117 if region is None and dataId.region is not None: 

118 region = dataId.region 

119 if dataId is None: 

120 dataId = DataCoordinate.makeEmpty(graph.universe) 

121 if defaults is None: 

122 defaults = DataCoordinate.makeEmpty(graph.universe) 

123 if self._bind and check: 

124 for identifier in self._bind: 

125 if identifier in graph.universe.getStaticElements().names: 

126 raise RuntimeError( 

127 f"Bind parameter key {identifier!r} conflicts with a dimension element." 

128 ) 

129 table, sep, column = identifier.partition(".") 

130 if column and table in graph.universe.getStaticElements().names: 

131 raise RuntimeError(f"Bind parameter key {identifier!r} looks like a dimension column.") 

132 governor_constraints: dict[str, Set[str]] = {} 

133 summary: InspectionSummary 

134 if self._tree is not None: 

135 if check: 

136 # Convert the expression to disjunctive normal form (ORs of 

137 # ANDs). That's potentially super expensive in the general 

138 # case (where there's a ton of nesting of ANDs and ORs). That 

139 # won't be the case for the expressions we expect, and we 

140 # actually use disjunctive normal instead of conjunctive (i.e. 

141 # ANDs of ORs) because I think the worst-case is a long list 

142 # of OR'd-together data IDs, which is already in or very close 

143 # to disjunctive normal form. 

144 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE) 

145 from .expressions import CheckVisitor 

146 

147 # Check the expression for consistency and completeness. 

148 visitor = CheckVisitor(dataId, graph, self._bind, defaults) 

149 try: 

150 summary = expr.visit(visitor) 

151 except RuntimeError as err: 

152 exprOriginal = str(self._tree) 

153 exprNormal = str(expr.toTree()) 

154 if exprNormal == exprOriginal: 

155 msg = f'Error in query expression "{exprOriginal}": {err}' 

156 else: 

157 msg = ( 

158 f'Error in query expression "{exprOriginal}" ' 

159 f'(normalized to "{exprNormal}"): {err}' 

160 ) 

161 raise RuntimeError(msg) from None 

162 for dimension_name, values in summary.dimension_constraints.items(): 

163 if dimension_name in graph.universe.getGovernorDimensions().names: 

164 governor_constraints[dimension_name] = cast(Set[str], values) 

165 dataId = visitor.dataId 

166 else: 

167 from .expressions import InspectionVisitor 

168 

169 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind)) 

170 else: 

171 from .expressions import InspectionSummary 

172 

173 summary = InspectionSummary() 

174 return QueryWhereClause( 

175 self._tree, 

176 dataId, 

177 dimensions=summary.dimensions, 

178 columns=summary.columns, 

179 bind=self._bind, 

180 governor_constraints=governor_constraints, 

181 region=region, 

182 ) 

183 

184 

185@dataclass(frozen=True) 

186class QueryWhereClause: 

187 """Structure holding various contributions to a query's WHERE clause. 

188 

189 Instances of this class should only be created by 

190 `QueryWhereExpression.attach`, which guarantees the consistency of its 

191 attributes. 

192 """ 

193 

194 tree: Node | None 

195 """A parsed string expression tree., or `None` if there was no string 

196 expression. 

197 """ 

198 

199 dataId: DataCoordinate 

200 """A data ID identifying dimensions known before query construction 

201 (`DataCoordinate`). 

202 

203 ``dataId.hasRecords()`` is guaranteed to return `True`. 

204 """ 

205 

206 dimensions: NamedValueAbstractSet[Dimension] 

207 """Dimensions whose primary keys or dependencies were referenced anywhere 

208 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]). 

209 """ 

210 

211 columns: NamedKeyMapping[DimensionElement, Set[str]] 

212 """Dimension element tables whose non-key columns were referenced anywhere 

213 in the string expression 

214 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]). 

215 """ 

216 

217 bind: Mapping[str, Any] 

218 """Mapping containing literal values that should be injected into the 

219 query expression, keyed by the identifiers they replace (`Mapping`). 

220 """ 

221 

222 region: Region | None 

223 """A spatial region that all result rows must overlap 

224 (`lsst.sphgeom.Region` or `None`). 

225 """ 

226 

227 governor_constraints: Mapping[str, Set[str]] 

228 """Restrictions on the values governor dimensions can take in this query, 

229 imposed by the string expression and/or data ID 

230 (`Mapping` [ `set`, `~collections.abc.Set` [ `str` ] ]). 

231 

232 Governor dimensions not present in this mapping are not constrained at all. 

233 """ 

234 

235 @property 

236 @cached_getter 

237 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

238 """Dimension elements whose timespans are referenced by this 

239 expression (`NamedValueAbstractSet` [ `DimensionElement` ]) 

240 """ 

241 return NamedValueSet( 

242 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c 

243 ).freeze() 

244 

245 

246@dataclass(frozen=True) 

247class OrderByClauseColumn: 

248 """Information about single column in ORDER BY clause.""" 

249 

250 element: DimensionElement 

251 """Dimension element for data in this column (`DimensionElement`).""" 

252 

253 column: str | None 

254 """Name of the column or `None` for primary key (`str` or `None`)""" 

255 

256 ordering: bool 

257 """True for ascending order, False for descending (`bool`).""" 

258 

259 

260@immutable 

261class OrderByClause: 

262 """Class for information about columns in ORDER BY clause 

263 

264 Parameters 

265 ---------- 

266 order_by : `Iterable` [ `str` ] 

267 Sequence of names to use for ordering with optional "-" prefix. 

268 graph : `DimensionGraph` 

269 Dimensions used by a query. 

270 """ 

271 

272 def __init__(self, order_by: Iterable[str], graph: DimensionGraph): 

273 

274 self.order_by_columns = [] 

275 for name in order_by: 

276 if not name or name == "-": 

277 raise ValueError("Empty dimension name in ORDER BY") 

278 ascending = True 

279 if name[0] == "-": 

280 ascending = False 

281 name = name[1:] 

282 element, column = categorizeOrderByName(graph, name) 

283 self.order_by_columns.append( 

284 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

285 ) 

286 

287 self.elements = NamedValueSet( 

288 column.element for column in self.order_by_columns if column.column is not None 

289 ) 

290 

291 order_by_columns: Iterable[OrderByClauseColumn] 

292 """Columns that appear in the ORDER BY 

293 (`Iterable` [ `OrderByClauseColumn` ]). 

294 """ 

295 

296 elements: NamedValueSet[DimensionElement] 

297 """Dimension elements whose non-key columns were referenced by order_by 

298 (`NamedValueSet` [ `DimensionElement` ]). 

299 """ 

300 

301 

302@immutable 

303class ElementOrderByClause: 

304 """Class for information about columns in ORDER BY clause for one element. 

305 

306 Parameters 

307 ---------- 

308 order_by : `Iterable` [ `str` ] 

309 Sequence of names to use for ordering with optional "-" prefix. 

310 element : `DimensionElement` 

311 Dimensions used by a query. 

312 """ 

313 

314 def __init__(self, order_by: Iterable[str], element: DimensionElement): 

315 

316 self.order_by_columns = [] 

317 for name in order_by: 

318 if not name or name == "-": 

319 raise ValueError("Empty dimension name in ORDER BY") 

320 ascending = True 

321 if name[0] == "-": 

322 ascending = False 

323 name = name[1:] 

324 column = categorizeElementOrderByName(element, name) 

325 self.order_by_columns.append( 

326 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

327 ) 

328 

329 order_by_columns: Iterable[OrderByClauseColumn] 

330 """Columns that appear in the ORDER BY 

331 (`Iterable` [ `OrderByClauseColumn` ]). 

332 """ 

333 

334 

335@immutable 

336class QuerySummary: 

337 """A struct that holds and categorizes the dimensions involved in a query. 

338 

339 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

340 it needs to include all of the dimensions that will be included in the 

341 query (including any needed for querying datasets). 

342 

343 Parameters 

344 ---------- 

345 requested : `DimensionGraph` 

346 The dimensions whose primary keys should be included in the result rows 

347 of the query. 

348 dataId : `DataCoordinate`, optional 

349 A fully-expanded data ID identifying dimensions known in advance. If 

350 not provided, will be set to an empty data ID. 

351 expression : `str` or `QueryWhereExpression`, optional 

352 A user-provided string WHERE expression. 

353 whereRegion : `lsst.sphgeom.Region`, optional 

354 If `None` and ``dataId`` is an expanded data ID, ``dataId.region`` will 

355 be used to construct one. 

356 bind : `Mapping` [ `str`, `object` ], optional 

357 Mapping containing literal values that should be injected into the 

358 query expression, keyed by the identifiers they replace. 

359 defaults : `DataCoordinate`, optional 

360 A data ID containing default for governor dimensions. 

361 datasets : `Iterable` [ `DatasetType` ], optional 

362 Dataset types whose searches may be joined into the query. Callers 

363 must still call `QueryBuilder.joinDataset` explicitly to control how 

364 that join happens (e.g. which collections are searched), but by 

365 declaring them here first we can ensure that the query includes the 

366 right dimensions for those joins. 

367 order_by : `Iterable` [ `str` ] 

368 Sequence of names to use for ordering with optional "-" prefix. 

369 limit : `Tuple`, optional 

370 Limit on the number of returned rows and optional offset. 

371 check : `bool` 

372 If `True` (default) check the query for consistency. This may reject 

373 some valid queries that resemble common mistakes (e.g. queries for 

374 visits without specifying an instrument). 

375 """ 

376 

377 def __init__( 

378 self, 

379 requested: DimensionGraph, 

380 *, 

381 dataId: DataCoordinate | None = None, 

382 expression: str | QueryWhereExpression | None = None, 

383 whereRegion: Region | None = None, 

384 bind: Mapping[str, Any] | None = None, 

385 defaults: DataCoordinate | None = None, 

386 datasets: Iterable[DatasetType] = (), 

387 order_by: Iterable[str] | None = None, 

388 limit: tuple[int, int | None] | None = None, 

389 check: bool = True, 

390 ): 

391 self.requested = requested 

392 if expression is None: 

393 expression = QueryWhereExpression(None, bind) 

394 elif isinstance(expression, str): 

395 expression = QueryWhereExpression(expression, bind) 

396 elif bind is not None: 

397 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.") 

398 self.where = expression.attach( 

399 self.requested, dataId=dataId, region=whereRegion, defaults=defaults, check=check 

400 ) 

401 self.datasets = NamedValueSet(datasets).freeze() 

402 self.order_by = None if order_by is None else OrderByClause(order_by, requested) 

403 self.limit = limit 

404 

405 requested: DimensionGraph 

406 """Dimensions whose primary keys should be included in the result rows of 

407 the query (`DimensionGraph`). 

408 """ 

409 

410 where: QueryWhereClause 

411 """Structure containing objects that contribute to the WHERE clause of the 

412 query (`QueryWhereClause`). 

413 """ 

414 

415 datasets: NamedValueAbstractSet[DatasetType] 

416 """Dataset types whose searches may be joined into the query 

417 (`NamedValueAbstractSet` [ `DatasetType` ]). 

418 """ 

419 

420 @property 

421 def universe(self) -> DimensionUniverse: 

422 """All known dimensions (`DimensionUniverse`).""" 

423 return self.requested.universe 

424 

425 @property 

426 @cached_getter 

427 def spatial(self) -> NamedValueAbstractSet[DimensionElement]: 

428 """Dimension elements whose regions and skypix IDs should be included 

429 in the query (`NamedValueAbstractSet` of `DimensionElement`). 

430 """ 

431 # An element may participate spatially in the query if: 

432 # - it's the most precise spatial element for its system in the 

433 # requested dimensions (i.e. in `self.requested.spatial`); 

434 # - it isn't also given at query construction time. 

435 result: NamedValueSet[DimensionElement] = NamedValueSet() 

436 for family in self.mustHaveKeysJoined.spatial: 

437 element = family.choose(self.mustHaveKeysJoined.elements) 

438 assert isinstance(element, DimensionElement) 

439 if element not in self.where.dataId.graph.elements: 

440 result.add(element) 

441 if len(result) == 1: 

442 # There's no spatial join, but there might be a WHERE filter based 

443 # on a given region. 

444 if self.where.dataId.graph.spatial: 

445 # We can only perform those filters against SkyPix dimensions, 

446 # so if what we have isn't one, add the common SkyPix dimension 

447 # to the query; the element we have will be joined to that. 

448 (element,) = result 

449 if not isinstance(element, SkyPixDimension): 

450 result.add(self.universe.commonSkyPix) 

451 else: 

452 # There is no spatial join or filter in this query. Even 

453 # if this element might be associated with spatial 

454 # information, we don't need it for this query. 

455 return NamedValueSet().freeze() 

456 elif len(result) > 1: 

457 # There's a spatial join. Those require the common SkyPix 

458 # system to be included in the query in order to connect them. 

459 result.add(self.universe.commonSkyPix) 

460 return result.freeze() 

461 

462 @property 

463 @cached_getter 

464 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

465 """Dimension elements whose timespans should be included in the 

466 query (`NamedValueSet` of `DimensionElement`). 

467 """ 

468 if len(self.mustHaveKeysJoined.temporal) > 1: 

469 # We don't actually have multiple temporal families in our current 

470 # dimension configuration, so this limitation should be harmless. 

471 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.") 

472 return self.where.temporal 

473 

474 @property 

475 @cached_getter 

476 def mustHaveKeysJoined(self) -> DimensionGraph: 

477 """Dimensions whose primary keys must be used in the JOIN ON clauses 

478 of the query, even if their tables do not appear (`DimensionGraph`). 

479 

480 A `Dimension` primary key can appear in a join clause without its table 

481 via a foreign key column in table of a dependent dimension element or 

482 dataset. 

483 """ 

484 names = set(self.requested.names | self.where.dimensions.names) 

485 for dataset_type in self.datasets: 

486 names.update(dataset_type.dimensions.names) 

487 return DimensionGraph(self.universe, names=names) 

488 

489 @property 

490 @cached_getter 

491 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]: 

492 """Dimension elements whose associated tables must appear in the 

493 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

494 """ 

495 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys()) 

496 if self.order_by is not None: 

497 result.update(self.order_by.elements) 

498 for dimension in self.mustHaveKeysJoined: 

499 if dimension.implied: 

500 result.add(dimension) 

501 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements: 

502 if element.alwaysJoin: 

503 result.add(element) 

504 return result.freeze() 

505 

506 

507@dataclass 

508class DatasetQueryColumns: 

509 """A struct containing the columns used to reconstruct `DatasetRef` 

510 instances from query results. 

511 """ 

512 

513 datasetType: DatasetType 

514 """The dataset type being queried (`DatasetType`). 

515 """ 

516 

517 id: ColumnElement 

518 """Column containing the unique integer ID for this dataset. 

519 """ 

520 

521 runKey: ColumnElement 

522 """Foreign key column to the `~CollectionType.RUN` collection that holds 

523 this dataset. 

524 """ 

525 

526 ingestDate: ColumnElement | None 

527 """Column containing the ingest timestamp, this is not a part of 

528 `DatasetRef` but it comes from the same table. 

529 """ 

530 

531 def __iter__(self) -> Iterator[ColumnElement]: 

532 yield self.id 

533 yield self.runKey 

534 

535 

536@dataclass 

537class QueryColumns: 

538 """A struct organizing the columns in an under-construction or currently- 

539 executing query. 

540 

541 Takes no parameters at construction, as expected usage is to add elements 

542 to its container attributes incrementally. 

543 """ 

544 

545 def __init__(self) -> None: 

546 self.keys = NamedKeyDict() 

547 self.timespans = NamedKeyDict() 

548 self.regions = NamedKeyDict() 

549 self.datasets = None 

550 

551 keys: NamedKeyDict[Dimension, list[ColumnElement]] 

552 """Columns that correspond to the primary key values of dimensions 

553 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

554 

555 Each value list contains columns from multiple tables corresponding to the 

556 same dimension, and the query should constrain the values of those columns 

557 to be the same. 

558 

559 In a `Query`, the keys of this dictionary must include at least the 

560 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

561 """ 

562 

563 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] 

564 """Columns that correspond to timespans for elements that participate in a 

565 temporal join or filter in the query (`NamedKeyDict` mapping 

566 `DimensionElement` to `TimespanDatabaseRepresentation`). 

567 

568 In a `Query`, the keys of this dictionary must be exactly the elements 

569 in `QuerySummary.temporal`. 

570 """ 

571 

572 regions: NamedKeyDict[DimensionElement, ColumnElement] 

573 """Columns that correspond to regions for elements that participate in a 

574 spatial join or filter in the query (`NamedKeyDict` mapping 

575 `DimensionElement` to `sqlalchemy.sql.ColumnElement`). 

576 

577 In a `Query`, the keys of this dictionary must be exactly the elements 

578 in `QuerySummary.spatial`. 

579 """ 

580 

581 datasets: DatasetQueryColumns | None 

582 """Columns that can be used to construct `DatasetRef` instances from query 

583 results. 

584 (`DatasetQueryColumns` or `None`). 

585 """ 

586 

587 def isEmpty(self) -> bool: 

588 """Return `True` if this query has no columns at all.""" 

589 return not (self.keys or self.timespans or self.regions or self.datasets is not None) 

590 

591 def getKeyColumn(self, dimension: Dimension | str) -> ColumnElement: 

592 """Return one of the columns in self.keys for the given dimension. 

593 

594 The column selected is an implentation detail but is guaranteed to 

595 be deterministic and consistent across multiple calls. 

596 

597 Parameters 

598 ---------- 

599 dimension : `Dimension` or `str` 

600 Dimension for which to obtain a key column. 

601 

602 Returns 

603 ------- 

604 column : `sqlalchemy.sql.ColumnElement` 

605 SQLAlchemy column object. 

606 """ 

607 # Choosing the last element here is entirely for human readers of the 

608 # query (e.g. developers debugging things); it makes it more likely a 

609 # dimension key will be provided by the dimension's own table, or 

610 # failing that, some closely related dimension, which might be less 

611 # surprising to see than e.g. some dataset subquery. From the 

612 # database's perspective this is entirely arbitrary, because the query 

613 # guarantees they all have equal values. 

614 return self.keys[dimension][-1]