Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 37%

225 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-10-26 15:15 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary"] # other classes here are local to subpackage 

24 

25from collections.abc import Iterable, Iterator, Mapping, Set 

26from dataclasses import dataclass 

27from typing import Any, cast 

28 

29from lsst.sphgeom import Region 

30from lsst.utils.classes import cached_getter, immutable 

31from sqlalchemy.sql import ColumnElement 

32 

33from ...core import ( 

34 DataCoordinate, 

35 DatasetType, 

36 Dimension, 

37 DimensionElement, 

38 DimensionGraph, 

39 DimensionUniverse, 

40 NamedKeyDict, 

41 NamedKeyMapping, 

42 NamedValueAbstractSet, 

43 NamedValueSet, 

44 SkyPixDimension, 

45 TimespanDatabaseRepresentation, 

46) 

47from .._exceptions import UserExpressionSyntaxError 

48 

49# We're not trying to add typing to the lex/yacc parser code, so MyPy 

50# doesn't know about some of these imports. 

51from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore 

52from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName 

53 

54 

55@immutable 

56class QueryWhereExpression: 

57 """A struct representing a parsed user-provided WHERE expression. 

58 

59 Parameters 

60 ---------- 

61 expression : `str`, optional 

62 The string expression to parse. If `None`, a where expression that 

63 always evaluates to `True` is implied. 

64 bind : `Mapping` [ `str`, `object` ], optional 

65 Mapping containing literal values that should be injected into the 

66 query expression, keyed by the identifiers they replace. 

67 """ 

68 

69 def __init__(self, expression: str | None = None, bind: Mapping[str, Any] | None = None): 

70 if expression: 

71 try: 

72 parser = ParserYacc() 

73 self._tree = parser.parse(expression) 

74 except Exception as exc: 

75 raise UserExpressionSyntaxError(f"Failed to parse user expression `{expression}'.") from exc 

76 assert self._tree is not None 

77 else: 

78 self._tree = None 

79 if bind is None: 

80 bind = {} 

81 self._bind = bind 

82 

83 def attach( 

84 self, 

85 graph: DimensionGraph, 

86 dataId: DataCoordinate | None = None, 

87 region: Region | None = None, 

88 defaults: DataCoordinate | None = None, 

89 check: bool = True, 

90 ) -> QueryWhereClause: 

91 """Allow this expression to be attached to a `QuerySummary` by 

92 transforming it into a `QueryWhereClause`, while checking it for both 

93 internal consistency and consistency with the rest of the query. 

94 

95 Parameters 

96 ---------- 

97 graph : `DimensionGraph` 

98 The dimensions the query would include in the absence of this 

99 WHERE expression. 

100 dataId : `DataCoordinate`, optional 

101 A fully-expanded data ID identifying dimensions known in advance. 

102 If not provided, will be set to an empty data ID. 

103 region : `lsst.sphgeom.Region`, optional 

104 A spatial constraint that all rows must overlap. If `None` and 

105 ``dataId`` is an expanded data ID, ``dataId.region`` will be used 

106 to construct one. 

107 defaults : `DataCoordinate`, optional 

108 A data ID containing default for governor dimensions. Ignored 

109 unless ``check=True``. 

110 check : `bool` 

111 If `True` (default) check the query for consistency and inject 

112 default values into the data ID when needed. This may 

113 reject some valid queries that resemble common mistakes (e.g. 

114 queries for visits without specifying an instrument). 

115 """ 

116 if dataId is not None and dataId.hasRecords(): 

117 if region is None and dataId.region is not None: 

118 region = dataId.region 

119 if dataId is None: 

120 dataId = DataCoordinate.makeEmpty(graph.universe) 

121 if defaults is None: 

122 defaults = DataCoordinate.makeEmpty(graph.universe) 

123 if self._bind and check: 

124 for identifier in self._bind: 

125 if identifier in graph.universe.getStaticElements().names: 

126 raise RuntimeError( 

127 f"Bind parameter key {identifier!r} conflicts with a dimension element." 

128 ) 

129 table, sep, column = identifier.partition(".") 

130 if column and table in graph.universe.getStaticElements().names: 

131 raise RuntimeError(f"Bind parameter key {identifier!r} looks like a dimension column.") 

132 governor_constraints: dict[str, Set[str]] = {} 

133 summary: InspectionSummary 

134 if self._tree is not None: 

135 if check: 

136 # Convert the expression to disjunctive normal form (ORs of 

137 # ANDs). That's potentially super expensive in the general 

138 # case (where there's a ton of nesting of ANDs and ORs). That 

139 # won't be the case for the expressions we expect, and we 

140 # actually use disjunctive normal instead of conjunctive (i.e. 

141 # ANDs of ORs) because I think the worst-case is a long list 

142 # of OR'd-together data IDs, which is already in or very close 

143 # to disjunctive normal form. 

144 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE) 

145 from .expressions import CheckVisitor 

146 

147 # Check the expression for consistency and completeness. 

148 visitor = CheckVisitor(dataId, graph, self._bind, defaults) 

149 try: 

150 summary = expr.visit(visitor) 

151 except RuntimeError as err: 

152 exprOriginal = str(self._tree) 

153 exprNormal = str(expr.toTree()) 

154 if exprNormal == exprOriginal: 

155 msg = f'Error in query expression "{exprOriginal}": {err}' 

156 else: 

157 msg = ( 

158 f'Error in query expression "{exprOriginal}" ' 

159 f'(normalized to "{exprNormal}"): {err}' 

160 ) 

161 raise RuntimeError(msg) from None 

162 for dimension_name, values in summary.dimension_constraints.items(): 

163 if dimension_name in graph.universe.getGovernorDimensions().names: 

164 governor_constraints[dimension_name] = cast(Set[str], values) 

165 dataId = visitor.dataId 

166 else: 

167 from .expressions import InspectionVisitor 

168 

169 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind)) 

170 else: 

171 from .expressions import InspectionSummary 

172 

173 summary = InspectionSummary() 

174 return QueryWhereClause( 

175 self._tree, 

176 dataId, 

177 dimensions=summary.dimensions, 

178 columns=summary.columns, 

179 bind=self._bind, 

180 governor_constraints=governor_constraints, 

181 region=region, 

182 ) 

183 

184 

185@dataclass(frozen=True) 

186class QueryWhereClause: 

187 """Structure holding various contributions to a query's WHERE clause. 

188 

189 Instances of this class should only be created by 

190 `QueryWhereExpression.attach`, which guarantees the consistency of its 

191 attributes. 

192 """ 

193 

194 tree: Node | None 

195 """A parsed string expression tree., or `None` if there was no string 

196 expression. 

197 """ 

198 

199 dataId: DataCoordinate 

200 """A data ID identifying dimensions known before query construction 

201 (`DataCoordinate`). 

202 

203 ``dataId.hasRecords()`` is guaranteed to return `True`. 

204 """ 

205 

206 dimensions: NamedValueAbstractSet[Dimension] 

207 """Dimensions whose primary keys or dependencies were referenced anywhere 

208 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]). 

209 """ 

210 

211 columns: NamedKeyMapping[DimensionElement, Set[str]] 

212 """Dimension element tables whose non-key columns were referenced anywhere 

213 in the string expression 

214 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]). 

215 """ 

216 

217 bind: Mapping[str, Any] 

218 """Mapping containing literal values that should be injected into the 

219 query expression, keyed by the identifiers they replace (`Mapping`). 

220 """ 

221 

222 region: Region | None 

223 """A spatial region that all result rows must overlap 

224 (`lsst.sphgeom.Region` or `None`). 

225 """ 

226 

227 governor_constraints: Mapping[str, Set[str]] 

228 """Restrictions on the values governor dimensions can take in this query, 

229 imposed by the string expression and/or data ID 

230 (`Mapping` [ `set`, `~collections.abc.Set` [ `str` ] ]). 

231 

232 Governor dimensions not present in this mapping are not constrained at all. 

233 """ 

234 

235 @property 

236 @cached_getter 

237 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

238 """Dimension elements whose timespans are referenced by this 

239 expression (`NamedValueAbstractSet` [ `DimensionElement` ]) 

240 """ 

241 return NamedValueSet( 

242 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c 

243 ).freeze() 

244 

245 

246@dataclass(frozen=True) 

247class OrderByClauseColumn: 

248 """Information about single column in ORDER BY clause.""" 

249 

250 element: DimensionElement 

251 """Dimension element for data in this column (`DimensionElement`).""" 

252 

253 column: str | None 

254 """Name of the column or `None` for primary key (`str` or `None`)""" 

255 

256 ordering: bool 

257 """True for ascending order, False for descending (`bool`).""" 

258 

259 

260@immutable 

261class OrderByClause: 

262 """Class for information about columns in ORDER BY clause 

263 

264 Parameters 

265 ---------- 

266 order_by : `Iterable` [ `str` ] 

267 Sequence of names to use for ordering with optional "-" prefix. 

268 graph : `DimensionGraph` 

269 Dimensions used by a query. 

270 """ 

271 

272 def __init__(self, order_by: Iterable[str], graph: DimensionGraph): 

273 self.order_by_columns = [] 

274 for name in order_by: 

275 if not name or name == "-": 

276 raise ValueError("Empty dimension name in ORDER BY") 

277 ascending = True 

278 if name[0] == "-": 

279 ascending = False 

280 name = name[1:] 

281 element, column = categorizeOrderByName(graph, name) 

282 self.order_by_columns.append( 

283 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

284 ) 

285 

286 self.elements = NamedValueSet( 

287 column.element for column in self.order_by_columns if column.column is not None 

288 ) 

289 

290 order_by_columns: Iterable[OrderByClauseColumn] 

291 """Columns that appear in the ORDER BY 

292 (`Iterable` [ `OrderByClauseColumn` ]). 

293 """ 

294 

295 elements: NamedValueSet[DimensionElement] 

296 """Dimension elements whose non-key columns were referenced by order_by 

297 (`NamedValueSet` [ `DimensionElement` ]). 

298 """ 

299 

300 

301@immutable 

302class ElementOrderByClause: 

303 """Class for information about columns in ORDER BY clause for one element. 

304 

305 Parameters 

306 ---------- 

307 order_by : `Iterable` [ `str` ] 

308 Sequence of names to use for ordering with optional "-" prefix. 

309 element : `DimensionElement` 

310 Dimensions used by a query. 

311 """ 

312 

313 def __init__(self, order_by: Iterable[str], element: DimensionElement): 

314 self.order_by_columns = [] 

315 for name in order_by: 

316 if not name or name == "-": 

317 raise ValueError("Empty dimension name in ORDER BY") 

318 ascending = True 

319 if name[0] == "-": 

320 ascending = False 

321 name = name[1:] 

322 column = categorizeElementOrderByName(element, name) 

323 self.order_by_columns.append( 

324 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

325 ) 

326 

327 order_by_columns: Iterable[OrderByClauseColumn] 

328 """Columns that appear in the ORDER BY 

329 (`Iterable` [ `OrderByClauseColumn` ]). 

330 """ 

331 

332 

333@immutable 

334class QuerySummary: 

335 """A struct that holds and categorizes the dimensions involved in a query. 

336 

337 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

338 it needs to include all of the dimensions that will be included in the 

339 query (including any needed for querying datasets). 

340 

341 Parameters 

342 ---------- 

343 requested : `DimensionGraph` 

344 The dimensions whose primary keys should be included in the result rows 

345 of the query. 

346 dataId : `DataCoordinate`, optional 

347 A fully-expanded data ID identifying dimensions known in advance. If 

348 not provided, will be set to an empty data ID. 

349 expression : `str` or `QueryWhereExpression`, optional 

350 A user-provided string WHERE expression. 

351 whereRegion : `lsst.sphgeom.Region`, optional 

352 If `None` and ``dataId`` is an expanded data ID, ``dataId.region`` will 

353 be used to construct one. 

354 bind : `Mapping` [ `str`, `object` ], optional 

355 Mapping containing literal values that should be injected into the 

356 query expression, keyed by the identifiers they replace. 

357 defaults : `DataCoordinate`, optional 

358 A data ID containing default for governor dimensions. 

359 datasets : `Iterable` [ `DatasetType` ], optional 

360 Dataset types whose searches may be joined into the query. Callers 

361 must still call `QueryBuilder.joinDataset` explicitly to control how 

362 that join happens (e.g. which collections are searched), but by 

363 declaring them here first we can ensure that the query includes the 

364 right dimensions for those joins. 

365 order_by : `Iterable` [ `str` ] 

366 Sequence of names to use for ordering with optional "-" prefix. 

367 limit : `Tuple`, optional 

368 Limit on the number of returned rows and optional offset. 

369 check : `bool` 

370 If `True` (default) check the query for consistency. This may reject 

371 some valid queries that resemble common mistakes (e.g. queries for 

372 visits without specifying an instrument). 

373 """ 

374 

375 def __init__( 

376 self, 

377 requested: DimensionGraph, 

378 *, 

379 dataId: DataCoordinate | None = None, 

380 expression: str | QueryWhereExpression | None = None, 

381 whereRegion: Region | None = None, 

382 bind: Mapping[str, Any] | None = None, 

383 defaults: DataCoordinate | None = None, 

384 datasets: Iterable[DatasetType] = (), 

385 order_by: Iterable[str] | None = None, 

386 limit: tuple[int, int | None] | None = None, 

387 check: bool = True, 

388 ): 

389 self.requested = requested 

390 if expression is None: 

391 expression = QueryWhereExpression(None, bind) 

392 elif isinstance(expression, str): 

393 expression = QueryWhereExpression(expression, bind) 

394 elif bind is not None: 

395 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.") 

396 self.where = expression.attach( 

397 self.requested, dataId=dataId, region=whereRegion, defaults=defaults, check=check 

398 ) 

399 self.datasets = NamedValueSet(datasets).freeze() 

400 self.order_by = None if order_by is None else OrderByClause(order_by, requested) 

401 self.limit = limit 

402 

403 requested: DimensionGraph 

404 """Dimensions whose primary keys should be included in the result rows of 

405 the query (`DimensionGraph`). 

406 """ 

407 

408 where: QueryWhereClause 

409 """Structure containing objects that contribute to the WHERE clause of the 

410 query (`QueryWhereClause`). 

411 """ 

412 

413 datasets: NamedValueAbstractSet[DatasetType] 

414 """Dataset types whose searches may be joined into the query 

415 (`NamedValueAbstractSet` [ `DatasetType` ]). 

416 """ 

417 

418 @property 

419 def universe(self) -> DimensionUniverse: 

420 """All known dimensions (`DimensionUniverse`).""" 

421 return self.requested.universe 

422 

423 @property 

424 @cached_getter 

425 def spatial(self) -> NamedValueAbstractSet[DimensionElement]: 

426 """Dimension elements whose regions and skypix IDs should be included 

427 in the query (`NamedValueAbstractSet` of `DimensionElement`). 

428 """ 

429 # An element may participate spatially in the query if: 

430 # - it's the most precise spatial element for its system in the 

431 # requested dimensions (i.e. in `self.requested.spatial`); 

432 # - it isn't also given at query construction time. 

433 result: NamedValueSet[DimensionElement] = NamedValueSet() 

434 for family in self.mustHaveKeysJoined.spatial: 

435 element = family.choose(self.mustHaveKeysJoined.elements) 

436 assert isinstance(element, DimensionElement) 

437 if element not in self.where.dataId.graph.elements: 

438 result.add(element) 

439 if len(result) == 1: 

440 # There's no spatial join, but there might be a WHERE filter based 

441 # on a given region. 

442 if self.where.dataId.graph.spatial: 

443 # We can only perform those filters against SkyPix dimensions, 

444 # so if what we have isn't one, add the common SkyPix dimension 

445 # to the query; the element we have will be joined to that. 

446 (element,) = result 

447 if not isinstance(element, SkyPixDimension): 

448 result.add(self.universe.commonSkyPix) 

449 else: 

450 # There is no spatial join or filter in this query. Even 

451 # if this element might be associated with spatial 

452 # information, we don't need it for this query. 

453 return NamedValueSet().freeze() 

454 elif len(result) > 1: 

455 # There's a spatial join. Those require the common SkyPix 

456 # system to be included in the query in order to connect them. 

457 result.add(self.universe.commonSkyPix) 

458 return result.freeze() 

459 

460 @property 

461 @cached_getter 

462 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

463 """Dimension elements whose timespans should be included in the 

464 query (`NamedValueSet` of `DimensionElement`). 

465 """ 

466 if len(self.mustHaveKeysJoined.temporal) > 1: 

467 # We don't actually have multiple temporal families in our current 

468 # dimension configuration, so this limitation should be harmless. 

469 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.") 

470 return self.where.temporal 

471 

472 @property 

473 @cached_getter 

474 def mustHaveKeysJoined(self) -> DimensionGraph: 

475 """Dimensions whose primary keys must be used in the JOIN ON clauses 

476 of the query, even if their tables do not appear (`DimensionGraph`). 

477 

478 A `Dimension` primary key can appear in a join clause without its table 

479 via a foreign key column in table of a dependent dimension element or 

480 dataset. 

481 """ 

482 names = set(self.requested.names | self.where.dimensions.names) 

483 for dataset_type in self.datasets: 

484 names.update(dataset_type.dimensions.names) 

485 return DimensionGraph(self.universe, names=names) 

486 

487 @property 

488 @cached_getter 

489 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]: 

490 """Dimension elements whose associated tables must appear in the 

491 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

492 """ 

493 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys()) 

494 if self.order_by is not None: 

495 result.update(self.order_by.elements) 

496 for dimension in self.mustHaveKeysJoined: 

497 if dimension.implied: 

498 result.add(dimension) 

499 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements: 

500 if element.alwaysJoin: 

501 result.add(element) 

502 return result.freeze() 

503 

504 

505@dataclass 

506class DatasetQueryColumns: 

507 """A struct containing the columns used to reconstruct `DatasetRef` 

508 instances from query results. 

509 """ 

510 

511 datasetType: DatasetType 

512 """The dataset type being queried (`DatasetType`). 

513 """ 

514 

515 id: ColumnElement 

516 """Column containing the unique integer ID for this dataset. 

517 """ 

518 

519 runKey: ColumnElement 

520 """Foreign key column to the `~CollectionType.RUN` collection that holds 

521 this dataset. 

522 """ 

523 

524 ingestDate: ColumnElement | None 

525 """Column containing the ingest timestamp, this is not a part of 

526 `DatasetRef` but it comes from the same table. 

527 """ 

528 

529 def __iter__(self) -> Iterator[ColumnElement]: 

530 yield self.id 

531 yield self.runKey 

532 

533 

534@dataclass 

535class QueryColumns: 

536 """A struct organizing the columns in an under-construction or currently- 

537 executing query. 

538 

539 Takes no parameters at construction, as expected usage is to add elements 

540 to its container attributes incrementally. 

541 """ 

542 

543 def __init__(self) -> None: 

544 self.keys = NamedKeyDict() 

545 self.timespans = NamedKeyDict() 

546 self.regions = NamedKeyDict() 

547 self.datasets = None 

548 

549 keys: NamedKeyDict[Dimension, list[ColumnElement]] 

550 """Columns that correspond to the primary key values of dimensions 

551 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

552 

553 Each value list contains columns from multiple tables corresponding to the 

554 same dimension, and the query should constrain the values of those columns 

555 to be the same. 

556 

557 In a `Query`, the keys of this dictionary must include at least the 

558 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

559 """ 

560 

561 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] 

562 """Columns that correspond to timespans for elements that participate in a 

563 temporal join or filter in the query (`NamedKeyDict` mapping 

564 `DimensionElement` to `TimespanDatabaseRepresentation`). 

565 

566 In a `Query`, the keys of this dictionary must be exactly the elements 

567 in `QuerySummary.temporal`. 

568 """ 

569 

570 regions: NamedKeyDict[DimensionElement, ColumnElement] 

571 """Columns that correspond to regions for elements that participate in a 

572 spatial join or filter in the query (`NamedKeyDict` mapping 

573 `DimensionElement` to `sqlalchemy.sql.ColumnElement`). 

574 

575 In a `Query`, the keys of this dictionary must be exactly the elements 

576 in `QuerySummary.spatial`. 

577 """ 

578 

579 datasets: DatasetQueryColumns | None 

580 """Columns that can be used to construct `DatasetRef` instances from query 

581 results. 

582 (`DatasetQueryColumns` or `None`). 

583 """ 

584 

585 def isEmpty(self) -> bool: 

586 """Return `True` if this query has no columns at all.""" 

587 return not (self.keys or self.timespans or self.regions or self.datasets is not None) 

588 

589 def getKeyColumn(self, dimension: Dimension | str) -> ColumnElement: 

590 """Return one of the columns in self.keys for the given dimension. 

591 

592 The column selected is an implentation detail but is guaranteed to 

593 be deterministic and consistent across multiple calls. 

594 

595 Parameters 

596 ---------- 

597 dimension : `Dimension` or `str` 

598 Dimension for which to obtain a key column. 

599 

600 Returns 

601 ------- 

602 column : `sqlalchemy.sql.ColumnElement` 

603 SQLAlchemy column object. 

604 """ 

605 # Choosing the last element here is entirely for human readers of the 

606 # query (e.g. developers debugging things); it makes it more likely a 

607 # dimension key will be provided by the dimension's own table, or 

608 # failing that, some closely related dimension, which might be less 

609 # surprising to see than e.g. some dataset subquery. From the 

610 # database's perspective this is entirely arbitrary, because the query 

611 # guarantees they all have equal values. 

612 return self.keys[dimension][-1]