Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 36%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

193 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage 

24 

25from dataclasses import dataclass 

26from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Tuple, Type, Union 

27 

28from lsst.sphgeom import Region 

29from lsst.utils.classes import cached_getter, immutable 

30from sqlalchemy.sql import ColumnElement 

31 

32from ...core import ( 

33 DataCoordinate, 

34 DatasetType, 

35 Dimension, 

36 DimensionElement, 

37 DimensionGraph, 

38 DimensionUniverse, 

39 NamedKeyDict, 

40 NamedKeyMapping, 

41 NamedValueAbstractSet, 

42 NamedValueSet, 

43 SkyPixDimension, 

44 SpatialRegionDatabaseRepresentation, 

45 TimespanDatabaseRepresentation, 

46) 

47from ..interfaces import CollectionManager, DatasetRecordStorageManager, DimensionRecordStorageManager 

48from ..summaries import GovernorDimensionRestriction 

49 

50# We're not trying to add typing to the lex/yacc parser code, so MyPy 

51# doesn't know about some of these imports. 

52from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore 

53from .expressions.categorize import categorizeOrderByName 

54 

55 

56@immutable 

57class QueryWhereExpression: 

58 """A struct representing a parsed user-provided WHERE expression. 

59 

60 Parameters 

61 ---------- 

62 expression : `str`, optional 

63 The string expression to parse. If `None`, a where expression that 

64 always evaluates to `True` is implied. 

65 bind : `Mapping` [ `str`, `object` ], optional 

66 Mapping containing literal values that should be injected into the 

67 query expression, keyed by the identifiers they replace. 

68 """ 

69 

70 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None): 

71 if expression: 

72 try: 

73 parser = ParserYacc() 

74 self._tree = parser.parse(expression) 

75 except Exception as exc: 

76 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc 

77 assert self._tree is not None 

78 else: 

79 self._tree = None 

80 if bind is None: 

81 bind = {} 

82 self._bind = bind 

83 

84 def attach( 

85 self, 

86 graph: DimensionGraph, 

87 dataId: Optional[DataCoordinate] = None, 

88 region: Optional[Region] = None, 

89 defaults: Optional[DataCoordinate] = None, 

90 check: bool = True, 

91 ) -> QueryWhereClause: 

92 """Allow this expression to be attached to a `QuerySummary` by 

93 transforming it into a `QueryWhereClause`, while checking it for both 

94 internal consistency and consistency with the rest of the query. 

95 

96 Parameters 

97 ---------- 

98 graph : `DimensionGraph` 

99 The dimensions the query would include in the absence of this 

100 WHERE expression. 

101 dataId : `DataCoordinate`, optional 

102 A fully-expanded data ID identifying dimensions known in advance. 

103 If not provided, will be set to an empty data ID. 

104 ``dataId.hasRecords()`` must return `True`. 

105 region : `lsst.sphgeom.Region`, optional 

106 A spatial region that all rows must overlap. If `None` and 

107 ``dataId`` is not `None`, ``dataId.region`` will be used. 

108 defaults : `DataCoordinate`, optional 

109 A data ID containing default for governor dimensions. Ignored 

110 unless ``check=True``. 

111 check : `bool` 

112 If `True` (default) check the query for consistency and inject 

113 default values into the data ID when needed. This may 

114 reject some valid queries that resemble common mistakes (e.g. 

115 queries for visits without specifying an instrument). 

116 """ 

117 if region is None and dataId is not None: 

118 region = dataId.region 

119 if dataId is None: 

120 dataId = DataCoordinate.makeEmpty(graph.universe) 

121 if defaults is None: 

122 defaults = DataCoordinate.makeEmpty(graph.universe) 

123 if self._bind and check: 

124 for identifier in self._bind: 

125 if identifier in graph.universe.getStaticElements().names: 

126 raise RuntimeError( 

127 f"Bind parameter key {identifier!r} conflicts with a dimension element." 

128 ) 

129 table, sep, column = identifier.partition(".") 

130 if column and table in graph.universe.getStaticElements().names: 

131 raise RuntimeError(f"Bind parameter key {identifier!r} looks like a dimension column.") 

132 restriction = GovernorDimensionRestriction(NamedKeyDict()) 

133 summary: InspectionSummary 

134 if self._tree is not None: 

135 if check: 

136 # Convert the expression to disjunctive normal form (ORs of 

137 # ANDs). That's potentially super expensive in the general 

138 # case (where there's a ton of nesting of ANDs and ORs). That 

139 # won't be the case for the expressions we expect, and we 

140 # actually use disjunctive normal instead of conjunctive (i.e. 

141 # ANDs of ORs) because I think the worst-case is a long list 

142 # of OR'd-together data IDs, which is already in or very close 

143 # to disjunctive normal form. 

144 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE) 

145 from .expressions import CheckVisitor 

146 

147 # Check the expression for consistency and completeness. 

148 visitor = CheckVisitor(dataId, graph, self._bind.keys(), defaults) 

149 try: 

150 summary = expr.visit(visitor) 

151 except RuntimeError as err: 

152 exprOriginal = str(self._tree) 

153 exprNormal = str(expr.toTree()) 

154 if exprNormal == exprOriginal: 

155 msg = f'Error in query expression "{exprOriginal}": {err}' 

156 else: 

157 msg = ( 

158 f'Error in query expression "{exprOriginal}" ' 

159 f'(normalized to "{exprNormal}"): {err}' 

160 ) 

161 raise RuntimeError(msg) from None 

162 restriction = summary.governors 

163 dataId = visitor.dataId 

164 else: 

165 from .expressions import InspectionVisitor 

166 

167 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys())) 

168 else: 

169 from .expressions import InspectionSummary 

170 

171 summary = InspectionSummary() 

172 return QueryWhereClause( 

173 self._tree, 

174 dataId, 

175 dimensions=summary.dimensions, 

176 columns=summary.columns, 

177 bind=self._bind, 

178 restriction=restriction, 

179 region=region, 

180 ) 

181 

182 

183@dataclass(frozen=True) 

184class QueryWhereClause: 

185 """Structure holding various contributions to a query's WHERE clause. 

186 

187 Instances of this class should only be created by 

188 `QueryWhereExpression.attach`, which guarantees the consistency of its 

189 attributes. 

190 """ 

191 

192 tree: Optional[Node] 

193 """A parsed string expression tree., or `None` if there was no string 

194 expression. 

195 """ 

196 

197 dataId: DataCoordinate 

198 """A data ID identifying dimensions known before query construction 

199 (`DataCoordinate`). 

200 

201 ``dataId.hasRecords()`` is guaranteed to return `True`. 

202 """ 

203 

204 dimensions: NamedValueAbstractSet[Dimension] 

205 """Dimensions whose primary keys or dependencies were referenced anywhere 

206 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]). 

207 """ 

208 

209 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]] 

210 """Dimension element tables whose non-key columns were referenced anywhere 

211 in the string expression 

212 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]). 

213 """ 

214 

215 bind: Mapping[str, Any] 

216 """Mapping containing literal values that should be injected into the 

217 query expression, keyed by the identifiers they replace (`Mapping`). 

218 """ 

219 

220 region: Optional[Region] 

221 """A spatial region that all result rows must overlap 

222 (`lsst.sphgeom.Region` or `None`). 

223 """ 

224 

225 restriction: GovernorDimensionRestriction 

226 """Restrictions on the values governor dimensions can take in this query, 

227 imposed by the string expression or data ID 

228 (`GovernorDimensionRestriction`). 

229 """ 

230 

231 @property # type: ignore 

232 @cached_getter 

233 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

234 """Dimension elements whose timespans are referenced by this 

235 expression (`NamedValueAbstractSet` [ `DimensionElement` ]) 

236 """ 

237 return NamedValueSet( 

238 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c 

239 ).freeze() 

240 

241 

242@dataclass(frozen=True) 

243class OrderByClauseColumn: 

244 """Information about single column in ORDER BY clause.""" 

245 

246 element: DimensionElement 

247 """Dimension element for data in this column (`DimensionElement`).""" 

248 

249 column: Optional[str] 

250 """Name of the column or `None` for primary key (`str` or `None`)""" 

251 

252 ordering: bool 

253 """True for ascending order, False for descending (`bool`).""" 

254 

255 

256@immutable 

257class OrderByClause: 

258 """Class for information about columns in ORDER BY clause 

259 

260 Parameters 

261 ---------- 

262 order_by : `Iterable` [ `str` ] 

263 Sequence of names to use for ordering with optional "-" prefix. 

264 graph : `DimensionGraph` 

265 Dimensions used by a query. 

266 """ 

267 

268 def __init__(self, order_by: Iterable[str], graph: DimensionGraph): 

269 

270 self.order_by_columns = [] 

271 for name in order_by: 

272 if not name or name == "-": 

273 raise ValueError("Empty dimension name in ORDER BY") 

274 ascending = True 

275 if name[0] == "-": 

276 ascending = False 

277 name = name[1:] 

278 element, column = categorizeOrderByName(graph, name) 

279 self.order_by_columns.append( 

280 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

281 ) 

282 

283 self.elements = NamedValueSet( 

284 column.element for column in self.order_by_columns if column.column is not None 

285 ) 

286 

287 order_by_columns: Iterable[OrderByClauseColumn] 

288 """Columns that appear in the ORDER BY 

289 (`Iterable` [ `OrderByClauseColumn` ]). 

290 """ 

291 

292 elements: NamedValueSet[DimensionElement] 

293 """Dimension elements whose non-key columns were referenced by order_by 

294 (`NamedValueSet` [ `DimensionElement` ]). 

295 """ 

296 

297 

298@immutable 

299class QuerySummary: 

300 """A struct that holds and categorizes the dimensions involved in a query. 

301 

302 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

303 it needs to include all of the dimensions that will be included in the 

304 query (including any needed for querying datasets). 

305 

306 Parameters 

307 ---------- 

308 requested : `DimensionGraph` 

309 The dimensions whose primary keys should be included in the result rows 

310 of the query. 

311 dataId : `DataCoordinate`, optional 

312 A fully-expanded data ID identifying dimensions known in advance. If 

313 not provided, will be set to an empty data ID. ``dataId.hasRecords()`` 

314 must return `True`. 

315 expression : `str` or `QueryWhereExpression`, optional 

316 A user-provided string WHERE expression. 

317 whereRegion : `lsst.sphgeom.Region`, optional 

318 A spatial region that all rows must overlap. If `None` and ``dataId`` 

319 is not `None`, ``dataId.region`` will be used. 

320 bind : `Mapping` [ `str`, `object` ], optional 

321 Mapping containing literal values that should be injected into the 

322 query expression, keyed by the identifiers they replace. 

323 defaults : `DataCoordinate`, optional 

324 A data ID containing default for governor dimensions. 

325 datasets : `Iterable` [ `DatasetType` ], optional 

326 Dataset types whose searches may be joined into the query. Callers 

327 must still call `QueryBuilder.joinDataset` explicitly to control how 

328 that join happens (e.g. which collections are searched), but by 

329 declaring them here first we can ensure that the query includes the 

330 right dimensions for those joins. 

331 order_by : `Iterable` [ `str` ] 

332 Sequence of names to use for ordering with optional "-" prefix. 

333 limit : `Tuple`, optional 

334 Limit on the number of returned rows and optional offset. 

335 check : `bool` 

336 If `True` (default) check the query for consistency. This may reject 

337 some valid queries that resemble common mistakes (e.g. queries for 

338 visits without specifying an instrument). 

339 """ 

340 

341 def __init__( 

342 self, 

343 requested: DimensionGraph, 

344 *, 

345 dataId: Optional[DataCoordinate] = None, 

346 expression: Optional[Union[str, QueryWhereExpression]] = None, 

347 whereRegion: Optional[Region] = None, 

348 bind: Optional[Mapping[str, Any]] = None, 

349 defaults: Optional[DataCoordinate] = None, 

350 datasets: Iterable[DatasetType] = (), 

351 order_by: Optional[Iterable[str]] = None, 

352 limit: Optional[Tuple[int, Optional[int]]] = None, 

353 check: bool = True, 

354 ): 

355 self.requested = requested 

356 if expression is None: 

357 expression = QueryWhereExpression(None, bind) 

358 elif isinstance(expression, str): 

359 expression = QueryWhereExpression(expression, bind) 

360 elif bind is not None: 

361 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.") 

362 self.where = expression.attach( 

363 self.requested, dataId=dataId, region=whereRegion, defaults=defaults, check=check 

364 ) 

365 self.datasets = NamedValueSet(datasets).freeze() 

366 self.order_by = None if order_by is None else OrderByClause(order_by, requested) 

367 self.limit = limit 

368 

369 requested: DimensionGraph 

370 """Dimensions whose primary keys should be included in the result rows of 

371 the query (`DimensionGraph`). 

372 """ 

373 

374 where: QueryWhereClause 

375 """Structure containing objects that contribute to the WHERE clause of the 

376 query (`QueryWhereClause`). 

377 """ 

378 

379 datasets: NamedValueAbstractSet[DatasetType] 

380 """Dataset types whose searches may be joined into the query 

381 (`NamedValueAbstractSet` [ `DatasetType` ]). 

382 """ 

383 

384 @property 

385 def universe(self) -> DimensionUniverse: 

386 """All known dimensions (`DimensionUniverse`).""" 

387 return self.requested.universe 

388 

389 @property # type: ignore 

390 @cached_getter 

391 def spatial(self) -> NamedValueAbstractSet[DimensionElement]: 

392 """Dimension elements whose regions and skypix IDs should be included 

393 in the query (`NamedValueAbstractSet` of `DimensionElement`). 

394 """ 

395 # An element may participate spatially in the query if: 

396 # - it's the most precise spatial element for its system in the 

397 # requested dimensions (i.e. in `self.requested.spatial`); 

398 # - it isn't also given at query construction time. 

399 result: NamedValueSet[DimensionElement] = NamedValueSet() 

400 for family in self.mustHaveKeysJoined.spatial: 

401 element = family.choose(self.mustHaveKeysJoined.elements) 

402 assert isinstance(element, DimensionElement) 

403 if element not in self.where.dataId.graph.elements: 

404 result.add(element) 

405 if len(result) == 1: 

406 # There's no spatial join, but there might be a WHERE filter based 

407 # on a given region. 

408 if self.where.dataId.graph.spatial: 

409 # We can only perform those filters against SkyPix dimensions, 

410 # so if what we have isn't one, add the common SkyPix dimension 

411 # to the query; the element we have will be joined to that. 

412 (element,) = result 

413 if not isinstance(element, SkyPixDimension): 

414 result.add(self.universe.commonSkyPix) 

415 else: 

416 # There is no spatial join or filter in this query. Even 

417 # if this element might be associated with spatial 

418 # information, we don't need it for this query. 

419 return NamedValueSet().freeze() 

420 elif len(result) > 1: 

421 # There's a spatial join. Those require the common SkyPix 

422 # system to be included in the query in order to connect them. 

423 result.add(self.universe.commonSkyPix) 

424 return result.freeze() 

425 

426 @property # type: ignore 

427 @cached_getter 

428 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

429 """Dimension elements whose timespans should be included in the 

430 query (`NamedValueSet` of `DimensionElement`). 

431 """ 

432 if len(self.mustHaveKeysJoined.temporal) > 1: 

433 # We don't actually have multiple temporal families in our current 

434 # dimension configuration, so this limitation should be harmless. 

435 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.") 

436 return self.where.temporal 

437 

438 @property # type: ignore 

439 @cached_getter 

440 def mustHaveKeysJoined(self) -> DimensionGraph: 

441 """Dimensions whose primary keys must be used in the JOIN ON clauses 

442 of the query, even if their tables do not appear (`DimensionGraph`). 

443 

444 A `Dimension` primary key can appear in a join clause without its table 

445 via a foreign key column in table of a dependent dimension element or 

446 dataset. 

447 """ 

448 names = set(self.requested.names | self.where.dimensions.names) 

449 for dataset_type in self.datasets: 

450 names.update(dataset_type.dimensions.names) 

451 return DimensionGraph(self.universe, names=names) 

452 

453 @property # type: ignore 

454 @cached_getter 

455 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]: 

456 """Dimension elements whose associated tables must appear in the 

457 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

458 """ 

459 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys()) 

460 if self.order_by is not None: 

461 result.update(self.order_by.elements) 

462 for dimension in self.mustHaveKeysJoined: 

463 if dimension.implied: 

464 result.add(dimension) 

465 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements: 

466 if element.alwaysJoin: 

467 result.add(element) 

468 return result.freeze() 

469 

470 

471@dataclass 

472class DatasetQueryColumns: 

473 """A struct containing the columns used to reconstruct `DatasetRef` 

474 instances from query results. 

475 """ 

476 

477 datasetType: DatasetType 

478 """The dataset type being queried (`DatasetType`). 

479 """ 

480 

481 id: ColumnElement 

482 """Column containing the unique integer ID for this dataset. 

483 """ 

484 

485 runKey: ColumnElement 

486 """Foreign key column to the `~CollectionType.RUN` collection that holds 

487 this dataset. 

488 """ 

489 

490 ingestDate: Optional[ColumnElement] 

491 """Column containing the ingest timestamp, this is not a part of 

492 `DatasetRef` but it comes from the same table. 

493 """ 

494 

495 def __iter__(self) -> Iterator[ColumnElement]: 

496 yield self.id 

497 yield self.runKey 

498 

499 

500@dataclass 

501class QueryColumns: 

502 """A struct organizing the columns in an under-construction or currently- 

503 executing query. 

504 

505 Takes no parameters at construction, as expected usage is to add elements 

506 to its container attributes incrementally. 

507 """ 

508 

509 def __init__(self) -> None: 

510 self.keys = NamedKeyDict() 

511 self.timespans = NamedKeyDict() 

512 self.regions = NamedKeyDict() 

513 self.datasets = None 

514 

515 keys: NamedKeyDict[Dimension, List[ColumnElement]] 

516 """Columns that correspond to the primary key values of dimensions 

517 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

518 

519 Each value list contains columns from multiple tables corresponding to the 

520 same dimension, and the query should constrain the values of those columns 

521 to be the same. 

522 

523 In a `Query`, the keys of this dictionary must include at least the 

524 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

525 """ 

526 

527 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] 

528 """Columns that correspond to timespans for elements that participate in a 

529 temporal join or filter in the query (`NamedKeyDict` mapping 

530 `DimensionElement` to `TimespanDatabaseRepresentation`). 

531 

532 In a `Query`, the keys of this dictionary must be exactly the elements 

533 in `QuerySummary.temporal`. 

534 """ 

535 

536 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation] 

537 """Columns that correspond to regions for elements that participate in a 

538 spatial join or filter in the query (`NamedKeyDict` mapping 

539 `DimensionElement` to `SpatialRegionDatabaseRepresentation`). 

540 

541 In a `Query`, the keys of this dictionary must be exactly the elements 

542 in `QuerySummary.spatial`. 

543 """ 

544 

545 datasets: Optional[DatasetQueryColumns] 

546 """Columns that can be used to construct `DatasetRef` instances from query 

547 results. 

548 (`DatasetQueryColumns` or `None`). 

549 """ 

550 

551 def isEmpty(self) -> bool: 

552 """Return `True` if this query has no columns at all.""" 

553 return not (self.keys or self.timespans or self.regions or self.datasets is not None) 

554 

555 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement: 

556 """Return one of the columns in self.keys for the given dimension. 

557 

558 The column selected is an implentation detail but is guaranteed to 

559 be deterministic and consistent across multiple calls. 

560 

561 Parameters 

562 ---------- 

563 dimension : `Dimension` or `str` 

564 Dimension for which to obtain a key column. 

565 

566 Returns 

567 ------- 

568 column : `sqlalchemy.sql.ColumnElement` 

569 SQLAlchemy column object. 

570 """ 

571 # Choosing the last element here is entirely for human readers of the 

572 # query (e.g. developers debugging things); it makes it more likely a 

573 # dimension key will be provided by the dimension's own table, or 

574 # failing that, some closely related dimension, which might be less 

575 # surprising to see than e.g. some dataset subquery. From the 

576 # database's perspective this is entirely arbitrary, because the query 

577 # guarantees they all have equal values. 

578 return self.keys[dimension][-1] 

579 

580 

581@dataclass 

582class RegistryManagers: 

583 """Struct used to pass around the manager objects that back a `Registry` 

584 and are used internally by the query system. 

585 """ 

586 

587 collections: CollectionManager 

588 """Manager for collections (`CollectionManager`). 

589 """ 

590 

591 datasets: DatasetRecordStorageManager 

592 """Manager for datasets and dataset types (`DatasetRecordStorageManager`). 

593 """ 

594 

595 dimensions: DimensionRecordStorageManager 

596 """Manager for dimensions (`DimensionRecordStorageManager`). 

597 """ 

598 

599 TimespanReprClass: Type[TimespanDatabaseRepresentation] 

600 """Type that encapsulates how timespans are represented in this database 

601 (`type`; subclass of `TimespanDatabaseRepresentation`). 

602 """