Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 36%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

193 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage 

24 

25from dataclasses import dataclass 

26from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Tuple, Type, Union 

27 

28from sqlalchemy.sql import ColumnElement 

29 

30from lsst.utils.classes import cached_getter, immutable 

31from lsst.sphgeom import Region 

32from ...core import ( 

33 DataCoordinate, 

34 DatasetType, 

35 Dimension, 

36 DimensionElement, 

37 DimensionGraph, 

38 DimensionUniverse, 

39 NamedKeyDict, 

40 NamedKeyMapping, 

41 NamedValueAbstractSet, 

42 NamedValueSet, 

43 SkyPixDimension, 

44 SpatialRegionDatabaseRepresentation, 

45 TimespanDatabaseRepresentation, 

46) 

47from ..interfaces import ( 

48 CollectionManager, 

49 DatasetRecordStorageManager, 

50 DimensionRecordStorageManager, 

51) 

52from ..summaries import GovernorDimensionRestriction 

53# We're not trying to add typing to the lex/yacc parser code, so MyPy 

54# doesn't know about some of these imports. 

55from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore 

56from .expressions.categorize import categorizeOrderByName 

57 

58 

59@immutable 

60class QueryWhereExpression: 

61 """A struct representing a parsed user-provided WHERE expression. 

62 

63 Parameters 

64 ---------- 

65 expression : `str`, optional 

66 The string expression to parse. If `None`, a where expression that 

67 always evaluates to `True` is implied. 

68 bind : `Mapping` [ `str`, `object` ], optional 

69 Mapping containing literal values that should be injected into the 

70 query expression, keyed by the identifiers they replace. 

71 """ 

72 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None): 

73 if expression: 

74 try: 

75 parser = ParserYacc() 

76 self._tree = parser.parse(expression) 

77 except Exception as exc: 

78 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc 

79 assert self._tree is not None 

80 else: 

81 self._tree = None 

82 if bind is None: 

83 bind = {} 

84 self._bind = bind 

85 

86 def attach( 

87 self, 

88 graph: DimensionGraph, 

89 dataId: Optional[DataCoordinate] = None, 

90 region: Optional[Region] = None, 

91 defaults: Optional[DataCoordinate] = None, 

92 check: bool = True, 

93 ) -> QueryWhereClause: 

94 """Allow this expression to be attached to a `QuerySummary` by 

95 transforming it into a `QueryWhereClause`, while checking it for both 

96 internal consistency and consistency with the rest of the query. 

97 

98 Parameters 

99 ---------- 

100 graph : `DimensionGraph` 

101 The dimensions the query would include in the absence of this 

102 WHERE expression. 

103 dataId : `DataCoordinate`, optional 

104 A fully-expanded data ID identifying dimensions known in advance. 

105 If not provided, will be set to an empty data ID. 

106 ``dataId.hasRecords()`` must return `True`. 

107 region : `lsst.sphgeom.Region`, optional 

108 A spatial region that all rows must overlap. If `None` and 

109 ``dataId`` is not `None`, ``dataId.region`` will be used. 

110 defaults : `DataCoordinate`, optional 

111 A data ID containing default for governor dimensions. Ignored 

112 unless ``check=True``. 

113 check : `bool` 

114 If `True` (default) check the query for consistency and inject 

115 default values into the data ID when needed. This may 

116 reject some valid queries that resemble common mistakes (e.g. 

117 queries for visits without specifying an instrument). 

118 """ 

119 if region is None and dataId is not None: 

120 region = dataId.region 

121 if dataId is None: 

122 dataId = DataCoordinate.makeEmpty(graph.universe) 

123 if defaults is None: 

124 defaults = DataCoordinate.makeEmpty(graph.universe) 

125 if self._bind and check: 

126 for identifier in self._bind: 

127 if identifier in graph.universe.getStaticElements().names: 

128 raise RuntimeError( 

129 f"Bind parameter key {identifier!r} conflicts with a dimension element." 

130 ) 

131 table, sep, column = identifier.partition('.') 

132 if column and table in graph.universe.getStaticElements().names: 

133 raise RuntimeError( 

134 f"Bind parameter key {identifier!r} looks like a dimension column." 

135 ) 

136 restriction = GovernorDimensionRestriction(NamedKeyDict()) 

137 summary: InspectionSummary 

138 if self._tree is not None: 

139 if check: 

140 # Convert the expression to disjunctive normal form (ORs of 

141 # ANDs). That's potentially super expensive in the general 

142 # case (where there's a ton of nesting of ANDs and ORs). That 

143 # won't be the case for the expressions we expect, and we 

144 # actually use disjunctive normal instead of conjunctive (i.e. 

145 # ANDs of ORs) because I think the worst-case is a long list 

146 # of OR'd-together data IDs, which is already in or very close 

147 # to disjunctive normal form. 

148 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE) 

149 from .expressions import CheckVisitor 

150 # Check the expression for consistency and completeness. 

151 visitor = CheckVisitor(dataId, graph, self._bind.keys(), defaults) 

152 try: 

153 summary = expr.visit(visitor) 

154 except RuntimeError as err: 

155 exprOriginal = str(self._tree) 

156 exprNormal = str(expr.toTree()) 

157 if exprNormal == exprOriginal: 

158 msg = f'Error in query expression "{exprOriginal}": {err}' 

159 else: 

160 msg = ( 

161 f'Error in query expression "{exprOriginal}" ' 

162 f'(normalized to "{exprNormal}"): {err}' 

163 ) 

164 raise RuntimeError(msg) from None 

165 restriction = summary.governors 

166 dataId = visitor.dataId 

167 else: 

168 from .expressions import InspectionVisitor 

169 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys())) 

170 else: 

171 from .expressions import InspectionSummary 

172 summary = InspectionSummary() 

173 return QueryWhereClause( 

174 self._tree, 

175 dataId, 

176 dimensions=summary.dimensions, 

177 columns=summary.columns, 

178 bind=self._bind, 

179 restriction=restriction, 

180 region=region, 

181 ) 

182 

183 

184@dataclass(frozen=True) 

185class QueryWhereClause: 

186 """Structure holding various contributions to a query's WHERE clause. 

187 

188 Instances of this class should only be created by 

189 `QueryWhereExpression.attach`, which guarantees the consistency of its 

190 attributes. 

191 """ 

192 

193 tree: Optional[Node] 

194 """A parsed string expression tree., or `None` if there was no string 

195 expression. 

196 """ 

197 

198 dataId: DataCoordinate 

199 """A data ID identifying dimensions known before query construction 

200 (`DataCoordinate`). 

201 

202 ``dataId.hasRecords()`` is guaranteed to return `True`. 

203 """ 

204 

205 dimensions: NamedValueAbstractSet[Dimension] 

206 """Dimensions whose primary keys or dependencies were referenced anywhere 

207 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]). 

208 """ 

209 

210 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]] 

211 """Dimension element tables whose non-key columns were referenced anywhere 

212 in the string expression 

213 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]). 

214 """ 

215 

216 bind: Mapping[str, Any] 

217 """Mapping containing literal values that should be injected into the 

218 query expression, keyed by the identifiers they replace (`Mapping`). 

219 """ 

220 

221 region: Optional[Region] 

222 """A spatial region that all result rows must overlap 

223 (`lsst.sphgeom.Region` or `None`). 

224 """ 

225 

226 restriction: GovernorDimensionRestriction 

227 """Restrictions on the values governor dimensions can take in this query, 

228 imposed by the string expression or data ID 

229 (`GovernorDimensionRestriction`). 

230 """ 

231 

232 @property # type: ignore 

233 @cached_getter 

234 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

235 """Dimension elements whose timespans are referenced by this 

236 expression (`NamedValueAbstractSet` [ `DimensionElement` ]) 

237 """ 

238 return NamedValueSet( 

239 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c 

240 ).freeze() 

241 

242 

243@dataclass(frozen=True) 

244class OrderByClauseColumn: 

245 """Information about single column in ORDER BY clause. 

246 """ 

247 element: DimensionElement 

248 """Dimension element for data in this column (`DimensionElement`).""" 

249 

250 column: Optional[str] 

251 """Name of the column or `None` for primary key (`str` or `None`)""" 

252 

253 ordering: bool 

254 """True for ascending order, False for descending (`bool`).""" 

255 

256 

257@immutable 

258class OrderByClause: 

259 """Class for information about columns in ORDER BY clause 

260 

261 Parameters 

262 ---------- 

263 order_by : `Iterable` [ `str` ] 

264 Sequence of names to use for ordering with optional "-" prefix. 

265 graph : `DimensionGraph` 

266 Dimensions used by a query. 

267 """ 

268 def __init__(self, order_by: Iterable[str], graph: DimensionGraph): 

269 

270 self.order_by_columns = [] 

271 for name in order_by: 

272 if not name or name == "-": 

273 raise ValueError("Empty dimension name in ORDER BY") 

274 ascending = True 

275 if name[0] == "-": 

276 ascending = False 

277 name = name[1:] 

278 element, column = categorizeOrderByName(graph, name) 

279 self.order_by_columns.append( 

280 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

281 ) 

282 

283 self.elements = NamedValueSet(column.element for column in self.order_by_columns 

284 if column.column is not None) 

285 

286 order_by_columns: Iterable[OrderByClauseColumn] 

287 """Columns that appear in the ORDER BY 

288 (`Iterable` [ `OrderByClauseColumn` ]). 

289 """ 

290 

291 elements: NamedValueSet[DimensionElement] 

292 """Dimension elements whose non-key columns were referenced by order_by 

293 (`NamedValueSet` [ `DimensionElement` ]). 

294 """ 

295 

296 

297@immutable 

298class QuerySummary: 

299 """A struct that holds and categorizes the dimensions involved in a query. 

300 

301 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

302 it needs to include all of the dimensions that will be included in the 

303 query (including any needed for querying datasets). 

304 

305 Parameters 

306 ---------- 

307 requested : `DimensionGraph` 

308 The dimensions whose primary keys should be included in the result rows 

309 of the query. 

310 dataId : `DataCoordinate`, optional 

311 A fully-expanded data ID identifying dimensions known in advance. If 

312 not provided, will be set to an empty data ID. ``dataId.hasRecords()`` 

313 must return `True`. 

314 expression : `str` or `QueryWhereExpression`, optional 

315 A user-provided string WHERE expression. 

316 whereRegion : `lsst.sphgeom.Region`, optional 

317 A spatial region that all rows must overlap. If `None` and ``dataId`` 

318 is not `None`, ``dataId.region`` will be used. 

319 bind : `Mapping` [ `str`, `object` ], optional 

320 Mapping containing literal values that should be injected into the 

321 query expression, keyed by the identifiers they replace. 

322 defaults : `DataCoordinate`, optional 

323 A data ID containing default for governor dimensions. 

324 datasets : `Iterable` [ `DatasetType` ], optional 

325 Dataset types whose searches may be joined into the query. Callers 

326 must still call `QueryBuilder.joinDataset` explicitly to control how 

327 that join happens (e.g. which collections are searched), but by 

328 declaring them here first we can ensure that the query includes the 

329 right dimensions for those joins. 

330 order_by : `Iterable` [ `str` ] 

331 Sequence of names to use for ordering with optional "-" prefix. 

332 limit : `Tuple`, optional 

333 Limit on the number of returned rows and optional offset. 

334 check : `bool` 

335 If `True` (default) check the query for consistency. This may reject 

336 some valid queries that resemble common mistakes (e.g. queries for 

337 visits without specifying an instrument). 

338 """ 

339 def __init__(self, requested: DimensionGraph, *, 

340 dataId: Optional[DataCoordinate] = None, 

341 expression: Optional[Union[str, QueryWhereExpression]] = None, 

342 whereRegion: Optional[Region] = None, 

343 bind: Optional[Mapping[str, Any]] = None, 

344 defaults: Optional[DataCoordinate] = None, 

345 datasets: Iterable[DatasetType] = (), 

346 order_by: Optional[Iterable[str]] = None, 

347 limit: Optional[Tuple[int, Optional[int]]] = None, 

348 check: bool = True): 

349 self.requested = requested 

350 if expression is None: 

351 expression = QueryWhereExpression(None, bind) 

352 elif isinstance(expression, str): 

353 expression = QueryWhereExpression(expression, bind) 

354 elif bind is not None: 

355 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.") 

356 self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, defaults=defaults, 

357 check=check) 

358 self.datasets = NamedValueSet(datasets).freeze() 

359 self.order_by = None if order_by is None else OrderByClause(order_by, requested) 

360 self.limit = limit 

361 

362 requested: DimensionGraph 

363 """Dimensions whose primary keys should be included in the result rows of 

364 the query (`DimensionGraph`). 

365 """ 

366 

367 where: QueryWhereClause 

368 """Structure containing objects that contribute to the WHERE clause of the 

369 query (`QueryWhereClause`). 

370 """ 

371 

372 datasets: NamedValueAbstractSet[DatasetType] 

373 """Dataset types whose searches may be joined into the query 

374 (`NamedValueAbstractSet` [ `DatasetType` ]). 

375 """ 

376 

377 @property 

378 def universe(self) -> DimensionUniverse: 

379 """All known dimensions (`DimensionUniverse`). 

380 """ 

381 return self.requested.universe 

382 

383 @property # type: ignore 

384 @cached_getter 

385 def spatial(self) -> NamedValueAbstractSet[DimensionElement]: 

386 """Dimension elements whose regions and skypix IDs should be included 

387 in the query (`NamedValueAbstractSet` of `DimensionElement`). 

388 """ 

389 # An element may participate spatially in the query if: 

390 # - it's the most precise spatial element for its system in the 

391 # requested dimensions (i.e. in `self.requested.spatial`); 

392 # - it isn't also given at query construction time. 

393 result: NamedValueSet[DimensionElement] = NamedValueSet() 

394 for family in self.mustHaveKeysJoined.spatial: 

395 element = family.choose(self.mustHaveKeysJoined.elements) 

396 assert isinstance(element, DimensionElement) 

397 if element not in self.where.dataId.graph.elements: 

398 result.add(element) 

399 if len(result) == 1: 

400 # There's no spatial join, but there might be a WHERE filter based 

401 # on a given region. 

402 if self.where.dataId.graph.spatial: 

403 # We can only perform those filters against SkyPix dimensions, 

404 # so if what we have isn't one, add the common SkyPix dimension 

405 # to the query; the element we have will be joined to that. 

406 element, = result 

407 if not isinstance(element, SkyPixDimension): 

408 result.add(self.universe.commonSkyPix) 

409 else: 

410 # There is no spatial join or filter in this query. Even 

411 # if this element might be associated with spatial 

412 # information, we don't need it for this query. 

413 return NamedValueSet().freeze() 

414 elif len(result) > 1: 

415 # There's a spatial join. Those require the common SkyPix 

416 # system to be included in the query in order to connect them. 

417 result.add(self.universe.commonSkyPix) 

418 return result.freeze() 

419 

420 @property # type: ignore 

421 @cached_getter 

422 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

423 """Dimension elements whose timespans should be included in the 

424 query (`NamedValueSet` of `DimensionElement`). 

425 """ 

426 if len(self.mustHaveKeysJoined.temporal) > 1: 

427 # We don't actually have multiple temporal families in our current 

428 # dimension configuration, so this limitation should be harmless. 

429 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.") 

430 return self.where.temporal 

431 

432 @property # type: ignore 

433 @cached_getter 

434 def mustHaveKeysJoined(self) -> DimensionGraph: 

435 """Dimensions whose primary keys must be used in the JOIN ON clauses 

436 of the query, even if their tables do not appear (`DimensionGraph`). 

437 

438 A `Dimension` primary key can appear in a join clause without its table 

439 via a foreign key column in table of a dependent dimension element or 

440 dataset. 

441 """ 

442 names = set(self.requested.names | self.where.dimensions.names) 

443 for dataset_type in self.datasets: 

444 names.update(dataset_type.dimensions.names) 

445 return DimensionGraph(self.universe, names=names) 

446 

447 @property # type: ignore 

448 @cached_getter 

449 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]: 

450 """Dimension elements whose associated tables must appear in the 

451 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

452 """ 

453 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys()) 

454 if self.order_by is not None: 

455 result.update(self.order_by.elements) 

456 for dimension in self.mustHaveKeysJoined: 

457 if dimension.implied: 

458 result.add(dimension) 

459 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements: 

460 if element.alwaysJoin: 

461 result.add(element) 

462 return result.freeze() 

463 

464 

465@dataclass 

466class DatasetQueryColumns: 

467 """A struct containing the columns used to reconstruct `DatasetRef` 

468 instances from query results. 

469 """ 

470 

471 datasetType: DatasetType 

472 """The dataset type being queried (`DatasetType`). 

473 """ 

474 

475 id: ColumnElement 

476 """Column containing the unique integer ID for this dataset. 

477 """ 

478 

479 runKey: ColumnElement 

480 """Foreign key column to the `~CollectionType.RUN` collection that holds 

481 this dataset. 

482 """ 

483 

484 ingestDate: Optional[ColumnElement] 

485 """Column containing the ingest timestamp, this is not a part of 

486 `DatasetRef` but it comes from the same table. 

487 """ 

488 

489 def __iter__(self) -> Iterator[ColumnElement]: 

490 yield self.id 

491 yield self.runKey 

492 

493 

494@dataclass 

495class QueryColumns: 

496 """A struct organizing the columns in an under-construction or currently- 

497 executing query. 

498 

499 Takes no parameters at construction, as expected usage is to add elements 

500 to its container attributes incrementally. 

501 """ 

502 def __init__(self) -> None: 

503 self.keys = NamedKeyDict() 

504 self.timespans = NamedKeyDict() 

505 self.regions = NamedKeyDict() 

506 self.datasets = None 

507 

508 keys: NamedKeyDict[Dimension, List[ColumnElement]] 

509 """Columns that correspond to the primary key values of dimensions 

510 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

511 

512 Each value list contains columns from multiple tables corresponding to the 

513 same dimension, and the query should constrain the values of those columns 

514 to be the same. 

515 

516 In a `Query`, the keys of this dictionary must include at least the 

517 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

518 """ 

519 

520 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] 

521 """Columns that correspond to timespans for elements that participate in a 

522 temporal join or filter in the query (`NamedKeyDict` mapping 

523 `DimensionElement` to `TimespanDatabaseRepresentation`). 

524 

525 In a `Query`, the keys of this dictionary must be exactly the elements 

526 in `QuerySummary.temporal`. 

527 """ 

528 

529 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation] 

530 """Columns that correspond to regions for elements that participate in a 

531 spatial join or filter in the query (`NamedKeyDict` mapping 

532 `DimensionElement` to `SpatialRegionDatabaseRepresentation`). 

533 

534 In a `Query`, the keys of this dictionary must be exactly the elements 

535 in `QuerySummary.spatial`. 

536 """ 

537 

538 datasets: Optional[DatasetQueryColumns] 

539 """Columns that can be used to construct `DatasetRef` instances from query 

540 results. 

541 (`DatasetQueryColumns` or `None`). 

542 """ 

543 

544 def isEmpty(self) -> bool: 

545 """Return `True` if this query has no columns at all. 

546 """ 

547 return not (self.keys or self.timespans or self.regions or self.datasets is not None) 

548 

549 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement: 

550 """ Return one of the columns in self.keys for the given dimension. 

551 

552 The column selected is an implentation detail but is guaranteed to 

553 be deterministic and consistent across multiple calls. 

554 

555 Parameters 

556 ---------- 

557 dimension : `Dimension` or `str` 

558 Dimension for which to obtain a key column. 

559 

560 Returns 

561 ------- 

562 column : `sqlalchemy.sql.ColumnElement` 

563 SQLAlchemy column object. 

564 """ 

565 # Choosing the last element here is entirely for human readers of the 

566 # query (e.g. developers debugging things); it makes it more likely a 

567 # dimension key will be provided by the dimension's own table, or 

568 # failing that, some closely related dimension, which might be less 

569 # surprising to see than e.g. some dataset subquery. From the 

570 # database's perspective this is entirely arbitrary, because the query 

571 # guarantees they all have equal values. 

572 return self.keys[dimension][-1] 

573 

574 

575@dataclass 

576class RegistryManagers: 

577 """Struct used to pass around the manager objects that back a `Registry` 

578 and are used internally by the query system. 

579 """ 

580 

581 collections: CollectionManager 

582 """Manager for collections (`CollectionManager`). 

583 """ 

584 

585 datasets: DatasetRecordStorageManager 

586 """Manager for datasets and dataset types (`DatasetRecordStorageManager`). 

587 """ 

588 

589 dimensions: DimensionRecordStorageManager 

590 """Manager for dimensions (`DimensionRecordStorageManager`). 

591 """ 

592 

593 TimespanReprClass: Type[TimespanDatabaseRepresentation] 

594 """Type that encapsulates how timespans are represented in this database 

595 (`type`; subclass of `TimespanDatabaseRepresentation`). 

596 """