Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 36%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

161 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage 

24 

25from dataclasses import dataclass 

26from typing import AbstractSet, Any, Iterator, List, Mapping, Optional, Type, Union 

27 

28from sqlalchemy.sql import ColumnElement 

29 

30from lsst.sphgeom import Region 

31from ...core import ( 

32 DataCoordinate, 

33 DatasetType, 

34 Dimension, 

35 DimensionElement, 

36 DimensionGraph, 

37 DimensionUniverse, 

38 NamedKeyDict, 

39 NamedKeyMapping, 

40 NamedValueAbstractSet, 

41 NamedValueSet, 

42 SkyPixDimension, 

43 SpatialRegionDatabaseRepresentation, 

44 TimespanDatabaseRepresentation, 

45) 

46from ...core.utils import cached_getter, immutable 

47from ..interfaces import ( 

48 CollectionManager, 

49 DatasetRecordStorageManager, 

50 DimensionRecordStorageManager, 

51) 

52from ..summaries import GovernorDimensionRestriction 

53# We're not trying to add typing to the lex/yacc parser code, so MyPy 

54# doesn't know about some of these imports. 

55from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore 

56 

57 

58@immutable 

59class QueryWhereExpression: 

60 """A struct representing a parsed user-provided WHERE expression. 

61 

62 Parameters 

63 ---------- 

64 expression : `str`, optional 

65 The string expression to parse. If `None`, a where expression that 

66 always evaluates to `True` is implied. 

67 bind : `Mapping` [ `str`, `object` ], optional 

68 Mapping containing literal values that should be injected into the 

69 query expression, keyed by the identifiers they replace. 

70 """ 

71 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None): 

72 if expression: 

73 try: 

74 parser = ParserYacc() 

75 self._tree = parser.parse(expression) 

76 except Exception as exc: 

77 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc 

78 assert self._tree is not None 

79 else: 

80 self._tree = None 

81 if bind is None: 

82 bind = {} 

83 self._bind = bind 

84 

85 def attach( 

86 self, 

87 graph: DimensionGraph, 

88 dataId: Optional[DataCoordinate] = None, 

89 region: Optional[Region] = None, 

90 defaults: Optional[DataCoordinate] = None, 

91 check: bool = True, 

92 ) -> QueryWhereClause: 

93 """Allow this expression to be attached to a `QuerySummary` by 

94 transforming it into a `QueryWhereClause`, while checking it for both 

95 internal consistency and consistency with the rest of the query. 

96 

97 Parameters 

98 ---------- 

99 graph : `DimensionGraph` 

100 The dimensions the query would include in the absence of this 

101 WHERE expression. 

102 dataId : `DataCoordinate`, optional 

103 A fully-expanded data ID identifying dimensions known in advance. 

104 If not provided, will be set to an empty data ID. 

105 ``dataId.hasRecords()`` must return `True`. 

106 region : `lsst.sphgeom.Region`, optional 

107 A spatial region that all rows must overlap. If `None` and 

108 ``dataId`` is not `None`, ``dataId.region`` will be used. 

109 defaults : `DataCoordinate`, optional 

110 A data ID containing default for governor dimensions. Ignored 

111 unless ``check=True``. 

112 check : `bool` 

113 If `True` (default) check the query for consistency and inject 

114 default values into the data ID when needed. This may 

115 reject some valid queries that resemble common mistakes (e.g. 

116 queries for visits without specifying an instrument). 

117 """ 

118 if region is None and dataId is not None: 

119 region = dataId.region 

120 if dataId is None: 

121 dataId = DataCoordinate.makeEmpty(graph.universe) 

122 if defaults is None: 

123 defaults = DataCoordinate.makeEmpty(graph.universe) 

124 if self._bind and check: 

125 for identifier in self._bind: 

126 if identifier in graph.universe.getStaticElements().names: 

127 raise RuntimeError( 

128 f"Bind parameter key {identifier!r} conflicts with a dimension element." 

129 ) 

130 table, sep, column = identifier.partition('.') 

131 if column and table in graph.universe.getStaticElements().names: 

132 raise RuntimeError( 

133 f"Bind parameter key {identifier!r} looks like a dimension column." 

134 ) 

135 restriction = GovernorDimensionRestriction(NamedKeyDict()) 

136 summary: InspectionSummary 

137 if self._tree is not None: 

138 if check: 

139 # Convert the expression to disjunctive normal form (ORs of 

140 # ANDs). That's potentially super expensive in the general 

141 # case (where there's a ton of nesting of ANDs and ORs). That 

142 # won't be the case for the expressions we expect, and we 

143 # actually use disjunctive normal instead of conjunctive (i.e. 

144 # ANDs of ORs) because I think the worst-case is a long list 

145 # of OR'd-together data IDs, which is already in or very close 

146 # to disjunctive normal form. 

147 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE) 

148 from .expressions import CheckVisitor 

149 # Check the expression for consistency and completeness. 

150 visitor = CheckVisitor(dataId, graph, self._bind.keys(), defaults) 

151 try: 

152 summary = expr.visit(visitor) 

153 except RuntimeError as err: 

154 exprOriginal = str(self._tree) 

155 exprNormal = str(expr.toTree()) 

156 if exprNormal == exprOriginal: 

157 msg = f'Error in query expression "{exprOriginal}": {err}' 

158 else: 

159 msg = ( 

160 f'Error in query expression "{exprOriginal}" ' 

161 f'(normalized to "{exprNormal}"): {err}' 

162 ) 

163 raise RuntimeError(msg) from None 

164 restriction = summary.governors 

165 dataId = visitor.dataId 

166 else: 

167 from .expressions import InspectionVisitor 

168 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys())) 

169 else: 

170 from .expressions import InspectionSummary 

171 summary = InspectionSummary() 

172 return QueryWhereClause( 

173 self._tree, 

174 dataId, 

175 dimensions=summary.dimensions, 

176 columns=summary.columns, 

177 bind=self._bind, 

178 restriction=restriction, 

179 region=region, 

180 ) 

181 

182 

183@dataclass(frozen=True) 

184class QueryWhereClause: 

185 """Structure holding various contributions to a query's WHERE clause. 

186 

187 Instances of this class should only be created by 

188 `QueryWhereExpression.attach`, which guarantees the consistency of its 

189 attributes. 

190 """ 

191 

192 tree: Optional[Node] 

193 """A parsed string expression tree., or `None` if there was no string 

194 expression. 

195 """ 

196 

197 dataId: DataCoordinate 

198 """A data ID identifying dimensions known before query construction 

199 (`DataCoordinate`). 

200 

201 ``dataId.hasRecords()`` is guaranteed to return `True`. 

202 """ 

203 

204 dimensions: NamedValueAbstractSet[Dimension] 

205 """Dimensions whose primary keys or dependencies were referenced anywhere 

206 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]). 

207 """ 

208 

209 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]] 

210 """Dimension element tables whose non-key columns were referenced anywhere 

211 in the string expression 

212 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]). 

213 """ 

214 

215 bind: Mapping[str, Any] 

216 """Mapping containing literal values that should be injected into the 

217 query expression, keyed by the identifiers they replace (`Mapping`). 

218 """ 

219 

220 region: Optional[Region] 

221 """A spatial region that all result rows must overlap 

222 (`lsst.sphgeom.Region` or `None`). 

223 """ 

224 

225 restriction: GovernorDimensionRestriction 

226 """Restrictions on the values governor dimensions can take in this query, 

227 imposed by the string expression or data ID 

228 (`GovernorDimensionRestriction`). 

229 """ 

230 

231 @property # type: ignore 

232 @cached_getter 

233 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

234 """Dimension elements whose timespans are referenced by this 

235 expression (`NamedValueAbstractSet` [ `DimensionElement` ]) 

236 """ 

237 return NamedValueSet( 

238 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c 

239 ).freeze() 

240 

241 

242@immutable 

243class QuerySummary: 

244 """A struct that holds and categorizes the dimensions involved in a query. 

245 

246 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

247 it needs to include all of the dimensions that will be included in the 

248 query (including any needed for querying datasets). 

249 

250 Parameters 

251 ---------- 

252 requested : `DimensionGraph` 

253 The dimensions whose primary keys should be included in the result rows 

254 of the query. 

255 dataId : `DataCoordinate`, optional 

256 A fully-expanded data ID identifying dimensions known in advance. If 

257 not provided, will be set to an empty data ID. ``dataId.hasRecords()`` 

258 must return `True`. 

259 expression : `str` or `QueryWhereExpression`, optional 

260 A user-provided string WHERE expression. 

261 whereRegion : `lsst.sphgeom.Region`, optional 

262 A spatial region that all rows must overlap. If `None` and ``dataId`` 

263 is not `None`, ``dataId.region`` will be used. 

264 bind : `Mapping` [ `str`, `object` ], optional 

265 Mapping containing literal values that should be injected into the 

266 query expression, keyed by the identifiers they replace. 

267 defaults : `DataCoordinate`, optional 

268 A data ID containing default for governor dimensions. 

269 check : `bool` 

270 If `True` (default) check the query for consistency. This may reject 

271 some valid queries that resemble common mistakes (e.g. queries for 

272 visits without specifying an instrument). 

273 """ 

274 def __init__(self, requested: DimensionGraph, *, 

275 dataId: Optional[DataCoordinate] = None, 

276 expression: Optional[Union[str, QueryWhereExpression]] = None, 

277 whereRegion: Optional[Region] = None, 

278 bind: Optional[Mapping[str, Any]] = None, 

279 defaults: Optional[DataCoordinate] = None, 

280 check: bool = True): 

281 self.requested = requested 

282 if expression is None: 

283 expression = QueryWhereExpression(None, bind) 

284 elif isinstance(expression, str): 

285 expression = QueryWhereExpression(expression, bind) 

286 elif bind is not None: 

287 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.") 

288 self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, defaults=defaults, 

289 check=check) 

290 

291 requested: DimensionGraph 

292 """Dimensions whose primary keys should be included in the result rows of 

293 the query (`DimensionGraph`). 

294 """ 

295 

296 where: QueryWhereClause 

297 """Structure containing objects that contribute to the WHERE clause of the 

298 query (`QueryWhereClause`). 

299 """ 

300 

301 @property 

302 def universe(self) -> DimensionUniverse: 

303 """All known dimensions (`DimensionUniverse`). 

304 """ 

305 return self.requested.universe 

306 

307 @property # type: ignore 

308 @cached_getter 

309 def spatial(self) -> NamedValueAbstractSet[DimensionElement]: 

310 """Dimension elements whose regions and skypix IDs should be included 

311 in the query (`NamedValueAbstractSet` of `DimensionElement`). 

312 """ 

313 # An element may participate spatially in the query if: 

314 # - it's the most precise spatial element for its system in the 

315 # requested dimensions (i.e. in `self.requested.spatial`); 

316 # - it isn't also given at query construction time. 

317 result: NamedValueSet[DimensionElement] = NamedValueSet() 

318 for family in self.mustHaveKeysJoined.spatial: 

319 element = family.choose(self.mustHaveKeysJoined.elements) 

320 assert isinstance(element, DimensionElement) 

321 if element not in self.where.dataId.graph.elements: 

322 result.add(element) 

323 if len(result) == 1: 

324 # There's no spatial join, but there might be a WHERE filter based 

325 # on a given region. 

326 if self.where.dataId.graph.spatial: 

327 # We can only perform those filters against SkyPix dimensions, 

328 # so if what we have isn't one, add the common SkyPix dimension 

329 # to the query; the element we have will be joined to that. 

330 element, = result 

331 if not isinstance(element, SkyPixDimension): 

332 result.add(self.universe.commonSkyPix) 

333 else: 

334 # There is no spatial join or filter in this query. Even 

335 # if this element might be associated with spatial 

336 # information, we don't need it for this query. 

337 return NamedValueSet().freeze() 

338 elif len(result) > 1: 

339 # There's a spatial join. Those require the common SkyPix 

340 # system to be included in the query in order to connect them. 

341 result.add(self.universe.commonSkyPix) 

342 return result.freeze() 

343 

344 @property # type: ignore 

345 @cached_getter 

346 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

347 """Dimension elements whose timespans should be included in the 

348 query (`NamedValueSet` of `DimensionElement`). 

349 """ 

350 if len(self.mustHaveKeysJoined.temporal) > 1: 

351 # We don't actually have multiple temporal families in our current 

352 # dimension configuration, so this limitation should be harmless. 

353 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.") 

354 return self.where.temporal 

355 

356 @property # type: ignore 

357 @cached_getter 

358 def mustHaveKeysJoined(self) -> DimensionGraph: 

359 """Dimensions whose primary keys must be used in the JOIN ON clauses 

360 of the query, even if their tables do not appear (`DimensionGraph`). 

361 

362 A `Dimension` primary key can appear in a join clause without its table 

363 via a foreign key column in table of a dependent dimension element or 

364 dataset. 

365 """ 

366 names = set(self.requested.names | self.where.dimensions.names) 

367 return DimensionGraph(self.universe, names=names) 

368 

369 @property # type: ignore 

370 @cached_getter 

371 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]: 

372 """Dimension elements whose associated tables must appear in the 

373 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

374 """ 

375 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys()) 

376 for dimension in self.mustHaveKeysJoined: 

377 if dimension.implied: 

378 result.add(dimension) 

379 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements: 

380 if element.alwaysJoin: 

381 result.add(element) 

382 return result.freeze() 

383 

384 

385@dataclass 

386class DatasetQueryColumns: 

387 """A struct containing the columns used to reconstruct `DatasetRef` 

388 instances from query results. 

389 """ 

390 

391 datasetType: DatasetType 

392 """The dataset type being queried (`DatasetType`). 

393 """ 

394 

395 id: ColumnElement 

396 """Column containing the unique integer ID for this dataset. 

397 """ 

398 

399 runKey: ColumnElement 

400 """Foreign key column to the `~CollectionType.RUN` collection that holds 

401 this dataset. 

402 """ 

403 

404 ingestDate: Optional[ColumnElement] 

405 """Column containing the ingest timestamp, this is not a part of 

406 `DatasetRef` but it comes from the same table. 

407 """ 

408 

409 def __iter__(self) -> Iterator[ColumnElement]: 

410 yield self.id 

411 yield self.runKey 

412 

413 

414@dataclass 

415class QueryColumns: 

416 """A struct organizing the columns in an under-construction or currently- 

417 executing query. 

418 

419 Takes no parameters at construction, as expected usage is to add elements 

420 to its container attributes incrementally. 

421 """ 

422 def __init__(self) -> None: 

423 self.keys = NamedKeyDict() 

424 self.timespans = NamedKeyDict() 

425 self.regions = NamedKeyDict() 

426 self.datasets = None 

427 

428 keys: NamedKeyDict[Dimension, List[ColumnElement]] 

429 """Columns that correspond to the primary key values of dimensions 

430 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

431 

432 Each value list contains columns from multiple tables corresponding to the 

433 same dimension, and the query should constrain the values of those columns 

434 to be the same. 

435 

436 In a `Query`, the keys of this dictionary must include at least the 

437 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

438 """ 

439 

440 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] 

441 """Columns that correspond to timespans for elements that participate in a 

442 temporal join or filter in the query (`NamedKeyDict` mapping 

443 `DimensionElement` to `TimespanDatabaseRepresentation`). 

444 

445 In a `Query`, the keys of this dictionary must be exactly the elements 

446 in `QuerySummary.temporal`. 

447 """ 

448 

449 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation] 

450 """Columns that correspond to regions for elements that participate in a 

451 spatial join or filter in the query (`NamedKeyDict` mapping 

452 `DimensionElement` to `SpatialRegionDatabaseRepresentation`). 

453 

454 In a `Query`, the keys of this dictionary must be exactly the elements 

455 in `QuerySummary.spatial`. 

456 """ 

457 

458 datasets: Optional[DatasetQueryColumns] 

459 """Columns that can be used to construct `DatasetRef` instances from query 

460 results. 

461 (`DatasetQueryColumns` or `None`). 

462 """ 

463 

464 def isEmpty(self) -> bool: 

465 """Return `True` if this query has no columns at all. 

466 """ 

467 return not (self.keys or self.timespans or self.regions or self.datasets is not None) 

468 

469 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement: 

470 """ Return one of the columns in self.keys for the given dimension. 

471 

472 The column selected is an implentation detail but is guaranteed to 

473 be deterministic and consistent across multiple calls. 

474 

475 Parameters 

476 ---------- 

477 dimension : `Dimension` or `str` 

478 Dimension for which to obtain a key column. 

479 

480 Returns 

481 ------- 

482 column : `sqlalchemy.sql.ColumnElement` 

483 SQLAlchemy column object. 

484 """ 

485 # Choosing the last element here is entirely for human readers of the 

486 # query (e.g. developers debugging things); it makes it more likely a 

487 # dimension key will be provided by the dimension's own table, or 

488 # failing that, some closely related dimension, which might be less 

489 # surprising to see than e.g. some dataset subquery. From the 

490 # database's perspective this is entirely arbitrary, because the query 

491 # guarantees they all have equal values. 

492 return self.keys[dimension][-1] 

493 

494 

495@dataclass 

496class RegistryManagers: 

497 """Struct used to pass around the manager objects that back a `Registry` 

498 and are used internally by the query system. 

499 """ 

500 

501 collections: CollectionManager 

502 """Manager for collections (`CollectionManager`). 

503 """ 

504 

505 datasets: DatasetRecordStorageManager 

506 """Manager for datasets and dataset types (`DatasetRecordStorageManager`). 

507 """ 

508 

509 dimensions: DimensionRecordStorageManager 

510 """Manager for dimensions (`DimensionRecordStorageManager`). 

511 """ 

512 

513 TimespanReprClass: Type[TimespanDatabaseRepresentation] 

514 """Type that encapsulates how timespans are represented in this database 

515 (`type`; subclass of `TimespanDatabaseRepresentation`). 

516 """