Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage 

24 

25from dataclasses import dataclass 

26from typing import AbstractSet, Any, Iterator, List, Mapping, Optional, Type, Union 

27 

28from sqlalchemy.sql import ColumnElement 

29 

30from lsst.sphgeom import Region 

31from ...core import ( 

32 DataCoordinate, 

33 DatasetType, 

34 Dimension, 

35 DimensionElement, 

36 DimensionGraph, 

37 DimensionUniverse, 

38 NamedKeyDict, 

39 NamedKeyMapping, 

40 NamedValueAbstractSet, 

41 NamedValueSet, 

42 SkyPixDimension, 

43 SpatialRegionDatabaseRepresentation, 

44 TimespanDatabaseRepresentation, 

45) 

46from ...core.utils import cached_getter, immutable 

47from ..interfaces import ( 

48 CollectionManager, 

49 DatasetRecordStorageManager, 

50 DimensionRecordStorageManager, 

51) 

52from ..wildcards import GovernorDimensionRestriction 

53# We're not trying to add typing to the lex/yacc parser code, so MyPy 

54# doesn't know about some of these imports. 

55from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore 

56 

57 

58@immutable 

59class QueryWhereExpression: 

60 """A struct representing a parsed user-provided WHERE expression. 

61 

62 Parameters 

63 ---------- 

64 expression : `str`, optional 

65 The string expression to parse. If `None`, a where expression that 

66 always evaluates to `True` is implied. 

67 bind : `Mapping` [ `str`, `object` ], optional 

68 Mapping containing literal values that should be injected into the 

69 query expression, keyed by the identifiers they replace. 

70 """ 

71 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None): 

72 if expression: 

73 try: 

74 parser = ParserYacc() 

75 self._tree = parser.parse(expression) 

76 except Exception as exc: 

77 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc 

78 assert self._tree is not None 

79 else: 

80 self._tree = None 

81 if bind is None: 

82 bind = {} 

83 self._bind = bind 

84 

85 def attach( 

86 self, 

87 graph: DimensionGraph, 

88 dataId: Optional[DataCoordinate] = None, 

89 region: Optional[Region] = None, 

90 check: bool = True, 

91 ) -> QueryWhereClause: 

92 """Allow this expression to be attached to a `QuerySummary` by 

93 transforming it into a `QueryWhereClause`, while checking it for both 

94 internal consistency and consistency with the rest of the query. 

95 

96 Parameters 

97 ---------- 

98 graph : `DimensionGraph` 

99 The dimensions the query would include in the absence of this 

100 WHERE expression. 

101 dataId : `DataCoordinate`, optional 

102 A fully-expanded data ID identifying dimensions known in advance. 

103 If not provided, will be set to an empty data ID. 

104 ``dataId.hasRecords()`` must return `True`. 

105 region : `lsst.sphgeom.Region`, optional 

106 A spatial region that all rows must overlap. If `None` and 

107 ``dataId`` is not `None`, ``dataId.region`` will be used. 

108 check : `bool` 

109 If `True` (default) check the query for consistency. This may 

110 reject some valid queries that resemble common mistakes (e.g. 

111 queries for visits without specifying an instrument). 

112 """ 

113 if region is None and dataId is not None: 

114 region = dataId.region 

115 if dataId is None: 

116 dataId = DataCoordinate.makeEmpty(graph.universe) 

117 if self._bind and check: 

118 for identifier in self._bind: 

119 if identifier in graph.universe.getStaticElements().names: 

120 raise RuntimeError( 

121 f"Bind parameter key {identifier!r} conflicts with a dimension element." 

122 ) 

123 table, sep, column = identifier.partition('.') 

124 if column and table in graph.universe.getStaticElements().names: 

125 raise RuntimeError( 

126 f"Bind parameter key {identifier!r} looks like a dimension column." 

127 ) 

128 restriction = GovernorDimensionRestriction(graph.universe) 

129 summary: InspectionSummary 

130 if self._tree is not None: 

131 if check: 

132 # Convert the expression to disjunctive normal form (ORs of 

133 # ANDs). That's potentially super expensive in the general 

134 # case (where there's a ton of nesting of ANDs and ORs). That 

135 # won't be the case for the expressions we expect, and we 

136 # actually use disjunctive normal instead of conjunctive (i.e. 

137 # ANDs of ORs) because I think the worst-case is a long list 

138 # of OR'd-together data IDs, which is already in or very close 

139 # to disjunctive normal form. 

140 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE) 

141 from .expressions import CheckVisitor 

142 # Check the expression for consistency and completeness. 

143 try: 

144 summary = expr.visit(CheckVisitor(dataId, graph, self._bind.keys())) 

145 except RuntimeError as err: 

146 exprOriginal = str(self._tree) 

147 exprNormal = str(expr.toTree()) 

148 if exprNormal == exprOriginal: 

149 msg = f'Error in query expression "{exprOriginal}": {err}' 

150 else: 

151 msg = ( 

152 f'Error in query expression "{exprOriginal}" ' 

153 f'(normalized to "{exprNormal}"): {err}' 

154 ) 

155 raise RuntimeError(msg) from None 

156 restriction = GovernorDimensionRestriction( 

157 graph.universe, 

158 **summary.governors.byName(), 

159 ) 

160 else: 

161 from .expressions import InspectionVisitor 

162 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys())) 

163 else: 

164 from .expressions import InspectionSummary 

165 summary = InspectionSummary() 

166 return QueryWhereClause( 

167 self._tree, 

168 dataId, 

169 dimensions=summary.dimensions, 

170 columns=summary.columns, 

171 bind=self._bind, 

172 restriction=restriction, 

173 region=region, 

174 ) 

175 

176 

177@dataclass(frozen=True) 

178class QueryWhereClause: 

179 """Structure holding various contributions to a query's WHERE clause. 

180 

181 Instances of this class should only be created by 

182 `QueryWhereExpression.attach`, which guarantees the consistency of its 

183 attributes. 

184 """ 

185 

186 tree: Optional[Node] 

187 """A parsed string expression tree., or `None` if there was no string 

188 expression. 

189 """ 

190 

191 dataId: DataCoordinate 

192 """A data ID identifying dimensions known before query construction 

193 (`DataCoordinate`). 

194 

195 ``dataId.hasRecords()`` is guaranteed to return `True`. 

196 """ 

197 

198 dimensions: NamedValueAbstractSet[Dimension] 

199 """Dimensions whose primary keys or dependencies were referenced anywhere 

200 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]). 

201 """ 

202 

203 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]] 

204 """Dimension element tables whose non-key columns were referenced anywhere 

205 in the string expression 

206 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]). 

207 """ 

208 

209 bind: Mapping[str, Any] 

210 """Mapping containing literal values that should be injected into the 

211 query expression, keyed by the identifiers they replace (`Mapping`). 

212 """ 

213 

214 region: Optional[Region] 

215 """A spatial region that all result rows must overlap 

216 (`lsst.sphgeom.Region` or `None`). 

217 """ 

218 

219 restriction: GovernorDimensionRestriction 

220 """Restrictions on the values governor dimensions can take in this query, 

221 imposed by the string expression or data ID 

222 (`GovernorDimensionRestriction`). 

223 """ 

224 

225 @property # type: ignore 

226 @cached_getter 

227 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

228 """Dimension elements whose timespans are referenced by this 

229 expression (`NamedValueAbstractSet` [ `DimensionElement` ]) 

230 """ 

231 return NamedValueSet( 

232 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c 

233 ).freeze() 

234 

235 

236@immutable 

237class QuerySummary: 

238 """A struct that holds and categorizes the dimensions involved in a query. 

239 

240 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

241 it needs to include all of the dimensions that will be included in the 

242 query (including any needed for querying datasets). 

243 

244 Parameters 

245 ---------- 

246 requested : `DimensionGraph` 

247 The dimensions whose primary keys should be included in the result rows 

248 of the query. 

249 dataId : `DataCoordinate`, optional 

250 A fully-expanded data ID identifying dimensions known in advance. If 

251 not provided, will be set to an empty data ID. ``dataId.hasRecords()`` 

252 must return `True`. 

253 expression : `str` or `QueryWhereExpression`, optional 

254 A user-provided string WHERE expression. 

255 whereRegion : `lsst.sphgeom.Region`, optional 

256 A spatial region that all rows must overlap. If `None` and ``dataId`` 

257 is not `None`, ``dataId.region`` will be used. 

258 bind : `Mapping` [ `str`, `object` ], optional 

259 Mapping containing literal values that should be injected into the 

260 query expression, keyed by the identifiers they replace. 

261 check : `bool` 

262 If `True` (default) check the query for consistency. This may reject 

263 some valid queries that resemble common mistakes (e.g. queries for 

264 visits without specifying an instrument). 

265 """ 

266 def __init__(self, requested: DimensionGraph, *, 

267 dataId: Optional[DataCoordinate] = None, 

268 expression: Optional[Union[str, QueryWhereExpression]] = None, 

269 whereRegion: Optional[Region] = None, 

270 bind: Optional[Mapping[str, Any]] = None, 

271 check: bool = True): 

272 self.requested = requested 

273 if expression is None: 

274 expression = QueryWhereExpression(None, bind) 

275 elif isinstance(expression, str): 

276 expression = QueryWhereExpression(expression, bind) 

277 elif bind is not None: 

278 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.") 

279 self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, check=check) 

280 

281 requested: DimensionGraph 

282 """Dimensions whose primary keys should be included in the result rows of 

283 the query (`DimensionGraph`). 

284 """ 

285 

286 where: QueryWhereClause 

287 """Structure containing objects that contribute to the WHERE clause of the 

288 query (`QueryWhereClause`). 

289 """ 

290 

291 @property 

292 def universe(self) -> DimensionUniverse: 

293 """All known dimensions (`DimensionUniverse`). 

294 """ 

295 return self.requested.universe 

296 

297 @property # type: ignore 

298 @cached_getter 

299 def spatial(self) -> NamedValueAbstractSet[DimensionElement]: 

300 """Dimension elements whose regions and skypix IDs should be included 

301 in the query (`NamedValueAbstractSet` of `DimensionElement`). 

302 """ 

303 # An element may participate spatially in the query if: 

304 # - it's the most precise spatial element for its system in the 

305 # requested dimensions (i.e. in `self.requested.spatial`); 

306 # - it isn't also given at query construction time. 

307 result: NamedValueSet[DimensionElement] = NamedValueSet() 

308 for family in self.mustHaveKeysJoined.spatial: 

309 element = family.choose(self.mustHaveKeysJoined.elements) 

310 assert isinstance(element, DimensionElement) 

311 if element not in self.where.dataId.graph.elements: 

312 result.add(element) 

313 if len(result) == 1: 

314 # There's no spatial join, but there might be a WHERE filter based 

315 # on a given region. 

316 if self.where.dataId.graph.spatial: 

317 # We can only perform those filters against SkyPix dimensions, 

318 # so if what we have isn't one, add the common SkyPix dimension 

319 # to the query; the element we have will be joined to that. 

320 element, = result 

321 if not isinstance(element, SkyPixDimension): 

322 result.add(self.universe.commonSkyPix) 

323 else: 

324 # There is no spatial join or filter in this query. Even 

325 # if this element might be associated with spatial 

326 # information, we don't need it for this query. 

327 return NamedValueSet().freeze() 

328 elif len(result) > 1: 

329 # There's a spatial join. Those require the common SkyPix 

330 # system to be included in the query in order to connect them. 

331 result.add(self.universe.commonSkyPix) 

332 return result.freeze() 

333 

334 @property # type: ignore 

335 @cached_getter 

336 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

337 """Dimension elements whose timespans should be included in the 

338 query (`NamedValueSet` of `DimensionElement`). 

339 """ 

340 if len(self.mustHaveKeysJoined.temporal) > 1: 

341 # We don't actually have multiple temporal families in our current 

342 # dimension configuration, so this limitation should be harmless. 

343 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.") 

344 return self.where.temporal 

345 

346 @property # type: ignore 

347 @cached_getter 

348 def mustHaveKeysJoined(self) -> DimensionGraph: 

349 """Dimensions whose primary keys must be used in the JOIN ON clauses 

350 of the query, even if their tables do not appear (`DimensionGraph`). 

351 

352 A `Dimension` primary key can appear in a join clause without its table 

353 via a foreign key column in table of a dependent dimension element or 

354 dataset. 

355 """ 

356 names = set(self.requested.names | self.where.dimensions.names) 

357 return DimensionGraph(self.universe, names=names) 

358 

359 @property # type: ignore 

360 @cached_getter 

361 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]: 

362 """Dimension elements whose associated tables must appear in the 

363 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

364 """ 

365 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys()) 

366 for dimension in self.mustHaveKeysJoined: 

367 if dimension.implied: 

368 result.add(dimension) 

369 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements: 

370 if element.alwaysJoin: 

371 result.add(element) 

372 return result.freeze() 

373 

374 

375@dataclass 

376class DatasetQueryColumns: 

377 """A struct containing the columns used to reconstruct `DatasetRef` 

378 instances from query results. 

379 """ 

380 

381 datasetType: DatasetType 

382 """The dataset type being queried (`DatasetType`). 

383 """ 

384 

385 id: ColumnElement 

386 """Column containing the unique integer ID for this dataset. 

387 """ 

388 

389 runKey: ColumnElement 

390 """Foreign key column to the `~CollectionType.RUN` collection that holds 

391 this dataset. 

392 """ 

393 

394 ingestDate: Optional[ColumnElement] 

395 """Column containing the ingest timestamp, this is not a part of 

396 `DatasetRef` but it comes from the same table. 

397 """ 

398 

399 def __iter__(self) -> Iterator[ColumnElement]: 

400 yield self.id 

401 yield self.runKey 

402 

403 

404@dataclass 

405class QueryColumns: 

406 """A struct organizing the columns in an under-construction or currently- 

407 executing query. 

408 

409 Takes no parameters at construction, as expected usage is to add elements 

410 to its container attributes incrementally. 

411 """ 

412 def __init__(self) -> None: 

413 self.keys = NamedKeyDict() 

414 self.timespans = NamedKeyDict() 

415 self.regions = NamedKeyDict() 

416 self.datasets = None 

417 

418 keys: NamedKeyDict[Dimension, List[ColumnElement]] 

419 """Columns that correspond to the primary key values of dimensions 

420 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

421 

422 Each value list contains columns from multiple tables corresponding to the 

423 same dimension, and the query should constrain the values of those columns 

424 to be the same. 

425 

426 In a `Query`, the keys of this dictionary must include at least the 

427 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

428 """ 

429 

430 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] 

431 """Columns that correspond to timespans for elements that participate in a 

432 temporal join or filter in the query (`NamedKeyDict` mapping 

433 `DimensionElement` to `TimespanDatabaseRepresentation`). 

434 

435 In a `Query`, the keys of this dictionary must be exactly the elements 

436 in `QuerySummary.temporal`. 

437 """ 

438 

439 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation] 

440 """Columns that correspond to regions for elements that participate in a 

441 spatial join or filter in the query (`NamedKeyDict` mapping 

442 `DimensionElement` to `SpatialRegionDatabaseRepresentation`). 

443 

444 In a `Query`, the keys of this dictionary must be exactly the elements 

445 in `QuerySummary.spatial`. 

446 """ 

447 

448 datasets: Optional[DatasetQueryColumns] 

449 """Columns that can be used to construct `DatasetRef` instances from query 

450 results. 

451 (`DatasetQueryColumns` or `None`). 

452 """ 

453 

454 def isEmpty(self) -> bool: 

455 """Return `True` if this query has no columns at all. 

456 """ 

457 return not (self.keys or self.timespans or self.regions or self.datasets is not None) 

458 

459 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement: 

460 """ Return one of the columns in self.keys for the given dimension. 

461 

462 The column selected is an implentation detail but is guaranteed to 

463 be deterministic and consistent across multiple calls. 

464 

465 Parameters 

466 ---------- 

467 dimension : `Dimension` or `str` 

468 Dimension for which to obtain a key column. 

469 

470 Returns 

471 ------- 

472 column : `sqlalchemy.sql.ColumnElement` 

473 SQLAlchemy column object. 

474 """ 

475 # Choosing the last element here is entirely for human readers of the 

476 # query (e.g. developers debugging things); it makes it more likely a 

477 # dimension key will be provided by the dimension's own table, or 

478 # failing that, some closely related dimension, which might be less 

479 # surprising to see than e.g. some dataset subquery. From the 

480 # database's perspective this is entirely arbitrary, because the query 

481 # guarantees they all have equal values. 

482 return self.keys[dimension][-1] 

483 

484 

485@dataclass 

486class RegistryManagers: 

487 """Struct used to pass around the manager objects that back a `Registry` 

488 and are used internally by the query system. 

489 """ 

490 

491 collections: CollectionManager 

492 """Manager for collections (`CollectionManager`). 

493 """ 

494 

495 datasets: DatasetRecordStorageManager 

496 """Manager for datasets and dataset types (`DatasetRecordStorageManager`). 

497 """ 

498 

499 dimensions: DimensionRecordStorageManager 

500 """Manager for dimensions (`DimensionRecordStorageManager`). 

501 """ 

502 

503 TimespanReprClass: Type[TimespanDatabaseRepresentation] 

504 """Type that encapsulates how timespans are represented in this database 

505 (`type`; subclass of `TimespanDatabaseRepresentation`). 

506 """