Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage 

24 

25from dataclasses import dataclass 

26from typing import AbstractSet, Iterator, List, Optional, Union 

27 

28from sqlalchemy.sql import ColumnElement 

29 

30from lsst.sphgeom import Region 

31from ...core import ( 

32 TimespanDatabaseRepresentation, 

33 DataCoordinate, 

34 DatasetType, 

35 Dimension, 

36 DimensionElement, 

37 DimensionGraph, 

38 DimensionUniverse, 

39 NamedKeyDict, 

40 NamedKeyMapping, 

41 NamedValueAbstractSet, 

42 NamedValueSet, 

43 SkyPixDimension, 

44) 

45from ...core.utils import cached_getter, immutable 

46from ..interfaces import ( 

47 CollectionManager, 

48 DatasetRecordStorageManager, 

49 DimensionRecordStorageManager, 

50) 

51from ..wildcards import GovernorDimensionRestriction 

52# We're not trying to add typing to the lex/yacc parser code, so MyPy 

53# doesn't know about some of these imports. 

54from .exprParser import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore 

55 

56 

57@immutable 

58class QueryWhereExpression: 

59 """A struct representing a parsed user-provided WHERE expression. 

60 

61 Parameters 

62 ---------- 

63 expression : `str`, optional 

64 The string expression to parse. If `None`, a where expression that 

65 always evaluates to `True` is implied. 

66 """ 

67 def __init__(self, expression: Optional[str] = None): 

68 if expression: 

69 try: 

70 parser = ParserYacc() 

71 self._tree = parser.parse(expression) 

72 except Exception as exc: 

73 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc 

74 assert self._tree is not None 

75 else: 

76 self._tree = None 

77 

78 def attach( 

79 self, 

80 graph: DimensionGraph, 

81 dataId: Optional[DataCoordinate] = None, 

82 region: Optional[Region] = None, 

83 check: bool = True, 

84 ) -> QueryWhereClause: 

85 """Allow this expression to be attached to a `QuerySummary` by 

86 transforming it into a `QueryWhereClause`, while checking it for both 

87 internal consistency and consistency with the rest of the query. 

88 

89 Parameters 

90 ---------- 

91 graph : `DimensionGraph` 

92 The dimensions the query would include in the absence of this 

93 WHERE expression. 

94 dataId : `DataCoordinate`, optional 

95 A fully-expanded data ID identifying dimensions known in advance. 

96 If not provided, will be set to an empty data ID. 

97 ``dataId.hasRecords()`` must return `True`. 

98 region : `lsst.sphgeom.Region`, optional 

99 A spatial region that all rows must overlap. If `None` and 

100 ``dataId`` is not `None`, ``dataId.region`` will be used. 

101 check : `bool` 

102 If `True` (default) check the query for consistency. This may 

103 reject some valid queries that resemble common mistakes (e.g. 

104 queries for visits without specifying an instrument). 

105 """ 

106 if region is None and dataId is not None: 

107 region = dataId.region 

108 if dataId is None: 

109 dataId = DataCoordinate.makeEmpty(graph.universe) 

110 restriction = GovernorDimensionRestriction(graph.universe) 

111 summary: InspectionSummary 

112 if self._tree is not None: 

113 if check: 

114 # Convert the expression to disjunctive normal form (ORs of 

115 # ANDs). That's potentially super expensive in the general 

116 # case (where there's a ton of nesting of ANDs and ORs). That 

117 # won't be the case for the expressions we expect, and we 

118 # actually use disjunctive normal instead of conjunctive (i.e. 

119 # ANDs of ORs) because I think the worst-case is a long list 

120 # of OR'd-together data IDs, which is already in or very close 

121 # to disjunctive normal form. 

122 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE) 

123 from .expressions import CheckVisitor 

124 # Check the expression for consistency and completeness. 

125 try: 

126 summary = expr.visit(CheckVisitor(dataId, graph)) 

127 except RuntimeError as err: 

128 exprOriginal = str(self._tree) 

129 exprNormal = str(expr.toTree()) 

130 if exprNormal == exprOriginal: 

131 msg = f'Error in query expression "{exprOriginal}": {err}' 

132 else: 

133 msg = ( 

134 f'Error in query expression "{exprOriginal}" ' 

135 f'(normalized to "{exprNormal}"): {err}' 

136 ) 

137 raise RuntimeError(msg) from None 

138 restriction = GovernorDimensionRestriction( 

139 graph.universe, 

140 **summary.governors.byName(), 

141 ) 

142 else: 

143 from .expressions import InspectionVisitor 

144 summary = self._tree.visit(InspectionVisitor(graph.universe)) 

145 else: 

146 from .expressions import InspectionSummary 

147 summary = InspectionSummary() 

148 return QueryWhereClause( 

149 self._tree, 

150 dataId, 

151 dimensions=summary.dimensions, 

152 columns=summary.columns, 

153 restriction=restriction, 

154 region=region, 

155 ) 

156 

157 

158@dataclass(frozen=True) 

159class QueryWhereClause: 

160 """Structure holding various contributions to a query's WHERE clause. 

161 

162 Instances of this class should only be created by 

163 `QueryWhereExpression.attach`, which guarantees the consistency of its 

164 attributes. 

165 """ 

166 

167 tree: Optional[Node] 

168 """A parsed string expression tree., or `None` if there was no string 

169 expression. 

170 """ 

171 

172 dataId: DataCoordinate 

173 """A data ID identifying dimensions known before query construction 

174 (`DataCoordinate`). 

175 

176 ``dataId.hasRecords()`` is guaranteed to return `True`. 

177 """ 

178 

179 dimensions: NamedValueAbstractSet[Dimension] 

180 """Dimensions whose primary keys or dependencies were referenced anywhere 

181 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]). 

182 """ 

183 

184 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]] 

185 """Dimension element tables whose non-key columns were referenced anywhere 

186 in the string expression 

187 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]). 

188 """ 

189 

190 region: Optional[Region] 

191 """A spatial region that all result rows must overlap 

192 (`lsst.sphgeom.Region` or `None`). 

193 """ 

194 

195 restriction: GovernorDimensionRestriction 

196 """Restrictions on the values governor dimensions can take in this query, 

197 imposed by the string expression or data ID 

198 (`GovernorDimensionRestriction`). 

199 """ 

200 

201 

202@immutable 

203class QuerySummary: 

204 """A struct that holds and categorizes the dimensions involved in a query. 

205 

206 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

207 it needs to include all of the dimensions that will be included in the 

208 query (including any needed for querying datasets). 

209 

210 Parameters 

211 ---------- 

212 requested : `DimensionGraph` 

213 The dimensions whose primary keys should be included in the result rows 

214 of the query. 

215 dataId : `DataCoordinate`, optional 

216 A fully-expanded data ID identifying dimensions known in advance. If 

217 not provided, will be set to an empty data ID. ``dataId.hasRecords()`` 

218 must return `True`. 

219 expression : `str` or `QueryWhereExpression`, optional 

220 A user-provided string WHERE expression. 

221 whereRegion : `lsst.sphgeom.Region`, optional 

222 A spatial region that all rows must overlap. If `None` and ``dataId`` 

223 is not `None`, ``dataId.region`` will be used. 

224 check : `bool` 

225 If `True` (default) check the query for consistency. This may reject 

226 some valid queries that resemble common mistakes (e.g. queries for 

227 visits without specifying an instrument). 

228 """ 

229 def __init__(self, requested: DimensionGraph, *, 

230 dataId: Optional[DataCoordinate] = None, 

231 expression: Optional[Union[str, QueryWhereExpression]] = None, 

232 whereRegion: Optional[Region] = None, 

233 check: bool = True): 

234 self.requested = requested 

235 if expression is None: 

236 expression = QueryWhereExpression(None) 

237 elif isinstance(expression, str): 

238 expression = QueryWhereExpression(expression) 

239 self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, check=check) 

240 

241 requested: DimensionGraph 

242 """Dimensions whose primary keys should be included in the result rows of 

243 the query (`DimensionGraph`). 

244 """ 

245 

246 where: QueryWhereClause 

247 """Structure containing objects that contribute to the WHERE clause of the 

248 query (`QueryWhereClause`). 

249 """ 

250 

251 @property 

252 def universe(self) -> DimensionUniverse: 

253 """All known dimensions (`DimensionUniverse`). 

254 """ 

255 return self.requested.universe 

256 

257 @property # type: ignore 

258 @cached_getter 

259 def spatial(self) -> NamedValueSet[DimensionElement]: 

260 """Dimension elements whose regions and skypix IDs should be included 

261 in the query (`NamedValueSet` of `DimensionElement`). 

262 """ 

263 # An element may participate spatially in the query if: 

264 # - it's the most precise spatial element for its system in the 

265 # requested dimensions (i.e. in `self.requested.spatial`); 

266 # - it isn't also given at query construction time. 

267 result: NamedValueSet[DimensionElement] = NamedValueSet() 

268 for family in self.mustHaveKeysJoined.spatial: 

269 element = family.choose(self.mustHaveKeysJoined.elements) 

270 assert isinstance(element, DimensionElement) 

271 if element not in self.where.dataId.graph.elements: 

272 result.add(element) 

273 if len(result) == 1: 

274 # There's no spatial join, but there might be a WHERE filter based 

275 # on a given region. 

276 if self.where.dataId.graph.spatial: 

277 # We can only perform those filters against SkyPix dimensions, 

278 # so if what we have isn't one, add the common SkyPix dimension 

279 # to the query; the element we have will be joined to that. 

280 element, = result 

281 if not isinstance(element, SkyPixDimension): 

282 result.add(self.universe.commonSkyPix) 

283 else: 

284 # There is no spatial join or filter in this query. Even 

285 # if this element might be associated with spatial 

286 # information, we don't need it for this query. 

287 return NamedValueSet() 

288 elif len(result) > 1: 

289 # There's a spatial join. Those require the common SkyPix 

290 # system to be included in the query in order to connect them. 

291 result.add(self.universe.commonSkyPix) 

292 return result 

293 

294 @property # type: ignore 

295 @cached_getter 

296 def temporal(self) -> NamedValueSet[DimensionElement]: 

297 """Dimension elements whose timespans should be included in the 

298 query (`NamedValueSet` of `DimensionElement`). 

299 """ 

300 # An element may participate temporally in the query if: 

301 # - it's the most precise temporal element for its system in the 

302 # requested dimensions (i.e. in `self.requested.temporal`); 

303 # - it isn't also given at query construction time. 

304 result: NamedValueSet[DimensionElement] = NamedValueSet() 

305 for family in self.mustHaveKeysJoined.temporal: 

306 element = family.choose(self.mustHaveKeysJoined.elements) 

307 assert isinstance(element, DimensionElement) 

308 if element not in self.where.dataId.graph.elements: 

309 result.add(element) 

310 if len(result) == 1 and not self.where.dataId.graph.temporal: 

311 # No temporal join or filter. Even if this element might be 

312 # associated with temporal information, we don't need it for this 

313 # query. 

314 return NamedValueSet() 

315 return result 

316 

317 @property # type: ignore 

318 @cached_getter 

319 def mustHaveKeysJoined(self) -> DimensionGraph: 

320 """Dimensions whose primary keys must be used in the JOIN ON clauses 

321 of the query, even if their tables do not appear (`DimensionGraph`). 

322 

323 A `Dimension` primary key can appear in a join clause without its table 

324 via a foreign key column in table of a dependent dimension element or 

325 dataset. 

326 """ 

327 names = set(self.requested.names | self.where.dimensions.names) 

328 return DimensionGraph(self.universe, names=names) 

329 

330 @property # type: ignore 

331 @cached_getter 

332 def mustHaveTableJoined(self) -> NamedValueSet[DimensionElement]: 

333 """Dimension elements whose associated tables must appear in the 

334 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

335 """ 

336 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys()) 

337 for dimension in self.mustHaveKeysJoined: 

338 if dimension.implied: 

339 result.add(dimension) 

340 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements: 

341 if element.alwaysJoin: 

342 result.add(element) 

343 return result 

344 

345 

346@dataclass 

347class DatasetQueryColumns: 

348 """A struct containing the columns used to reconstruct `DatasetRef` 

349 instances from query results. 

350 """ 

351 

352 datasetType: DatasetType 

353 """The dataset type being queried (`DatasetType`). 

354 """ 

355 

356 id: ColumnElement 

357 """Column containing the unique integer ID for this dataset. 

358 """ 

359 

360 runKey: ColumnElement 

361 """Foreign key column to the `~CollectionType.RUN` collection that holds 

362 this dataset. 

363 """ 

364 

365 ingestDate: Optional[ColumnElement] 

366 """Column containing the ingest timestamp, this is not a part of 

367 `DatasetRef` but it comes from the same table. 

368 """ 

369 

370 def __iter__(self) -> Iterator[ColumnElement]: 

371 yield self.id 

372 yield self.runKey 

373 

374 

375@dataclass 

376class QueryColumns: 

377 """A struct organizing the columns in an under-construction or currently- 

378 executing query. 

379 

380 Takes no parameters at construction, as expected usage is to add elements 

381 to its container attributes incrementally. 

382 """ 

383 def __init__(self) -> None: 

384 self.keys = NamedKeyDict() 

385 self.timespans = NamedKeyDict() 

386 self.regions = NamedKeyDict() 

387 self.datasets = None 

388 

389 keys: NamedKeyDict[Dimension, List[ColumnElement]] 

390 """Columns that correspond to the primary key values of dimensions 

391 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

392 

393 Each value list contains columns from multiple tables corresponding to the 

394 same dimension, and the query should constrain the values of those columns 

395 to be the same. 

396 

397 In a `Query`, the keys of this dictionary must include at least the 

398 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

399 """ 

400 

401 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] 

402 """Columns that correspond to timespans for elements that participate in a 

403 temporal join or filter in the query (`NamedKeyDict` mapping 

404 `DimensionElement` to `TimespanDatabaseRepresentation`). 

405 

406 In a `Query`, the keys of this dictionary must be exactly the elements 

407 in `QuerySummary.temporal`. 

408 """ 

409 

410 regions: NamedKeyDict[DimensionElement, ColumnElement] 

411 """Columns that correspond to regions for elements that participate in a 

412 spatial join or filter in the query (`NamedKeyDict` mapping 

413 `DimensionElement` to `ColumnElement`). 

414 

415 In a `Query`, the keys of this dictionary must be exactly the elements 

416 in `QuerySummary.spatial`. 

417 """ 

418 

419 datasets: Optional[DatasetQueryColumns] 

420 """Columns that can be used to construct `DatasetRef` instances from query 

421 results. 

422 (`DatasetQueryColumns` or `None`). 

423 """ 

424 

425 def isEmpty(self) -> bool: 

426 """Return `True` if this query has no columns at all. 

427 """ 

428 return not (self.keys or self.timespans or self.regions or self.datasets is not None) 

429 

430 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement: 

431 """ Return one of the columns in self.keys for the given dimension. 

432 

433 The column selected is an implentation detail but is guaranteed to 

434 be deterministic and consistent across multiple calls. 

435 

436 Parameters 

437 ---------- 

438 dimension : `Dimension` or `str` 

439 Dimension for which to obtain a key column. 

440 

441 Returns 

442 ------- 

443 column : `sqlalchemy.sql.ColumnElement` 

444 SQLAlchemy column object. 

445 """ 

446 # Choosing the last element here is entirely for human readers of the 

447 # query (e.g. developers debugging things); it makes it more likely a 

448 # dimension key will be provided by the dimension's own table, or 

449 # failing that, some closely related dimension, which might be less 

450 # surprising to see than e.g. some dataset subquery. From the 

451 # database's perspective this is entirely arbitrary, because the query 

452 # guarantees they all have equal values. 

453 return self.keys[dimension][-1] 

454 

455 

456@dataclass 

457class RegistryManagers: 

458 """Struct used to pass around the manager objects that back a `Registry` 

459 and are used internally by the query system. 

460 """ 

461 

462 collections: CollectionManager 

463 """Manager for collections (`CollectionManager`). 

464 """ 

465 

466 datasets: DatasetRecordStorageManager 

467 """Manager for datasets and dataset types (`DatasetRecordStorageManager`). 

468 """ 

469 

470 dimensions: DimensionRecordStorageManager 

471 """Manager for dimensions (`DimensionRecordStorageManager`). 

472 """