Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 35%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

165 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage 

24 

25from dataclasses import dataclass 

26from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Type, Union 

27 

28from sqlalchemy.sql import ColumnElement 

29 

30from lsst.utils.classes import cached_getter, immutable 

31from lsst.sphgeom import Region 

32from ...core import ( 

33 DataCoordinate, 

34 DatasetType, 

35 Dimension, 

36 DimensionElement, 

37 DimensionGraph, 

38 DimensionUniverse, 

39 NamedKeyDict, 

40 NamedKeyMapping, 

41 NamedValueAbstractSet, 

42 NamedValueSet, 

43 SkyPixDimension, 

44 SpatialRegionDatabaseRepresentation, 

45 TimespanDatabaseRepresentation, 

46) 

47from ..interfaces import ( 

48 CollectionManager, 

49 DatasetRecordStorageManager, 

50 DimensionRecordStorageManager, 

51) 

52from ..summaries import GovernorDimensionRestriction 

53# We're not trying to add typing to the lex/yacc parser code, so MyPy 

54# doesn't know about some of these imports. 

55from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore 

56 

57 

58@immutable 

59class QueryWhereExpression: 

60 """A struct representing a parsed user-provided WHERE expression. 

61 

62 Parameters 

63 ---------- 

64 expression : `str`, optional 

65 The string expression to parse. If `None`, a where expression that 

66 always evaluates to `True` is implied. 

67 bind : `Mapping` [ `str`, `object` ], optional 

68 Mapping containing literal values that should be injected into the 

69 query expression, keyed by the identifiers they replace. 

70 """ 

71 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None): 

72 if expression: 

73 try: 

74 parser = ParserYacc() 

75 self._tree = parser.parse(expression) 

76 except Exception as exc: 

77 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc 

78 assert self._tree is not None 

79 else: 

80 self._tree = None 

81 if bind is None: 

82 bind = {} 

83 self._bind = bind 

84 

85 def attach( 

86 self, 

87 graph: DimensionGraph, 

88 dataId: Optional[DataCoordinate] = None, 

89 region: Optional[Region] = None, 

90 defaults: Optional[DataCoordinate] = None, 

91 check: bool = True, 

92 ) -> QueryWhereClause: 

93 """Allow this expression to be attached to a `QuerySummary` by 

94 transforming it into a `QueryWhereClause`, while checking it for both 

95 internal consistency and consistency with the rest of the query. 

96 

97 Parameters 

98 ---------- 

99 graph : `DimensionGraph` 

100 The dimensions the query would include in the absence of this 

101 WHERE expression. 

102 dataId : `DataCoordinate`, optional 

103 A fully-expanded data ID identifying dimensions known in advance. 

104 If not provided, will be set to an empty data ID. 

105 ``dataId.hasRecords()`` must return `True`. 

106 region : `lsst.sphgeom.Region`, optional 

107 A spatial region that all rows must overlap. If `None` and 

108 ``dataId`` is not `None`, ``dataId.region`` will be used. 

109 defaults : `DataCoordinate`, optional 

110 A data ID containing default for governor dimensions. Ignored 

111 unless ``check=True``. 

112 check : `bool` 

113 If `True` (default) check the query for consistency and inject 

114 default values into the data ID when needed. This may 

115 reject some valid queries that resemble common mistakes (e.g. 

116 queries for visits without specifying an instrument). 

117 """ 

118 if region is None and dataId is not None: 

119 region = dataId.region 

120 if dataId is None: 

121 dataId = DataCoordinate.makeEmpty(graph.universe) 

122 if defaults is None: 

123 defaults = DataCoordinate.makeEmpty(graph.universe) 

124 if self._bind and check: 

125 for identifier in self._bind: 

126 if identifier in graph.universe.getStaticElements().names: 

127 raise RuntimeError( 

128 f"Bind parameter key {identifier!r} conflicts with a dimension element." 

129 ) 

130 table, sep, column = identifier.partition('.') 

131 if column and table in graph.universe.getStaticElements().names: 

132 raise RuntimeError( 

133 f"Bind parameter key {identifier!r} looks like a dimension column." 

134 ) 

135 restriction = GovernorDimensionRestriction(NamedKeyDict()) 

136 summary: InspectionSummary 

137 if self._tree is not None: 

138 if check: 

139 # Convert the expression to disjunctive normal form (ORs of 

140 # ANDs). That's potentially super expensive in the general 

141 # case (where there's a ton of nesting of ANDs and ORs). That 

142 # won't be the case for the expressions we expect, and we 

143 # actually use disjunctive normal instead of conjunctive (i.e. 

144 # ANDs of ORs) because I think the worst-case is a long list 

145 # of OR'd-together data IDs, which is already in or very close 

146 # to disjunctive normal form. 

147 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE) 

148 from .expressions import CheckVisitor 

149 # Check the expression for consistency and completeness. 

150 visitor = CheckVisitor(dataId, graph, self._bind.keys(), defaults) 

151 try: 

152 summary = expr.visit(visitor) 

153 except RuntimeError as err: 

154 exprOriginal = str(self._tree) 

155 exprNormal = str(expr.toTree()) 

156 if exprNormal == exprOriginal: 

157 msg = f'Error in query expression "{exprOriginal}": {err}' 

158 else: 

159 msg = ( 

160 f'Error in query expression "{exprOriginal}" ' 

161 f'(normalized to "{exprNormal}"): {err}' 

162 ) 

163 raise RuntimeError(msg) from None 

164 restriction = summary.governors 

165 dataId = visitor.dataId 

166 else: 

167 from .expressions import InspectionVisitor 

168 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys())) 

169 else: 

170 from .expressions import InspectionSummary 

171 summary = InspectionSummary() 

172 return QueryWhereClause( 

173 self._tree, 

174 dataId, 

175 dimensions=summary.dimensions, 

176 columns=summary.columns, 

177 bind=self._bind, 

178 restriction=restriction, 

179 region=region, 

180 ) 

181 

182 

183@dataclass(frozen=True) 

184class QueryWhereClause: 

185 """Structure holding various contributions to a query's WHERE clause. 

186 

187 Instances of this class should only be created by 

188 `QueryWhereExpression.attach`, which guarantees the consistency of its 

189 attributes. 

190 """ 

191 

192 tree: Optional[Node] 

193 """A parsed string expression tree., or `None` if there was no string 

194 expression. 

195 """ 

196 

197 dataId: DataCoordinate 

198 """A data ID identifying dimensions known before query construction 

199 (`DataCoordinate`). 

200 

201 ``dataId.hasRecords()`` is guaranteed to return `True`. 

202 """ 

203 

204 dimensions: NamedValueAbstractSet[Dimension] 

205 """Dimensions whose primary keys or dependencies were referenced anywhere 

206 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]). 

207 """ 

208 

209 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]] 

210 """Dimension element tables whose non-key columns were referenced anywhere 

211 in the string expression 

212 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]). 

213 """ 

214 

215 bind: Mapping[str, Any] 

216 """Mapping containing literal values that should be injected into the 

217 query expression, keyed by the identifiers they replace (`Mapping`). 

218 """ 

219 

220 region: Optional[Region] 

221 """A spatial region that all result rows must overlap 

222 (`lsst.sphgeom.Region` or `None`). 

223 """ 

224 

225 restriction: GovernorDimensionRestriction 

226 """Restrictions on the values governor dimensions can take in this query, 

227 imposed by the string expression or data ID 

228 (`GovernorDimensionRestriction`). 

229 """ 

230 

231 @property # type: ignore 

232 @cached_getter 

233 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

234 """Dimension elements whose timespans are referenced by this 

235 expression (`NamedValueAbstractSet` [ `DimensionElement` ]) 

236 """ 

237 return NamedValueSet( 

238 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c 

239 ).freeze() 

240 

241 

242@immutable 

243class QuerySummary: 

244 """A struct that holds and categorizes the dimensions involved in a query. 

245 

246 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

247 it needs to include all of the dimensions that will be included in the 

248 query (including any needed for querying datasets). 

249 

250 Parameters 

251 ---------- 

252 requested : `DimensionGraph` 

253 The dimensions whose primary keys should be included in the result rows 

254 of the query. 

255 dataId : `DataCoordinate`, optional 

256 A fully-expanded data ID identifying dimensions known in advance. If 

257 not provided, will be set to an empty data ID. ``dataId.hasRecords()`` 

258 must return `True`. 

259 expression : `str` or `QueryWhereExpression`, optional 

260 A user-provided string WHERE expression. 

261 whereRegion : `lsst.sphgeom.Region`, optional 

262 A spatial region that all rows must overlap. If `None` and ``dataId`` 

263 is not `None`, ``dataId.region`` will be used. 

264 bind : `Mapping` [ `str`, `object` ], optional 

265 Mapping containing literal values that should be injected into the 

266 query expression, keyed by the identifiers they replace. 

267 defaults : `DataCoordinate`, optional 

268 A data ID containing default for governor dimensions. 

269 datasets : `Iterable` [ `DatasetType` ], optional 

270 Dataset types whose searches may be joined into the query. Callers 

271 must still call `QueryBuilder.joinDataset` explicitly to control how 

272 that join happens (e.g. which collections are searched), but by 

273 declaring them here first we can ensure that the query includes the 

274 right dimensions for those joins. 

275 check : `bool` 

276 If `True` (default) check the query for consistency. This may reject 

277 some valid queries that resemble common mistakes (e.g. queries for 

278 visits without specifying an instrument). 

279 """ 

280 def __init__(self, requested: DimensionGraph, *, 

281 dataId: Optional[DataCoordinate] = None, 

282 expression: Optional[Union[str, QueryWhereExpression]] = None, 

283 whereRegion: Optional[Region] = None, 

284 bind: Optional[Mapping[str, Any]] = None, 

285 defaults: Optional[DataCoordinate] = None, 

286 datasets: Iterable[DatasetType] = (), 

287 check: bool = True): 

288 self.requested = requested 

289 if expression is None: 

290 expression = QueryWhereExpression(None, bind) 

291 elif isinstance(expression, str): 

292 expression = QueryWhereExpression(expression, bind) 

293 elif bind is not None: 

294 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.") 

295 self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, defaults=defaults, 

296 check=check) 

297 self.datasets = NamedValueSet(datasets).freeze() 

298 

299 requested: DimensionGraph 

300 """Dimensions whose primary keys should be included in the result rows of 

301 the query (`DimensionGraph`). 

302 """ 

303 

304 where: QueryWhereClause 

305 """Structure containing objects that contribute to the WHERE clause of the 

306 query (`QueryWhereClause`). 

307 """ 

308 

309 datasets: NamedValueAbstractSet[DatasetType] 

310 """Dataset types whose searches may be joined into the query 

311 (`NamedValueAbstractSet` [ `DatasetType` ]). 

312 """ 

313 

314 @property 

315 def universe(self) -> DimensionUniverse: 

316 """All known dimensions (`DimensionUniverse`). 

317 """ 

318 return self.requested.universe 

319 

320 @property # type: ignore 

321 @cached_getter 

322 def spatial(self) -> NamedValueAbstractSet[DimensionElement]: 

323 """Dimension elements whose regions and skypix IDs should be included 

324 in the query (`NamedValueAbstractSet` of `DimensionElement`). 

325 """ 

326 # An element may participate spatially in the query if: 

327 # - it's the most precise spatial element for its system in the 

328 # requested dimensions (i.e. in `self.requested.spatial`); 

329 # - it isn't also given at query construction time. 

330 result: NamedValueSet[DimensionElement] = NamedValueSet() 

331 for family in self.mustHaveKeysJoined.spatial: 

332 element = family.choose(self.mustHaveKeysJoined.elements) 

333 assert isinstance(element, DimensionElement) 

334 if element not in self.where.dataId.graph.elements: 

335 result.add(element) 

336 if len(result) == 1: 

337 # There's no spatial join, but there might be a WHERE filter based 

338 # on a given region. 

339 if self.where.dataId.graph.spatial: 

340 # We can only perform those filters against SkyPix dimensions, 

341 # so if what we have isn't one, add the common SkyPix dimension 

342 # to the query; the element we have will be joined to that. 

343 element, = result 

344 if not isinstance(element, SkyPixDimension): 

345 result.add(self.universe.commonSkyPix) 

346 else: 

347 # There is no spatial join or filter in this query. Even 

348 # if this element might be associated with spatial 

349 # information, we don't need it for this query. 

350 return NamedValueSet().freeze() 

351 elif len(result) > 1: 

352 # There's a spatial join. Those require the common SkyPix 

353 # system to be included in the query in order to connect them. 

354 result.add(self.universe.commonSkyPix) 

355 return result.freeze() 

356 

357 @property # type: ignore 

358 @cached_getter 

359 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

360 """Dimension elements whose timespans should be included in the 

361 query (`NamedValueSet` of `DimensionElement`). 

362 """ 

363 if len(self.mustHaveKeysJoined.temporal) > 1: 

364 # We don't actually have multiple temporal families in our current 

365 # dimension configuration, so this limitation should be harmless. 

366 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.") 

367 return self.where.temporal 

368 

369 @property # type: ignore 

370 @cached_getter 

371 def mustHaveKeysJoined(self) -> DimensionGraph: 

372 """Dimensions whose primary keys must be used in the JOIN ON clauses 

373 of the query, even if their tables do not appear (`DimensionGraph`). 

374 

375 A `Dimension` primary key can appear in a join clause without its table 

376 via a foreign key column in table of a dependent dimension element or 

377 dataset. 

378 """ 

379 names = set(self.requested.names | self.where.dimensions.names) 

380 for dataset_type in self.datasets: 

381 names.update(dataset_type.dimensions.names) 

382 return DimensionGraph(self.universe, names=names) 

383 

384 @property # type: ignore 

385 @cached_getter 

386 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]: 

387 """Dimension elements whose associated tables must appear in the 

388 query's FROM clause (`NamedValueSet` of `DimensionElement`). 

389 """ 

390 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys()) 

391 for dimension in self.mustHaveKeysJoined: 

392 if dimension.implied: 

393 result.add(dimension) 

394 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements: 

395 if element.alwaysJoin: 

396 result.add(element) 

397 return result.freeze() 

398 

399 

400@dataclass 

401class DatasetQueryColumns: 

402 """A struct containing the columns used to reconstruct `DatasetRef` 

403 instances from query results. 

404 """ 

405 

406 datasetType: DatasetType 

407 """The dataset type being queried (`DatasetType`). 

408 """ 

409 

410 id: ColumnElement 

411 """Column containing the unique integer ID for this dataset. 

412 """ 

413 

414 runKey: ColumnElement 

415 """Foreign key column to the `~CollectionType.RUN` collection that holds 

416 this dataset. 

417 """ 

418 

419 ingestDate: Optional[ColumnElement] 

420 """Column containing the ingest timestamp, this is not a part of 

421 `DatasetRef` but it comes from the same table. 

422 """ 

423 

424 def __iter__(self) -> Iterator[ColumnElement]: 

425 yield self.id 

426 yield self.runKey 

427 

428 

429@dataclass 

430class QueryColumns: 

431 """A struct organizing the columns in an under-construction or currently- 

432 executing query. 

433 

434 Takes no parameters at construction, as expected usage is to add elements 

435 to its container attributes incrementally. 

436 """ 

437 def __init__(self) -> None: 

438 self.keys = NamedKeyDict() 

439 self.timespans = NamedKeyDict() 

440 self.regions = NamedKeyDict() 

441 self.datasets = None 

442 

443 keys: NamedKeyDict[Dimension, List[ColumnElement]] 

444 """Columns that correspond to the primary key values of dimensions 

445 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`). 

446 

447 Each value list contains columns from multiple tables corresponding to the 

448 same dimension, and the query should constrain the values of those columns 

449 to be the same. 

450 

451 In a `Query`, the keys of this dictionary must include at least the 

452 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`. 

453 """ 

454 

455 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] 

456 """Columns that correspond to timespans for elements that participate in a 

457 temporal join or filter in the query (`NamedKeyDict` mapping 

458 `DimensionElement` to `TimespanDatabaseRepresentation`). 

459 

460 In a `Query`, the keys of this dictionary must be exactly the elements 

461 in `QuerySummary.temporal`. 

462 """ 

463 

464 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation] 

465 """Columns that correspond to regions for elements that participate in a 

466 spatial join or filter in the query (`NamedKeyDict` mapping 

467 `DimensionElement` to `SpatialRegionDatabaseRepresentation`). 

468 

469 In a `Query`, the keys of this dictionary must be exactly the elements 

470 in `QuerySummary.spatial`. 

471 """ 

472 

473 datasets: Optional[DatasetQueryColumns] 

474 """Columns that can be used to construct `DatasetRef` instances from query 

475 results. 

476 (`DatasetQueryColumns` or `None`). 

477 """ 

478 

479 def isEmpty(self) -> bool: 

480 """Return `True` if this query has no columns at all. 

481 """ 

482 return not (self.keys or self.timespans or self.regions or self.datasets is not None) 

483 

484 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement: 

485 """ Return one of the columns in self.keys for the given dimension. 

486 

487 The column selected is an implentation detail but is guaranteed to 

488 be deterministic and consistent across multiple calls. 

489 

490 Parameters 

491 ---------- 

492 dimension : `Dimension` or `str` 

493 Dimension for which to obtain a key column. 

494 

495 Returns 

496 ------- 

497 column : `sqlalchemy.sql.ColumnElement` 

498 SQLAlchemy column object. 

499 """ 

500 # Choosing the last element here is entirely for human readers of the 

501 # query (e.g. developers debugging things); it makes it more likely a 

502 # dimension key will be provided by the dimension's own table, or 

503 # failing that, some closely related dimension, which might be less 

504 # surprising to see than e.g. some dataset subquery. From the 

505 # database's perspective this is entirely arbitrary, because the query 

506 # guarantees they all have equal values. 

507 return self.keys[dimension][-1] 

508 

509 

510@dataclass 

511class RegistryManagers: 

512 """Struct used to pass around the manager objects that back a `Registry` 

513 and are used internally by the query system. 

514 """ 

515 

516 collections: CollectionManager 

517 """Manager for collections (`CollectionManager`). 

518 """ 

519 

520 datasets: DatasetRecordStorageManager 

521 """Manager for datasets and dataset types (`DatasetRecordStorageManager`). 

522 """ 

523 

524 dimensions: DimensionRecordStorageManager 

525 """Manager for dimensions (`DimensionRecordStorageManager`). 

526 """ 

527 

528 TimespanReprClass: Type[TimespanDatabaseRepresentation] 

529 """Type that encapsulates how timespans are represented in this database 

530 (`type`; subclass of `TimespanDatabaseRepresentation`). 

531 """