Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 35%

179 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-07 10:08 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary"] # other classes here are local to subpackage 

24 

25import dataclasses 

26from collections.abc import Iterable, Mapping, Set 

27from typing import Any 

28 

29import astropy.time 

30from lsst.daf.relation import ColumnExpression, ColumnTag, Predicate, SortTerm 

31from lsst.sphgeom import Region 

32from lsst.utils.classes import cached_getter, immutable 

33 

34from ...core import ( 

35 DataCoordinate, 

36 DatasetType, 

37 DimensionElement, 

38 DimensionGraph, 

39 DimensionKeyColumnTag, 

40 DimensionRecordColumnTag, 

41 DimensionUniverse, 

42 NamedValueAbstractSet, 

43 NamedValueSet, 

44 SkyPixDimension, 

45 Timespan, 

46) 

47 

48# We're not trying to add typing to the lex/yacc parser code, so MyPy 

49# doesn't know about some of these imports. 

50from .expressions import make_string_expression_predicate 

51from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName 

52 

53 

54@dataclasses.dataclass(frozen=True, eq=False) 

55class QueryWhereClause: 

56 """Structure holding various contributions to a query's WHERE clause. 

57 

58 Instances of this class should only be created by 

59 `QueryWhereExpression.combine`, which guarantees the consistency of its 

60 attributes. 

61 """ 

62 

63 @classmethod 

64 def combine( 

65 cls, 

66 dimensions: DimensionGraph, 

67 expression: str = "", 

68 *, 

69 bind: Mapping[str, Any] | None = None, 

70 data_id: DataCoordinate | None = None, 

71 region: Region | None = None, 

72 timespan: Timespan | None = None, 

73 defaults: DataCoordinate | None = None, 

74 dataset_type_name: str | None = None, 

75 allow_orphans: bool = False, 

76 ) -> QueryWhereClause: 

77 """Construct from various components. 

78 

79 Parameters 

80 ---------- 

81 dimensions : `DimensionGraph` 

82 The dimensions that would be included in the query in the absence 

83 of the WHERE clause. 

84 expression : `str`, optional 

85 A user-provided string expression. 

86 bind : `Mapping` [ `str`, `object` ], optional 

87 Mapping containing literal values that should be injected into the 

88 query expression, keyed by the identifiers they replace. 

89 data_id : `DataCoordinate`, optional 

90 A data ID identifying dimensions known in advance. If not 

91 provided, will be set to an empty data ID. 

92 region : `lsst.sphgeom.Region`, optional 

93 A spatial constraint that all rows must overlap. If `None` and 

94 ``data_id`` is an expanded data ID, ``data_id.region`` will be used 

95 to construct one. 

96 timespan : `Timespan`, optional 

97 A temporal constraint that all rows must overlap. If `None` and 

98 ``data_id`` is an expanded data ID, ``data_id.timespan`` will be 

99 used to construct one. 

100 defaults : `DataCoordinate`, optional 

101 A data ID containing default for governor dimensions. 

102 dataset_type_name : `str` or `None`, optional 

103 The name of the dataset type to assume for unqualified dataset 

104 columns, or `None` if there are no such identifiers. 

105 allow_orphans : `bool`, optional 

106 If `True`, permit expressions to refer to dimensions without 

107 providing a value for their governor dimensions (e.g. referring to 

108 a visit without an instrument). Should be left to default to 

109 `False` in essentially all new code. 

110 

111 Returns 

112 ------- 

113 where : `QueryWhereClause` 

114 An object representing the WHERE clause for a query. 

115 """ 

116 if data_id is not None and data_id.hasRecords(): 

117 if region is None and data_id.region is not None: 

118 region = data_id.region 

119 if timespan is None and data_id.timespan is not None: 

120 timespan = data_id.timespan 

121 if data_id is None: 

122 data_id = DataCoordinate.makeEmpty(dimensions.universe) 

123 if defaults is None: 

124 defaults = DataCoordinate.makeEmpty(dimensions.universe) 

125 expression_predicate, governor_constraints = make_string_expression_predicate( 

126 expression, 

127 dimensions, 

128 bind=bind, 

129 data_id=data_id, 

130 defaults=defaults, 

131 dataset_type_name=dataset_type_name, 

132 allow_orphans=allow_orphans, 

133 ) 

134 return QueryWhereClause( 

135 expression_predicate, 

136 data_id, 

137 region=region, 

138 timespan=timespan, 

139 governor_constraints=governor_constraints, 

140 ) 

141 

142 expression_predicate: Predicate | None 

143 """A predicate that evaluates a string expression from the user 

144 (`expressions.Predicate` or `None`). 

145 """ 

146 

147 data_id: DataCoordinate 

148 """A data ID identifying dimensions known before query construction 

149 (`DataCoordinate`). 

150 """ 

151 

152 region: Region | None 

153 """A spatial region that all result rows must overlap 

154 (`lsst.sphgeom.Region` or `None`). 

155 """ 

156 

157 timespan: Timespan | None 

158 """A temporal constraint that all result rows must overlap 

159 (`Timespan` or `None`). 

160 """ 

161 

162 governor_constraints: Mapping[str, Set[str]] 

163 """Restrictions on the values governor dimensions can take in this query, 

164 imposed by the string expression and/or data ID 

165 (`Mapping` [ `set`, `~collections.abc.Set` [ `str` ] ]). 

166 

167 Governor dimensions not present in this mapping are not constrained at all. 

168 """ 

169 

170 

171@dataclasses.dataclass(frozen=True) 

172class OrderByClauseColumn: 

173 """Information about single column in ORDER BY clause.""" 

174 

175 element: DimensionElement 

176 """Dimension element for data in this column (`DimensionElement`).""" 

177 

178 column: str | None 

179 """Name of the column or `None` for primary key (`str` or `None`)""" 

180 

181 ordering: bool 

182 """True for ascending order, False for descending (`bool`).""" 

183 

184 

185@dataclasses.dataclass(frozen=True, eq=False) 

186class OrderByClause: 

187 """Class for information about columns in ORDER BY clause""" 

188 

189 @classmethod 

190 def parse_general(cls, order_by: Iterable[str], graph: DimensionGraph) -> OrderByClause: 

191 """Parse an iterable of strings in the context of a multi-dimension 

192 query. 

193 

194 Parameters 

195 ---------- 

196 order_by : `Iterable` [ `str` ] 

197 Sequence of names to use for ordering with optional "-" prefix. 

198 graph : `DimensionGraph` 

199 Dimensions used by a query. 

200 

201 Returns 

202 ------- 

203 clause : `OrderByClause` 

204 New order-by clause representing the given string columns. 

205 """ 

206 terms = [] 

207 for name in order_by: 

208 if not name or name == "-": 

209 raise ValueError("Empty dimension name in ORDER BY") 

210 ascending = True 

211 if name[0] == "-": 

212 ascending = False 

213 name = name[1:] 

214 element, column = categorizeOrderByName(graph, name) 

215 term = cls._make_term(element, column, ascending) 

216 terms.append(term) 

217 return cls(terms) 

218 

219 @classmethod 

220 def parse_element(cls, order_by: Iterable[str], element: DimensionElement) -> OrderByClause: 

221 """Parse an iterable of strings in the context of a single dimension 

222 element query. 

223 

224 Parameters 

225 ---------- 

226 order_by : `Iterable` [ `str` ] 

227 Sequence of names to use for ordering with optional "-" prefix. 

228 element : `DimensionElement` 

229 Single or primary dimension element in the query 

230 

231 Returns 

232 ------- 

233 clause : `OrderByClause` 

234 New order-by clause representing the given string columns. 

235 """ 

236 terms = [] 

237 for name in order_by: 

238 if not name or name == "-": 

239 raise ValueError("Empty dimension name in ORDER BY") 

240 ascending = True 

241 if name[0] == "-": 

242 ascending = False 

243 name = name[1:] 

244 column = categorizeElementOrderByName(element, name) 

245 term = cls._make_term(element, column, ascending) 

246 terms.append(term) 

247 return cls(terms) 

248 

249 @classmethod 

250 def _make_term(cls, element: DimensionElement, column: str | None, ascending: bool) -> SortTerm: 

251 """Make a single sort term from parsed user expression values. 

252 

253 Parameters 

254 ---------- 

255 element : `DimensionElement` 

256 Dimension element the sort term references. 

257 column : `str` or `None` 

258 DimensionRecord field name, or `None` if ``element`` is a 

259 `Dimension` and the sort term is on is key value. 

260 ascending : `bool` 

261 Whether to sort ascending (`True`) or descending (`False`). 

262 

263 Returns 

264 ------- 

265 term : `lsst.daf.relation.SortTerm` 

266 Sort term struct. 

267 """ 

268 tag: ColumnTag 

269 expression: ColumnExpression 

270 if column is None: 

271 tag = DimensionKeyColumnTag(element.name) 

272 expression = ColumnExpression.reference(tag) 

273 elif column in ("timespan.begin", "timespan.end"): 

274 base_column, _, subfield = column.partition(".") 

275 tag = DimensionRecordColumnTag(element.name, base_column) 

276 expression = ColumnExpression.reference(tag).method( 

277 "lower" if subfield == "begin" else "upper", dtype=astropy.time.Time 

278 ) 

279 else: 

280 tag = DimensionRecordColumnTag(element.name, column) 

281 expression = ColumnExpression.reference(tag) 

282 return SortTerm(expression, ascending) 

283 

284 terms: Iterable[SortTerm] 

285 """Columns that appear in the ORDER BY 

286 (`Iterable` [ `OrderByClauseColumn` ]). 

287 """ 

288 

289 @property 

290 @cached_getter 

291 def columns_required(self) -> Set[ColumnTag]: 

292 """Set of column tags for all columns referenced by the ORDER BY clause 

293 (`~collections.abc.Set` [ `ColumnTag` ]). 

294 """ 

295 tags: set[ColumnTag] = set() 

296 for term in self.terms: 

297 tags.update(term.expression.columns_required) 

298 return tags 

299 

300 

301@immutable 

302class ElementOrderByClause: 

303 """Class for information about columns in ORDER BY clause for one element. 

304 

305 Parameters 

306 ---------- 

307 order_by : `Iterable` [ `str` ] 

308 Sequence of names to use for ordering with optional "-" prefix. 

309 element : `DimensionElement` 

310 Dimensions used by a query. 

311 """ 

312 

313 def __init__(self, order_by: Iterable[str], element: DimensionElement): 

314 

315 self.order_by_columns = [] 

316 for name in order_by: 

317 if not name or name == "-": 

318 raise ValueError("Empty dimension name in ORDER BY") 

319 ascending = True 

320 if name[0] == "-": 

321 ascending = False 

322 name = name[1:] 

323 column = categorizeElementOrderByName(element, name) 

324 self.order_by_columns.append( 

325 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

326 ) 

327 

328 order_by_columns: Iterable[OrderByClauseColumn] 

329 """Columns that appear in the ORDER BY 

330 (`Iterable` [ `OrderByClauseColumn` ]). 

331 """ 

332 

333 

334@immutable 

335class QuerySummary: 

336 """A struct that holds and categorizes the dimensions involved in a query. 

337 

338 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

339 it needs to include all of the dimensions that will be included in the 

340 query (including any needed for querying datasets). 

341 

342 Parameters 

343 ---------- 

344 requested : `DimensionGraph` 

345 The dimensions whose primary keys should be included in the result rows 

346 of the query. 

347 data_id : `DataCoordinate`, optional 

348 A fully-expanded data ID identifying dimensions known in advance. If 

349 not provided, will be set to an empty data ID. 

350 expression : `str`, optional 

351 A user-provided string WHERE expression. 

352 region : `lsst.sphgeom.Region`, optional 

353 If `None` and ``data_id`` is an expanded data ID, ``data_id.region`` 

354 will be used to construct one. 

355 timespan : `Timespan`, optional 

356 A temporal constraint that all rows must overlap. If `None` and 

357 ``data_id`` is an expanded data ID, ``data_id.timespan`` will be used 

358 to construct one. 

359 bind : `Mapping` [ `str`, `object` ], optional 

360 Mapping containing literal values that should be injected into the 

361 query expression, keyed by the identifiers they replace. 

362 defaults : `DataCoordinate`, optional 

363 A data ID containing default for governor dimensions. 

364 datasets : `Iterable` [ `DatasetType` ], optional 

365 Dataset types whose searches may be joined into the query. Callers 

366 must still call `QueryBuilder.joinDataset` explicitly to control how 

367 that join happens (e.g. which collections are searched), but by 

368 declaring them here first we can ensure that the query includes the 

369 right dimensions for those joins. 

370 order_by : `Iterable` [ `str` ] 

371 Sequence of names to use for ordering with optional "-" prefix. 

372 limit : `Tuple`, optional 

373 Limit on the number of returned rows and optional offset. 

374 check : `bool`, optional 

375 If `False`, permit expressions to refer to dimensions without providing 

376 a value for their governor dimensions (e.g. referring to a visit 

377 without an instrument). Should be left to default to `True` in 

378 essentially all new code. 

379 """ 

380 

381 def __init__( 

382 self, 

383 requested: DimensionGraph, 

384 *, 

385 data_id: DataCoordinate | None = None, 

386 expression: str = "", 

387 region: Region | None = None, 

388 timespan: Timespan | None = None, 

389 bind: Mapping[str, Any] | None = None, 

390 defaults: DataCoordinate | None = None, 

391 datasets: Iterable[DatasetType] = (), 

392 order_by: Iterable[str] | None = None, 

393 limit: tuple[int, int | None] | None = None, 

394 check: bool = True, 

395 ): 

396 self.requested = requested 

397 self.datasets = NamedValueSet(datasets).freeze() 

398 if len(self.datasets) == 1: 

399 (dataset_type_name,) = self.datasets.names 

400 else: 

401 dataset_type_name = None 

402 self.where = QueryWhereClause.combine( 

403 self.requested, 

404 expression=expression, 

405 bind=bind, 

406 data_id=data_id, 

407 region=region, 

408 timespan=timespan, 

409 defaults=defaults, 

410 dataset_type_name=dataset_type_name, 

411 allow_orphans=not check, 

412 ) 

413 self.order_by = None if order_by is None else OrderByClause.parse_general(order_by, requested) 

414 self.limit = limit 

415 self.columns_required, self.dimensions = self._compute_columns_required() 

416 

417 requested: DimensionGraph 

418 """Dimensions whose primary keys should be included in the result rows of 

419 the query (`DimensionGraph`). 

420 """ 

421 

422 where: QueryWhereClause 

423 """Structure containing objects that contribute to the WHERE clause of the 

424 query (`QueryWhereClause`). 

425 """ 

426 

427 datasets: NamedValueAbstractSet[DatasetType] 

428 """Dataset types whose searches may be joined into the query 

429 (`NamedValueAbstractSet` [ `DatasetType` ]). 

430 """ 

431 

432 order_by: OrderByClause | None 

433 """Object that manages how the query results should be sorted 

434 (`OrderByClause` or `None`). 

435 """ 

436 

437 limit: tuple[int, int | None] | None 

438 """Integer offset and maximum number of rows returned (prior to 

439 postprocessing filters), respectively. 

440 """ 

441 

442 dimensions: DimensionGraph 

443 """All dimensions in the query in any form (`DimensionGraph`). 

444 """ 

445 

446 columns_required: Set[ColumnTag] 

447 """All columns that must be included directly in the query. 

448 

449 This does not include columns that only need to be included in the result 

450 rows, and hence could be provided by postprocessors. 

451 """ 

452 

453 @property 

454 def universe(self) -> DimensionUniverse: 

455 """All known dimensions (`DimensionUniverse`).""" 

456 return self.requested.universe 

457 

458 @property 

459 @cached_getter 

460 def spatial(self) -> NamedValueAbstractSet[DimensionElement]: 

461 """Dimension elements whose regions and skypix IDs should be included 

462 in the query (`NamedValueAbstractSet` of `DimensionElement`). 

463 """ 

464 # An element may participate spatially in the query if: 

465 # - it's the most precise spatial element for its system in the 

466 # requested dimensions (i.e. in `self.requested.spatial`); 

467 # - it isn't also given at query construction time. 

468 result: NamedValueSet[DimensionElement] = NamedValueSet() 

469 for family in self.dimensions.spatial: 

470 element = family.choose(self.dimensions.elements) 

471 assert isinstance(element, DimensionElement) 

472 if element not in self.where.data_id.graph.elements: 

473 result.add(element) 

474 if len(result) == 1: 

475 # There's no spatial join, but there might be a WHERE filter based 

476 # on a given region. 

477 if self.where.data_id.graph.spatial: 

478 # We can only perform those filters against SkyPix dimensions, 

479 # so if what we have isn't one, add the common SkyPix dimension 

480 # to the query; the element we have will be joined to that. 

481 (element,) = result 

482 if not isinstance(element, SkyPixDimension): 

483 result.add(self.universe.commonSkyPix) 

484 else: 

485 # There is no spatial join or filter in this query. Even 

486 # if this element might be associated with spatial 

487 # information, we don't need it for this query. 

488 return NamedValueSet().freeze() 

489 return result.freeze() 

490 

491 @property 

492 @cached_getter 

493 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

494 """Dimension elements whose timespans should be included in the 

495 query (`NamedValueSet` of `DimensionElement`). 

496 """ 

497 if len(self.dimensions.temporal) > 1: 

498 # We don't actually have multiple temporal families in our current 

499 # dimension configuration, so this limitation should be harmless. 

500 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.") 

501 result = NamedValueSet[DimensionElement]() 

502 if self.where.expression_predicate is not None: 

503 for tag in DimensionRecordColumnTag.filter_from(self.where.expression_predicate.columns_required): 

504 if tag.column == "timespan": 

505 result.add(self.requested.universe[tag.element]) 

506 return result.freeze() 

507 

508 def _compute_columns_required(self) -> tuple[set[ColumnTag], DimensionGraph]: 

509 """Compute the columns that must be provided by the relations joined 

510 into this query in order to obtain the right *set* of result rows in 

511 the right order. 

512 

513 This does not include columns that only need to be included in the 

514 result rows, and hence could be provided by postprocessors. 

515 """ 

516 tags: set[ColumnTag] = set(DimensionKeyColumnTag.generate(self.requested.names)) 

517 tags.update(DimensionKeyColumnTag.generate(self.where.data_id.graph.names)) 

518 for dataset_type in self.datasets: 

519 tags.update(DimensionKeyColumnTag.generate(dataset_type.dimensions.names)) 

520 if self.where.expression_predicate is not None: 

521 tags.update(self.where.expression_predicate.columns_required) 

522 if self.order_by is not None: 

523 tags.update(self.order_by.columns_required) 

524 # Make sure the dimension keys are expanded self-consistently in what 

525 # we return by passing them through DimensionGraph. 

526 dimensions = DimensionGraph( 

527 self.universe, names={tag.dimension for tag in DimensionKeyColumnTag.filter_from(tags)} 

528 ) 

529 tags.update(DimensionKeyColumnTag.generate(dimensions.names)) 

530 return (tags, dimensions)