Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 35%

179 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-05 02:04 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary"] # other classes here are local to subpackage 

24 

25import dataclasses 

26from collections.abc import Iterable, Mapping, Set 

27from typing import Any 

28 

29import astropy.time 

30from lsst.daf.relation import ColumnExpression, ColumnTag, Predicate, SortTerm 

31from lsst.sphgeom import Region 

32from lsst.utils.classes import cached_getter, immutable 

33 

34from ...core import ( 

35 DataCoordinate, 

36 DatasetType, 

37 DimensionElement, 

38 DimensionGraph, 

39 DimensionKeyColumnTag, 

40 DimensionRecordColumnTag, 

41 DimensionUniverse, 

42 NamedValueAbstractSet, 

43 NamedValueSet, 

44 SkyPixDimension, 

45 Timespan, 

46) 

47 

48# We're not trying to add typing to the lex/yacc parser code, so MyPy 

49# doesn't know about some of these imports. 

50from .expressions import make_string_expression_predicate 

51from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName 

52 

53 

54@dataclasses.dataclass(frozen=True, eq=False) 

55class QueryWhereClause: 

56 """Structure holding various contributions to a query's WHERE clause. 

57 

58 Instances of this class should only be created by 

59 `QueryWhereExpression.combine`, which guarantees the consistency of its 

60 attributes. 

61 """ 

62 

63 @classmethod 

64 def combine( 

65 cls, 

66 dimensions: DimensionGraph, 

67 expression: str = "", 

68 *, 

69 bind: Mapping[str, Any] | None = None, 

70 data_id: DataCoordinate | None = None, 

71 region: Region | None = None, 

72 timespan: Timespan | None = None, 

73 defaults: DataCoordinate | None = None, 

74 dataset_type_name: str | None = None, 

75 allow_orphans: bool = False, 

76 ) -> QueryWhereClause: 

77 """Construct from various components. 

78 

79 Parameters 

80 ---------- 

81 dimensions : `DimensionGraph` 

82 The dimensions that would be included in the query in the absence 

83 of the WHERE clause. 

84 expression : `str`, optional 

85 A user-provided string expression. 

86 bind : `Mapping` [ `str`, `object` ], optional 

87 Mapping containing literal values that should be injected into the 

88 query expression, keyed by the identifiers they replace. 

89 data_id : `DataCoordinate`, optional 

90 A data ID identifying dimensions known in advance. If not 

91 provided, will be set to an empty data ID. 

92 region : `lsst.sphgeom.Region`, optional 

93 A spatial constraint that all rows must overlap. If `None` and 

94 ``data_id`` is an expanded data ID, ``data_id.region`` will be used 

95 to construct one. 

96 timespan : `Timespan`, optional 

97 A temporal constraint that all rows must overlap. If `None` and 

98 ``data_id`` is an expanded data ID, ``data_id.timespan`` will be 

99 used to construct one. 

100 defaults : `DataCoordinate`, optional 

101 A data ID containing default for governor dimensions. 

102 dataset_type_name : `str` or `None`, optional 

103 The name of the dataset type to assume for unqualified dataset 

104 columns, or `None` if there are no such identifiers. 

105 allow_orphans : `bool`, optional 

106 If `True`, permit expressions to refer to dimensions without 

107 providing a value for their governor dimensions (e.g. referring to 

108 a visit without an instrument). Should be left to default to 

109 `False` in essentially all new code. 

110 

111 Returns 

112 ------- 

113 where : `QueryWhereClause` 

114 An object representing the WHERE clause for a query. 

115 """ 

116 if data_id is not None and data_id.hasRecords(): 

117 if region is None and data_id.region is not None: 

118 region = data_id.region 

119 if timespan is None and data_id.timespan is not None: 

120 timespan = data_id.timespan 

121 if data_id is None: 

122 data_id = DataCoordinate.makeEmpty(dimensions.universe) 

123 if defaults is None: 

124 defaults = DataCoordinate.makeEmpty(dimensions.universe) 

125 expression_predicate, governor_constraints = make_string_expression_predicate( 

126 expression, 

127 dimensions, 

128 bind=bind, 

129 data_id=data_id, 

130 defaults=defaults, 

131 dataset_type_name=dataset_type_name, 

132 allow_orphans=allow_orphans, 

133 ) 

134 return QueryWhereClause( 

135 expression_predicate, 

136 data_id, 

137 region=region, 

138 timespan=timespan, 

139 governor_constraints=governor_constraints, 

140 ) 

141 

142 expression_predicate: Predicate | None 

143 """A predicate that evaluates a string expression from the user 

144 (`expressions.Predicate` or `None`). 

145 """ 

146 

147 data_id: DataCoordinate 

148 """A data ID identifying dimensions known before query construction 

149 (`DataCoordinate`). 

150 """ 

151 

152 region: Region | None 

153 """A spatial region that all result rows must overlap 

154 (`lsst.sphgeom.Region` or `None`). 

155 """ 

156 

157 timespan: Timespan | None 

158 """A temporal constraint that all result rows must overlap 

159 (`Timespan` or `None`). 

160 """ 

161 

162 governor_constraints: Mapping[str, Set[str]] 

163 """Restrictions on the values governor dimensions can take in this query, 

164 imposed by the string expression and/or data ID 

165 (`Mapping` [ `set`, `~collections.abc.Set` [ `str` ] ]). 

166 

167 Governor dimensions not present in this mapping are not constrained at all. 

168 """ 

169 

170 

171@dataclasses.dataclass(frozen=True) 

172class OrderByClauseColumn: 

173 """Information about single column in ORDER BY clause.""" 

174 

175 element: DimensionElement 

176 """Dimension element for data in this column (`DimensionElement`).""" 

177 

178 column: str | None 

179 """Name of the column or `None` for primary key (`str` or `None`)""" 

180 

181 ordering: bool 

182 """True for ascending order, False for descending (`bool`).""" 

183 

184 

185@dataclasses.dataclass(frozen=True, eq=False) 

186class OrderByClause: 

187 """Class for information about columns in ORDER BY clause""" 

188 

189 @classmethod 

190 def parse_general(cls, order_by: Iterable[str], graph: DimensionGraph) -> OrderByClause: 

191 """Parse an iterable of strings in the context of a multi-dimension 

192 query. 

193 

194 Parameters 

195 ---------- 

196 order_by : `Iterable` [ `str` ] 

197 Sequence of names to use for ordering with optional "-" prefix. 

198 graph : `DimensionGraph` 

199 Dimensions used by a query. 

200 

201 Returns 

202 ------- 

203 clause : `OrderByClause` 

204 New order-by clause representing the given string columns. 

205 """ 

206 terms = [] 

207 for name in order_by: 

208 if not name or name == "-": 

209 raise ValueError("Empty dimension name in ORDER BY") 

210 ascending = True 

211 if name[0] == "-": 

212 ascending = False 

213 name = name[1:] 

214 element, column = categorizeOrderByName(graph, name) 

215 term = cls._make_term(element, column, ascending) 

216 terms.append(term) 

217 return cls(terms) 

218 

219 @classmethod 

220 def parse_element(cls, order_by: Iterable[str], element: DimensionElement) -> OrderByClause: 

221 """Parse an iterable of strings in the context of a single dimension 

222 element query. 

223 

224 Parameters 

225 ---------- 

226 order_by : `Iterable` [ `str` ] 

227 Sequence of names to use for ordering with optional "-" prefix. 

228 element : `DimensionElement` 

229 Single or primary dimension element in the query 

230 

231 Returns 

232 ------- 

233 clause : `OrderByClause` 

234 New order-by clause representing the given string columns. 

235 """ 

236 terms = [] 

237 for name in order_by: 

238 if not name or name == "-": 

239 raise ValueError("Empty dimension name in ORDER BY") 

240 ascending = True 

241 if name[0] == "-": 

242 ascending = False 

243 name = name[1:] 

244 column = categorizeElementOrderByName(element, name) 

245 term = cls._make_term(element, column, ascending) 

246 terms.append(term) 

247 return cls(terms) 

248 

249 @classmethod 

250 def _make_term(cls, element: DimensionElement, column: str | None, ascending: bool) -> SortTerm: 

251 """Make a single sort term from parsed user expression values. 

252 

253 Parameters 

254 ---------- 

255 element : `DimensionElement` 

256 Dimension element the sort term references. 

257 column : `str` or `None` 

258 DimensionRecord field name, or `None` if ``element`` is a 

259 `Dimension` and the sort term is on is key value. 

260 ascending : `bool` 

261 Whether to sort ascending (`True`) or descending (`False`). 

262 

263 Returns 

264 ------- 

265 term : `lsst.daf.relation.SortTerm` 

266 Sort term struct. 

267 """ 

268 tag: ColumnTag 

269 expression: ColumnExpression 

270 if column is None: 

271 tag = DimensionKeyColumnTag(element.name) 

272 expression = ColumnExpression.reference(tag) 

273 elif column in ("timespan.begin", "timespan.end"): 

274 base_column, _, subfield = column.partition(".") 

275 tag = DimensionRecordColumnTag(element.name, base_column) 

276 expression = ColumnExpression.reference(tag).method( 

277 "lower" if subfield == "begin" else "upper", dtype=astropy.time.Time 

278 ) 

279 else: 

280 tag = DimensionRecordColumnTag(element.name, column) 

281 expression = ColumnExpression.reference(tag) 

282 return SortTerm(expression, ascending) 

283 

284 terms: Iterable[SortTerm] 

285 """Columns that appear in the ORDER BY 

286 (`Iterable` [ `OrderByClauseColumn` ]). 

287 """ 

288 

289 @property 

290 @cached_getter 

291 def columns_required(self) -> Set[ColumnTag]: 

292 """Set of column tags for all columns referenced by the ORDER BY clause 

293 (`~collections.abc.Set` [ `ColumnTag` ]). 

294 """ 

295 tags: set[ColumnTag] = set() 

296 for term in self.terms: 

297 tags.update(term.expression.columns_required) 

298 return tags 

299 

300 

301@immutable 

302class ElementOrderByClause: 

303 """Class for information about columns in ORDER BY clause for one element. 

304 

305 Parameters 

306 ---------- 

307 order_by : `Iterable` [ `str` ] 

308 Sequence of names to use for ordering with optional "-" prefix. 

309 element : `DimensionElement` 

310 Dimensions used by a query. 

311 """ 

312 

313 def __init__(self, order_by: Iterable[str], element: DimensionElement): 

314 self.order_by_columns = [] 

315 for name in order_by: 

316 if not name or name == "-": 

317 raise ValueError("Empty dimension name in ORDER BY") 

318 ascending = True 

319 if name[0] == "-": 

320 ascending = False 

321 name = name[1:] 

322 column = categorizeElementOrderByName(element, name) 

323 self.order_by_columns.append( 

324 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

325 ) 

326 

327 order_by_columns: Iterable[OrderByClauseColumn] 

328 """Columns that appear in the ORDER BY 

329 (`Iterable` [ `OrderByClauseColumn` ]). 

330 """ 

331 

332 

333@immutable 

334class QuerySummary: 

335 """A struct that holds and categorizes the dimensions involved in a query. 

336 

337 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

338 it needs to include all of the dimensions that will be included in the 

339 query (including any needed for querying datasets). 

340 

341 Parameters 

342 ---------- 

343 requested : `DimensionGraph` 

344 The dimensions whose primary keys should be included in the result rows 

345 of the query. 

346 data_id : `DataCoordinate`, optional 

347 A fully-expanded data ID identifying dimensions known in advance. If 

348 not provided, will be set to an empty data ID. 

349 expression : `str`, optional 

350 A user-provided string WHERE expression. 

351 region : `lsst.sphgeom.Region`, optional 

352 If `None` and ``data_id`` is an expanded data ID, ``data_id.region`` 

353 will be used to construct one. 

354 timespan : `Timespan`, optional 

355 A temporal constraint that all rows must overlap. If `None` and 

356 ``data_id`` is an expanded data ID, ``data_id.timespan`` will be used 

357 to construct one. 

358 bind : `Mapping` [ `str`, `object` ], optional 

359 Mapping containing literal values that should be injected into the 

360 query expression, keyed by the identifiers they replace. 

361 defaults : `DataCoordinate`, optional 

362 A data ID containing default for governor dimensions. 

363 datasets : `Iterable` [ `DatasetType` ], optional 

364 Dataset types whose searches may be joined into the query. Callers 

365 must still call `QueryBuilder.joinDataset` explicitly to control how 

366 that join happens (e.g. which collections are searched), but by 

367 declaring them here first we can ensure that the query includes the 

368 right dimensions for those joins. 

369 order_by : `Iterable` [ `str` ] 

370 Sequence of names to use for ordering with optional "-" prefix. 

371 limit : `Tuple`, optional 

372 Limit on the number of returned rows and optional offset. 

373 check : `bool`, optional 

374 If `False`, permit expressions to refer to dimensions without providing 

375 a value for their governor dimensions (e.g. referring to a visit 

376 without an instrument). Should be left to default to `True` in 

377 essentially all new code. 

378 """ 

379 

380 def __init__( 

381 self, 

382 requested: DimensionGraph, 

383 *, 

384 data_id: DataCoordinate | None = None, 

385 expression: str = "", 

386 region: Region | None = None, 

387 timespan: Timespan | None = None, 

388 bind: Mapping[str, Any] | None = None, 

389 defaults: DataCoordinate | None = None, 

390 datasets: Iterable[DatasetType] = (), 

391 order_by: Iterable[str] | None = None, 

392 limit: tuple[int, int | None] | None = None, 

393 check: bool = True, 

394 ): 

395 self.requested = requested 

396 self.datasets = NamedValueSet(datasets).freeze() 

397 if len(self.datasets) == 1: 

398 (dataset_type_name,) = self.datasets.names 

399 else: 

400 dataset_type_name = None 

401 self.where = QueryWhereClause.combine( 

402 self.requested, 

403 expression=expression, 

404 bind=bind, 

405 data_id=data_id, 

406 region=region, 

407 timespan=timespan, 

408 defaults=defaults, 

409 dataset_type_name=dataset_type_name, 

410 allow_orphans=not check, 

411 ) 

412 self.order_by = None if order_by is None else OrderByClause.parse_general(order_by, requested) 

413 self.limit = limit 

414 self.columns_required, self.dimensions = self._compute_columns_required() 

415 

416 requested: DimensionGraph 

417 """Dimensions whose primary keys should be included in the result rows of 

418 the query (`DimensionGraph`). 

419 """ 

420 

421 where: QueryWhereClause 

422 """Structure containing objects that contribute to the WHERE clause of the 

423 query (`QueryWhereClause`). 

424 """ 

425 

426 datasets: NamedValueAbstractSet[DatasetType] 

427 """Dataset types whose searches may be joined into the query 

428 (`NamedValueAbstractSet` [ `DatasetType` ]). 

429 """ 

430 

431 order_by: OrderByClause | None 

432 """Object that manages how the query results should be sorted 

433 (`OrderByClause` or `None`). 

434 """ 

435 

436 limit: tuple[int, int | None] | None 

437 """Integer offset and maximum number of rows returned (prior to 

438 postprocessing filters), respectively. 

439 """ 

440 

441 dimensions: DimensionGraph 

442 """All dimensions in the query in any form (`DimensionGraph`). 

443 """ 

444 

445 columns_required: Set[ColumnTag] 

446 """All columns that must be included directly in the query. 

447 

448 This does not include columns that only need to be included in the result 

449 rows, and hence could be provided by postprocessors. 

450 """ 

451 

452 @property 

453 def universe(self) -> DimensionUniverse: 

454 """All known dimensions (`DimensionUniverse`).""" 

455 return self.requested.universe 

456 

457 @property 

458 @cached_getter 

459 def spatial(self) -> NamedValueAbstractSet[DimensionElement]: 

460 """Dimension elements whose regions and skypix IDs should be included 

461 in the query (`NamedValueAbstractSet` of `DimensionElement`). 

462 """ 

463 # An element may participate spatially in the query if: 

464 # - it's the most precise spatial element for its system in the 

465 # requested dimensions (i.e. in `self.requested.spatial`); 

466 # - it isn't also given at query construction time. 

467 result: NamedValueSet[DimensionElement] = NamedValueSet() 

468 for family in self.dimensions.spatial: 

469 element = family.choose(self.dimensions.elements) 

470 assert isinstance(element, DimensionElement) 

471 if element not in self.where.data_id.graph.elements: 

472 result.add(element) 

473 if len(result) == 1: 

474 # There's no spatial join, but there might be a WHERE filter based 

475 # on a given region. 

476 if self.where.data_id.graph.spatial: 

477 # We can only perform those filters against SkyPix dimensions, 

478 # so if what we have isn't one, add the common SkyPix dimension 

479 # to the query; the element we have will be joined to that. 

480 (element,) = result 

481 if not isinstance(element, SkyPixDimension): 

482 result.add(self.universe.commonSkyPix) 

483 else: 

484 # There is no spatial join or filter in this query. Even 

485 # if this element might be associated with spatial 

486 # information, we don't need it for this query. 

487 return NamedValueSet().freeze() 

488 return result.freeze() 

489 

490 @property 

491 @cached_getter 

492 def temporal(self) -> NamedValueAbstractSet[DimensionElement]: 

493 """Dimension elements whose timespans should be included in the 

494 query (`NamedValueSet` of `DimensionElement`). 

495 """ 

496 if len(self.dimensions.temporal) > 1: 

497 # We don't actually have multiple temporal families in our current 

498 # dimension configuration, so this limitation should be harmless. 

499 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.") 

500 result = NamedValueSet[DimensionElement]() 

501 if self.where.expression_predicate is not None: 

502 for tag in DimensionRecordColumnTag.filter_from(self.where.expression_predicate.columns_required): 

503 if tag.column == "timespan": 

504 result.add(self.requested.universe[tag.element]) 

505 return result.freeze() 

506 

507 def _compute_columns_required(self) -> tuple[set[ColumnTag], DimensionGraph]: 

508 """Compute the columns that must be provided by the relations joined 

509 into this query in order to obtain the right *set* of result rows in 

510 the right order. 

511 

512 This does not include columns that only need to be included in the 

513 result rows, and hence could be provided by postprocessors. 

514 """ 

515 tags: set[ColumnTag] = set(DimensionKeyColumnTag.generate(self.requested.names)) 

516 tags.update(DimensionKeyColumnTag.generate(self.where.data_id.graph.names)) 

517 for dataset_type in self.datasets: 

518 tags.update(DimensionKeyColumnTag.generate(dataset_type.dimensions.names)) 

519 if self.where.expression_predicate is not None: 

520 tags.update(self.where.expression_predicate.columns_required) 

521 if self.order_by is not None: 

522 tags.update(self.order_by.columns_required) 

523 # Make sure the dimension keys are expanded self-consistently in what 

524 # we return by passing them through DimensionGraph. 

525 dimensions = DimensionGraph( 

526 self.universe, names={tag.dimension for tag in DimensionKeyColumnTag.filter_from(tags)} 

527 ) 

528 tags.update(DimensionKeyColumnTag.generate(dimensions.names)) 

529 return (tags, dimensions)