Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 40%

172 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-05 11:07 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ["QuerySummary"] # other classes here are local to subpackage 

30 

31import dataclasses 

32from collections.abc import Iterable, Mapping, Set 

33from typing import Any 

34 

35import astropy.time 

36from lsst.daf.relation import ColumnExpression, ColumnTag, Predicate, SortTerm 

37from lsst.sphgeom import IntersectionRegion, Region 

38from lsst.utils.classes import cached_getter, immutable 

39 

40from ..._column_tags import DimensionKeyColumnTag, DimensionRecordColumnTag 

41from ..._column_type_info import ColumnTypeInfo 

42from ..._dataset_type import DatasetType 

43from ..._named import NamedValueAbstractSet, NamedValueSet 

44from ...dimensions import DataCoordinate, DimensionElement, DimensionGroup, DimensionUniverse, SkyPixDimension 

45 

46# We're not trying to add typing to the lex/yacc parser code, so MyPy 

47# doesn't know about some of these imports. 

48from .expressions import make_string_expression_predicate 

49from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName 

50 

51 

52@dataclasses.dataclass(frozen=True, eq=False) 

53class QueryWhereClause: 

54 """Structure holding various contributions to a query's WHERE clause. 

55 

56 Instances of this class should only be created by 

57 `QueryWhereExpression.combine`, which guarantees the consistency of its 

58 attributes. 

59 """ 

60 

61 @classmethod 

62 def combine( 

63 cls, 

64 dimensions: DimensionGroup, 

65 expression: str = "", 

66 *, 

67 column_types: ColumnTypeInfo, 

68 bind: Mapping[str, Any] | None = None, 

69 data_id: DataCoordinate | None = None, 

70 region: Region | None = None, 

71 defaults: DataCoordinate | None = None, 

72 dataset_type_name: str | None = None, 

73 allow_orphans: bool = False, 

74 ) -> QueryWhereClause: 

75 """Construct from various components. 

76 

77 Parameters 

78 ---------- 

79 dimensions : `DimensionGroup` 

80 The dimensions that would be included in the query in the absence 

81 of the WHERE clause. 

82 expression : `str`, optional 

83 A user-provided string expression. 

84 column_types : `ColumnTypeInfo` 

85 Information about column types. 

86 bind : `~collections.abc.Mapping` [ `str`, `object` ], optional 

87 Mapping containing literal values that should be injected into the 

88 query expression, keyed by the identifiers they replace. 

89 data_id : `DataCoordinate`, optional 

90 A data ID identifying dimensions known in advance. If not 

91 provided, will be set to an empty data ID. 

92 region : `lsst.sphgeom.Region`, optional 

93 A spatial constraint that all rows must overlap. 

94 defaults : `DataCoordinate`, optional 

95 A data ID containing default for governor dimensions. 

96 dataset_type_name : `str` or `None`, optional 

97 The name of the dataset type to assume for unqualified dataset 

98 columns, or `None` if there are no such identifiers. 

99 allow_orphans : `bool`, optional 

100 If `True`, permit expressions to refer to dimensions without 

101 providing a value for their governor dimensions (e.g. referring to 

102 a visit without an instrument). Should be left to default to 

103 `False` in essentially all new code. 

104 

105 Returns 

106 ------- 

107 where : `QueryWhereClause` 

108 An object representing the WHERE clause for a query. 

109 """ 

110 if data_id is None: 

111 data_id = DataCoordinate.make_empty(dimensions.universe) 

112 if defaults is None: 

113 defaults = DataCoordinate.make_empty(dimensions.universe) 

114 expression_predicate, governor_constraints = make_string_expression_predicate( 

115 expression, 

116 dimensions, 

117 column_types=column_types, 

118 bind=bind, 

119 data_id=data_id, 

120 defaults=defaults, 

121 dataset_type_name=dataset_type_name, 

122 allow_orphans=allow_orphans, 

123 ) 

124 return QueryWhereClause( 

125 expression_predicate, 

126 data_id, 

127 region=region, 

128 governor_constraints=governor_constraints, 

129 ) 

130 

131 expression_predicate: Predicate | None 

132 """A predicate that evaluates a string expression from the user 

133 (`expressions.Predicate` or `None`). 

134 """ 

135 

136 data_id: DataCoordinate 

137 """A data ID identifying dimensions known before query construction 

138 (`DataCoordinate`). 

139 """ 

140 

141 region: Region | None 

142 """A spatial region that all result rows must overlap 

143 (`lsst.sphgeom.Region` or `None`). 

144 """ 

145 

146 governor_constraints: Mapping[str, Set[str]] 

147 """Restrictions on the values governor dimensions can take in this query, 

148 imposed by the string expression and/or data ID 

149 (`~collections.abc.Mapping` [ `str`, `~collections.abc.Set` [ `str` ] ]). 

150 

151 Governor dimensions not present in this mapping are not constrained at all. 

152 """ 

153 

154 

155@dataclasses.dataclass(frozen=True) 

156class OrderByClauseColumn: 

157 """Information about single column in ORDER BY clause.""" 

158 

159 element: DimensionElement 

160 """Dimension element for data in this column (`DimensionElement`).""" 

161 

162 column: str | None 

163 """Name of the column or `None` for primary key (`str` or `None`)""" 

164 

165 ordering: bool 

166 """True for ascending order, False for descending (`bool`).""" 

167 

168 

169@dataclasses.dataclass(frozen=True, eq=False) 

170class OrderByClause: 

171 """Class for information about columns in ORDER BY clause.""" 

172 

173 @classmethod 

174 def parse_general(cls, order_by: Iterable[str], dimensions: DimensionGroup) -> OrderByClause: 

175 """Parse an iterable of strings in the context of a multi-dimension 

176 query. 

177 

178 Parameters 

179 ---------- 

180 order_by : `~collections.abc.Iterable` [ `str` ] 

181 Sequence of names to use for ordering with optional "-" prefix. 

182 dimensions : `DimensionGroup` 

183 Dimensions used by a query. 

184 

185 Returns 

186 ------- 

187 clause : `OrderByClause` 

188 New order-by clause representing the given string columns. 

189 """ 

190 terms = [] 

191 for name in order_by: 

192 if not name or name == "-": 

193 raise ValueError("Empty dimension name in ORDER BY") 

194 ascending = True 

195 if name[0] == "-": 

196 ascending = False 

197 name = name[1:] 

198 element, column = categorizeOrderByName(dimensions, name) 

199 term = cls._make_term(element, column, ascending) 

200 terms.append(term) 

201 return cls(terms) 

202 

203 @classmethod 

204 def parse_element(cls, order_by: Iterable[str], element: DimensionElement) -> OrderByClause: 

205 """Parse an iterable of strings in the context of a single dimension 

206 element query. 

207 

208 Parameters 

209 ---------- 

210 order_by : `~collections.abc.Iterable` [ `str` ] 

211 Sequence of names to use for ordering with optional "-" prefix. 

212 element : `DimensionElement` 

213 Single or primary dimension element in the query 

214 

215 Returns 

216 ------- 

217 clause : `OrderByClause` 

218 New order-by clause representing the given string columns. 

219 """ 

220 terms = [] 

221 for name in order_by: 

222 if not name or name == "-": 

223 raise ValueError("Empty dimension name in ORDER BY") 

224 ascending = True 

225 if name[0] == "-": 

226 ascending = False 

227 name = name[1:] 

228 column = categorizeElementOrderByName(element, name) 

229 term = cls._make_term(element, column, ascending) 

230 terms.append(term) 

231 return cls(terms) 

232 

233 @classmethod 

234 def _make_term(cls, element: DimensionElement, column: str | None, ascending: bool) -> SortTerm: 

235 """Make a single sort term from parsed user expression values. 

236 

237 Parameters 

238 ---------- 

239 element : `DimensionElement` 

240 Dimension element the sort term references. 

241 column : `str` or `None` 

242 DimensionRecord field name, or `None` if ``element`` is a 

243 `Dimension` and the sort term is on is key value. 

244 ascending : `bool` 

245 Whether to sort ascending (`True`) or descending (`False`). 

246 

247 Returns 

248 ------- 

249 term : `lsst.daf.relation.SortTerm` 

250 Sort term struct. 

251 """ 

252 tag: ColumnTag 

253 expression: ColumnExpression 

254 if column is None: 

255 tag = DimensionKeyColumnTag(element.name) 

256 expression = ColumnExpression.reference(tag) 

257 elif column in ("timespan.begin", "timespan.end"): 

258 base_column, _, subfield = column.partition(".") 

259 tag = DimensionRecordColumnTag(element.name, base_column) 

260 expression = ColumnExpression.reference(tag).method( 

261 "lower" if subfield == "begin" else "upper", dtype=astropy.time.Time 

262 ) 

263 else: 

264 tag = DimensionRecordColumnTag(element.name, column) 

265 expression = ColumnExpression.reference(tag) 

266 return SortTerm(expression, ascending) 

267 

268 terms: Iterable[SortTerm] 

269 """Columns that appear in the ORDER BY 

270 (`~collections.abc.Iterable` [ `OrderByClauseColumn` ]). 

271 """ 

272 

273 @property 

274 @cached_getter 

275 def columns_required(self) -> Set[ColumnTag]: 

276 """Set of column tags for all columns referenced by the ORDER BY clause 

277 (`~collections.abc.Set` [ `ColumnTag` ]). 

278 """ 

279 tags: set[ColumnTag] = set() 

280 for term in self.terms: 

281 tags.update(term.expression.columns_required) 

282 return tags 

283 

284 

285@immutable 

286class ElementOrderByClause: 

287 """Class for information about columns in ORDER BY clause for one element. 

288 

289 Parameters 

290 ---------- 

291 order_by : `~collections.abc.Iterable` [ `str` ] 

292 Sequence of names to use for ordering with optional "-" prefix. 

293 element : `DimensionElement` 

294 Dimensions used by a query. 

295 """ 

296 

297 def __init__(self, order_by: Iterable[str], element: DimensionElement): 

298 self.order_by_columns = [] 

299 for name in order_by: 

300 if not name or name == "-": 

301 raise ValueError("Empty dimension name in ORDER BY") 

302 ascending = True 

303 if name[0] == "-": 

304 ascending = False 

305 name = name[1:] 

306 column = categorizeElementOrderByName(element, name) 

307 self.order_by_columns.append( 

308 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

309 ) 

310 

311 order_by_columns: Iterable[OrderByClauseColumn] 

312 """Columns that appear in the ORDER BY 

313 (`~collections.abc.Iterable` [ `OrderByClauseColumn` ]). 

314 """ 

315 

316 

317@immutable 

318class QuerySummary: 

319 """A struct that holds and categorizes the dimensions involved in a query. 

320 

321 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

322 it needs to include all of the dimensions that will be included in the 

323 query (including any needed for querying datasets). 

324 

325 Parameters 

326 ---------- 

327 requested : `DimensionGroup` 

328 The dimensions whose primary keys should be included in the result rows 

329 of the query. 

330 column_types : `ColumnTypeInfo` 

331 Information about column types. 

332 data_id : `DataCoordinate`, optional 

333 A fully-expanded data ID identifying dimensions known in advance. If 

334 not provided, will be set to an empty data ID. 

335 expression : `str`, optional 

336 A user-provided string WHERE expression. 

337 region : `lsst.sphgeom.Region`, optional 

338 A spatial constraint that all rows must overlap. 

339 timespan : `Timespan`, optional 

340 A temporal constraint that all rows must overlap. 

341 bind : `~collections.abc.Mapping` [ `str`, `object` ], optional 

342 Mapping containing literal values that should be injected into the 

343 query expression, keyed by the identifiers they replace. 

344 defaults : `DataCoordinate`, optional 

345 A data ID containing default for governor dimensions. 

346 datasets : `~collections.abc.Iterable` [ `DatasetType` ], optional 

347 Dataset types whose searches may be joined into the query. Callers 

348 must still call `QueryBuilder.joinDataset` explicitly to control how 

349 that join happens (e.g. which collections are searched), but by 

350 declaring them here first we can ensure that the query includes the 

351 right dimensions for those joins. 

352 order_by : `~collections.abc.Iterable` [ `str` ] 

353 Sequence of names to use for ordering with optional "-" prefix. 

354 limit : `Tuple`, optional 

355 Limit on the number of returned rows and optional offset. 

356 check : `bool`, optional 

357 If `False`, permit expressions to refer to dimensions without providing 

358 a value for their governor dimensions (e.g. referring to a visit 

359 without an instrument). Should be left to default to `True` in 

360 essentially all new code. 

361 """ 

362 

363 def __init__( 

364 self, 

365 requested: DimensionGroup, 

366 *, 

367 column_types: ColumnTypeInfo, 

368 data_id: DataCoordinate | None = None, 

369 expression: str = "", 

370 region: Region | None = None, 

371 bind: Mapping[str, Any] | None = None, 

372 defaults: DataCoordinate | None = None, 

373 datasets: Iterable[DatasetType] = (), 

374 order_by: Iterable[str] | None = None, 

375 limit: tuple[int, int | None] | None = None, 

376 check: bool = True, 

377 ): 

378 self.requested = requested 

379 self.datasets = NamedValueSet(datasets).freeze() 

380 if len(self.datasets) == 1: 

381 (dataset_type_name,) = self.datasets.names 

382 else: 

383 dataset_type_name = None 

384 self.where = QueryWhereClause.combine( 

385 self.requested, 

386 expression=expression, 

387 column_types=column_types, 

388 bind=bind, 

389 data_id=data_id, 

390 region=region, 

391 defaults=defaults, 

392 dataset_type_name=dataset_type_name, 

393 allow_orphans=not check, 

394 ) 

395 self.order_by = None if order_by is None else OrderByClause.parse_general(order_by, requested) 

396 self.limit = limit 

397 self.columns_required, self.dimensions, self.region = self._compute_columns_required() 

398 

399 requested: DimensionGroup 

400 """Dimensions whose primary keys should be included in the result rows of 

401 the query (`DimensionGroup`). 

402 """ 

403 

404 where: QueryWhereClause 

405 """Structure containing objects that contribute to the WHERE clause of the 

406 query (`QueryWhereClause`). 

407 """ 

408 

409 datasets: NamedValueAbstractSet[DatasetType] 

410 """Dataset types whose searches may be joined into the query 

411 (`NamedValueAbstractSet` [ `DatasetType` ]). 

412 """ 

413 

414 order_by: OrderByClause | None 

415 """Object that manages how the query results should be sorted 

416 (`OrderByClause` or `None`). 

417 """ 

418 

419 limit: tuple[int, int | None] | None 

420 """Integer offset and maximum number of rows returned (prior to 

421 postprocessing filters), respectively. 

422 """ 

423 

424 dimensions: DimensionGroup 

425 """All dimensions in the query in any form (`DimensionGroup`). 

426 """ 

427 

428 region: Region | None 

429 """Region that bounds all query results (`lsst.sphgeom.Region`). 

430 

431 While `QueryWhereClause.region` and the ``region`` constructor argument 

432 represent an external region given directly by the caller, this represents 

433 the region actually used directly as a constraint on the query results, 

434 which can also come from the data ID passed by the caller. 

435 """ 

436 

437 columns_required: Set[ColumnTag] 

438 """All columns that must be included directly in the query. 

439 

440 This does not include columns that only need to be included in the result 

441 rows, and hence could be provided by postprocessors. 

442 """ 

443 

444 @property 

445 def universe(self) -> DimensionUniverse: 

446 """All known dimensions (`DimensionUniverse`).""" 

447 return self.requested.universe 

448 

449 def _compute_columns_required( 

450 self, 

451 ) -> tuple[set[ColumnTag], DimensionGroup, Region | None]: 

452 """Compute the columns that must be provided by the relations joined 

453 into this query in order to obtain the right *set* of result rows in 

454 the right order. 

455 

456 This does not include columns that only need to be included in the 

457 result rows, and hence could be provided by postprocessors. 

458 """ 

459 tags: set[ColumnTag] = set(DimensionKeyColumnTag.generate(self.requested.names)) 

460 for dataset_type in self.datasets: 

461 tags.update(DimensionKeyColumnTag.generate(dataset_type.dimensions.names)) 

462 if self.where.expression_predicate is not None: 

463 tags.update(self.where.expression_predicate.columns_required) 

464 if self.order_by is not None: 

465 tags.update(self.order_by.columns_required) 

466 region = self.where.region 

467 for dimension_name in self.where.data_id.dimensions.names: 

468 dimension_tag = DimensionKeyColumnTag(dimension_name) 

469 if dimension_tag in tags: 

470 continue 

471 if skypix_dimension := self.universe.skypix_dimensions.get(dimension_name): 

472 if skypix_dimension == self.universe.commonSkyPix: 

473 # Common skypix dimension is should be available from 

474 # spatial join tables. 

475 tags.add(dimension_tag) 

476 else: 

477 # This is a SkyPixDimension other than the common one. If 

478 # it's not already present in the query (e.g. from a 

479 # dataset join), this is a pure spatial constraint, which 

480 # we can only apply by modifying the 'region' for the 

481 # query. That will also require that we join in the common 

482 # skypix dimension. 

483 pixel = skypix_dimension.pixelization.pixel(self.where.data_id[dimension_name]) 

484 if region is None: 

485 region = pixel 

486 else: 

487 region = IntersectionRegion(region, pixel) 

488 else: 

489 # If a dimension in the data ID is available from dimension 

490 # tables or dimension spatial-join tables in the database, 

491 # include it in the set of dimensions whose tables should 

492 # be joined. This makes these data ID constraints work 

493 # just like simple 'where' constraints, which is good. 

494 tags.add(dimension_tag) 

495 # Make sure the dimension keys are expanded self-consistently in what 

496 # we return by passing them through DimensionGroup. 

497 dimensions = DimensionGroup( 

498 self.universe, names={tag.dimension for tag in DimensionKeyColumnTag.filter_from(tags)} 

499 ) 

500 # If we have a region constraint, ensure region columns and the common 

501 # skypix dimension are included. 

502 missing_common_skypix = False 

503 if region is not None: 

504 for family in dimensions.spatial: 

505 element = family.choose(dimensions.elements.names, self.universe) 

506 tags.add(DimensionRecordColumnTag(element.name, "region")) 

507 if ( 

508 not isinstance(element, SkyPixDimension) 

509 and self.universe.commonSkyPix.name not in dimensions 

510 ): 

511 missing_common_skypix = True 

512 if missing_common_skypix: 

513 dimensions = dimensions.union(self.universe.commonSkyPix.minimal_group) 

514 tags.update(DimensionKeyColumnTag.generate(dimensions.names)) 

515 return (tags, dimensions, region)