Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 40%

172 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-13 09:58 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ["QuerySummary"] # other classes here are local to subpackage 

30 

31import dataclasses 

32from collections.abc import Iterable, Mapping, Set 

33from typing import Any 

34 

35import astropy.time 

36from lsst.daf.relation import ColumnExpression, ColumnTag, Predicate, SortTerm 

37from lsst.sphgeom import IntersectionRegion, Region 

38from lsst.utils.classes import cached_getter, immutable 

39 

40from ..._column_tags import DimensionKeyColumnTag, DimensionRecordColumnTag 

41from ..._column_type_info import ColumnTypeInfo 

42from ..._dataset_type import DatasetType 

43from ..._named import NamedValueAbstractSet, NamedValueSet 

44from ...dimensions import DataCoordinate, DimensionElement, DimensionGroup, DimensionUniverse, SkyPixDimension 

45 

46# We're not trying to add typing to the lex/yacc parser code, so MyPy 

47# doesn't know about some of these imports. 

48from .expressions import make_string_expression_predicate 

49from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName 

50 

51 

52@dataclasses.dataclass(frozen=True, eq=False) 

53class QueryWhereClause: 

54 """Structure holding various contributions to a query's WHERE clause. 

55 

56 Instances of this class should only be created by 

57 `QueryWhereExpression.combine`, which guarantees the consistency of its 

58 attributes. 

59 """ 

60 

61 @classmethod 

62 def combine( 

63 cls, 

64 dimensions: DimensionGroup, 

65 expression: str = "", 

66 *, 

67 column_types: ColumnTypeInfo, 

68 bind: Mapping[str, Any] | None = None, 

69 data_id: DataCoordinate | None = None, 

70 region: Region | None = None, 

71 defaults: DataCoordinate | None = None, 

72 dataset_type_name: str | None = None, 

73 allow_orphans: bool = False, 

74 ) -> QueryWhereClause: 

75 """Construct from various components. 

76 

77 Parameters 

78 ---------- 

79 dimensions : `DimensionGroup` 

80 The dimensions that would be included in the query in the absence 

81 of the WHERE clause. 

82 expression : `str`, optional 

83 A user-provided string expression. 

84 column_types : `ColumnTypeInfo` 

85 Information about column types. 

86 bind : `~collections.abc.Mapping` [ `str`, `object` ], optional 

87 Mapping containing literal values that should be injected into the 

88 query expression, keyed by the identifiers they replace. 

89 data_id : `DataCoordinate`, optional 

90 A data ID identifying dimensions known in advance. If not 

91 provided, will be set to an empty data ID. 

92 region : `lsst.sphgeom.Region`, optional 

93 A spatial constraint that all rows must overlap. 

94 defaults : `DataCoordinate`, optional 

95 A data ID containing default for governor dimensions. 

96 dataset_type_name : `str` or `None`, optional 

97 The name of the dataset type to assume for unqualified dataset 

98 columns, or `None` if there are no such identifiers. 

99 allow_orphans : `bool`, optional 

100 If `True`, permit expressions to refer to dimensions without 

101 providing a value for their governor dimensions (e.g. referring to 

102 a visit without an instrument). Should be left to default to 

103 `False` in essentially all new code. 

104 

105 Returns 

106 ------- 

107 where : `QueryWhereClause` 

108 An object representing the WHERE clause for a query. 

109 """ 

110 if data_id is None: 

111 data_id = DataCoordinate.make_empty(dimensions.universe) 

112 if defaults is None: 

113 defaults = DataCoordinate.make_empty(dimensions.universe) 

114 expression_predicate, governor_constraints = make_string_expression_predicate( 

115 expression, 

116 dimensions, 

117 column_types=column_types, 

118 bind=bind, 

119 data_id=data_id, 

120 defaults=defaults, 

121 dataset_type_name=dataset_type_name, 

122 allow_orphans=allow_orphans, 

123 ) 

124 return QueryWhereClause( 

125 expression_predicate, 

126 data_id, 

127 region=region, 

128 governor_constraints=governor_constraints, 

129 ) 

130 

131 expression_predicate: Predicate | None 

132 """A predicate that evaluates a string expression from the user 

133 (`expressions.Predicate` or `None`). 

134 """ 

135 

136 data_id: DataCoordinate 

137 """A data ID identifying dimensions known before query construction 

138 (`DataCoordinate`). 

139 """ 

140 

141 region: Region | None 

142 """A spatial region that all result rows must overlap 

143 (`lsst.sphgeom.Region` or `None`). 

144 """ 

145 

146 governor_constraints: Mapping[str, Set[str]] 

147 """Restrictions on the values governor dimensions can take in this query, 

148 imposed by the string expression and/or data ID 

149 (`~collections.abc.Mapping` [ `str`, `~collections.abc.Set` [ `str` ] ]). 

150 

151 Governor dimensions not present in this mapping are not constrained at all. 

152 """ 

153 

154 

155@dataclasses.dataclass(frozen=True) 

156class OrderByClauseColumn: 

157 """Information about single column in ORDER BY clause.""" 

158 

159 element: DimensionElement 

160 """Dimension element for data in this column (`DimensionElement`).""" 

161 

162 column: str | None 

163 """Name of the column or `None` for primary key (`str` or `None`)""" 

164 

165 ordering: bool 

166 """True for ascending order, False for descending (`bool`).""" 

167 

168 

169@dataclasses.dataclass(frozen=True, eq=False) 

170class OrderByClause: 

171 """Class for information about columns in ORDER BY clause.""" 

172 

173 @classmethod 

174 def parse_general(cls, order_by: Iterable[str], dimensions: DimensionGroup) -> OrderByClause: 

175 """Parse an iterable of strings in the context of a multi-dimension 

176 query. 

177 

178 Parameters 

179 ---------- 

180 order_by : `~collections.abc.Iterable` [ `str` ] 

181 Sequence of names to use for ordering with optional "-" prefix. 

182 dimensions : `DimensionGroup` 

183 Dimensions used by a query. 

184 

185 Returns 

186 ------- 

187 clause : `OrderByClause` 

188 New order-by clause representing the given string columns. 

189 """ 

190 terms = [] 

191 for name in order_by: 

192 if not name or name == "-": 

193 raise ValueError("Empty dimension name in ORDER BY") 

194 ascending = True 

195 if name[0] == "-": 

196 ascending = False 

197 name = name[1:] 

198 element, column = categorizeOrderByName(dimensions, name) 

199 term = cls._make_term(element, column, ascending) 

200 terms.append(term) 

201 return cls(terms) 

202 

203 @classmethod 

204 def parse_element(cls, order_by: Iterable[str], element: DimensionElement) -> OrderByClause: 

205 """Parse an iterable of strings in the context of a single dimension 

206 element query. 

207 

208 Parameters 

209 ---------- 

210 order_by : `~collections.abc.Iterable` [ `str` ] 

211 Sequence of names to use for ordering with optional "-" prefix. 

212 element : `DimensionElement` 

213 Single or primary dimension element in the query. 

214 

215 Returns 

216 ------- 

217 clause : `OrderByClause` 

218 New order-by clause representing the given string columns. 

219 """ 

220 terms = [] 

221 for name in order_by: 

222 if not name or name == "-": 

223 raise ValueError("Empty dimension name in ORDER BY") 

224 ascending = True 

225 if name[0] == "-": 

226 ascending = False 

227 name = name[1:] 

228 found_element, column = categorizeElementOrderByName(element, name) 

229 term = cls._make_term(found_element, column, ascending) 

230 terms.append(term) 

231 return cls(terms) 

232 

233 @classmethod 

234 def _make_term(cls, element: DimensionElement, column: str | None, ascending: bool) -> SortTerm: 

235 """Make a single sort term from parsed user expression values. 

236 

237 Parameters 

238 ---------- 

239 element : `DimensionElement` 

240 Dimension element the sort term references. 

241 column : `str` or `None` 

242 DimensionRecord field name, or `None` if ``element`` is a 

243 `Dimension` and the sort term is on is key value. 

244 ascending : `bool` 

245 Whether to sort ascending (`True`) or descending (`False`). 

246 

247 Returns 

248 ------- 

249 term : `lsst.daf.relation.SortTerm` 

250 Sort term struct. 

251 """ 

252 tag: ColumnTag 

253 expression: ColumnExpression 

254 if column is None: 

255 tag = DimensionKeyColumnTag(element.name) 

256 expression = ColumnExpression.reference(tag) 

257 elif column in ("timespan.begin", "timespan.end"): 

258 base_column, _, subfield = column.partition(".") 

259 tag = DimensionRecordColumnTag(element.name, base_column) 

260 expression = ColumnExpression.reference(tag).method( 

261 "lower" if subfield == "begin" else "upper", dtype=astropy.time.Time 

262 ) 

263 else: 

264 tag = DimensionRecordColumnTag(element.name, column) 

265 expression = ColumnExpression.reference(tag) 

266 return SortTerm(expression, ascending) 

267 

268 terms: Iterable[SortTerm] 

269 """Columns that appear in the ORDER BY 

270 (`~collections.abc.Iterable` [ `OrderByClauseColumn` ]). 

271 """ 

272 

273 @property 

274 @cached_getter 

275 def columns_required(self) -> Set[ColumnTag]: 

276 """Set of column tags for all columns referenced by the ORDER BY clause 

277 (`~collections.abc.Set` [ `ColumnTag` ]). 

278 """ 

279 tags: set[ColumnTag] = set() 

280 for term in self.terms: 

281 tags.update(term.expression.columns_required) 

282 return tags 

283 

284 

285@immutable 

286class ElementOrderByClause: 

287 """Class for information about columns in ORDER BY clause for one element. 

288 

289 Parameters 

290 ---------- 

291 order_by : `~collections.abc.Iterable` [ `str` ] 

292 Sequence of names to use for ordering with optional "-" prefix. 

293 element : `DimensionElement` 

294 Dimensions used by a query. 

295 """ 

296 

297 def __init__(self, order_by: Iterable[str], element: DimensionElement): 

298 self.order_by_columns = [] 

299 for name in order_by: 

300 if not name or name == "-": 

301 raise ValueError("Empty dimension name in ORDER BY") 

302 ascending = True 

303 if name[0] == "-": 

304 ascending = False 

305 name = name[1:] 

306 found_element, column = categorizeElementOrderByName(element, name) 

307 self.order_by_columns.append( 

308 OrderByClauseColumn(element=found_element, column=column, ordering=ascending) 

309 ) 

310 

311 order_by_columns: Iterable[OrderByClauseColumn] 

312 """Columns that appear in the ORDER BY 

313 (`~collections.abc.Iterable` [ `OrderByClauseColumn` ]). 

314 """ 

315 

316 

317@immutable 

318class QuerySummary: 

319 """A struct that holds and categorizes the dimensions involved in a query. 

320 

321 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

322 it needs to include all of the dimensions that will be included in the 

323 query (including any needed for querying datasets). 

324 

325 Parameters 

326 ---------- 

327 requested : `DimensionGroup` 

328 The dimensions whose primary keys should be included in the result rows 

329 of the query. 

330 column_types : `ColumnTypeInfo` 

331 Information about column types. 

332 data_id : `DataCoordinate`, optional 

333 A fully-expanded data ID identifying dimensions known in advance. If 

334 not provided, will be set to an empty data ID. 

335 expression : `str`, optional 

336 A user-provided string WHERE expression. 

337 region : `lsst.sphgeom.Region`, optional 

338 A spatial constraint that all rows must overlap. 

339 bind : `~collections.abc.Mapping` [ `str`, `object` ], optional 

340 Mapping containing literal values that should be injected into the 

341 query expression, keyed by the identifiers they replace. 

342 defaults : `DataCoordinate`, optional 

343 A data ID containing default for governor dimensions. 

344 datasets : `~collections.abc.Iterable` [ `DatasetType` ], optional 

345 Dataset types whose searches may be joined into the query. Callers 

346 must still call `QueryBuilder.joinDataset` explicitly to control how 

347 that join happens (e.g. which collections are searched), but by 

348 declaring them here first we can ensure that the query includes the 

349 right dimensions for those joins. 

350 order_by : `~collections.abc.Iterable` [ `str` ] 

351 Sequence of names to use for ordering with optional "-" prefix. 

352 limit : `Tuple`, optional 

353 Limit on the number of returned rows and optional offset. 

354 check : `bool`, optional 

355 If `False`, permit expressions to refer to dimensions without providing 

356 a value for their governor dimensions (e.g. referring to a visit 

357 without an instrument). Should be left to default to `True` in 

358 essentially all new code. 

359 """ 

360 

361 def __init__( 

362 self, 

363 requested: DimensionGroup, 

364 *, 

365 column_types: ColumnTypeInfo, 

366 data_id: DataCoordinate | None = None, 

367 expression: str = "", 

368 region: Region | None = None, 

369 bind: Mapping[str, Any] | None = None, 

370 defaults: DataCoordinate | None = None, 

371 datasets: Iterable[DatasetType] = (), 

372 order_by: Iterable[str] | None = None, 

373 limit: tuple[int, int | None] | None = None, 

374 check: bool = True, 

375 ): 

376 self.requested = requested 

377 self.datasets = NamedValueSet(datasets).freeze() 

378 if len(self.datasets) == 1: 

379 (dataset_type_name,) = self.datasets.names 

380 else: 

381 dataset_type_name = None 

382 self.where = QueryWhereClause.combine( 

383 self.requested, 

384 expression=expression, 

385 column_types=column_types, 

386 bind=bind, 

387 data_id=data_id, 

388 region=region, 

389 defaults=defaults, 

390 dataset_type_name=dataset_type_name, 

391 allow_orphans=not check, 

392 ) 

393 self.order_by = None if order_by is None else OrderByClause.parse_general(order_by, requested) 

394 self.limit = limit 

395 self.columns_required, self.dimensions, self.region = self._compute_columns_required() 

396 

397 requested: DimensionGroup 

398 """Dimensions whose primary keys should be included in the result rows of 

399 the query (`DimensionGroup`). 

400 """ 

401 

402 where: QueryWhereClause 

403 """Structure containing objects that contribute to the WHERE clause of the 

404 query (`QueryWhereClause`). 

405 """ 

406 

407 datasets: NamedValueAbstractSet[DatasetType] 

408 """Dataset types whose searches may be joined into the query 

409 (`NamedValueAbstractSet` [ `DatasetType` ]). 

410 """ 

411 

412 order_by: OrderByClause | None 

413 """Object that manages how the query results should be sorted 

414 (`OrderByClause` or `None`). 

415 """ 

416 

417 limit: tuple[int, int | None] | None 

418 """Integer offset and maximum number of rows returned (prior to 

419 postprocessing filters), respectively. 

420 """ 

421 

422 dimensions: DimensionGroup 

423 """All dimensions in the query in any form (`DimensionGroup`). 

424 """ 

425 

426 region: Region | None 

427 """Region that bounds all query results (`lsst.sphgeom.Region`). 

428 

429 While `QueryWhereClause.region` and the ``region`` constructor argument 

430 represent an external region given directly by the caller, this represents 

431 the region actually used directly as a constraint on the query results, 

432 which can also come from the data ID passed by the caller. 

433 """ 

434 

435 columns_required: Set[ColumnTag] 

436 """All columns that must be included directly in the query. 

437 

438 This does not include columns that only need to be included in the result 

439 rows, and hence could be provided by postprocessors. 

440 """ 

441 

442 @property 

443 def universe(self) -> DimensionUniverse: 

444 """All known dimensions (`DimensionUniverse`).""" 

445 return self.requested.universe 

446 

447 def _compute_columns_required( 

448 self, 

449 ) -> tuple[set[ColumnTag], DimensionGroup, Region | None]: 

450 """Compute the columns that must be provided by the relations joined 

451 into this query in order to obtain the right *set* of result rows in 

452 the right order. 

453 

454 This does not include columns that only need to be included in the 

455 result rows, and hence could be provided by postprocessors. 

456 """ 

457 tags: set[ColumnTag] = set(DimensionKeyColumnTag.generate(self.requested.names)) 

458 for dataset_type in self.datasets: 

459 tags.update(DimensionKeyColumnTag.generate(dataset_type.dimensions.names)) 

460 if self.where.expression_predicate is not None: 

461 tags.update(self.where.expression_predicate.columns_required) 

462 if self.order_by is not None: 

463 tags.update(self.order_by.columns_required) 

464 region = self.where.region 

465 for dimension_name in self.where.data_id.dimensions.names: 

466 dimension_tag = DimensionKeyColumnTag(dimension_name) 

467 if dimension_tag in tags: 

468 continue 

469 if skypix_dimension := self.universe.skypix_dimensions.get(dimension_name): 

470 if skypix_dimension == self.universe.commonSkyPix: 

471 # Common skypix dimension is should be available from 

472 # spatial join tables. 

473 tags.add(dimension_tag) 

474 else: 

475 # This is a SkyPixDimension other than the common one. If 

476 # it's not already present in the query (e.g. from a 

477 # dataset join), this is a pure spatial constraint, which 

478 # we can only apply by modifying the 'region' for the 

479 # query. That will also require that we join in the common 

480 # skypix dimension. 

481 pixel = skypix_dimension.pixelization.pixel(self.where.data_id[dimension_name]) 

482 if region is None: 

483 region = pixel 

484 else: 

485 region = IntersectionRegion(region, pixel) 

486 else: 

487 # If a dimension in the data ID is available from dimension 

488 # tables or dimension spatial-join tables in the database, 

489 # include it in the set of dimensions whose tables should 

490 # be joined. This makes these data ID constraints work 

491 # just like simple 'where' constraints, which is good. 

492 tags.add(dimension_tag) 

493 # Make sure the dimension keys are expanded self-consistently in what 

494 # we return by passing them through DimensionGroup. 

495 dimensions = DimensionGroup( 

496 self.universe, names={tag.dimension for tag in DimensionKeyColumnTag.filter_from(tags)} 

497 ) 

498 # If we have a region constraint, ensure region columns and the common 

499 # skypix dimension are included. 

500 missing_common_skypix = False 

501 if region is not None: 

502 for family in dimensions.spatial: 

503 element = family.choose(dimensions.elements.names, self.universe) 

504 tags.add(DimensionRecordColumnTag(element.name, "region")) 

505 if ( 

506 not isinstance(element, SkyPixDimension) 

507 and self.universe.commonSkyPix.name not in dimensions 

508 ): 

509 missing_common_skypix = True 

510 if missing_common_skypix: 

511 dimensions = dimensions.union(self.universe.commonSkyPix.minimal_group) 

512 tags.update(DimensionKeyColumnTag.generate(dimensions.names)) 

513 return (tags, dimensions, region)