Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 40%

166 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ["QuerySummary"] # other classes here are local to subpackage 

30 

31import dataclasses 

32from collections.abc import Iterable, Mapping, Set 

33from typing import Any 

34 

35import astropy.time 

36from lsst.daf.relation import ColumnExpression, ColumnTag, Predicate, SortTerm 

37from lsst.sphgeom import IntersectionRegion, Region 

38from lsst.utils.classes import cached_getter, immutable 

39 

40from ...core import ( 

41 ColumnTypeInfo, 

42 DataCoordinate, 

43 DatasetType, 

44 DimensionElement, 

45 DimensionGraph, 

46 DimensionKeyColumnTag, 

47 DimensionRecordColumnTag, 

48 DimensionUniverse, 

49 NamedValueAbstractSet, 

50 NamedValueSet, 

51 SkyPixDimension, 

52) 

53 

54# We're not trying to add typing to the lex/yacc parser code, so MyPy 

55# doesn't know about some of these imports. 

56from .expressions import make_string_expression_predicate 

57from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName 

58 

59 

60@dataclasses.dataclass(frozen=True, eq=False) 

61class QueryWhereClause: 

62 """Structure holding various contributions to a query's WHERE clause. 

63 

64 Instances of this class should only be created by 

65 `QueryWhereExpression.combine`, which guarantees the consistency of its 

66 attributes. 

67 """ 

68 

69 @classmethod 

70 def combine( 

71 cls, 

72 dimensions: DimensionGraph, 

73 expression: str = "", 

74 *, 

75 column_types: ColumnTypeInfo, 

76 bind: Mapping[str, Any] | None = None, 

77 data_id: DataCoordinate | None = None, 

78 region: Region | None = None, 

79 defaults: DataCoordinate | None = None, 

80 dataset_type_name: str | None = None, 

81 allow_orphans: bool = False, 

82 ) -> QueryWhereClause: 

83 """Construct from various components. 

84 

85 Parameters 

86 ---------- 

87 dimensions : `DimensionGraph` 

88 The dimensions that would be included in the query in the absence 

89 of the WHERE clause. 

90 expression : `str`, optional 

91 A user-provided string expression. 

92 column_types : `ColumnTypeInfo` 

93 Information about column types. 

94 bind : `~collections.abc.Mapping` [ `str`, `object` ], optional 

95 Mapping containing literal values that should be injected into the 

96 query expression, keyed by the identifiers they replace. 

97 data_id : `DataCoordinate`, optional 

98 A data ID identifying dimensions known in advance. If not 

99 provided, will be set to an empty data ID. 

100 region : `lsst.sphgeom.Region`, optional 

101 A spatial constraint that all rows must overlap. 

102 defaults : `DataCoordinate`, optional 

103 A data ID containing default for governor dimensions. 

104 dataset_type_name : `str` or `None`, optional 

105 The name of the dataset type to assume for unqualified dataset 

106 columns, or `None` if there are no such identifiers. 

107 allow_orphans : `bool`, optional 

108 If `True`, permit expressions to refer to dimensions without 

109 providing a value for their governor dimensions (e.g. referring to 

110 a visit without an instrument). Should be left to default to 

111 `False` in essentially all new code. 

112 

113 Returns 

114 ------- 

115 where : `QueryWhereClause` 

116 An object representing the WHERE clause for a query. 

117 """ 

118 if data_id is None: 

119 data_id = DataCoordinate.makeEmpty(dimensions.universe) 

120 if defaults is None: 

121 defaults = DataCoordinate.makeEmpty(dimensions.universe) 

122 expression_predicate, governor_constraints = make_string_expression_predicate( 

123 expression, 

124 dimensions, 

125 column_types=column_types, 

126 bind=bind, 

127 data_id=data_id, 

128 defaults=defaults, 

129 dataset_type_name=dataset_type_name, 

130 allow_orphans=allow_orphans, 

131 ) 

132 return QueryWhereClause( 

133 expression_predicate, 

134 data_id, 

135 region=region, 

136 governor_constraints=governor_constraints, 

137 ) 

138 

139 expression_predicate: Predicate | None 

140 """A predicate that evaluates a string expression from the user 

141 (`expressions.Predicate` or `None`). 

142 """ 

143 

144 data_id: DataCoordinate 

145 """A data ID identifying dimensions known before query construction 

146 (`DataCoordinate`). 

147 """ 

148 

149 region: Region | None 

150 """A spatial region that all result rows must overlap 

151 (`lsst.sphgeom.Region` or `None`). 

152 """ 

153 

154 governor_constraints: Mapping[str, Set[str]] 

155 """Restrictions on the values governor dimensions can take in this query, 

156 imposed by the string expression and/or data ID 

157 (`~collections.abc.Mapping` [ `str`, `~collections.abc.Set` [ `str` ] ]). 

158 

159 Governor dimensions not present in this mapping are not constrained at all. 

160 """ 

161 

162 

163@dataclasses.dataclass(frozen=True) 

164class OrderByClauseColumn: 

165 """Information about single column in ORDER BY clause.""" 

166 

167 element: DimensionElement 

168 """Dimension element for data in this column (`DimensionElement`).""" 

169 

170 column: str | None 

171 """Name of the column or `None` for primary key (`str` or `None`)""" 

172 

173 ordering: bool 

174 """True for ascending order, False for descending (`bool`).""" 

175 

176 

177@dataclasses.dataclass(frozen=True, eq=False) 

178class OrderByClause: 

179 """Class for information about columns in ORDER BY clause.""" 

180 

181 @classmethod 

182 def parse_general(cls, order_by: Iterable[str], graph: DimensionGraph) -> OrderByClause: 

183 """Parse an iterable of strings in the context of a multi-dimension 

184 query. 

185 

186 Parameters 

187 ---------- 

188 order_by : `~collections.abc.Iterable` [ `str` ] 

189 Sequence of names to use for ordering with optional "-" prefix. 

190 graph : `DimensionGraph` 

191 Dimensions used by a query. 

192 

193 Returns 

194 ------- 

195 clause : `OrderByClause` 

196 New order-by clause representing the given string columns. 

197 """ 

198 terms = [] 

199 for name in order_by: 

200 if not name or name == "-": 

201 raise ValueError("Empty dimension name in ORDER BY") 

202 ascending = True 

203 if name[0] == "-": 

204 ascending = False 

205 name = name[1:] 

206 element, column = categorizeOrderByName(graph, name) 

207 term = cls._make_term(element, column, ascending) 

208 terms.append(term) 

209 return cls(terms) 

210 

211 @classmethod 

212 def parse_element(cls, order_by: Iterable[str], element: DimensionElement) -> OrderByClause: 

213 """Parse an iterable of strings in the context of a single dimension 

214 element query. 

215 

216 Parameters 

217 ---------- 

218 order_by : `~collections.abc.Iterable` [ `str` ] 

219 Sequence of names to use for ordering with optional "-" prefix. 

220 element : `DimensionElement` 

221 Single or primary dimension element in the query 

222 

223 Returns 

224 ------- 

225 clause : `OrderByClause` 

226 New order-by clause representing the given string columns. 

227 """ 

228 terms = [] 

229 for name in order_by: 

230 if not name or name == "-": 

231 raise ValueError("Empty dimension name in ORDER BY") 

232 ascending = True 

233 if name[0] == "-": 

234 ascending = False 

235 name = name[1:] 

236 column = categorizeElementOrderByName(element, name) 

237 term = cls._make_term(element, column, ascending) 

238 terms.append(term) 

239 return cls(terms) 

240 

241 @classmethod 

242 def _make_term(cls, element: DimensionElement, column: str | None, ascending: bool) -> SortTerm: 

243 """Make a single sort term from parsed user expression values. 

244 

245 Parameters 

246 ---------- 

247 element : `DimensionElement` 

248 Dimension element the sort term references. 

249 column : `str` or `None` 

250 DimensionRecord field name, or `None` if ``element`` is a 

251 `Dimension` and the sort term is on is key value. 

252 ascending : `bool` 

253 Whether to sort ascending (`True`) or descending (`False`). 

254 

255 Returns 

256 ------- 

257 term : `lsst.daf.relation.SortTerm` 

258 Sort term struct. 

259 """ 

260 tag: ColumnTag 

261 expression: ColumnExpression 

262 if column is None: 

263 tag = DimensionKeyColumnTag(element.name) 

264 expression = ColumnExpression.reference(tag) 

265 elif column in ("timespan.begin", "timespan.end"): 

266 base_column, _, subfield = column.partition(".") 

267 tag = DimensionRecordColumnTag(element.name, base_column) 

268 expression = ColumnExpression.reference(tag).method( 

269 "lower" if subfield == "begin" else "upper", dtype=astropy.time.Time 

270 ) 

271 else: 

272 tag = DimensionRecordColumnTag(element.name, column) 

273 expression = ColumnExpression.reference(tag) 

274 return SortTerm(expression, ascending) 

275 

276 terms: Iterable[SortTerm] 

277 """Columns that appear in the ORDER BY 

278 (`~collections.abc.Iterable` [ `OrderByClauseColumn` ]). 

279 """ 

280 

281 @property 

282 @cached_getter 

283 def columns_required(self) -> Set[ColumnTag]: 

284 """Set of column tags for all columns referenced by the ORDER BY clause 

285 (`~collections.abc.Set` [ `ColumnTag` ]). 

286 """ 

287 tags: set[ColumnTag] = set() 

288 for term in self.terms: 

289 tags.update(term.expression.columns_required) 

290 return tags 

291 

292 

293@immutable 

294class ElementOrderByClause: 

295 """Class for information about columns in ORDER BY clause for one element. 

296 

297 Parameters 

298 ---------- 

299 order_by : `~collections.abc.Iterable` [ `str` ] 

300 Sequence of names to use for ordering with optional "-" prefix. 

301 element : `DimensionElement` 

302 Dimensions used by a query. 

303 """ 

304 

305 def __init__(self, order_by: Iterable[str], element: DimensionElement): 

306 self.order_by_columns = [] 

307 for name in order_by: 

308 if not name or name == "-": 

309 raise ValueError("Empty dimension name in ORDER BY") 

310 ascending = True 

311 if name[0] == "-": 

312 ascending = False 

313 name = name[1:] 

314 column = categorizeElementOrderByName(element, name) 

315 self.order_by_columns.append( 

316 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

317 ) 

318 

319 order_by_columns: Iterable[OrderByClauseColumn] 

320 """Columns that appear in the ORDER BY 

321 (`~collections.abc.Iterable` [ `OrderByClauseColumn` ]). 

322 """ 

323 

324 

325@immutable 

326class QuerySummary: 

327 """A struct that holds and categorizes the dimensions involved in a query. 

328 

329 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

330 it needs to include all of the dimensions that will be included in the 

331 query (including any needed for querying datasets). 

332 

333 Parameters 

334 ---------- 

335 requested : `DimensionGraph` 

336 The dimensions whose primary keys should be included in the result rows 

337 of the query. 

338 column_types : `ColumnTypeInfo` 

339 Information about column types. 

340 data_id : `DataCoordinate`, optional 

341 A fully-expanded data ID identifying dimensions known in advance. If 

342 not provided, will be set to an empty data ID. 

343 expression : `str`, optional 

344 A user-provided string WHERE expression. 

345 region : `lsst.sphgeom.Region`, optional 

346 A spatial constraint that all rows must overlap. 

347 timespan : `Timespan`, optional 

348 A temporal constraint that all rows must overlap. 

349 bind : `~collections.abc.Mapping` [ `str`, `object` ], optional 

350 Mapping containing literal values that should be injected into the 

351 query expression, keyed by the identifiers they replace. 

352 defaults : `DataCoordinate`, optional 

353 A data ID containing default for governor dimensions. 

354 datasets : `~collections.abc.Iterable` [ `DatasetType` ], optional 

355 Dataset types whose searches may be joined into the query. Callers 

356 must still call `QueryBuilder.joinDataset` explicitly to control how 

357 that join happens (e.g. which collections are searched), but by 

358 declaring them here first we can ensure that the query includes the 

359 right dimensions for those joins. 

360 order_by : `~collections.abc.Iterable` [ `str` ] 

361 Sequence of names to use for ordering with optional "-" prefix. 

362 limit : `Tuple`, optional 

363 Limit on the number of returned rows and optional offset. 

364 check : `bool`, optional 

365 If `False`, permit expressions to refer to dimensions without providing 

366 a value for their governor dimensions (e.g. referring to a visit 

367 without an instrument). Should be left to default to `True` in 

368 essentially all new code. 

369 """ 

370 

371 def __init__( 

372 self, 

373 requested: DimensionGraph, 

374 *, 

375 column_types: ColumnTypeInfo, 

376 data_id: DataCoordinate | None = None, 

377 expression: str = "", 

378 region: Region | None = None, 

379 bind: Mapping[str, Any] | None = None, 

380 defaults: DataCoordinate | None = None, 

381 datasets: Iterable[DatasetType] = (), 

382 order_by: Iterable[str] | None = None, 

383 limit: tuple[int, int | None] | None = None, 

384 check: bool = True, 

385 ): 

386 self.requested = requested 

387 self.datasets = NamedValueSet(datasets).freeze() 

388 if len(self.datasets) == 1: 

389 (dataset_type_name,) = self.datasets.names 

390 else: 

391 dataset_type_name = None 

392 self.where = QueryWhereClause.combine( 

393 self.requested, 

394 expression=expression, 

395 column_types=column_types, 

396 bind=bind, 

397 data_id=data_id, 

398 region=region, 

399 defaults=defaults, 

400 dataset_type_name=dataset_type_name, 

401 allow_orphans=not check, 

402 ) 

403 self.order_by = None if order_by is None else OrderByClause.parse_general(order_by, requested) 

404 self.limit = limit 

405 self.columns_required, self.dimensions, self.region = self._compute_columns_required() 

406 

407 requested: DimensionGraph 

408 """Dimensions whose primary keys should be included in the result rows of 

409 the query (`DimensionGraph`). 

410 """ 

411 

412 where: QueryWhereClause 

413 """Structure containing objects that contribute to the WHERE clause of the 

414 query (`QueryWhereClause`). 

415 """ 

416 

417 datasets: NamedValueAbstractSet[DatasetType] 

418 """Dataset types whose searches may be joined into the query 

419 (`NamedValueAbstractSet` [ `DatasetType` ]). 

420 """ 

421 

422 order_by: OrderByClause | None 

423 """Object that manages how the query results should be sorted 

424 (`OrderByClause` or `None`). 

425 """ 

426 

427 limit: tuple[int, int | None] | None 

428 """Integer offset and maximum number of rows returned (prior to 

429 postprocessing filters), respectively. 

430 """ 

431 

432 dimensions: DimensionGraph 

433 """All dimensions in the query in any form (`DimensionGraph`). 

434 """ 

435 

436 region: Region | None 

437 """Region that bounds all query results (`lsst.sphgeom.Region`). 

438 

439 While `QueryWhereClause.region` and the ``region`` constructor argument 

440 represent an external region given directly by the caller, this represents 

441 the region actually used directly as a constraint on the query results, 

442 which can also come from the data ID passed by the caller. 

443 """ 

444 

445 columns_required: Set[ColumnTag] 

446 """All columns that must be included directly in the query. 

447 

448 This does not include columns that only need to be included in the result 

449 rows, and hence could be provided by postprocessors. 

450 """ 

451 

452 @property 

453 def universe(self) -> DimensionUniverse: 

454 """All known dimensions (`DimensionUniverse`).""" 

455 return self.requested.universe 

456 

457 def _compute_columns_required( 

458 self, 

459 ) -> tuple[set[ColumnTag], DimensionGraph, Region | None]: 

460 """Compute the columns that must be provided by the relations joined 

461 into this query in order to obtain the right *set* of result rows in 

462 the right order. 

463 

464 This does not include columns that only need to be included in the 

465 result rows, and hence could be provided by postprocessors. 

466 """ 

467 tags: set[ColumnTag] = set(DimensionKeyColumnTag.generate(self.requested.names)) 

468 for dataset_type in self.datasets: 

469 tags.update(DimensionKeyColumnTag.generate(dataset_type.dimensions.names)) 

470 if self.where.expression_predicate is not None: 

471 tags.update(self.where.expression_predicate.columns_required) 

472 if self.order_by is not None: 

473 tags.update(self.order_by.columns_required) 

474 region = self.where.region 

475 for dimension in self.where.data_id.graph: 

476 dimension_tag = DimensionKeyColumnTag(dimension.name) 

477 if dimension_tag in tags: 

478 continue 

479 if dimension == self.universe.commonSkyPix or not isinstance(dimension, SkyPixDimension): 

480 # If a dimension in the data ID is available from dimension 

481 # tables or dimension spatial-join tables in the database, 

482 # include it in the set of dimensions whose tables should be 

483 # joined. This makes these data ID constraints work just like 

484 # simple 'where' constraints, which is good. 

485 tags.add(dimension_tag) 

486 else: 

487 # This is a SkyPixDimension other than the common one. If it's 

488 # not already present in the query (e.g. from a dataset join), 

489 # this is a pure spatial constraint, which we can only apply by 

490 # modifying the 'region' for the query. That will also require 

491 # that we join in the common skypix dimension. 

492 pixel = dimension.pixelization.pixel(self.where.data_id[dimension]) 

493 if region is None: 

494 region = pixel 

495 else: 

496 region = IntersectionRegion(region, pixel) 

497 # Make sure the dimension keys are expanded self-consistently in what 

498 # we return by passing them through DimensionGraph. 

499 dimensions = DimensionGraph( 

500 self.universe, names={tag.dimension for tag in DimensionKeyColumnTag.filter_from(tags)} 

501 ) 

502 # If we have a region constraint, ensure region columns and the common 

503 # skypix dimension are included. 

504 missing_common_skypix = False 

505 if region is not None: 

506 for family in dimensions.spatial: 

507 element = family.choose(dimensions.elements) 

508 tags.add(DimensionRecordColumnTag(element.name, "region")) 

509 if not isinstance(element, SkyPixDimension) and self.universe.commonSkyPix not in dimensions: 

510 missing_common_skypix = True 

511 if missing_common_skypix: 

512 dimensions = dimensions.union(self.universe.commonSkyPix.graph) 

513 tags.update(DimensionKeyColumnTag.generate(dimensions.names)) 

514 return (tags, dimensions, region)