Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 36%

166 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-22 03:05 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary"] # other classes here are local to subpackage 

24 

25import dataclasses 

26from collections.abc import Iterable, Mapping, Set 

27from typing import Any 

28 

29import astropy.time 

30from lsst.daf.relation import ColumnExpression, ColumnTag, Predicate, SortTerm 

31from lsst.sphgeom import IntersectionRegion, Region 

32from lsst.utils.classes import cached_getter, immutable 

33 

34from ...core import ( 

35 DataCoordinate, 

36 DatasetType, 

37 DimensionElement, 

38 DimensionGraph, 

39 DimensionKeyColumnTag, 

40 DimensionRecordColumnTag, 

41 DimensionUniverse, 

42 NamedValueAbstractSet, 

43 NamedValueSet, 

44 SkyPixDimension, 

45) 

46 

47# We're not trying to add typing to the lex/yacc parser code, so MyPy 

48# doesn't know about some of these imports. 

49from .expressions import make_string_expression_predicate 

50from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName 

51 

52 

53@dataclasses.dataclass(frozen=True, eq=False) 

54class QueryWhereClause: 

55 """Structure holding various contributions to a query's WHERE clause. 

56 

57 Instances of this class should only be created by 

58 `QueryWhereExpression.combine`, which guarantees the consistency of its 

59 attributes. 

60 """ 

61 

62 @classmethod 

63 def combine( 

64 cls, 

65 dimensions: DimensionGraph, 

66 expression: str = "", 

67 *, 

68 bind: Mapping[str, Any] | None = None, 

69 data_id: DataCoordinate | None = None, 

70 region: Region | None = None, 

71 defaults: DataCoordinate | None = None, 

72 dataset_type_name: str | None = None, 

73 allow_orphans: bool = False, 

74 ) -> QueryWhereClause: 

75 """Construct from various components. 

76 

77 Parameters 

78 ---------- 

79 dimensions : `DimensionGraph` 

80 The dimensions that would be included in the query in the absence 

81 of the WHERE clause. 

82 expression : `str`, optional 

83 A user-provided string expression. 

84 bind : `Mapping` [ `str`, `object` ], optional 

85 Mapping containing literal values that should be injected into the 

86 query expression, keyed by the identifiers they replace. 

87 data_id : `DataCoordinate`, optional 

88 A data ID identifying dimensions known in advance. If not 

89 provided, will be set to an empty data ID. 

90 region : `lsst.sphgeom.Region`, optional 

91 A spatial constraint that all rows must overlap. 

92 defaults : `DataCoordinate`, optional 

93 A data ID containing default for governor dimensions. 

94 dataset_type_name : `str` or `None`, optional 

95 The name of the dataset type to assume for unqualified dataset 

96 columns, or `None` if there are no such identifiers. 

97 allow_orphans : `bool`, optional 

98 If `True`, permit expressions to refer to dimensions without 

99 providing a value for their governor dimensions (e.g. referring to 

100 a visit without an instrument). Should be left to default to 

101 `False` in essentially all new code. 

102 

103 Returns 

104 ------- 

105 where : `QueryWhereClause` 

106 An object representing the WHERE clause for a query. 

107 """ 

108 if data_id is None: 

109 data_id = DataCoordinate.makeEmpty(dimensions.universe) 

110 if defaults is None: 

111 defaults = DataCoordinate.makeEmpty(dimensions.universe) 

112 expression_predicate, governor_constraints = make_string_expression_predicate( 

113 expression, 

114 dimensions, 

115 bind=bind, 

116 data_id=data_id, 

117 defaults=defaults, 

118 dataset_type_name=dataset_type_name, 

119 allow_orphans=allow_orphans, 

120 ) 

121 return QueryWhereClause( 

122 expression_predicate, 

123 data_id, 

124 region=region, 

125 governor_constraints=governor_constraints, 

126 ) 

127 

128 expression_predicate: Predicate | None 

129 """A predicate that evaluates a string expression from the user 

130 (`expressions.Predicate` or `None`). 

131 """ 

132 

133 data_id: DataCoordinate 

134 """A data ID identifying dimensions known before query construction 

135 (`DataCoordinate`). 

136 """ 

137 

138 region: Region | None 

139 """A spatial region that all result rows must overlap 

140 (`lsst.sphgeom.Region` or `None`). 

141 """ 

142 

143 governor_constraints: Mapping[str, Set[str]] 

144 """Restrictions on the values governor dimensions can take in this query, 

145 imposed by the string expression and/or data ID 

146 (`Mapping` [ `set`, `~collections.abc.Set` [ `str` ] ]). 

147 

148 Governor dimensions not present in this mapping are not constrained at all. 

149 """ 

150 

151 

152@dataclasses.dataclass(frozen=True) 

153class OrderByClauseColumn: 

154 """Information about single column in ORDER BY clause.""" 

155 

156 element: DimensionElement 

157 """Dimension element for data in this column (`DimensionElement`).""" 

158 

159 column: str | None 

160 """Name of the column or `None` for primary key (`str` or `None`)""" 

161 

162 ordering: bool 

163 """True for ascending order, False for descending (`bool`).""" 

164 

165 

166@dataclasses.dataclass(frozen=True, eq=False) 

167class OrderByClause: 

168 """Class for information about columns in ORDER BY clause""" 

169 

170 @classmethod 

171 def parse_general(cls, order_by: Iterable[str], graph: DimensionGraph) -> OrderByClause: 

172 """Parse an iterable of strings in the context of a multi-dimension 

173 query. 

174 

175 Parameters 

176 ---------- 

177 order_by : `Iterable` [ `str` ] 

178 Sequence of names to use for ordering with optional "-" prefix. 

179 graph : `DimensionGraph` 

180 Dimensions used by a query. 

181 

182 Returns 

183 ------- 

184 clause : `OrderByClause` 

185 New order-by clause representing the given string columns. 

186 """ 

187 terms = [] 

188 for name in order_by: 

189 if not name or name == "-": 

190 raise ValueError("Empty dimension name in ORDER BY") 

191 ascending = True 

192 if name[0] == "-": 

193 ascending = False 

194 name = name[1:] 

195 element, column = categorizeOrderByName(graph, name) 

196 term = cls._make_term(element, column, ascending) 

197 terms.append(term) 

198 return cls(terms) 

199 

200 @classmethod 

201 def parse_element(cls, order_by: Iterable[str], element: DimensionElement) -> OrderByClause: 

202 """Parse an iterable of strings in the context of a single dimension 

203 element query. 

204 

205 Parameters 

206 ---------- 

207 order_by : `Iterable` [ `str` ] 

208 Sequence of names to use for ordering with optional "-" prefix. 

209 element : `DimensionElement` 

210 Single or primary dimension element in the query 

211 

212 Returns 

213 ------- 

214 clause : `OrderByClause` 

215 New order-by clause representing the given string columns. 

216 """ 

217 terms = [] 

218 for name in order_by: 

219 if not name or name == "-": 

220 raise ValueError("Empty dimension name in ORDER BY") 

221 ascending = True 

222 if name[0] == "-": 

223 ascending = False 

224 name = name[1:] 

225 column = categorizeElementOrderByName(element, name) 

226 term = cls._make_term(element, column, ascending) 

227 terms.append(term) 

228 return cls(terms) 

229 

230 @classmethod 

231 def _make_term(cls, element: DimensionElement, column: str | None, ascending: bool) -> SortTerm: 

232 """Make a single sort term from parsed user expression values. 

233 

234 Parameters 

235 ---------- 

236 element : `DimensionElement` 

237 Dimension element the sort term references. 

238 column : `str` or `None` 

239 DimensionRecord field name, or `None` if ``element`` is a 

240 `Dimension` and the sort term is on is key value. 

241 ascending : `bool` 

242 Whether to sort ascending (`True`) or descending (`False`). 

243 

244 Returns 

245 ------- 

246 term : `lsst.daf.relation.SortTerm` 

247 Sort term struct. 

248 """ 

249 tag: ColumnTag 

250 expression: ColumnExpression 

251 if column is None: 

252 tag = DimensionKeyColumnTag(element.name) 

253 expression = ColumnExpression.reference(tag) 

254 elif column in ("timespan.begin", "timespan.end"): 

255 base_column, _, subfield = column.partition(".") 

256 tag = DimensionRecordColumnTag(element.name, base_column) 

257 expression = ColumnExpression.reference(tag).method( 

258 "lower" if subfield == "begin" else "upper", dtype=astropy.time.Time 

259 ) 

260 else: 

261 tag = DimensionRecordColumnTag(element.name, column) 

262 expression = ColumnExpression.reference(tag) 

263 return SortTerm(expression, ascending) 

264 

265 terms: Iterable[SortTerm] 

266 """Columns that appear in the ORDER BY 

267 (`Iterable` [ `OrderByClauseColumn` ]). 

268 """ 

269 

270 @property 

271 @cached_getter 

272 def columns_required(self) -> Set[ColumnTag]: 

273 """Set of column tags for all columns referenced by the ORDER BY clause 

274 (`~collections.abc.Set` [ `ColumnTag` ]). 

275 """ 

276 tags: set[ColumnTag] = set() 

277 for term in self.terms: 

278 tags.update(term.expression.columns_required) 

279 return tags 

280 

281 

282@immutable 

283class ElementOrderByClause: 

284 """Class for information about columns in ORDER BY clause for one element. 

285 

286 Parameters 

287 ---------- 

288 order_by : `Iterable` [ `str` ] 

289 Sequence of names to use for ordering with optional "-" prefix. 

290 element : `DimensionElement` 

291 Dimensions used by a query. 

292 """ 

293 

294 def __init__(self, order_by: Iterable[str], element: DimensionElement): 

295 self.order_by_columns = [] 

296 for name in order_by: 

297 if not name or name == "-": 

298 raise ValueError("Empty dimension name in ORDER BY") 

299 ascending = True 

300 if name[0] == "-": 

301 ascending = False 

302 name = name[1:] 

303 column = categorizeElementOrderByName(element, name) 

304 self.order_by_columns.append( 

305 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

306 ) 

307 

308 order_by_columns: Iterable[OrderByClauseColumn] 

309 """Columns that appear in the ORDER BY 

310 (`Iterable` [ `OrderByClauseColumn` ]). 

311 """ 

312 

313 

314@immutable 

315class QuerySummary: 

316 """A struct that holds and categorizes the dimensions involved in a query. 

317 

318 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

319 it needs to include all of the dimensions that will be included in the 

320 query (including any needed for querying datasets). 

321 

322 Parameters 

323 ---------- 

324 requested : `DimensionGraph` 

325 The dimensions whose primary keys should be included in the result rows 

326 of the query. 

327 data_id : `DataCoordinate`, optional 

328 A fully-expanded data ID identifying dimensions known in advance. If 

329 not provided, will be set to an empty data ID. 

330 expression : `str`, optional 

331 A user-provided string WHERE expression. 

332 region : `lsst.sphgeom.Region`, optional 

333 A spatial constraint that all rows must overlap. 

334 timespan : `Timespan`, optional 

335 A temporal constraint that all rows must overlap. 

336 bind : `Mapping` [ `str`, `object` ], optional 

337 Mapping containing literal values that should be injected into the 

338 query expression, keyed by the identifiers they replace. 

339 defaults : `DataCoordinate`, optional 

340 A data ID containing default for governor dimensions. 

341 datasets : `Iterable` [ `DatasetType` ], optional 

342 Dataset types whose searches may be joined into the query. Callers 

343 must still call `QueryBuilder.joinDataset` explicitly to control how 

344 that join happens (e.g. which collections are searched), but by 

345 declaring them here first we can ensure that the query includes the 

346 right dimensions for those joins. 

347 order_by : `Iterable` [ `str` ] 

348 Sequence of names to use for ordering with optional "-" prefix. 

349 limit : `Tuple`, optional 

350 Limit on the number of returned rows and optional offset. 

351 check : `bool`, optional 

352 If `False`, permit expressions to refer to dimensions without providing 

353 a value for their governor dimensions (e.g. referring to a visit 

354 without an instrument). Should be left to default to `True` in 

355 essentially all new code. 

356 """ 

357 

358 def __init__( 

359 self, 

360 requested: DimensionGraph, 

361 *, 

362 data_id: DataCoordinate | None = None, 

363 expression: str = "", 

364 region: Region | None = None, 

365 bind: Mapping[str, Any] | None = None, 

366 defaults: DataCoordinate | None = None, 

367 datasets: Iterable[DatasetType] = (), 

368 order_by: Iterable[str] | None = None, 

369 limit: tuple[int, int | None] | None = None, 

370 check: bool = True, 

371 ): 

372 self.requested = requested 

373 self.datasets = NamedValueSet(datasets).freeze() 

374 if len(self.datasets) == 1: 

375 (dataset_type_name,) = self.datasets.names 

376 else: 

377 dataset_type_name = None 

378 self.where = QueryWhereClause.combine( 

379 self.requested, 

380 expression=expression, 

381 bind=bind, 

382 data_id=data_id, 

383 region=region, 

384 defaults=defaults, 

385 dataset_type_name=dataset_type_name, 

386 allow_orphans=not check, 

387 ) 

388 self.order_by = None if order_by is None else OrderByClause.parse_general(order_by, requested) 

389 self.limit = limit 

390 self.columns_required, self.dimensions, self.region = self._compute_columns_required() 

391 

392 requested: DimensionGraph 

393 """Dimensions whose primary keys should be included in the result rows of 

394 the query (`DimensionGraph`). 

395 """ 

396 

397 where: QueryWhereClause 

398 """Structure containing objects that contribute to the WHERE clause of the 

399 query (`QueryWhereClause`). 

400 """ 

401 

402 datasets: NamedValueAbstractSet[DatasetType] 

403 """Dataset types whose searches may be joined into the query 

404 (`NamedValueAbstractSet` [ `DatasetType` ]). 

405 """ 

406 

407 order_by: OrderByClause | None 

408 """Object that manages how the query results should be sorted 

409 (`OrderByClause` or `None`). 

410 """ 

411 

412 limit: tuple[int, int | None] | None 

413 """Integer offset and maximum number of rows returned (prior to 

414 postprocessing filters), respectively. 

415 """ 

416 

417 dimensions: DimensionGraph 

418 """All dimensions in the query in any form (`DimensionGraph`). 

419 """ 

420 

421 region: Region | None 

422 """Region that bounds all query results (`lsst.sphgeom.Region`). 

423 

424 While `QueryWhereClause.region` and the ``region`` constructor argument 

425 represent an external region given directly by the caller, this represents 

426 the region actually used directly as a constraint on the query results, 

427 which can also come from the data ID passed by the caller. 

428 """ 

429 

430 columns_required: Set[ColumnTag] 

431 """All columns that must be included directly in the query. 

432 

433 This does not include columns that only need to be included in the result 

434 rows, and hence could be provided by postprocessors. 

435 """ 

436 

437 @property 

438 def universe(self) -> DimensionUniverse: 

439 """All known dimensions (`DimensionUniverse`).""" 

440 return self.requested.universe 

441 

442 def _compute_columns_required( 

443 self, 

444 ) -> tuple[set[ColumnTag], DimensionGraph, Region | None]: 

445 """Compute the columns that must be provided by the relations joined 

446 into this query in order to obtain the right *set* of result rows in 

447 the right order. 

448 

449 This does not include columns that only need to be included in the 

450 result rows, and hence could be provided by postprocessors. 

451 """ 

452 tags: set[ColumnTag] = set(DimensionKeyColumnTag.generate(self.requested.names)) 

453 for dataset_type in self.datasets: 

454 tags.update(DimensionKeyColumnTag.generate(dataset_type.dimensions.names)) 

455 if self.where.expression_predicate is not None: 

456 tags.update(self.where.expression_predicate.columns_required) 

457 if self.order_by is not None: 

458 tags.update(self.order_by.columns_required) 

459 region = self.where.region 

460 for dimension in self.where.data_id.graph: 

461 dimension_tag = DimensionKeyColumnTag(dimension.name) 

462 if dimension_tag in tags: 

463 continue 

464 if dimension == self.universe.commonSkyPix or not isinstance(dimension, SkyPixDimension): 

465 # If a dimension in the data ID is available from dimension 

466 # tables or dimension spatial-join tables in the database, 

467 # include it in the set of dimensions whose tables should be 

468 # joined. This makes these data ID constraints work just like 

469 # simple 'where' constraints, which is good. 

470 tags.add(dimension_tag) 

471 else: 

472 # This is a SkyPixDimension other than the common one. If it's 

473 # not already present in the query (e.g. from a dataset join), 

474 # this is a pure spatial constraint, which we can only apply by 

475 # modifying the 'region' for the query. That will also require 

476 # that we join in the common skypix dimension. 

477 pixel = dimension.pixelization.pixel(self.where.data_id[dimension]) 

478 if region is None: 

479 region = pixel 

480 else: 

481 region = IntersectionRegion(region, pixel) 

482 # Make sure the dimension keys are expanded self-consistently in what 

483 # we return by passing them through DimensionGraph. 

484 dimensions = DimensionGraph( 

485 self.universe, names={tag.dimension for tag in DimensionKeyColumnTag.filter_from(tags)} 

486 ) 

487 # If we have a region constraint, ensure region columns and the common 

488 # skypix dimension are included. 

489 missing_common_skypix = False 

490 if region is not None: 

491 for family in dimensions.spatial: 

492 element = family.choose(dimensions.elements) 

493 tags.add(DimensionRecordColumnTag(element.name, "region")) 

494 if not isinstance(element, SkyPixDimension) and self.universe.commonSkyPix not in dimensions: 

495 missing_common_skypix = True 

496 if missing_common_skypix: 

497 dimensions = dimensions.union(self.universe.commonSkyPix.graph) 

498 tags.update(DimensionKeyColumnTag.generate(dimensions.names)) 

499 return (tags, dimensions, region)