Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 40%

166 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-12 09:20 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["QuerySummary"] # other classes here are local to subpackage 

24 

25import dataclasses 

26from collections.abc import Iterable, Mapping, Set 

27from typing import Any 

28 

29import astropy.time 

30from lsst.daf.relation import ColumnExpression, ColumnTag, Predicate, SortTerm 

31from lsst.sphgeom import IntersectionRegion, Region 

32from lsst.utils.classes import cached_getter, immutable 

33 

34from ...core import ( 

35 ColumnTypeInfo, 

36 DataCoordinate, 

37 DatasetType, 

38 DimensionElement, 

39 DimensionGraph, 

40 DimensionKeyColumnTag, 

41 DimensionRecordColumnTag, 

42 DimensionUniverse, 

43 NamedValueAbstractSet, 

44 NamedValueSet, 

45 SkyPixDimension, 

46) 

47 

48# We're not trying to add typing to the lex/yacc parser code, so MyPy 

49# doesn't know about some of these imports. 

50from .expressions import make_string_expression_predicate 

51from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName 

52 

53 

54@dataclasses.dataclass(frozen=True, eq=False) 

55class QueryWhereClause: 

56 """Structure holding various contributions to a query's WHERE clause. 

57 

58 Instances of this class should only be created by 

59 `QueryWhereExpression.combine`, which guarantees the consistency of its 

60 attributes. 

61 """ 

62 

63 @classmethod 

64 def combine( 

65 cls, 

66 dimensions: DimensionGraph, 

67 expression: str = "", 

68 *, 

69 column_types: ColumnTypeInfo, 

70 bind: Mapping[str, Any] | None = None, 

71 data_id: DataCoordinate | None = None, 

72 region: Region | None = None, 

73 defaults: DataCoordinate | None = None, 

74 dataset_type_name: str | None = None, 

75 allow_orphans: bool = False, 

76 ) -> QueryWhereClause: 

77 """Construct from various components. 

78 

79 Parameters 

80 ---------- 

81 dimensions : `DimensionGraph` 

82 The dimensions that would be included in the query in the absence 

83 of the WHERE clause. 

84 expression : `str`, optional 

85 A user-provided string expression. 

86 column_types : `ColumnTypeInfo` 

87 Information about column types. 

88 bind : `~collections.abc.Mapping` [ `str`, `object` ], optional 

89 Mapping containing literal values that should be injected into the 

90 query expression, keyed by the identifiers they replace. 

91 data_id : `DataCoordinate`, optional 

92 A data ID identifying dimensions known in advance. If not 

93 provided, will be set to an empty data ID. 

94 region : `lsst.sphgeom.Region`, optional 

95 A spatial constraint that all rows must overlap. 

96 defaults : `DataCoordinate`, optional 

97 A data ID containing default for governor dimensions. 

98 dataset_type_name : `str` or `None`, optional 

99 The name of the dataset type to assume for unqualified dataset 

100 columns, or `None` if there are no such identifiers. 

101 allow_orphans : `bool`, optional 

102 If `True`, permit expressions to refer to dimensions without 

103 providing a value for their governor dimensions (e.g. referring to 

104 a visit without an instrument). Should be left to default to 

105 `False` in essentially all new code. 

106 

107 Returns 

108 ------- 

109 where : `QueryWhereClause` 

110 An object representing the WHERE clause for a query. 

111 """ 

112 if data_id is None: 

113 data_id = DataCoordinate.makeEmpty(dimensions.universe) 

114 if defaults is None: 

115 defaults = DataCoordinate.makeEmpty(dimensions.universe) 

116 expression_predicate, governor_constraints = make_string_expression_predicate( 

117 expression, 

118 dimensions, 

119 column_types=column_types, 

120 bind=bind, 

121 data_id=data_id, 

122 defaults=defaults, 

123 dataset_type_name=dataset_type_name, 

124 allow_orphans=allow_orphans, 

125 ) 

126 return QueryWhereClause( 

127 expression_predicate, 

128 data_id, 

129 region=region, 

130 governor_constraints=governor_constraints, 

131 ) 

132 

133 expression_predicate: Predicate | None 

134 """A predicate that evaluates a string expression from the user 

135 (`expressions.Predicate` or `None`). 

136 """ 

137 

138 data_id: DataCoordinate 

139 """A data ID identifying dimensions known before query construction 

140 (`DataCoordinate`). 

141 """ 

142 

143 region: Region | None 

144 """A spatial region that all result rows must overlap 

145 (`lsst.sphgeom.Region` or `None`). 

146 """ 

147 

148 governor_constraints: Mapping[str, Set[str]] 

149 """Restrictions on the values governor dimensions can take in this query, 

150 imposed by the string expression and/or data ID 

151 (`~collections.abc.Mapping` [ `str`, `~collections.abc.Set` [ `str` ] ]). 

152 

153 Governor dimensions not present in this mapping are not constrained at all. 

154 """ 

155 

156 

157@dataclasses.dataclass(frozen=True) 

158class OrderByClauseColumn: 

159 """Information about single column in ORDER BY clause.""" 

160 

161 element: DimensionElement 

162 """Dimension element for data in this column (`DimensionElement`).""" 

163 

164 column: str | None 

165 """Name of the column or `None` for primary key (`str` or `None`)""" 

166 

167 ordering: bool 

168 """True for ascending order, False for descending (`bool`).""" 

169 

170 

171@dataclasses.dataclass(frozen=True, eq=False) 

172class OrderByClause: 

173 """Class for information about columns in ORDER BY clause.""" 

174 

175 @classmethod 

176 def parse_general(cls, order_by: Iterable[str], graph: DimensionGraph) -> OrderByClause: 

177 """Parse an iterable of strings in the context of a multi-dimension 

178 query. 

179 

180 Parameters 

181 ---------- 

182 order_by : `~collections.abc.Iterable` [ `str` ] 

183 Sequence of names to use for ordering with optional "-" prefix. 

184 graph : `DimensionGraph` 

185 Dimensions used by a query. 

186 

187 Returns 

188 ------- 

189 clause : `OrderByClause` 

190 New order-by clause representing the given string columns. 

191 """ 

192 terms = [] 

193 for name in order_by: 

194 if not name or name == "-": 

195 raise ValueError("Empty dimension name in ORDER BY") 

196 ascending = True 

197 if name[0] == "-": 

198 ascending = False 

199 name = name[1:] 

200 element, column = categorizeOrderByName(graph, name) 

201 term = cls._make_term(element, column, ascending) 

202 terms.append(term) 

203 return cls(terms) 

204 

205 @classmethod 

206 def parse_element(cls, order_by: Iterable[str], element: DimensionElement) -> OrderByClause: 

207 """Parse an iterable of strings in the context of a single dimension 

208 element query. 

209 

210 Parameters 

211 ---------- 

212 order_by : `~collections.abc.Iterable` [ `str` ] 

213 Sequence of names to use for ordering with optional "-" prefix. 

214 element : `DimensionElement` 

215 Single or primary dimension element in the query 

216 

217 Returns 

218 ------- 

219 clause : `OrderByClause` 

220 New order-by clause representing the given string columns. 

221 """ 

222 terms = [] 

223 for name in order_by: 

224 if not name or name == "-": 

225 raise ValueError("Empty dimension name in ORDER BY") 

226 ascending = True 

227 if name[0] == "-": 

228 ascending = False 

229 name = name[1:] 

230 column = categorizeElementOrderByName(element, name) 

231 term = cls._make_term(element, column, ascending) 

232 terms.append(term) 

233 return cls(terms) 

234 

235 @classmethod 

236 def _make_term(cls, element: DimensionElement, column: str | None, ascending: bool) -> SortTerm: 

237 """Make a single sort term from parsed user expression values. 

238 

239 Parameters 

240 ---------- 

241 element : `DimensionElement` 

242 Dimension element the sort term references. 

243 column : `str` or `None` 

244 DimensionRecord field name, or `None` if ``element`` is a 

245 `Dimension` and the sort term is on is key value. 

246 ascending : `bool` 

247 Whether to sort ascending (`True`) or descending (`False`). 

248 

249 Returns 

250 ------- 

251 term : `lsst.daf.relation.SortTerm` 

252 Sort term struct. 

253 """ 

254 tag: ColumnTag 

255 expression: ColumnExpression 

256 if column is None: 

257 tag = DimensionKeyColumnTag(element.name) 

258 expression = ColumnExpression.reference(tag) 

259 elif column in ("timespan.begin", "timespan.end"): 

260 base_column, _, subfield = column.partition(".") 

261 tag = DimensionRecordColumnTag(element.name, base_column) 

262 expression = ColumnExpression.reference(tag).method( 

263 "lower" if subfield == "begin" else "upper", dtype=astropy.time.Time 

264 ) 

265 else: 

266 tag = DimensionRecordColumnTag(element.name, column) 

267 expression = ColumnExpression.reference(tag) 

268 return SortTerm(expression, ascending) 

269 

270 terms: Iterable[SortTerm] 

271 """Columns that appear in the ORDER BY 

272 (`~collections.abc.Iterable` [ `OrderByClauseColumn` ]). 

273 """ 

274 

275 @property 

276 @cached_getter 

277 def columns_required(self) -> Set[ColumnTag]: 

278 """Set of column tags for all columns referenced by the ORDER BY clause 

279 (`~collections.abc.Set` [ `ColumnTag` ]). 

280 """ 

281 tags: set[ColumnTag] = set() 

282 for term in self.terms: 

283 tags.update(term.expression.columns_required) 

284 return tags 

285 

286 

287@immutable 

288class ElementOrderByClause: 

289 """Class for information about columns in ORDER BY clause for one element. 

290 

291 Parameters 

292 ---------- 

293 order_by : `~collections.abc.Iterable` [ `str` ] 

294 Sequence of names to use for ordering with optional "-" prefix. 

295 element : `DimensionElement` 

296 Dimensions used by a query. 

297 """ 

298 

299 def __init__(self, order_by: Iterable[str], element: DimensionElement): 

300 self.order_by_columns = [] 

301 for name in order_by: 

302 if not name or name == "-": 

303 raise ValueError("Empty dimension name in ORDER BY") 

304 ascending = True 

305 if name[0] == "-": 

306 ascending = False 

307 name = name[1:] 

308 column = categorizeElementOrderByName(element, name) 

309 self.order_by_columns.append( 

310 OrderByClauseColumn(element=element, column=column, ordering=ascending) 

311 ) 

312 

313 order_by_columns: Iterable[OrderByClauseColumn] 

314 """Columns that appear in the ORDER BY 

315 (`~collections.abc.Iterable` [ `OrderByClauseColumn` ]). 

316 """ 

317 

318 

319@immutable 

320class QuerySummary: 

321 """A struct that holds and categorizes the dimensions involved in a query. 

322 

323 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and 

324 it needs to include all of the dimensions that will be included in the 

325 query (including any needed for querying datasets). 

326 

327 Parameters 

328 ---------- 

329 requested : `DimensionGraph` 

330 The dimensions whose primary keys should be included in the result rows 

331 of the query. 

332 column_types : `ColumnTypeInfo` 

333 Information about column types. 

334 data_id : `DataCoordinate`, optional 

335 A fully-expanded data ID identifying dimensions known in advance. If 

336 not provided, will be set to an empty data ID. 

337 expression : `str`, optional 

338 A user-provided string WHERE expression. 

339 region : `lsst.sphgeom.Region`, optional 

340 A spatial constraint that all rows must overlap. 

341 timespan : `Timespan`, optional 

342 A temporal constraint that all rows must overlap. 

343 bind : `~collections.abc.Mapping` [ `str`, `object` ], optional 

344 Mapping containing literal values that should be injected into the 

345 query expression, keyed by the identifiers they replace. 

346 defaults : `DataCoordinate`, optional 

347 A data ID containing default for governor dimensions. 

348 datasets : `~collections.abc.Iterable` [ `DatasetType` ], optional 

349 Dataset types whose searches may be joined into the query. Callers 

350 must still call `QueryBuilder.joinDataset` explicitly to control how 

351 that join happens (e.g. which collections are searched), but by 

352 declaring them here first we can ensure that the query includes the 

353 right dimensions for those joins. 

354 order_by : `~collections.abc.Iterable` [ `str` ] 

355 Sequence of names to use for ordering with optional "-" prefix. 

356 limit : `Tuple`, optional 

357 Limit on the number of returned rows and optional offset. 

358 check : `bool`, optional 

359 If `False`, permit expressions to refer to dimensions without providing 

360 a value for their governor dimensions (e.g. referring to a visit 

361 without an instrument). Should be left to default to `True` in 

362 essentially all new code. 

363 """ 

364 

365 def __init__( 

366 self, 

367 requested: DimensionGraph, 

368 *, 

369 column_types: ColumnTypeInfo, 

370 data_id: DataCoordinate | None = None, 

371 expression: str = "", 

372 region: Region | None = None, 

373 bind: Mapping[str, Any] | None = None, 

374 defaults: DataCoordinate | None = None, 

375 datasets: Iterable[DatasetType] = (), 

376 order_by: Iterable[str] | None = None, 

377 limit: tuple[int, int | None] | None = None, 

378 check: bool = True, 

379 ): 

380 self.requested = requested 

381 self.datasets = NamedValueSet(datasets).freeze() 

382 if len(self.datasets) == 1: 

383 (dataset_type_name,) = self.datasets.names 

384 else: 

385 dataset_type_name = None 

386 self.where = QueryWhereClause.combine( 

387 self.requested, 

388 expression=expression, 

389 column_types=column_types, 

390 bind=bind, 

391 data_id=data_id, 

392 region=region, 

393 defaults=defaults, 

394 dataset_type_name=dataset_type_name, 

395 allow_orphans=not check, 

396 ) 

397 self.order_by = None if order_by is None else OrderByClause.parse_general(order_by, requested) 

398 self.limit = limit 

399 self.columns_required, self.dimensions, self.region = self._compute_columns_required() 

400 

401 requested: DimensionGraph 

402 """Dimensions whose primary keys should be included in the result rows of 

403 the query (`DimensionGraph`). 

404 """ 

405 

406 where: QueryWhereClause 

407 """Structure containing objects that contribute to the WHERE clause of the 

408 query (`QueryWhereClause`). 

409 """ 

410 

411 datasets: NamedValueAbstractSet[DatasetType] 

412 """Dataset types whose searches may be joined into the query 

413 (`NamedValueAbstractSet` [ `DatasetType` ]). 

414 """ 

415 

416 order_by: OrderByClause | None 

417 """Object that manages how the query results should be sorted 

418 (`OrderByClause` or `None`). 

419 """ 

420 

421 limit: tuple[int, int | None] | None 

422 """Integer offset and maximum number of rows returned (prior to 

423 postprocessing filters), respectively. 

424 """ 

425 

426 dimensions: DimensionGraph 

427 """All dimensions in the query in any form (`DimensionGraph`). 

428 """ 

429 

430 region: Region | None 

431 """Region that bounds all query results (`lsst.sphgeom.Region`). 

432 

433 While `QueryWhereClause.region` and the ``region`` constructor argument 

434 represent an external region given directly by the caller, this represents 

435 the region actually used directly as a constraint on the query results, 

436 which can also come from the data ID passed by the caller. 

437 """ 

438 

439 columns_required: Set[ColumnTag] 

440 """All columns that must be included directly in the query. 

441 

442 This does not include columns that only need to be included in the result 

443 rows, and hence could be provided by postprocessors. 

444 """ 

445 

446 @property 

447 def universe(self) -> DimensionUniverse: 

448 """All known dimensions (`DimensionUniverse`).""" 

449 return self.requested.universe 

450 

451 def _compute_columns_required( 

452 self, 

453 ) -> tuple[set[ColumnTag], DimensionGraph, Region | None]: 

454 """Compute the columns that must be provided by the relations joined 

455 into this query in order to obtain the right *set* of result rows in 

456 the right order. 

457 

458 This does not include columns that only need to be included in the 

459 result rows, and hence could be provided by postprocessors. 

460 """ 

461 tags: set[ColumnTag] = set(DimensionKeyColumnTag.generate(self.requested.names)) 

462 for dataset_type in self.datasets: 

463 tags.update(DimensionKeyColumnTag.generate(dataset_type.dimensions.names)) 

464 if self.where.expression_predicate is not None: 

465 tags.update(self.where.expression_predicate.columns_required) 

466 if self.order_by is not None: 

467 tags.update(self.order_by.columns_required) 

468 region = self.where.region 

469 for dimension in self.where.data_id.graph: 

470 dimension_tag = DimensionKeyColumnTag(dimension.name) 

471 if dimension_tag in tags: 

472 continue 

473 if dimension == self.universe.commonSkyPix or not isinstance(dimension, SkyPixDimension): 

474 # If a dimension in the data ID is available from dimension 

475 # tables or dimension spatial-join tables in the database, 

476 # include it in the set of dimensions whose tables should be 

477 # joined. This makes these data ID constraints work just like 

478 # simple 'where' constraints, which is good. 

479 tags.add(dimension_tag) 

480 else: 

481 # This is a SkyPixDimension other than the common one. If it's 

482 # not already present in the query (e.g. from a dataset join), 

483 # this is a pure spatial constraint, which we can only apply by 

484 # modifying the 'region' for the query. That will also require 

485 # that we join in the common skypix dimension. 

486 pixel = dimension.pixelization.pixel(self.where.data_id[dimension]) 

487 if region is None: 

488 region = pixel 

489 else: 

490 region = IntersectionRegion(region, pixel) 

491 # Make sure the dimension keys are expanded self-consistently in what 

492 # we return by passing them through DimensionGraph. 

493 dimensions = DimensionGraph( 

494 self.universe, names={tag.dimension for tag in DimensionKeyColumnTag.filter_from(tags)} 

495 ) 

496 # If we have a region constraint, ensure region columns and the common 

497 # skypix dimension are included. 

498 missing_common_skypix = False 

499 if region is not None: 

500 for family in dimensions.spatial: 

501 element = family.choose(dimensions.elements) 

502 tags.add(DimensionRecordColumnTag(element.name, "region")) 

503 if not isinstance(element, SkyPixDimension) and self.universe.commonSkyPix not in dimensions: 

504 missing_common_skypix = True 

505 if missing_common_skypix: 

506 dimensions = dimensions.union(self.universe.commonSkyPix.graph) 

507 tags.update(DimensionKeyColumnTag.generate(dimensions.names)) 

508 return (tags, dimensions, region)