Coverage for python/lsst/daf/butler/registry/queries/_query_context.py: 55%

74 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-01 11:20 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("QueryContext",) 

30 

31from abc import abstractmethod 

32from collections.abc import Iterable, Set 

33from contextlib import AbstractContextManager 

34from typing import Any 

35 

36import lsst.sphgeom 

37from lsst.daf.relation import ( 

38 ColumnExpression, 

39 ColumnTag, 

40 Engine, 

41 EngineError, 

42 Predicate, 

43 Processor, 

44 Relation, 

45 UnaryOperation, 

46 iteration, 

47) 

48 

49from ..._column_tags import DimensionKeyColumnTag 

50from ..._timespan import Timespan 

51from ...dimensions import DataCoordinate, SkyPixDimension 

52 

53 

54class QueryContext(Processor, AbstractContextManager["QueryContext"]): 

55 """A context manager interface for query operations that require some 

56 connection-like state. 

57 

58 Notes 

59 ----- 

60 `QueryContext` implementations are usually paired with a `QueryBackend` 

61 implementation, with the division of responsibilities as follows: 

62 

63 - `QueryContext` implements the `lsst.daf.relation.Processor` interface, 

64 and is hence responsible for executing multi-engine relation trees. 

65 

66 - `QueryContext` manages all state whose lifetime is a single query or set 

67 of related queries (e.g. temporary tables) via its context manager 

68 interface. Methods that do not involve this state should not require the 

69 context manager to have been entered. 

70 

71 - `QueryContext` objects should be easily to construct by registry helper 

72 code that doesn't have access to the full `Registry` data structure 

73 itself, while `QueryBackend` instances can generally only be constructed 

74 by code that does see essentially the full registry (for example, 

75 `SqlQueryBackend` holds a `RegistryManagerInstances` struct, while 

76 `SqlQueryContext` can be constructed with just a `Database` and 

77 `ColumnTypeInfo`). 

78 

79 - `QueryBackend.context` is a factory for the associated `QueryContext` 

80 type. 

81 

82 - `QueryBackend` methods that return relations accept the `QueryContext` 

83 returned by its `~QueryBackend.context` method in case those methods 

84 require state that should be cleaned up after the query is complete. 

85 """ 

86 

87 def __init__(self) -> None: 

88 self.iteration_engine = iteration.Engine() 

89 self.iteration_engine.functions[regions_overlap.__name__] = regions_overlap 

90 

91 iteration_engine: iteration.Engine 

92 """The relation engine that all relations must ultimately be transferred 

93 to in order to be executed by this context. 

94 """ 

95 

96 @property 

97 def preferred_engine(self) -> Engine: 

98 """Return the relation engine that this context prefers to execute 

99 operations in (`lsst.daf.relation.Engine`). 

100 """ 

101 return self.iteration_engine 

102 

103 @property 

104 @abstractmethod 

105 def is_open(self) -> bool: 

106 """Whether the context manager has been entered (`bool`).""" 

107 raise NotImplementedError() 

108 

109 def make_initial_relation(self, relation: Relation | None = None) -> Relation: 

110 """Construct an initial relation suitable for this context. 

111 

112 Parameters 

113 ---------- 

114 relation : `Relation`, optional 

115 A user-provided initial relation. Must be included by 

116 implementations when provided, but may be modified (e.g. by adding 

117 a transfer to a new engine) when need to satisfy the context's 

118 invariants. 

119 """ 

120 if relation is None: 

121 return self.preferred_engine.make_join_identity_relation() 

122 return relation 

123 

124 def fetch_iterable(self, relation: Relation) -> iteration.RowIterable: 

125 """Execute the given relation and return its rows as an iterable of 

126 mappings. 

127 

128 Parameters 

129 ---------- 

130 relation : `Relation` 

131 Relation representing the query to execute. 

132 

133 Returns 

134 ------- 

135 rows : `~lsst.daf.relation.iteration.RowIterable` 

136 An iterable over rows, with each row a mapping from `ColumnTag` 

137 to column value. 

138 

139 Notes 

140 ----- 

141 A transfer to `iteration_engine` will be added to the root (end) of the 

142 relation tree if the root is not already in the iteration engine. 

143 

144 Any transfers from other engines or persistent materializations will be 

145 handled by delegating to `process` before execution in the iteration 

146 engine. 

147 

148 To ensure the result is a multi-pass Python collection in memory, 

149 ensure the given tree ends with a materialization operation in the 

150 iteration engine. 

151 """ 

152 # This transfer does nothing if the relation is already in the 

153 # iteration engine. 

154 relation = relation.transferred_to(self.iteration_engine) 

155 relation = self.process(relation) 

156 return self.iteration_engine.execute(relation) 

157 

158 @abstractmethod 

159 def count(self, relation: Relation, *, exact: bool = True, discard: bool = False) -> int: 

160 """Count the number of rows in the given relation. 

161 

162 Parameters 

163 ---------- 

164 relation : `Relation` 

165 Relation whose rows are to be counted. 

166 exact : `bool`, optional 

167 If `True` (default), return the exact number of rows. If `False`, 

168 returning an upper bound is permitted if it can be done much more 

169 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but 

170 ignoring client-side filtering that would otherwise take place. 

171 discard : `bool`, optional 

172 If `True`, compute the exact count even if it would require running 

173 the full query and then throwing away the result rows after 

174 counting them. If `False`, this is an error, as the user would 

175 usually be better off executing the query first to fetch its rows 

176 into a new query (or passing ``exact=False``). Ignored if 

177 ``exact=False``. 

178 

179 Returns 

180 ------- 

181 n_rows : `int` 

182 Number of rows in the relation, or an upper bound. This includes 

183 duplicates, if there are any. 

184 

185 Raises 

186 ------ 

187 RuntimeError 

188 Raised if an exact count was requested and could not be obtained 

189 without fetching and discarding rows. 

190 """ 

191 raise NotImplementedError() 

192 

193 @abstractmethod 

194 def any(self, relation: Relation, *, execute: bool = True, exact: bool = True) -> bool: 

195 """Check whether this relation has any result rows at all. 

196 

197 Parameters 

198 ---------- 

199 relation : `Relation` 

200 Relation to be checked. 

201 execute : `bool`, optional 

202 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

203 determined prior to execution that the query would return no rows. 

204 exact : `bool`, optional 

205 If `True`, run the full query and perform post-query filtering if 

206 needed, until at least one result row is found. If `False`, the 

207 returned result does not account for post-query filtering, and 

208 hence may be `True` even when all result rows would be filtered 

209 out. 

210 

211 Returns 

212 ------- 

213 any_rows : `bool` 

214 Whether the relation has any rows, or if it may have any rows if 

215 ``exact=False``. 

216 

217 Raises 

218 ------ 

219 RuntimeError 

220 Raised if an exact check was requested and could not be obtained 

221 without executing the query. 

222 """ 

223 raise NotImplementedError() 

224 

225 def transfer(self, source: Relation, destination: Engine, materialize_as: str | None) -> Any: 

226 # Docstring inherited from lsst.daf.relation.Processor. 

227 raise NotImplementedError("No transfers expected by base QueryContext implementation.") 

228 

229 def materialize(self, base: Relation, name: str) -> Any: 

230 # Docstring inherited from lsst.daf.relation.Processor. 

231 if base.engine == self.iteration_engine: 

232 return self.iteration_engine.execute(base).materialized() 

233 raise EngineError(f"Unexpected engine {base.engine} for base QueryContext implementation.") 

234 

235 @abstractmethod 

236 def restore_columns( 

237 self, 

238 relation: Relation, 

239 columns_required: Set[ColumnTag], 

240 ) -> tuple[Relation, set[ColumnTag]]: 

241 """Return a modified relation tree that attempts to restore columns 

242 that were dropped by a projection operation. 

243 

244 Parameters 

245 ---------- 

246 relation : `Relation` 

247 Original relation tree. 

248 columns_required : `~collections.abc.Set` [ `ColumnTag` ] 

249 Columns to attempt to restore. May include columns already 

250 present in the relation. 

251 

252 Returns 

253 ------- 

254 modified : `Relation` 

255 Possibly-modified tree with any projections that had dropped 

256 requested columns replaced by projections that do not drop these 

257 columns. Care is taken to ensure that join common columns and 

258 deduplication behavior is preserved, even if that means some 

259 columns are not restored. 

260 columns_found : `set` [ `ColumnTag` ] 

261 Columns from those requested that are present in ``modified``. 

262 """ 

263 raise NotImplementedError() 

264 

265 @abstractmethod 

266 def strip_postprocessing(self, relation: Relation) -> tuple[Relation, list[UnaryOperation]]: 

267 """Return a modified relation tree without any iteration-engine 

268 operations and any transfer to the iteration engine at the end. 

269 

270 Parameters 

271 ---------- 

272 relation : `Relation` 

273 Original relation tree. 

274 

275 Returns 

276 ------- 

277 modified : `Relation` 

278 Stripped relation tree, with engine != `iteration_engine`. 

279 stripped : `UnaryOperation` 

280 Operations that were stripped, in the same order they should be 

281 reapplied (with ``transfer=True, 

282 preferred_engine=iteration_engine``) to recover the original tree. 

283 """ 

284 raise NotImplementedError() 

285 

286 @abstractmethod 

287 def drop_invalidated_postprocessing(self, relation: Relation, new_columns: Set[ColumnTag]) -> Relation: 

288 """Return a modified relation tree without iteration-engine operations 

289 that require columns that are not in the given set. 

290 

291 Parameters 

292 ---------- 

293 relation : `Relation` 

294 Original relation tree. 

295 new_columns : `~collections.abc.Set` [ `ColumnTag` ] 

296 The only columns that postprocessing operations may require if they 

297 are to be retained in the returned relation tree. 

298 

299 Returns 

300 ------- 

301 modified : `Relation` 

302 Modified relation tree with postprocessing operations incompatible 

303 with ``new_columns`` removed. 

304 """ 

305 raise NotImplementedError() 

306 

307 def make_data_coordinate_predicate( 

308 self, 

309 data_coordinate: DataCoordinate, 

310 full: bool | None = None, 

311 ) -> Predicate: 

312 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

313 represents a data ID constraint. 

314 

315 Parameters 

316 ---------- 

317 data_coordinate : `DataCoordinate` 

318 Data ID whose keys and values should be transformed to predicate 

319 equality constraints. 

320 full : `bool`, optional 

321 Whether to include constraints on implied dimensions (default is to 

322 include implied dimensions if ``data_coordinate`` has them). 

323 

324 Returns 

325 ------- 

326 predicate : `lsst.daf.relation.column_expressions.Predicate` 

327 New predicate. 

328 """ 

329 if full is None: 

330 full = data_coordinate.hasFull() 

331 dimension_names = ( 

332 data_coordinate.required if not full else data_coordinate.dimensions.data_coordinate_keys 

333 ) 

334 terms: list[Predicate] = [] 

335 for dimension_name in dimension_names: 

336 dimension = data_coordinate.universe.dimensions[dimension_name] 

337 dtype = dimension.primaryKey.getPythonType() 

338 terms.append( 

339 ColumnExpression.reference(DimensionKeyColumnTag(dimension_name), dtype=dtype).eq( 

340 ColumnExpression.literal(data_coordinate[dimension_name], dtype=dtype) 

341 ) 

342 ) 

343 return Predicate.logical_and(*terms) 

344 

345 def make_spatial_region_skypix_predicate( 

346 self, 

347 dimension: SkyPixDimension, 

348 region: lsst.sphgeom.Region, 

349 ) -> Predicate: 

350 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

351 tests whether two region columns overlap. 

352 

353 This operation only works with `iteration engines 

354 <lsst.daf.relation.iteration.Engine>`; it is usually used to refine the 

355 result of a join on `SkyPixDimension` columns in SQL. 

356 

357 Parameters 

358 ---------- 

359 dimension : `SkyPixDimension` 

360 Dimension whose key column is being constrained. 

361 region : `lsst.sphgeom.Region` 

362 Spatial region constraint to test against. 

363 

364 Returns 

365 ------- 

366 predicate : `lsst.daf.relation.column_expressions.Predicate` 

367 New predicate with the `DimensionKeyColumn` associated with 

368 ``dimension`` as its only required column. 

369 """ 

370 ref = ColumnExpression.reference(DimensionKeyColumnTag(dimension.name), dtype=int) 

371 terms: list[Predicate] = [] 

372 for begin, end in dimension.pixelization.envelope(region): 

373 if begin + 1 == end: 

374 terms.append(ref.eq(ColumnExpression.literal(begin, dtype=int))) 

375 else: 

376 terms.append( 

377 ref.ge(ColumnExpression.literal(begin, dtype=int)).logical_and( 

378 ref.lt(ColumnExpression.literal(end, dtype=int)) 

379 ) 

380 ) 

381 return Predicate.logical_or(*terms) 

382 

383 def make_spatial_region_overlap_predicate( 

384 self, 

385 lhs: ColumnExpression, 

386 rhs: ColumnExpression, 

387 ) -> Predicate: 

388 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

389 tests whether two regions overlap. 

390 

391 This operation only works with 

392 `iteration engines <lsst.daf.relation.iteration.Engine>`; it is usually 

393 used to refine the result of a join or constraint on `SkyPixDimension` 

394 columns in SQL. 

395 

396 Parameters 

397 ---------- 

398 lhs : `lsst.daf.relation.column_expressions.ColumnExpression` 

399 Expression for one spatial region. 

400 rhs : `lsst.daf.relation.column_expressions.ColumnExpression` 

401 Expression for the other spatial region. 

402 

403 Returns 

404 ------- 

405 predicate : `lsst.daf.relation.column_expressions.Predicate` 

406 New predicate with ``lhs`` and ``rhs`` as its required columns. 

407 """ 

408 return lhs.predicate_method(regions_overlap.__name__, rhs, supporting_engine_types={iteration.Engine}) 

409 

410 def make_timespan_overlap_predicate( 

411 self, 

412 tag: ColumnTag, 

413 timespan: Timespan, 

414 ) -> Predicate: 

415 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

416 tests whether a timespan column overlaps a timespan literal. 

417 

418 Parameters 

419 ---------- 

420 tag : `ColumnTag` 

421 Identifier for a timespan column. 

422 timespan : `Timespan` 

423 Timespan literal selected rows must overlap. 

424 

425 Returns 

426 ------- 

427 predicate : `lsst.daf.relation.column_expressions.Predicate` 

428 New predicate. 

429 """ 

430 return ColumnExpression.reference(tag, dtype=Timespan).predicate_method( 

431 "overlaps", ColumnExpression.literal(timespan) 

432 ) 

433 

434 def make_data_id_relation( 

435 self, data_ids: Set[DataCoordinate], dimension_names: Iterable[str] 

436 ) -> Relation: 

437 """Transform a set of data IDs into a relation. 

438 

439 Parameters 

440 ---------- 

441 data_ids : `~collections.abc.Set` [ `DataCoordinate` ] 

442 Data IDs to upload. All must have at least the dimensions given, 

443 but may have more. 

444 dimension_names : `~collections.abc.Iterable` [ `str` ] 

445 Names of dimensions that will be the columns of the relation. 

446 

447 Returns 

448 ------- 

449 relation : `Relation` 

450 Relation in the iteration engine. 

451 """ 

452 tags = DimensionKeyColumnTag.generate(dimension_names) 

453 payload = iteration.RowSequence( 

454 [{tag: data_id[tag.dimension] for tag in tags} for data_id in data_ids] 

455 ).to_mapping(tags) 

456 return self.iteration_engine.make_leaf(frozenset(tags), payload, name_prefix="upload") 

457 

458 

459def regions_overlap(a: lsst.sphgeom.Region, b: lsst.sphgeom.Region) -> bool: 

460 """Test whether a pair of regions overlap. 

461 

462 Parameters 

463 ---------- 

464 a : `lsst.sphgeom.Region` 

465 One region. 

466 b : `lsst.sphgeom.Region` 

467 The other region. 

468 

469 Returns 

470 ------- 

471 overlap : `bool` 

472 Whether the regions overlap. 

473 """ 

474 return not (a.relate(b) & lsst.sphgeom.DISJOINT)