Coverage for python/lsst/daf/butler/registry/queries/_query_context.py: 55%

73 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("QueryContext",) 

30 

31from abc import abstractmethod 

32from collections.abc import Iterable, Set 

33from contextlib import AbstractContextManager 

34from typing import Any 

35 

36import lsst.sphgeom 

37from lsst.daf.relation import ( 

38 ColumnExpression, 

39 ColumnTag, 

40 Engine, 

41 EngineError, 

42 Predicate, 

43 Processor, 

44 Relation, 

45 UnaryOperation, 

46 iteration, 

47) 

48 

49from ..._column_tags import DimensionKeyColumnTag 

50from ..._timespan import Timespan 

51from ...dimensions import DataCoordinate, SkyPixDimension 

52 

53 

54class QueryContext(Processor, AbstractContextManager["QueryContext"]): 

55 """A context manager interface for query operations that require some 

56 connection-like state. 

57 

58 Notes 

59 ----- 

60 `QueryContext` implementations are usually paired with a `QueryBackend` 

61 implementation, with the division of responsibilities as follows: 

62 

63 - `QueryContext` implements the `lsst.daf.relation.Processor` interface, 

64 and is hence responsible for executing multi-engine relation trees. 

65 

66 - `QueryContext` manages all state whose lifetime is a single query or set 

67 of related queries (e.g. temporary tables) via its context manager 

68 interface. Methods that do not involve this state should not require the 

69 context manager to have been entered. 

70 

71 - `QueryContext` objects should be easily to construct by registry helper 

72 code that doesn't have access to the full `Registry` data structure 

73 itself, while `QueryBackend` instances can generally only be constructed 

74 by code that does see essentially the full registry (for example, 

75 `SqlQueryBackend` holds a `RegistryManagerInstances` struct, while 

76 `SqlQueryContext` can be constructed with just a `Database` and 

77 `ColumnTypeInfo`). 

78 

79 - `QueryBackend.context` is a factory for the associated `QueryContext` 

80 type. 

81 

82 - `QueryBackend` methods that return relations accept the `QueryContext` 

83 returned by its `~QueryBackend.context` method in case those methods 

84 require state that should be cleaned up after the query is complete. 

85 """ 

86 

87 def __init__(self) -> None: 

88 self.iteration_engine = iteration.Engine() 

89 self.iteration_engine.functions[regions_overlap.__name__] = regions_overlap 

90 

91 iteration_engine: iteration.Engine 

92 """The relation engine that all relations must ultimately be transferred 

93 to in order to be executed by this context. 

94 """ 

95 

96 @property 

97 def preferred_engine(self) -> Engine: 

98 """Return the relation engine that this context prefers to execute 

99 operations in (`lsst.daf.relation.Engine`). 

100 """ 

101 return self.iteration_engine 

102 

103 @property 

104 @abstractmethod 

105 def is_open(self) -> bool: 

106 """Whether the context manager has been entered (`bool`).""" 

107 raise NotImplementedError() 

108 

109 def make_initial_relation(self, relation: Relation | None = None) -> Relation: 

110 """Construct an initial relation suitable for this context. 

111 

112 Parameters 

113 ---------- 

114 relation : `Relation`, optional 

115 A user-provided initial relation. Must be included by 

116 implementations when provided, but may be modified (e.g. by adding 

117 a transfer to a new engine) when need to satisfy the context's 

118 invariants. 

119 """ 

120 if relation is None: 

121 return self.preferred_engine.make_join_identity_relation() 

122 return relation 

123 

124 def fetch_iterable(self, relation: Relation) -> iteration.RowIterable: 

125 """Execute the given relation and return its rows as an iterable of 

126 mappings. 

127 

128 Parameters 

129 ---------- 

130 relation : `Relation` 

131 Relation representing the query to execute. 

132 

133 Returns 

134 ------- 

135 rows : `~lsst.daf.relation.iteration.RowIterable` 

136 An iterable over rows, with each row a mapping from `ColumnTag` 

137 to column value. 

138 

139 Notes 

140 ----- 

141 A transfer to `iteration_engine` will be added to the root (end) of the 

142 relation tree if the root is not already in the iteration engine. 

143 

144 Any transfers from other engines or persistent materializations will be 

145 handled by delegating to `process` before execution in the iteration 

146 engine. 

147 

148 To ensure the result is a multi-pass Python collection in memory, 

149 ensure the given tree ends with a materialization operation in the 

150 iteration engine. 

151 """ 

152 # This transfer does nothing if the relation is already in the 

153 # iteration engine. 

154 relation = relation.transferred_to(self.iteration_engine) 

155 relation = self.process(relation) 

156 return self.iteration_engine.execute(relation) 

157 

158 @abstractmethod 

159 def count(self, relation: Relation, *, exact: bool = True, discard: bool = False) -> int: 

160 """Count the number of rows in the given relation. 

161 

162 Parameters 

163 ---------- 

164 relation : `Relation` 

165 Relation whose rows are to be counted. 

166 exact : `bool`, optional 

167 If `True` (default), return the exact number of rows. If `False`, 

168 returning an upper bound is permitted if it can be done much more 

169 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but 

170 ignoring client-side filtering that would otherwise take place. 

171 discard : `bool`, optional 

172 If `True`, compute the exact count even if it would require running 

173 the full query and then throwing away the result rows after 

174 counting them. If `False`, this is an error, as the user would 

175 usually be better off executing the query first to fetch its rows 

176 into a new query (or passing ``exact=False``). Ignored if 

177 ``exact=False``. 

178 

179 Returns 

180 ------- 

181 n_rows : `int` 

182 Number of rows in the relation, or an upper bound. This includes 

183 duplicates, if there are any. 

184 

185 Raises 

186 ------ 

187 RuntimeError 

188 Raised if an exact count was requested and could not be obtained 

189 without fetching and discarding rows. 

190 """ 

191 raise NotImplementedError() 

192 

193 @abstractmethod 

194 def any(self, relation: Relation, *, execute: bool = True, exact: bool = True) -> bool: 

195 """Check whether this relation has any result rows at all. 

196 

197 Parameters 

198 ---------- 

199 relation : `Relation` 

200 Relation to be checked. 

201 execute : `bool`, optional 

202 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

203 determined prior to execution that the query would return no rows. 

204 exact : `bool`, optional 

205 If `True`, run the full query and perform post-query filtering if 

206 needed, until at least one result row is found. If `False`, the 

207 returned result does not account for post-query filtering, and 

208 hence may be `True` even when all result rows would be filtered 

209 out. 

210 

211 Returns 

212 ------- 

213 any_rows : `bool` 

214 Whether the relation has any rows, or if it may have any rows if 

215 ``exact=False``. 

216 

217 Raises 

218 ------ 

219 RuntimeError 

220 Raised if an exact check was requested and could not be obtained 

221 without executing the query. 

222 """ 

223 raise NotImplementedError() 

224 

225 def transfer(self, source: Relation, destination: Engine, materialize_as: str | None) -> Any: 

226 # Docstring inherited from lsst.daf.relation.Processor. 

227 raise NotImplementedError("No transfers expected by base QueryContext implementation.") 

228 

229 def materialize(self, base: Relation, name: str) -> Any: 

230 # Docstring inherited from lsst.daf.relation.Processor. 

231 if base.engine == self.iteration_engine: 

232 return self.iteration_engine.execute(base).materialized() 

233 raise EngineError(f"Unexpected engine {base.engine} for base QueryContext implementation.") 

234 

235 @abstractmethod 

236 def restore_columns( 

237 self, 

238 relation: Relation, 

239 columns_required: Set[ColumnTag], 

240 ) -> tuple[Relation, set[ColumnTag]]: 

241 """Return a modified relation tree that attempts to restore columns 

242 that were dropped by a projection operation. 

243 

244 Parameters 

245 ---------- 

246 relation : `Relation` 

247 Original relation tree. 

248 columns_required : `~collections.abc.Set` [ `ColumnTag` ] 

249 Columns to attempt to restore. May include columns already 

250 present in the relation. 

251 

252 Returns 

253 ------- 

254 modified : `Relation` 

255 Possibly-modified tree with any projections that had dropped 

256 requested columns replaced by projections that do not drop these 

257 columns. Care is taken to ensure that join common columns and 

258 deduplication behavior is preserved, even if that means some 

259 columns are not restored. 

260 columns_found : `set` [ `ColumnTag` ] 

261 Columns from those requested that are present in ``modified``. 

262 """ 

263 raise NotImplementedError() 

264 

265 @abstractmethod 

266 def strip_postprocessing(self, relation: Relation) -> tuple[Relation, list[UnaryOperation]]: 

267 """Return a modified relation tree without any iteration-engine 

268 operations and any transfer to the iteration engine at the end. 

269 

270 Parameters 

271 ---------- 

272 relation : `Relation` 

273 Original relation tree. 

274 

275 Returns 

276 ------- 

277 modified : `Relation` 

278 Stripped relation tree, with engine != `iteration_engine`. 

279 stripped : `UnaryOperation` 

280 Operations that were stripped, in the same order they should be 

281 reapplied (with ``transfer=True, 

282 preferred_engine=iteration_engine``) to recover the original tree. 

283 """ 

284 raise NotImplementedError() 

285 

286 @abstractmethod 

287 def drop_invalidated_postprocessing(self, relation: Relation, new_columns: Set[ColumnTag]) -> Relation: 

288 """Return a modified relation tree without iteration-engine operations 

289 that require columns that are not in the given set. 

290 

291 Parameters 

292 ---------- 

293 relation : `Relation` 

294 Original relation tree. 

295 new_columns : `~collections.abc.Set` [ `ColumnTag` ] 

296 The only columns that postprocessing operations may require if they 

297 are to be retained in the returned relation tree. 

298 

299 Returns 

300 ------- 

301 modified : `Relation` 

302 Modified relation tree with postprocessing operations incompatible 

303 with ``new_columns`` removed. 

304 """ 

305 raise NotImplementedError() 

306 

307 def make_data_coordinate_predicate( 

308 self, 

309 data_coordinate: DataCoordinate, 

310 full: bool | None = None, 

311 ) -> Predicate: 

312 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

313 represents a data ID constraint. 

314 

315 Parameters 

316 ---------- 

317 data_coordinate : `DataCoordinate` 

318 Data ID whose keys and values should be transformed to predicate 

319 equality constraints. 

320 full : `bool`, optional 

321 Whether to include constraints on implied dimensions (default is to 

322 include implied dimensions if ``data_coordinate`` has them). 

323 

324 Returns 

325 ------- 

326 predicate : `lsst.daf.relation.column_expressions.Predicate` 

327 New predicate 

328 """ 

329 if full is None: 

330 full = data_coordinate.hasFull() 

331 dimensions = data_coordinate.graph.required if not full else data_coordinate.graph.dimensions 

332 terms: list[Predicate] = [] 

333 for dimension in dimensions: 

334 dtype = dimension.primaryKey.getPythonType() 

335 terms.append( 

336 ColumnExpression.reference(DimensionKeyColumnTag(dimension.name), dtype=dtype).eq( 

337 ColumnExpression.literal(data_coordinate[dimension.name], dtype=dtype) 

338 ) 

339 ) 

340 return Predicate.logical_and(*terms) 

341 

342 def make_spatial_region_skypix_predicate( 

343 self, 

344 dimension: SkyPixDimension, 

345 region: lsst.sphgeom.Region, 

346 ) -> Predicate: 

347 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

348 tests whether two region columns overlap. 

349 

350 This operation only works with `iteration engines 

351 <lsst.daf.relation.iteration.Engine>`; it is usually used to refine the 

352 result of a join on `SkyPixDimension` columns in SQL. 

353 

354 Parameters 

355 ---------- 

356 dimension : `SkyPixDimension` 

357 Dimension whose key column is being constrained. 

358 region : `lsst.sphgeom.Region` 

359 Spatial region constraint to test against. 

360 

361 Returns 

362 ------- 

363 predicate : `lsst.daf.relation.column_expressions.Predicate` 

364 New predicate with the `DimensionKeyColumn` associated with 

365 ``dimension`` as its only required column. 

366 """ 

367 ref = ColumnExpression.reference(DimensionKeyColumnTag(dimension.name), dtype=int) 

368 terms: list[Predicate] = [] 

369 for begin, end in dimension.pixelization.envelope(region): 

370 if begin + 1 == end: 

371 terms.append(ref.eq(ColumnExpression.literal(begin, dtype=int))) 

372 else: 

373 terms.append( 

374 ref.ge(ColumnExpression.literal(begin, dtype=int)).logical_and( 

375 ref.lt(ColumnExpression.literal(end, dtype=int)) 

376 ) 

377 ) 

378 return Predicate.logical_or(*terms) 

379 

380 def make_spatial_region_overlap_predicate( 

381 self, 

382 lhs: ColumnExpression, 

383 rhs: ColumnExpression, 

384 ) -> Predicate: 

385 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

386 tests whether two regions overlap. 

387 

388 This operation only works with 

389 `iteration engines <lsst.daf.relation.iteration.Engine>`; it is usually 

390 used to refine the result of a join or constraint on `SkyPixDimension` 

391 columns in SQL. 

392 

393 Parameters 

394 ---------- 

395 lhs : `lsst.daf.relation.column_expressions.ColumnExpression` 

396 Expression for one spatial region. 

397 rhs : `lsst.daf.relation.column_expressions.ColumnExpression` 

398 Expression for the other spatial region. 

399 

400 Returns 

401 ------- 

402 predicate : `lsst.daf.relation.column_expressions.Predicate` 

403 New predicate with ``lhs`` and ``rhs`` as its required columns. 

404 """ 

405 return lhs.predicate_method(regions_overlap.__name__, rhs, supporting_engine_types={iteration.Engine}) 

406 

407 def make_timespan_overlap_predicate( 

408 self, 

409 tag: ColumnTag, 

410 timespan: Timespan, 

411 ) -> Predicate: 

412 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

413 tests whether a timespan column overlaps a timespan literal. 

414 

415 Parameters 

416 ---------- 

417 tag : `ColumnTag` 

418 Identifier for a timespan column. 

419 timespan : `Timespan` 

420 Timespan literal selected rows must overlap. 

421 

422 Returns 

423 ------- 

424 predicate : `lsst.daf.relation.column_expressions.Predicate` 

425 New predicate. 

426 """ 

427 return ColumnExpression.reference(tag, dtype=Timespan).predicate_method( 

428 "overlaps", ColumnExpression.literal(timespan) 

429 ) 

430 

431 def make_data_id_relation( 

432 self, data_ids: Set[DataCoordinate], dimension_names: Iterable[str] 

433 ) -> Relation: 

434 """Transform a set of data IDs into a relation. 

435 

436 Parameters 

437 ---------- 

438 data_ids : `~collections.abc.Set` [ `DataCoordinate` ] 

439 Data IDs to upload. All must have at least the dimensions given, 

440 but may have more. 

441 dimension_names : `~collections.abc.Iterable` [ `str` ] 

442 Names of dimensions that will be the columns of the relation. 

443 

444 Returns 

445 ------- 

446 relation : `Relation` 

447 Relation in the iteration engine. 

448 """ 

449 tags = DimensionKeyColumnTag.generate(dimension_names) 

450 payload = iteration.RowSequence( 

451 [{tag: data_id[tag.dimension] for tag in tags} for data_id in data_ids] 

452 ).to_mapping(tags) 

453 return self.iteration_engine.make_leaf(frozenset(tags), payload, name_prefix="upload") 

454 

455 

456def regions_overlap(a: lsst.sphgeom.Region, b: lsst.sphgeom.Region) -> bool: 

457 """Test whether a pair of regions overlap. 

458 

459 Parameters 

460 ---------- 

461 a : `lsst.sphgeom.Region` 

462 One region. 

463 b : `lsst.sphgeom.Region` 

464 The other region. 

465 

466 Returns 

467 ------- 

468 overlap : `bool` 

469 Whether the regions overlap. 

470 """ 

471 return not (a.relate(b) & lsst.sphgeom.DISJOINT)