Coverage for python/lsst/daf/butler/registry/queries/_query_context.py: 54%

71 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("QueryContext",) 

30 

31from abc import abstractmethod 

32from collections.abc import Iterable, Set 

33from contextlib import AbstractContextManager 

34from typing import Any 

35 

36import lsst.sphgeom 

37from lsst.daf.relation import ( 

38 ColumnExpression, 

39 ColumnTag, 

40 Engine, 

41 EngineError, 

42 Predicate, 

43 Processor, 

44 Relation, 

45 UnaryOperation, 

46 iteration, 

47) 

48 

49from ...core import DataCoordinate, DimensionKeyColumnTag, SkyPixDimension, Timespan 

50 

51 

52class QueryContext(Processor, AbstractContextManager["QueryContext"]): 

53 """A context manager interface for query operations that require some 

54 connection-like state. 

55 

56 Notes 

57 ----- 

58 `QueryContext` implementations are usually paired with a `QueryBackend` 

59 implementation, with the division of responsibilities as follows: 

60 

61 - `QueryContext` implements the `lsst.daf.relation.Processor` interface, 

62 and is hence responsible for executing multi-engine relation trees. 

63 

64 - `QueryContext` manages all state whose lifetime is a single query or set 

65 of related queries (e.g. temporary tables) via its context manager 

66 interface. Methods that do not involve this state should not require the 

67 context manager to have been entered. 

68 

69 - `QueryContext` objects should be easily to construct by registry helper 

70 code that doesn't have access to the full `Registry` data structure 

71 itself, while `QueryBackend` instances can generally only be constructed 

72 by code that does see essentially the full registry (for example, 

73 `SqlQueryBackend` holds a `RegistryManagerInstances` struct, while 

74 `SqlQueryContext` can be constructed with just a `Database` and 

75 `ColumnTypeInfo`). 

76 

77 - `QueryBackend.context` is a factory for the associated `QueryContext` 

78 type. 

79 

80 - `QueryBackend` methods that return relations accept the `QueryContext` 

81 returned by its `~QueryBackend.context` method in case those methods 

82 require state that should be cleaned up after the query is complete. 

83 """ 

84 

85 def __init__(self) -> None: 

86 self.iteration_engine = iteration.Engine() 

87 self.iteration_engine.functions[regions_overlap.__name__] = regions_overlap 

88 

89 iteration_engine: iteration.Engine 

90 """The relation engine that all relations must ultimately be transferred 

91 to in order to be executed by this context. 

92 """ 

93 

94 @property 

95 def preferred_engine(self) -> Engine: 

96 """Return the relation engine that this context prefers to execute 

97 operations in (`lsst.daf.relation.Engine`). 

98 """ 

99 return self.iteration_engine 

100 

101 @property 

102 @abstractmethod 

103 def is_open(self) -> bool: 

104 """Whether the context manager has been entered (`bool`).""" 

105 raise NotImplementedError() 

106 

107 def make_initial_relation(self, relation: Relation | None = None) -> Relation: 

108 """Construct an initial relation suitable for this context. 

109 

110 Parameters 

111 ---------- 

112 relation : `Relation`, optional 

113 A user-provided initial relation. Must be included by 

114 implementations when provided, but may be modified (e.g. by adding 

115 a transfer to a new engine) when need to satisfy the context's 

116 invariants. 

117 """ 

118 if relation is None: 

119 return self.preferred_engine.make_join_identity_relation() 

120 return relation 

121 

122 def fetch_iterable(self, relation: Relation) -> iteration.RowIterable: 

123 """Execute the given relation and return its rows as an iterable of 

124 mappings. 

125 

126 Parameters 

127 ---------- 

128 relation : `Relation` 

129 Relation representing the query to execute. 

130 

131 Returns 

132 ------- 

133 rows : `~lsst.daf.relation.iteration.RowIterable` 

134 An iterable over rows, with each row a mapping from `ColumnTag` 

135 to column value. 

136 

137 Notes 

138 ----- 

139 A transfer to `iteration_engine` will be added to the root (end) of the 

140 relation tree if the root is not already in the iteration engine. 

141 

142 Any transfers from other engines or persistent materializations will be 

143 handled by delegating to `process` before execution in the iteration 

144 engine. 

145 

146 To ensure the result is a multi-pass Python collection in memory, 

147 ensure the given tree ends with a materialization operation in the 

148 iteration engine. 

149 """ 

150 # This transfer does nothing if the relation is already in the 

151 # iteration engine. 

152 relation = relation.transferred_to(self.iteration_engine) 

153 relation = self.process(relation) 

154 return self.iteration_engine.execute(relation) 

155 

156 @abstractmethod 

157 def count(self, relation: Relation, *, exact: bool = True, discard: bool = False) -> int: 

158 """Count the number of rows in the given relation. 

159 

160 Parameters 

161 ---------- 

162 relation : `Relation` 

163 Relation whose rows are to be counted. 

164 exact : `bool`, optional 

165 If `True` (default), return the exact number of rows. If `False`, 

166 returning an upper bound is permitted if it can be done much more 

167 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but 

168 ignoring client-side filtering that would otherwise take place. 

169 discard : `bool`, optional 

170 If `True`, compute the exact count even if it would require running 

171 the full query and then throwing away the result rows after 

172 counting them. If `False`, this is an error, as the user would 

173 usually be better off executing the query first to fetch its rows 

174 into a new query (or passing ``exact=False``). Ignored if 

175 ``exact=False``. 

176 

177 Returns 

178 ------- 

179 n_rows : `int` 

180 Number of rows in the relation, or an upper bound. This includes 

181 duplicates, if there are any. 

182 

183 Raises 

184 ------ 

185 RuntimeError 

186 Raised if an exact count was requested and could not be obtained 

187 without fetching and discarding rows. 

188 """ 

189 raise NotImplementedError() 

190 

191 @abstractmethod 

192 def any(self, relation: Relation, *, execute: bool = True, exact: bool = True) -> bool: 

193 """Check whether this relation has any result rows at all. 

194 

195 Parameters 

196 ---------- 

197 relation : `Relation` 

198 Relation to be checked. 

199 execute : `bool`, optional 

200 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

201 determined prior to execution that the query would return no rows. 

202 exact : `bool`, optional 

203 If `True`, run the full query and perform post-query filtering if 

204 needed, until at least one result row is found. If `False`, the 

205 returned result does not account for post-query filtering, and 

206 hence may be `True` even when all result rows would be filtered 

207 out. 

208 

209 Returns 

210 ------- 

211 any_rows : `bool` 

212 Whether the relation has any rows, or if it may have any rows if 

213 ``exact=False``. 

214 

215 Raises 

216 ------ 

217 RuntimeError 

218 Raised if an exact check was requested and could not be obtained 

219 without executing the query. 

220 """ 

221 raise NotImplementedError() 

222 

223 def transfer(self, source: Relation, destination: Engine, materialize_as: str | None) -> Any: 

224 # Docstring inherited from lsst.daf.relation.Processor. 

225 raise NotImplementedError("No transfers expected by base QueryContext implementation.") 

226 

227 def materialize(self, base: Relation, name: str) -> Any: 

228 # Docstring inherited from lsst.daf.relation.Processor. 

229 if base.engine == self.iteration_engine: 

230 return self.iteration_engine.execute(base).materialized() 

231 raise EngineError(f"Unexpected engine {base.engine} for base QueryContext implementation.") 

232 

233 @abstractmethod 

234 def restore_columns( 

235 self, 

236 relation: Relation, 

237 columns_required: Set[ColumnTag], 

238 ) -> tuple[Relation, set[ColumnTag]]: 

239 """Return a modified relation tree that attempts to restore columns 

240 that were dropped by a projection operation. 

241 

242 Parameters 

243 ---------- 

244 relation : `Relation` 

245 Original relation tree. 

246 columns_required : `~collections.abc.Set` [ `ColumnTag` ] 

247 Columns to attempt to restore. May include columns already 

248 present in the relation. 

249 

250 Returns 

251 ------- 

252 modified : `Relation` 

253 Possibly-modified tree with any projections that had dropped 

254 requested columns replaced by projections that do not drop these 

255 columns. Care is taken to ensure that join common columns and 

256 deduplication behavior is preserved, even if that means some 

257 columns are not restored. 

258 columns_found : `set` [ `ColumnTag` ] 

259 Columns from those requested that are present in ``modified``. 

260 """ 

261 raise NotImplementedError() 

262 

263 @abstractmethod 

264 def strip_postprocessing(self, relation: Relation) -> tuple[Relation, list[UnaryOperation]]: 

265 """Return a modified relation tree without any iteration-engine 

266 operations and any transfer to the iteration engine at the end. 

267 

268 Parameters 

269 ---------- 

270 relation : `Relation` 

271 Original relation tree. 

272 

273 Returns 

274 ------- 

275 modified : `Relation` 

276 Stripped relation tree, with engine != `iteration_engine`. 

277 stripped : `UnaryOperation` 

278 Operations that were stripped, in the same order they should be 

279 reapplied (with ``transfer=True, 

280 preferred_engine=iteration_engine``) to recover the original tree. 

281 """ 

282 raise NotImplementedError() 

283 

284 @abstractmethod 

285 def drop_invalidated_postprocessing(self, relation: Relation, new_columns: Set[ColumnTag]) -> Relation: 

286 """Return a modified relation tree without iteration-engine operations 

287 that require columns that are not in the given set. 

288 

289 Parameters 

290 ---------- 

291 relation : `Relation` 

292 Original relation tree. 

293 new_columns : `~collections.abc.Set` [ `ColumnTag` ] 

294 The only columns that postprocessing operations may require if they 

295 are to be retained in the returned relation tree. 

296 

297 Returns 

298 ------- 

299 modified : `Relation` 

300 Modified relation tree with postprocessing operations incompatible 

301 with ``new_columns`` removed. 

302 """ 

303 raise NotImplementedError() 

304 

305 def make_data_coordinate_predicate( 

306 self, 

307 data_coordinate: DataCoordinate, 

308 full: bool | None = None, 

309 ) -> Predicate: 

310 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

311 represents a data ID constraint. 

312 

313 Parameters 

314 ---------- 

315 data_coordinate : `DataCoordinate` 

316 Data ID whose keys and values should be transformed to predicate 

317 equality constraints. 

318 full : `bool`, optional 

319 Whether to include constraints on implied dimensions (default is to 

320 include implied dimensions if ``data_coordinate`` has them). 

321 

322 Returns 

323 ------- 

324 predicate : `lsst.daf.relation.column_expressions.Predicate` 

325 New predicate 

326 """ 

327 if full is None: 

328 full = data_coordinate.hasFull() 

329 dimensions = data_coordinate.graph.required if not full else data_coordinate.graph.dimensions 

330 terms: list[Predicate] = [] 

331 for dimension in dimensions: 

332 dtype = dimension.primaryKey.getPythonType() 

333 terms.append( 

334 ColumnExpression.reference(DimensionKeyColumnTag(dimension.name), dtype=dtype).eq( 

335 ColumnExpression.literal(data_coordinate[dimension.name], dtype=dtype) 

336 ) 

337 ) 

338 return Predicate.logical_and(*terms) 

339 

340 def make_spatial_region_skypix_predicate( 

341 self, 

342 dimension: SkyPixDimension, 

343 region: lsst.sphgeom.Region, 

344 ) -> Predicate: 

345 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

346 tests whether two region columns overlap. 

347 

348 This operation only works with `iteration engines 

349 <lsst.daf.relation.iteration.Engine>`; it is usually used to refine the 

350 result of a join on `SkyPixDimension` columns in SQL. 

351 

352 Parameters 

353 ---------- 

354 dimension : `SkyPixDimension` 

355 Dimension whose key column is being constrained. 

356 region : `lsst.sphgeom.Region` 

357 Spatial region constraint to test against. 

358 

359 Returns 

360 ------- 

361 predicate : `lsst.daf.relation.column_expressions.Predicate` 

362 New predicate with the `DimensionKeyColumn` associated with 

363 ``dimension`` as its only required column. 

364 """ 

365 ref = ColumnExpression.reference(DimensionKeyColumnTag(dimension.name), dtype=int) 

366 terms: list[Predicate] = [] 

367 for begin, end in dimension.pixelization.envelope(region): 

368 if begin + 1 == end: 

369 terms.append(ref.eq(ColumnExpression.literal(begin, dtype=int))) 

370 else: 

371 terms.append( 

372 ref.ge(ColumnExpression.literal(begin, dtype=int)).logical_and( 

373 ref.lt(ColumnExpression.literal(end, dtype=int)) 

374 ) 

375 ) 

376 return Predicate.logical_or(*terms) 

377 

378 def make_spatial_region_overlap_predicate( 

379 self, 

380 lhs: ColumnExpression, 

381 rhs: ColumnExpression, 

382 ) -> Predicate: 

383 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

384 tests whether two regions overlap. 

385 

386 This operation only works with 

387 `iteration engines <lsst.daf.relation.iteration.Engine>`; it is usually 

388 used to refine the result of a join or constraint on `SkyPixDimension` 

389 columns in SQL. 

390 

391 Parameters 

392 ---------- 

393 lhs : `lsst.daf.relation.column_expressions.ColumnExpression` 

394 Expression for one spatial region. 

395 rhs : `lsst.daf.relation.column_expressions.ColumnExpression` 

396 Expression for the other spatial region. 

397 

398 Returns 

399 ------- 

400 predicate : `lsst.daf.relation.column_expressions.Predicate` 

401 New predicate with ``lhs`` and ``rhs`` as its required columns. 

402 """ 

403 return lhs.predicate_method(regions_overlap.__name__, rhs, supporting_engine_types={iteration.Engine}) 

404 

405 def make_timespan_overlap_predicate( 

406 self, 

407 tag: ColumnTag, 

408 timespan: Timespan, 

409 ) -> Predicate: 

410 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

411 tests whether a timespan column overlaps a timespan literal. 

412 

413 Parameters 

414 ---------- 

415 tag : `ColumnTag` 

416 Identifier for a timespan column. 

417 timespan : `Timespan` 

418 Timespan literal selected rows must overlap. 

419 

420 Returns 

421 ------- 

422 predicate : `lsst.daf.relation.column_expressions.Predicate` 

423 New predicate. 

424 """ 

425 return ColumnExpression.reference(tag, dtype=Timespan).predicate_method( 

426 "overlaps", ColumnExpression.literal(timespan) 

427 ) 

428 

429 def make_data_id_relation( 

430 self, data_ids: Set[DataCoordinate], dimension_names: Iterable[str] 

431 ) -> Relation: 

432 """Transform a set of data IDs into a relation. 

433 

434 Parameters 

435 ---------- 

436 data_ids : `~collections.abc.Set` [ `DataCoordinate` ] 

437 Data IDs to upload. All must have at least the dimensions given, 

438 but may have more. 

439 dimension_names : `~collections.abc.Iterable` [ `str` ] 

440 Names of dimensions that will be the columns of the relation. 

441 

442 Returns 

443 ------- 

444 relation : `Relation` 

445 Relation in the iteration engine. 

446 """ 

447 tags = DimensionKeyColumnTag.generate(dimension_names) 

448 payload = iteration.RowSequence( 

449 [{tag: data_id[tag.dimension] for tag in tags} for data_id in data_ids] 

450 ).to_mapping(tags) 

451 return self.iteration_engine.make_leaf(frozenset(tags), payload, name_prefix="upload") 

452 

453 

454def regions_overlap(a: lsst.sphgeom.Region, b: lsst.sphgeom.Region) -> bool: 

455 """Test whether a pair of regions overlap. 

456 

457 Parameters 

458 ---------- 

459 a : `lsst.sphgeom.Region` 

460 One region. 

461 b : `lsst.sphgeom.Region` 

462 The other region. 

463 

464 Returns 

465 ------- 

466 overlap : `bool` 

467 Whether the regions overlap. 

468 """ 

469 return not (a.relate(b) & lsst.sphgeom.DISJOINT)