Coverage for python/lsst/daf/butler/registry/queries/_query_context.py: 45%

71 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-23 09:30 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryContext",) 

24 

25from abc import abstractmethod 

26from collections.abc import Iterable, Set 

27from contextlib import AbstractContextManager 

28from typing import Any 

29 

30import lsst.sphgeom 

31from lsst.daf.relation import ( 

32 ColumnExpression, 

33 ColumnTag, 

34 Engine, 

35 EngineError, 

36 Predicate, 

37 Processor, 

38 Relation, 

39 UnaryOperation, 

40 iteration, 

41) 

42 

43from ...core import DataCoordinate, DimensionKeyColumnTag, SkyPixDimension, Timespan 

44 

45 

46class QueryContext(Processor, AbstractContextManager["QueryContext"]): 

47 """A context manager interface for query operations that require some 

48 connection-like state. 

49 

50 Notes 

51 ----- 

52 `QueryContext` implementations are usually paired with a `QueryBackend` 

53 implementation, with the division of responsibilities as follows: 

54 

55 - `QueryContext` implements the `lsst.daf.relation.Processor` interface, 

56 and is hence responsible for executing multi-engine relation trees. 

57 

58 - `QueryContext` manages all state whose lifetime is a single query or set 

59 of related queries (e.g. temporary tables) via its context manager 

60 interface. Methods that do not involve this state should not require the 

61 context manager to have been entered. 

62 

63 - `QueryContext` objects should be easily to construct by registry helper 

64 code that doesn't have access to the full `Registry` data structure 

65 itself, while `QueryBackend` instances can generally only be constructed 

66 by code that does see essentially the full registry (for example, 

67 `SqlQueryBackend` holds a `RegistryManagerInstances` struct, while 

68 `SqlQueryContext` can be constructed with just a `Database` and 

69 `ColumnTypeInfo`). 

70 

71 - `QueryBackend.context` is a factory for the associated `QueryContext` 

72 type. 

73 

74 - `QueryBackend` methods that return relations accept the `QueryContext` 

75 returned by its `~QueryBackend.context` method in case those methods 

76 require state that should be cleaned up after the query is complete. 

77 """ 

78 

79 def __init__(self) -> None: 

80 self.iteration_engine = iteration.Engine() 

81 self.iteration_engine.functions[regions_overlap.__name__] = regions_overlap 

82 

83 iteration_engine: iteration.Engine 

84 """The relation engine that all relations must ultimately be transferred 

85 to in order to be executed by this context. 

86 """ 

87 

88 @property 

89 def preferred_engine(self) -> Engine: 

90 """Return the relation engine that this context prefers to execute 

91 operations in (`lsst.daf.relation.Engine`). 

92 """ 

93 return self.iteration_engine 

94 

95 @property 

96 @abstractmethod 

97 def is_open(self) -> bool: 

98 """Whether the context manager has been entered (`bool`).""" 

99 raise NotImplementedError() 

100 

101 def make_initial_relation(self, relation: Relation | None = None) -> Relation: 

102 """Construct an initial relation suitable for this context. 

103 

104 Parameters 

105 ---------- 

106 relation : `Relation`, optional 

107 A user-provided initial relation. Must be included by 

108 implementations when provided, but may be modified (e.g. by adding 

109 a transfer to a new engine) when need to satisfy the context's 

110 invariants. 

111 """ 

112 if relation is None: 

113 return self.preferred_engine.make_join_identity_relation() 

114 return relation 

115 

116 def fetch_iterable(self, relation: Relation) -> iteration.RowIterable: 

117 """Execute the given relation and return its rows as an iterable of 

118 mappings. 

119 

120 Parameters 

121 ---------- 

122 relation : `Relation` 

123 Relation representing the query to execute. 

124 

125 Returns 

126 ------- 

127 rows : `~lsst.daf.relation.iteration.RowIterable` 

128 An iterable over rows, with each row a mapping from `ColumnTag` 

129 to column value. 

130 

131 Notes 

132 ----- 

133 A transfer to `iteration_engine` will be added to the root (end) of the 

134 relation tree if the root is not already in the iteration engine. 

135 

136 Any transfers from other engines or persistent materializations will be 

137 handled by delegating to `process` before execution in the iteration 

138 engine. 

139 

140 To ensure the result is a multi-pass Python collection in memory, 

141 ensure the given tree ends with a materialization operation in the 

142 iteration engine. 

143 """ 

144 # This transfer does nothing if the relation is already in the 

145 # iteration engine. 

146 relation = relation.transferred_to(self.iteration_engine) 

147 relation = self.process(relation) 

148 return self.iteration_engine.execute(relation) 

149 

150 @abstractmethod 

151 def count(self, relation: Relation, *, exact: bool = True, discard: bool = False) -> int: 

152 """Count the number of rows in the given relation. 

153 

154 Parameters 

155 ---------- 

156 relation : `Relation` 

157 Relation whose rows are to be counted. 

158 exact : `bool`, optional 

159 If `True` (default), return the exact number of rows. If `False`, 

160 returning an upper bound is permitted if it can be done much more 

161 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but 

162 ignoring client-side filtering that would otherwise take place. 

163 discard : `bool`, optional 

164 If `True`, compute the exact count even if it would require running 

165 the full query and then throwing away the result rows after 

166 counting them. If `False`, this is an error, as the user would 

167 usually be better off executing the query first to fetch its rows 

168 into a new query (or passing ``exact=False``). Ignored if 

169 ``exact=False``. 

170 

171 Returns 

172 ------- 

173 n_rows : `int` 

174 Number of rows in the relation, or an upper bound. This includes 

175 duplicates, if there are any. 

176 

177 Raises 

178 ------ 

179 RuntimeError 

180 Raised if an exact count was requested and could not be obtained 

181 without fetching and discarding rows. 

182 """ 

183 raise NotImplementedError() 

184 

185 @abstractmethod 

186 def any(self, relation: Relation, *, execute: bool = True, exact: bool = True) -> bool: 

187 """Check whether this relation has any result rows at all. 

188 

189 Parameters 

190 ---------- 

191 relation : `Relation` 

192 Relation to be checked. 

193 execute : `bool`, optional 

194 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

195 determined prior to execution that the query would return no rows. 

196 exact : `bool`, optional 

197 If `True`, run the full query and perform post-query filtering if 

198 needed, until at least one result row is found. If `False`, the 

199 returned result does not account for post-query filtering, and 

200 hence may be `True` even when all result rows would be filtered 

201 out. 

202 

203 Returns 

204 ------- 

205 any_rows : `bool` 

206 Whether the relation has any rows, or if it may have any rows if 

207 ``exact=False``. 

208 

209 Raises 

210 ------ 

211 RuntimeError 

212 Raised if an exact check was requested and could not be obtained 

213 without executing the query. 

214 """ 

215 raise NotImplementedError() 

216 

217 def transfer(self, source: Relation, destination: Engine, materialize_as: str | None) -> Any: 

218 # Docstring inherited from lsst.daf.relation.Processor. 

219 raise NotImplementedError("No transfers expected by base QueryContext implementation.") 

220 

221 def materialize(self, base: Relation, name: str) -> Any: 

222 # Docstring inherited from lsst.daf.relation.Processor. 

223 if base.engine == self.iteration_engine: 

224 return self.iteration_engine.execute(base).materialized() 

225 raise EngineError(f"Unexpected engine {base.engine} for base QueryContext implementation.") 

226 

227 @abstractmethod 

228 def restore_columns( 

229 self, 

230 relation: Relation, 

231 columns_required: Set[ColumnTag], 

232 ) -> tuple[Relation, set[ColumnTag]]: 

233 """Return a modified relation tree that attempts to restore columns 

234 that were dropped by a projection operation. 

235 

236 Parameters 

237 ---------- 

238 relation : `Relation` 

239 Original relation tree. 

240 columns_required : `~collections.abc.Set` [ `ColumnTag` ] 

241 Columns to attempt to restore. May include columns already 

242 present in the relation. 

243 

244 Returns 

245 ------- 

246 modified : `Relation` 

247 Possibly-modified tree with any projections that had dropped 

248 requested columns replaced by projections that do not drop these 

249 columns. Care is taken to ensure that join common columns and 

250 deduplication behavior is preserved, even if that means some 

251 columns are not restored. 

252 columns_found : `set` [ `ColumnTag` ] 

253 Columns from those requested that are present in ``modified``. 

254 """ 

255 raise NotImplementedError() 

256 

257 @abstractmethod 

258 def strip_postprocessing(self, relation: Relation) -> tuple[Relation, list[UnaryOperation]]: 

259 """Return a modified relation tree without any iteration-engine 

260 operations and any transfer to the iteration engine at the end. 

261 

262 Parameters 

263 ---------- 

264 relation : `Relation` 

265 Original relation tree. 

266 

267 Returns 

268 ------- 

269 modified : `Relation` 

270 Stripped relation tree, with engine != `iteration_engine`. 

271 stripped : `UnaryOperation` 

272 Operations that were stripped, in the same order they should be 

273 reapplied (with ``transfer=True, 

274 preferred_engine=iteration_engine``) to recover the original tree. 

275 """ 

276 raise NotImplementedError() 

277 

278 @abstractmethod 

279 def drop_invalidated_postprocessing(self, relation: Relation, new_columns: Set[ColumnTag]) -> Relation: 

280 """Return a modified relation tree without iteration-engine operations 

281 that require columns that are not in the given set. 

282 

283 Parameters 

284 ---------- 

285 relation : `Relation` 

286 Original relation tree. 

287 new_columns : `~collections.abc.Set` [ `ColumnTag` ] 

288 The only columns that postprocessing operations may require if they 

289 are to be retained in the returned relation tree. 

290 

291 Returns 

292 ------- 

293 modified : `Relation` 

294 Modified relation tree with postprocessing operations incompatible 

295 with ``new_columns`` removed. 

296 """ 

297 raise NotImplementedError() 

298 

299 def make_data_coordinate_predicate( 

300 self, 

301 data_coordinate: DataCoordinate, 

302 full: bool | None = None, 

303 ) -> Predicate: 

304 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

305 represents a data ID constraint. 

306 

307 Parameters 

308 ---------- 

309 data_coordinate : `DataCoordinate` 

310 Data ID whose keys and values should be transformed to predicate 

311 equality constraints. 

312 full : `bool`, optional 

313 Whether to include constraints on implied dimensions (default is to 

314 include implied dimensions if ``data_coordinate`` has them). 

315 

316 Returns 

317 ------- 

318 predicate : `lsst.daf.relation.column_expressions.Predicate` 

319 New predicate 

320 """ 

321 if full is None: 

322 full = data_coordinate.hasFull() 

323 dimensions = data_coordinate.graph.required if not full else data_coordinate.graph.dimensions 

324 terms: list[Predicate] = [] 

325 for dimension in dimensions: 

326 dtype = dimension.primaryKey.getPythonType() 

327 terms.append( 

328 ColumnExpression.reference(DimensionKeyColumnTag(dimension.name), dtype=dtype).eq( 

329 ColumnExpression.literal(data_coordinate[dimension.name], dtype=dtype) 

330 ) 

331 ) 

332 return Predicate.logical_and(*terms) 

333 

334 def make_spatial_region_skypix_predicate( 

335 self, 

336 dimension: SkyPixDimension, 

337 region: lsst.sphgeom.Region, 

338 ) -> Predicate: 

339 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

340 tests whether two region columns overlap 

341 

342 This operation only works with `iteration engines 

343 <lsst.daf.relation.iteration.Engine>`; it is usually used to refine the 

344 result of a join on `SkyPixDimension` columns in SQL. 

345 

346 Parameters 

347 ---------- 

348 dimension : `SkyPixDimension` 

349 Dimension whose key column is being constrained. 

350 region : `lsst.sphgeom.Region` 

351 Spatial region constraint to test against. 

352 

353 Returns 

354 ------- 

355 predicate : `lsst.daf.relation.column_expressions.Predicate` 

356 New predicate with the `DimensionKeyColumn` associated with 

357 ``dimension`` as its only required column. 

358 """ 

359 ref = ColumnExpression.reference(DimensionKeyColumnTag(dimension.name), dtype=int) 

360 terms: list[Predicate] = [] 

361 for begin, end in dimension.pixelization.envelope(region): 

362 if begin + 1 == end: 

363 terms.append(ref.eq(ColumnExpression.literal(begin, dtype=int))) 

364 else: 

365 terms.append( 

366 ref.ge(ColumnExpression.literal(begin, dtype=int)).logical_and( 

367 ref.lt(ColumnExpression.literal(end, dtype=int)) 

368 ) 

369 ) 

370 return Predicate.logical_or(*terms) 

371 

372 def make_spatial_region_overlap_predicate( 

373 self, 

374 lhs: ColumnExpression, 

375 rhs: ColumnExpression, 

376 ) -> Predicate: 

377 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

378 tests whether two regions overlap 

379 

380 This operation only works with 

381 `iteration engines <lsst.daf.relation.iteration.Engine>`; it is usually 

382 used to refine the result of a join or constraint on `SkyPixDimension` 

383 columns in SQL. 

384 

385 Parameters 

386 ---------- 

387 lhs : `lsst.daf.relation.column_expressions.ColumnExpression` 

388 Expression for one spatial region. 

389 rhs : `lsst.daf.relation.column_expressions.ColumnExpression` 

390 Expression for the other spatial region. 

391 

392 Returns 

393 ------- 

394 predicate : `lsst.daf.relation.column_expressions.Predicate` 

395 New predicate with ``lhs`` and ``rhs`` as its required columns. 

396 """ 

397 return lhs.predicate_method(regions_overlap.__name__, rhs, supporting_engine_types={iteration.Engine}) 

398 

399 def make_timespan_overlap_predicate( 

400 self, 

401 tag: ColumnTag, 

402 timespan: Timespan, 

403 ) -> Predicate: 

404 """Return a `~lsst.daf.relation.column_expressions.Predicate` that 

405 tests whether a timespan column overlaps a timespan literal. 

406 

407 Parameters 

408 ---------- 

409 tag : `ColumnTag` 

410 Identifier for a timespan column. 

411 timespan : `Timespan` 

412 Timespan literal selected rows must overlap. 

413 

414 Returns 

415 ------- 

416 predicate : `lsst.daf.relation.column_expressions.Predicate` 

417 New predicate. 

418 """ 

419 return ColumnExpression.reference(tag, dtype=Timespan).predicate_method( 

420 "overlaps", ColumnExpression.literal(timespan) 

421 ) 

422 

423 def make_data_id_relation( 

424 self, data_ids: Set[DataCoordinate], dimension_names: Iterable[str] 

425 ) -> Relation: 

426 """Transform a set of data IDs into a relation. 

427 

428 Parameters 

429 ---------- 

430 data_ids : `~collections.abc.Set` [ `DataCoordinate` ] 

431 Data IDs to upload. All must have at least the dimensions given, 

432 but may have more. 

433 dimension_names : `~collections.abc.Iterable` [ `str` ] 

434 Names of dimensions that will be the columns of the relation. 

435 

436 Returns 

437 ------- 

438 relation : `Relation` 

439 Relation in the iteration engine. 

440 """ 

441 tags = DimensionKeyColumnTag.generate(dimension_names) 

442 payload = iteration.RowSequence( 

443 [{tag: data_id[tag.dimension] for tag in tags} for data_id in data_ids] 

444 ).to_mapping(tags) 

445 return self.iteration_engine.make_leaf(frozenset(tags), payload, name_prefix="upload") 

446 

447 

448def regions_overlap(a: lsst.sphgeom.Region, b: lsst.sphgeom.Region) -> bool: 

449 """Test whether a pair of regions overlap. 

450 

451 Parameters 

452 ---------- 

453 a : `lsst.sphgeom.Region` 

454 One region. 

455 b : `lsst.sphgeom.Region` 

456 The other region. 

457 

458 Returns 

459 ------- 

460 overlap : `bool` 

461 Whether the regions overlap. 

462 """ 

463 return not (a.relate(b) & lsst.sphgeom.DISJOINT)