Coverage for python/lsst/daf/butler/queries/driver.py: 87%

81 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-30 02:53 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "QueryDriver", 

32 "PageKey", 

33 "ResultPage", 

34 "DataCoordinateResultPage", 

35 "DimensionRecordResultPage", 

36 "DatasetRefResultPage", 

37 "GeneralResultPage", 

38) 

39 

40import dataclasses 

41import uuid 

42from abc import abstractmethod 

43from collections.abc import Iterable 

44from contextlib import AbstractContextManager 

45from typing import Any, TypeAlias, Union, overload 

46 

47from .._dataset_ref import DatasetRef 

48from .._dataset_type import DatasetType 

49from ..dimensions import ( 

50 DataCoordinate, 

51 DataIdValue, 

52 DimensionGroup, 

53 DimensionRecord, 

54 DimensionRecordSet, 

55 DimensionRecordTable, 

56 DimensionUniverse, 

57) 

58from .result_specs import ( 

59 DataCoordinateResultSpec, 

60 DatasetRefResultSpec, 

61 DimensionRecordResultSpec, 

62 GeneralResultSpec, 

63 ResultSpec, 

64) 

65from .tree import DataCoordinateUploadKey, MaterializationKey, QueryTree 

66 

67PageKey: TypeAlias = uuid.UUID 

68 

69 

70# The Page types below could become Pydantic models instead of dataclasses if 

71# that makes them more directly usable by RemoteButler (at least once we have 

72# Pydantic-friendly containers for all of them). We may want to add a 

73# discriminator annotation to the ResultPage union if we do that. 

74 

75 

76@dataclasses.dataclass 

77class DataCoordinateResultPage: 

78 """A single page of results from a data coordinate query.""" 

79 

80 spec: DataCoordinateResultSpec 

81 next_key: PageKey | None 

82 

83 # TODO: On DM-41114 this will become a custom container that normalizes out 

84 # attached DimensionRecords and is Pydantic-friendly. 

85 rows: list[DataCoordinate] 

86 

87 

88@dataclasses.dataclass 

89class DimensionRecordResultPage: 

90 """A single page of results from a dimension record query.""" 

91 

92 spec: DimensionRecordResultSpec 

93 next_key: PageKey | None 

94 rows: Iterable[DimensionRecord] 

95 

96 def as_table(self) -> DimensionRecordTable: 

97 if isinstance(self.rows, DimensionRecordTable): 

98 return self.rows 

99 else: 

100 return DimensionRecordTable(self.spec.element, self.rows) 

101 

102 def as_set(self) -> DimensionRecordSet: 

103 if isinstance(self.rows, DimensionRecordSet): 

104 return self.rows 

105 else: 

106 return DimensionRecordSet(self.spec.element, self.rows) 

107 

108 

109@dataclasses.dataclass 

110class DatasetRefResultPage: 

111 """A single page of results from a dataset query.""" 

112 

113 spec: DatasetRefResultSpec 

114 next_key: PageKey | None 

115 

116 # TODO: On DM-41115 this will become a custom container that normalizes out 

117 # attached DimensionRecords and is Pydantic-friendly. 

118 rows: list[DatasetRef] 

119 

120 

121@dataclasses.dataclass 

122class GeneralResultPage: 

123 """A single page of results from a general query.""" 

124 

125 spec: GeneralResultSpec 

126 next_key: PageKey | None 

127 

128 # Raw tabular data, with columns in the same order as spec.columns. 

129 rows: list[tuple[Any, ...]] 

130 

131 

132ResultPage: TypeAlias = Union[ 

133 DataCoordinateResultPage, DimensionRecordResultPage, DatasetRefResultPage, GeneralResultPage 

134] 

135 

136 

137class QueryDriver(AbstractContextManager[None]): 

138 """Base class for the implementation object inside `Query2` objects 

139 that is specialized for DirectButler vs. RemoteButler. 

140 

141 Notes 

142 ----- 

143 Implementations should be context managers. This allows them to manage the 

144 lifetime of server-side state, such as: 

145 

146 - a SQL transaction, when necessary (DirectButler); 

147 - SQL cursors for queries that were not fully iterated over (DirectButler); 

148 - temporary database tables (DirectButler); 

149 - result-page Parquet files that were never fetched (RemoteButler); 

150 - uploaded Parquet files used to fill temporary database tables 

151 (RemoteButler); 

152 - cached content needed to construct query trees, like collection summaries 

153 (potentially all Butlers). 

154 

155 When possible, these sorts of things should be cleaned up earlier when they 

156 are no longer needed, and the Butler server will still have to guard 

157 against the context manager's ``__exit__`` signal never reaching it, but a 

158 context manager will take care of these much more often than relying on 

159 garbage collection and ``__del__`` would. 

160 """ 

161 

162 @property 

163 @abstractmethod 

164 def universe(self) -> DimensionUniverse: 

165 """Object that defines all dimensions.""" 

166 raise NotImplementedError() 

167 

168 @overload 

169 def execute(self, result_spec: DataCoordinateResultSpec, tree: QueryTree) -> DataCoordinateResultPage: ... 169 ↛ exitline 169 didn't return from function 'execute', because

170 

171 @overload 

172 def execute( 172 ↛ exitline 172 didn't jump to the function exit

173 self, result_spec: DimensionRecordResultSpec, tree: QueryTree 

174 ) -> DimensionRecordResultPage: ... 

175 

176 @overload 

177 def execute(self, result_spec: DatasetRefResultSpec, tree: QueryTree) -> DatasetRefResultPage: ... 177 ↛ exitline 177 didn't return from function 'execute', because

178 

179 @overload 

180 def execute(self, result_spec: GeneralResultSpec, tree: QueryTree) -> GeneralResultPage: ... 180 ↛ exitline 180 didn't return from function 'execute', because

181 

182 @abstractmethod 

183 def execute(self, result_spec: ResultSpec, tree: QueryTree) -> ResultPage: 

184 """Execute a query and return the first result page. 

185 

186 Parameters 

187 ---------- 

188 result_spec : `ResultSpec` 

189 The kind of results the user wants from the query. This can affect 

190 the actual query (i.e. SQL and Python postprocessing) that is run, 

191 e.g. by changing what is in the SQL SELECT clause and even what 

192 tables are joined in, but it never changes the number or order of 

193 result rows. 

194 tree : `QueryTree` 

195 Query tree to evaluate. 

196 

197 Returns 

198 ------- 

199 first_page : `ResultPage` 

200 A page whose type corresponds to the type of ``result_spec``, with 

201 at least the initial rows from the query. This should have an 

202 empty ``rows`` attribute if the query returned no results, and a 

203 ``next_key`` attribute that is not `None` if there were more 

204 results than could be returned in a single page. 

205 """ 

206 raise NotImplementedError() 

207 

208 @overload 

209 def fetch_next_page( 209 ↛ exitline 209 didn't jump to the function exit

210 self, result_spec: DataCoordinateResultSpec, key: PageKey 

211 ) -> DataCoordinateResultPage: ... 

212 

213 @overload 

214 def fetch_next_page( 214 ↛ exitline 214 didn't jump to the function exit

215 self, result_spec: DimensionRecordResultSpec, key: PageKey 

216 ) -> DimensionRecordResultPage: ... 

217 

218 @overload 

219 def fetch_next_page(self, result_spec: DatasetRefResultSpec, key: PageKey) -> DatasetRefResultPage: ... 219 ↛ exitline 219 didn't return from function 'fetch_next_page', because

220 

221 @overload 

222 def fetch_next_page(self, result_spec: GeneralResultSpec, key: PageKey) -> GeneralResultPage: ... 222 ↛ exitline 222 didn't return from function 'fetch_next_page', because

223 

224 @abstractmethod 

225 def fetch_next_page(self, result_spec: ResultSpec, key: PageKey) -> ResultPage: 

226 """Fetch the next page of results from an already-executed query. 

227 

228 Parameters 

229 ---------- 

230 result_spec : `ResultSpec` 

231 The kind of results the user wants from the query. This must be 

232 identical to the ``result_spec`` passed to `execute`, but 

233 implementations are not *required* to check this. 

234 key : `PageKey` 

235 Key included in the previous page from this query. This key may 

236 become unusable or even be reused after this call. 

237 

238 Returns 

239 ------- 

240 next_page : `ResultPage` 

241 The next page of query results. 

242 """ 

243 # We can put off dealing with pagination initially by just making an 

244 # implementation of this method raise. 

245 # 

246 # In RemoteButler I expect this to work by having the call to execute 

247 # continue to write Parquet files (or whatever) to some location until 

248 # its cursor is exhausted, and then delete those files as they are 

249 # fetched (or, failing that, when receiving a signal from 

250 # ``__exit__``). 

251 # 

252 # In DirectButler I expect to have a dict[PageKey, Cursor], fetch a 

253 # blocks of rows from it, and just reuse the page key for the next page 

254 # until the cursor is exactly. 

255 raise NotImplementedError() 

256 

257 @abstractmethod 

258 def materialize( 

259 self, 

260 tree: QueryTree, 

261 dimensions: DimensionGroup, 

262 datasets: frozenset[str], 

263 ) -> MaterializationKey: 

264 """Execute a query tree, saving results to temporary storage for use 

265 in later queries. 

266 

267 Parameters 

268 ---------- 

269 tree : `QueryTree` 

270 Query tree to evaluate. 

271 dimensions : `DimensionGroup` 

272 Dimensions whose key columns should be preserved. 

273 datasets : `frozenset` [ `str` ] 

274 Names of dataset types whose ID columns may be materialized. It 

275 is implementation-defined whether they actually are. 

276 

277 Returns 

278 ------- 

279 key : `MaterializationKey` 

280 Unique identifier for the result rows that allows them to be 

281 referenced in a `QueryTree`. 

282 """ 

283 raise NotImplementedError() 

284 

285 @abstractmethod 

286 def upload_data_coordinates( 

287 self, dimensions: DimensionGroup, rows: Iterable[tuple[DataIdValue, ...]] 

288 ) -> DataCoordinateUploadKey: 

289 """Upload a table of data coordinates for use in later queries. 

290 

291 Parameters 

292 ---------- 

293 dimensions : `DimensionGroup` 

294 Dimensions of the data coordinates. 

295 rows : `Iterable` [ `tuple` ] 

296 Tuples of data coordinate values, covering just the "required" 

297 subset of ``dimensions``. 

298 

299 Returns 

300 ------- 

301 key 

302 Unique identifier for the upload that allows it to be referenced in 

303 a `QueryTree`. 

304 """ 

305 raise NotImplementedError() 

306 

307 @abstractmethod 

308 def count( 

309 self, 

310 tree: QueryTree, 

311 result_spec: ResultSpec, 

312 *, 

313 exact: bool, 

314 discard: bool, 

315 ) -> int: 

316 """Return the number of rows a query would return. 

317 

318 Parameters 

319 ---------- 

320 tree : `QueryTree` 

321 Query tree to evaluate. 

322 result_spec : `ResultSpec` 

323 The kind of results the user wants to count. 

324 exact : `bool`, optional 

325 If `True`, run the full query and perform post-query filtering if 

326 needed to account for that filtering in the count. If `False`, the 

327 result may be an upper bound. 

328 discard : `bool`, optional 

329 If `True`, compute the exact count even if it would require running 

330 the full query and then throwing away the result rows after 

331 counting them. If `False`, this is an error, as the user would 

332 usually be better off executing the query first to fetch its rows 

333 into a new query (or passing ``exact=False``). Ignored if 

334 ``exact=False``. 

335 """ 

336 raise NotImplementedError() 

337 

338 @abstractmethod 

339 def any(self, tree: QueryTree, *, execute: bool, exact: bool) -> bool: 

340 """Test whether the query would return any rows. 

341 

342 Parameters 

343 ---------- 

344 tree : `QueryTree` 

345 Query tree to evaluate. 

346 execute : `bool`, optional 

347 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

348 determined prior to execution that the query would return no rows. 

349 exact : `bool`, optional 

350 If `True`, run the full query and perform post-query filtering if 

351 needed, until at least one result row is found. If `False`, the 

352 returned result does not account for post-query filtering, and 

353 hence may be `True` even when all result rows would be filtered 

354 out. 

355 

356 Returns 

357 ------- 

358 any : `bool` 

359 `True` if the query would (or might, depending on arguments) yield 

360 result rows. `False` if it definitely would not. 

361 """ 

362 raise NotImplementedError() 

363 

364 @abstractmethod 

365 def explain_no_results(self, tree: QueryTree, execute: bool) -> Iterable[str]: 

366 """Return human-readable messages that may help explain why the query 

367 yields no results. 

368 

369 Parameters 

370 ---------- 

371 tree : `QueryTree` 

372 Query tree to evaluate. 

373 execute : `bool`, optional 

374 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``) 

375 of aspects of the tree to more precisely determine where rows were 

376 filtered out. 

377 

378 Returns 

379 ------- 

380 messages : `~collections.abc.Iterable` [ `str` ] 

381 String messages that describe reasons the query might not yield any 

382 results. 

383 """ 

384 raise NotImplementedError() 

385 

386 @abstractmethod 

387 def get_default_collections(self) -> tuple[str, ...]: 

388 """Return the default collection search path. 

389 

390 Returns 

391 ------- 

392 collections : `tuple` [ `str`, ... ] 

393 The default collection search path as a tuple of `str`. 

394 

395 Raises 

396 ------ 

397 NoDefaultCollectionError 

398 Raised if there are no default collections. 

399 """ 

400 raise NotImplementedError() 

401 

402 @abstractmethod 

403 def get_dataset_type(self, name: str) -> DatasetType: 

404 """Return the dimensions for a dataset type. 

405 

406 Parameters 

407 ---------- 

408 name : `str` 

409 Name of the dataset type. 

410 

411 Returns 

412 ------- 

413 dataset_type : `DatasetType` 

414 Dimensions of the dataset type. 

415 

416 Raises 

417 ------ 

418 MissingDatasetTypeError 

419 Raised if the dataset type is not registered. 

420 """ 

421 raise NotImplementedError()