Coverage for python/lsst/daf/butler/queries/driver.py: 88%

85 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-05 11:36 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "QueryDriver", 

32 "PageKey", 

33 "ResultPage", 

34 "DataCoordinateResultPage", 

35 "DimensionRecordResultPage", 

36 "DatasetRefResultPage", 

37 "GeneralResultPage", 

38) 

39 

40import dataclasses 

41import uuid 

42from abc import abstractmethod 

43from collections.abc import Iterable, Sequence 

44from contextlib import AbstractContextManager 

45from typing import Any, TypeAlias, Union, overload 

46 

47from .._dataset_ref import DatasetRef 

48from .._dataset_type import DatasetType 

49from ..dimensions import ( 

50 DataCoordinate, 

51 DataIdValue, 

52 DimensionGroup, 

53 DimensionRecord, 

54 DimensionRecordSet, 

55 DimensionRecordTable, 

56 DimensionUniverse, 

57) 

58from ..registry import CollectionSummary 

59from ..registry.interfaces import CollectionRecord 

60from .result_specs import ( 

61 DataCoordinateResultSpec, 

62 DatasetRefResultSpec, 

63 DimensionRecordResultSpec, 

64 GeneralResultSpec, 

65 ResultSpec, 

66) 

67from .tree import DataCoordinateUploadKey, MaterializationKey, QueryTree 

68 

69PageKey: TypeAlias = uuid.UUID 

70 

71 

72# The Page types below could become Pydantic models instead of dataclasses if 

73# that makes them more directly usable by RemoteButler (at least once we have 

74# Pydantic-friendly containers for all of them). We may want to add a 

75# discriminator annotation to the ResultPage union if we do that. 

76 

77 

78@dataclasses.dataclass 

79class DataCoordinateResultPage: 

80 """A single page of results from a data coordinate query.""" 

81 

82 spec: DataCoordinateResultSpec 

83 next_key: PageKey | None 

84 

85 # TODO: On DM-41114 this will become a custom container that normalizes out 

86 # attached DimensionRecords and is Pydantic-friendly. 

87 rows: list[DataCoordinate] 

88 

89 

90@dataclasses.dataclass 

91class DimensionRecordResultPage: 

92 """A single page of results from a dimension record query.""" 

93 

94 spec: DimensionRecordResultSpec 

95 next_key: PageKey | None 

96 rows: Iterable[DimensionRecord] 

97 

98 def as_table(self) -> DimensionRecordTable: 

99 if isinstance(self.rows, DimensionRecordTable): 

100 return self.rows 

101 else: 

102 return DimensionRecordTable(self.spec.element, self.rows) 

103 

104 def as_set(self) -> DimensionRecordSet: 

105 if isinstance(self.rows, DimensionRecordSet): 

106 return self.rows 

107 else: 

108 return DimensionRecordSet(self.spec.element, self.rows) 

109 

110 

111@dataclasses.dataclass 

112class DatasetRefResultPage: 

113 """A single page of results from a dataset query.""" 

114 

115 spec: DatasetRefResultSpec 

116 next_key: PageKey | None 

117 

118 # TODO: On DM-41115 this will become a custom container that normalizes out 

119 # attached DimensionRecords and is Pydantic-friendly. 

120 rows: list[DatasetRef] 

121 

122 

123@dataclasses.dataclass 

124class GeneralResultPage: 

125 """A single page of results from a general query.""" 

126 

127 spec: GeneralResultSpec 

128 next_key: PageKey | None 

129 

130 # Raw tabular data, with columns in the same order as spec.columns. 

131 rows: list[tuple[Any, ...]] 

132 

133 

134ResultPage: TypeAlias = Union[ 

135 DataCoordinateResultPage, DimensionRecordResultPage, DatasetRefResultPage, GeneralResultPage 

136] 

137 

138 

139class QueryDriver(AbstractContextManager[None]): 

140 """Base class for the implementation object inside `Query2` objects 

141 that is specialized for DirectButler vs. RemoteButler. 

142 

143 Notes 

144 ----- 

145 Implementations should be context managers. This allows them to manage the 

146 lifetime of server-side state, such as: 

147 

148 - a SQL transaction, when necessary (DirectButler); 

149 - SQL cursors for queries that were not fully iterated over (DirectButler); 

150 - temporary database tables (DirectButler); 

151 - result-page Parquet files that were never fetched (RemoteButler); 

152 - uploaded Parquet files used to fill temporary database tables 

153 (RemoteButler); 

154 - cached content needed to construct query trees, like collection summaries 

155 (potentially all Butlers). 

156 

157 When possible, these sorts of things should be cleaned up earlier when they 

158 are no longer needed, and the Butler server will still have to guard 

159 against the context manager's ``__exit__`` signal never reaching it, but a 

160 context manager will take care of these much more often than relying on 

161 garbage collection and ``__del__`` would. 

162 """ 

163 

164 @property 

165 @abstractmethod 

166 def universe(self) -> DimensionUniverse: 

167 """Object that defines all dimensions.""" 

168 raise NotImplementedError() 

169 

170 @overload 

171 def execute(self, result_spec: DataCoordinateResultSpec, tree: QueryTree) -> DataCoordinateResultPage: ... 171 ↛ exitline 171 didn't return from function 'execute'

172 

173 @overload 

174 def execute( 174 ↛ exitline 174 didn't jump to the function exit

175 self, result_spec: DimensionRecordResultSpec, tree: QueryTree 

176 ) -> DimensionRecordResultPage: ... 

177 

178 @overload 

179 def execute(self, result_spec: DatasetRefResultSpec, tree: QueryTree) -> DatasetRefResultPage: ... 179 ↛ exitline 179 didn't return from function 'execute'

180 

181 @overload 

182 def execute(self, result_spec: GeneralResultSpec, tree: QueryTree) -> GeneralResultPage: ... 182 ↛ exitline 182 didn't return from function 'execute'

183 

184 @abstractmethod 

185 def execute(self, result_spec: ResultSpec, tree: QueryTree) -> ResultPage: 

186 """Execute a query and return the first result page. 

187 

188 Parameters 

189 ---------- 

190 result_spec : `ResultSpec` 

191 The kind of results the user wants from the query. This can affect 

192 the actual query (i.e. SQL and Python postprocessing) that is run, 

193 e.g. by changing what is in the SQL SELECT clause and even what 

194 tables are joined in, but it never changes the number or order of 

195 result rows. 

196 tree : `QueryTree` 

197 Query tree to evaluate. 

198 

199 Returns 

200 ------- 

201 first_page : `ResultPage` 

202 A page whose type corresponds to the type of ``result_spec``, with 

203 at least the initial rows from the query. This should have an 

204 empty ``rows`` attribute if the query returned no results, and a 

205 ``next_key`` attribute that is not `None` if there were more 

206 results than could be returned in a single page. 

207 """ 

208 raise NotImplementedError() 

209 

210 @overload 

211 def fetch_next_page( 211 ↛ exitline 211 didn't jump to the function exit

212 self, result_spec: DataCoordinateResultSpec, key: PageKey 

213 ) -> DataCoordinateResultPage: ... 

214 

215 @overload 

216 def fetch_next_page( 216 ↛ exitline 216 didn't jump to the function exit

217 self, result_spec: DimensionRecordResultSpec, key: PageKey 

218 ) -> DimensionRecordResultPage: ... 

219 

220 @overload 

221 def fetch_next_page(self, result_spec: DatasetRefResultSpec, key: PageKey) -> DatasetRefResultPage: ... 221 ↛ exitline 221 didn't return from function 'fetch_next_page'

222 

223 @overload 

224 def fetch_next_page(self, result_spec: GeneralResultSpec, key: PageKey) -> GeneralResultPage: ... 224 ↛ exitline 224 didn't return from function 'fetch_next_page'

225 

226 @abstractmethod 

227 def fetch_next_page(self, result_spec: ResultSpec, key: PageKey) -> ResultPage: 

228 """Fetch the next page of results from an already-executed query. 

229 

230 Parameters 

231 ---------- 

232 result_spec : `ResultSpec` 

233 The kind of results the user wants from the query. This must be 

234 identical to the ``result_spec`` passed to `execute`, but 

235 implementations are not *required* to check this. 

236 key : `PageKey` 

237 Key included in the previous page from this query. This key may 

238 become unusable or even be reused after this call. 

239 

240 Returns 

241 ------- 

242 next_page : `ResultPage` 

243 The next page of query results. 

244 """ 

245 # We can put off dealing with pagination initially by just making an 

246 # implementation of this method raise. 

247 # 

248 # In RemoteButler I expect this to work by having the call to execute 

249 # continue to write Parquet files (or whatever) to some location until 

250 # its cursor is exhausted, and then delete those files as they are 

251 # fetched (or, failing that, when receiving a signal from 

252 # ``__exit__``). 

253 # 

254 # In DirectButler I expect to have a dict[PageKey, Cursor], fetch a 

255 # blocks of rows from it, and just reuse the page key for the next page 

256 # until the cursor is exactly. 

257 raise NotImplementedError() 

258 

259 @abstractmethod 

260 def materialize( 

261 self, 

262 tree: QueryTree, 

263 dimensions: DimensionGroup, 

264 datasets: frozenset[str], 

265 ) -> MaterializationKey: 

266 """Execute a query tree, saving results to temporary storage for use 

267 in later queries. 

268 

269 Parameters 

270 ---------- 

271 tree : `QueryTree` 

272 Query tree to evaluate. 

273 dimensions : `DimensionGroup` 

274 Dimensions whose key columns should be preserved. 

275 datasets : `frozenset` [ `str` ] 

276 Names of dataset types whose ID columns may be materialized. It 

277 is implementation-defined whether they actually are. 

278 

279 Returns 

280 ------- 

281 key : `MaterializationKey` 

282 Unique identifier for the result rows that allows them to be 

283 referenced in a `QueryTree`. 

284 """ 

285 raise NotImplementedError() 

286 

287 @abstractmethod 

288 def upload_data_coordinates( 

289 self, dimensions: DimensionGroup, rows: Iterable[tuple[DataIdValue, ...]] 

290 ) -> DataCoordinateUploadKey: 

291 """Upload a table of data coordinates for use in later queries. 

292 

293 Parameters 

294 ---------- 

295 dimensions : `DimensionGroup` 

296 Dimensions of the data coordinates. 

297 rows : `Iterable` [ `tuple` ] 

298 Tuples of data coordinate values, covering just the "required" 

299 subset of ``dimensions``. 

300 

301 Returns 

302 ------- 

303 key 

304 Unique identifier for the upload that allows it to be referenced in 

305 a `QueryTree`. 

306 """ 

307 raise NotImplementedError() 

308 

309 @abstractmethod 

310 def count( 

311 self, 

312 tree: QueryTree, 

313 result_spec: ResultSpec, 

314 *, 

315 exact: bool, 

316 discard: bool, 

317 ) -> int: 

318 """Return the number of rows a query would return. 

319 

320 Parameters 

321 ---------- 

322 tree : `QueryTree` 

323 Query tree to evaluate. 

324 result_spec : `ResultSpec` 

325 The kind of results the user wants to count. 

326 exact : `bool`, optional 

327 If `True`, run the full query and perform post-query filtering if 

328 needed to account for that filtering in the count. If `False`, the 

329 result may be an upper bound. 

330 discard : `bool`, optional 

331 If `True`, compute the exact count even if it would require running 

332 the full query and then throwing away the result rows after 

333 counting them. If `False`, this is an error, as the user would 

334 usually be better off executing the query first to fetch its rows 

335 into a new query (or passing ``exact=False``). Ignored if 

336 ``exact=False``. 

337 """ 

338 raise NotImplementedError() 

339 

340 @abstractmethod 

341 def any(self, tree: QueryTree, *, execute: bool, exact: bool) -> bool: 

342 """Test whether the query would return any rows. 

343 

344 Parameters 

345 ---------- 

346 tree : `QueryTree` 

347 Query tree to evaluate. 

348 execute : `bool`, optional 

349 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

350 determined prior to execution that the query would return no rows. 

351 exact : `bool`, optional 

352 If `True`, run the full query and perform post-query filtering if 

353 needed, until at least one result row is found. If `False`, the 

354 returned result does not account for post-query filtering, and 

355 hence may be `True` even when all result rows would be filtered 

356 out. 

357 

358 Returns 

359 ------- 

360 any : `bool` 

361 `True` if the query would (or might, depending on arguments) yield 

362 result rows. `False` if it definitely would not. 

363 """ 

364 raise NotImplementedError() 

365 

366 @abstractmethod 

367 def explain_no_results(self, tree: QueryTree, execute: bool) -> Iterable[str]: 

368 """Return human-readable messages that may help explain why the query 

369 yields no results. 

370 

371 Parameters 

372 ---------- 

373 tree : `QueryTree` 

374 Query tree to evaluate. 

375 execute : `bool`, optional 

376 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``) 

377 of aspects of the tree to more precisely determine where rows were 

378 filtered out. 

379 

380 Returns 

381 ------- 

382 messages : `~collections.abc.Iterable` [ `str` ] 

383 String messages that describe reasons the query might not yield any 

384 results. 

385 """ 

386 raise NotImplementedError() 

387 

388 @abstractmethod 

389 def get_default_collections(self) -> tuple[str, ...]: 

390 """Return the default collection search path. 

391 

392 Returns 

393 ------- 

394 collections : `tuple` [ `str`, ... ] 

395 The default collection search path as a tuple of `str`. 

396 

397 Raises 

398 ------ 

399 NoDefaultCollectionError 

400 Raised if there are no default collections. 

401 """ 

402 raise NotImplementedError() 

403 

404 @abstractmethod 

405 def resolve_collection_path( 

406 self, collections: Sequence[str] 

407 ) -> list[tuple[CollectionRecord, CollectionSummary]]: 

408 """Process a collection search path argument into a `list` of 

409 collection records and summaries. 

410 

411 Parameters 

412 ---------- 

413 collections : `~collections.abc.Sequence` [ `str` ] 

414 The collection or collections to search. 

415 

416 Returns 

417 ------- 

418 collection_info : `list` [ `tuple` [ `CollectionRecord`, \ 

419 `CollectionSummary` ] ] 

420 A `list` of pairs of `CollectionRecord` and `CollectionSummary` 

421 that flattens out all `~CollectionType.CHAINED` collections into 

422 their children while maintaining the same order and avoiding 

423 duplicates. 

424 

425 Raises 

426 ------ 

427 MissingCollectionError 

428 Raised if any collection in ``collections`` does not exist. 

429 

430 Notes 

431 ----- 

432 Implementations are generally expected to cache the collection records 

433 and summaries they obtain (including the records for 

434 `~CollectionType.CHAINED` collections that are not returned) in order 

435 to optimize multiple calls with collections in common. 

436 """ 

437 raise NotImplementedError() 

438 

439 @abstractmethod 

440 def get_dataset_type(self, name: str) -> DatasetType: 

441 """Return the dimensions for a dataset type. 

442 

443 Parameters 

444 ---------- 

445 name : `str` 

446 Name of the dataset type. 

447 

448 Returns 

449 ------- 

450 dataset_type : `DatasetType` 

451 Dimensions of the dataset type. 

452 

453 Raises 

454 ------ 

455 MissingDatasetTypeError 

456 Raised if the dataset type is not registered. 

457 """ 

458 raise NotImplementedError()