Coverage for python / lsst / daf / butler / queries / driver.py: 82%

66 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-22 08:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "DataCoordinateResultPage", 

32 "DatasetRefResultPage", 

33 "DimensionRecordResultPage", 

34 "GeneralResultPage", 

35 "QueryDriver", 

36 "ResultPage", 

37) 

38 

39import dataclasses 

40from abc import abstractmethod 

41from collections.abc import Iterable, Iterator 

42from contextlib import AbstractContextManager 

43from typing import Any, TypeAlias, Union, overload 

44 

45from .._dataset_ref import DatasetRef 

46from .._dataset_type import DatasetType 

47from ..dimensions import ( 

48 DataCoordinate, 

49 DataIdValue, 

50 DimensionElement, 

51 DimensionGroup, 

52 DimensionRecord, 

53 DimensionRecordSet, 

54 DimensionRecordTable, 

55 DimensionUniverse, 

56) 

57from .result_specs import ( 

58 DataCoordinateResultSpec, 

59 DatasetRefResultSpec, 

60 DimensionRecordResultSpec, 

61 GeneralResultSpec, 

62 ResultSpec, 

63) 

64from .tree import DataCoordinateUploadKey, MaterializationKey, QueryTree 

65 

66# The Page types below could become Pydantic models instead of dataclasses if 

67# that makes them more directly usable by RemoteButler (at least once we have 

68# Pydantic-friendly containers for all of them). We may want to add a 

69# discriminator annotation to the ResultPage union if we do that. 

70 

71 

72@dataclasses.dataclass 

73class DataCoordinateResultPage: 

74 """A single page of results from a data coordinate query.""" 

75 

76 spec: DataCoordinateResultSpec 

77 

78 # TODO: On DM-41114 this will become a custom container that normalizes out 

79 # attached DimensionRecords and is Pydantic-friendly. 

80 rows: list[DataCoordinate] 

81 

82 

83@dataclasses.dataclass 

84class DimensionRecordResultPage: 

85 """A single page of results from a dimension record query.""" 

86 

87 spec: DimensionRecordResultSpec 

88 rows: Iterable[DimensionRecord] 

89 

90 def as_table(self) -> DimensionRecordTable: 

91 if isinstance(self.rows, DimensionRecordTable): 

92 return self.rows 

93 else: 

94 return DimensionRecordTable(self.spec.element, self.rows) 

95 

96 def as_set(self) -> DimensionRecordSet: 

97 if isinstance(self.rows, DimensionRecordSet): 

98 return self.rows 

99 else: 

100 return DimensionRecordSet(self.spec.element, self.rows) 

101 

102 

103@dataclasses.dataclass 

104class DatasetRefResultPage: 

105 """A single page of results from a dataset query.""" 

106 

107 spec: DatasetRefResultSpec 

108 

109 # TODO: On DM-41115 this will become a custom container that normalizes out 

110 # attached DimensionRecords and is Pydantic-friendly. 

111 rows: list[DatasetRef] 

112 

113 

114@dataclasses.dataclass 

115class GeneralResultPage: 

116 """A single page of results from a general query.""" 

117 

118 spec: GeneralResultSpec 

119 

120 # Raw tabular data, with columns in the same order as 

121 # spec.get_result_columns(). 

122 rows: list[tuple[Any, ...]] 

123 

124 # This map contains dimension records for cached and skypix elements, 

125 # and only when spec.include_dimension_records is True. 

126 dimension_records: dict[DimensionElement, DimensionRecordSet] | None 

127 

128 

129ResultPage: TypeAlias = Union[ 

130 DataCoordinateResultPage, DimensionRecordResultPage, DatasetRefResultPage, GeneralResultPage 

131] 

132 

133 

134class QueryDriver(AbstractContextManager[None]): 

135 """Base class for the implementation object inside `Query` objects 

136 that is specialized for DirectButler vs. RemoteButler. 

137 

138 Notes 

139 ----- 

140 Implementations should be context managers. This allows them to manage the 

141 lifetime of server-side state, such as: 

142 

143 - a SQL transaction, when necessary (DirectButler); 

144 - SQL cursors for queries that were not fully iterated over (DirectButler); 

145 - temporary database tables (DirectButler); 

146 - result-page Parquet files that were never fetched (RemoteButler); 

147 - uploaded Parquet files used to fill temporary database tables 

148 (RemoteButler); 

149 - cached content needed to construct query trees, like collection summaries 

150 (potentially all Butlers). 

151 

152 When possible, these sorts of things should be cleaned up earlier when they 

153 are no longer needed, and the Butler server will still have to guard 

154 against the context manager's ``__exit__`` signal never reaching it, but a 

155 context manager will take care of these much more often than relying on 

156 garbage collection and ``__del__`` would. 

157 """ 

158 

159 @property 

160 @abstractmethod 

161 def universe(self) -> DimensionUniverse: 

162 """Object that defines all dimensions.""" 

163 raise NotImplementedError() 

164 

165 @overload 

166 def execute( 166 ↛ exitline 166 didn't return from function 'execute' because

167 self, result_spec: DataCoordinateResultSpec, tree: QueryTree 

168 ) -> Iterator[DataCoordinateResultPage]: ... 

169 

170 @overload 

171 def execute( 171 ↛ exitline 171 didn't return from function 'execute' because

172 self, result_spec: DimensionRecordResultSpec, tree: QueryTree 

173 ) -> Iterator[DimensionRecordResultPage]: ... 

174 

175 @overload 

176 def execute( 176 ↛ exitline 176 didn't return from function 'execute' because

177 self, result_spec: DatasetRefResultSpec, tree: QueryTree 

178 ) -> Iterator[DatasetRefResultPage]: ... 

179 

180 @overload 

181 def execute(self, result_spec: GeneralResultSpec, tree: QueryTree) -> Iterator[GeneralResultPage]: ... 181 ↛ exitline 181 didn't return from function 'execute' because

182 

183 @abstractmethod 

184 def execute(self, result_spec: ResultSpec, tree: QueryTree) -> Iterator[ResultPage]: 

185 """Execute a query and return the first result page. 

186 

187 Parameters 

188 ---------- 

189 result_spec : `ResultSpec` 

190 The kind of results the user wants from the query. This can affect 

191 the actual query (i.e. SQL and Python postprocessing) that is run, 

192 e.g. by changing what is in the SQL SELECT clause and even what 

193 tables are joined in, but it never changes the number or order of 

194 result rows. 

195 tree : `QueryTree` 

196 Query tree to evaluate. 

197 

198 Yields 

199 ------ 

200 page : `ResultPage` 

201 A page whose type corresponds to the type of ``result_spec``, with 

202 rows from the query. 

203 """ 

204 raise NotImplementedError() 

205 

206 @abstractmethod 

207 def materialize( 

208 self, 

209 tree: QueryTree, 

210 dimensions: DimensionGroup, 

211 datasets: frozenset[str], 

212 allow_duplicate_overlaps: bool = False, 

213 ) -> MaterializationKey: 

214 """Execute a query tree, saving results to temporary storage for use 

215 in later queries. 

216 

217 Parameters 

218 ---------- 

219 tree : `QueryTree` 

220 Query tree to evaluate. 

221 dimensions : `DimensionGroup` 

222 Dimensions whose key columns should be preserved. 

223 datasets : `frozenset` [ `str` ] 

224 Names of dataset types whose ID columns may be materialized. It 

225 is implementation-defined whether they actually are. 

226 allow_duplicate_overlaps : `bool`, optional 

227 If set to `True` then query will be allowed to generate 

228 non-distinct rows for spatial overlaps. 

229 

230 Returns 

231 ------- 

232 key : `MaterializationKey` 

233 Unique identifier for the result rows that allows them to be 

234 referenced in a `QueryTree`. 

235 """ 

236 raise NotImplementedError() 

237 

238 @abstractmethod 

239 def upload_data_coordinates( 

240 self, dimensions: DimensionGroup, rows: Iterable[tuple[DataIdValue, ...]] 

241 ) -> DataCoordinateUploadKey: 

242 """Upload a table of data coordinates for use in later queries. 

243 

244 Parameters 

245 ---------- 

246 dimensions : `DimensionGroup` 

247 Dimensions of the data coordinates. 

248 rows : `~collections.abc.Iterable` [ `tuple` ] 

249 Tuples of data coordinate values, covering just the "required" 

250 subset of ``dimensions``. 

251 

252 Returns 

253 ------- 

254 key : `DataCoordinateUploadKey` 

255 Unique identifier for the upload that allows it to be referenced in 

256 a `QueryTree`. 

257 """ 

258 raise NotImplementedError() 

259 

260 @abstractmethod 

261 def count( 

262 self, 

263 tree: QueryTree, 

264 result_spec: ResultSpec, 

265 *, 

266 exact: bool, 

267 discard: bool, 

268 ) -> int: 

269 """Return the number of rows a query would return. 

270 

271 Parameters 

272 ---------- 

273 tree : `QueryTree` 

274 Query tree to evaluate. 

275 result_spec : `ResultSpec` 

276 The kind of results the user wants to count. 

277 exact : `bool`, optional 

278 If `True`, run the full query and perform post-query filtering if 

279 needed to account for that filtering in the count. If `False`, the 

280 result may be an upper bound. 

281 discard : `bool`, optional 

282 If `True`, compute the exact count even if it would require running 

283 the full query and then throwing away the result rows after 

284 counting them. If `False`, this is an error, as the user would 

285 usually be better off executing the query first to fetch its rows 

286 into a new query (or passing ``exact=False``). Ignored if 

287 ``exact=False``. 

288 """ 

289 raise NotImplementedError() 

290 

291 @abstractmethod 

292 def any(self, tree: QueryTree, *, execute: bool, exact: bool) -> bool: 

293 """Test whether the query would return any rows. 

294 

295 Parameters 

296 ---------- 

297 tree : `QueryTree` 

298 Query tree to evaluate. 

299 execute : `bool`, optional 

300 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

301 determined prior to execution that the query would return no rows. 

302 exact : `bool`, optional 

303 If `True`, run the full query and perform post-query filtering if 

304 needed, until at least one result row is found. If `False`, the 

305 returned result does not account for post-query filtering, and 

306 hence may be `True` even when all result rows would be filtered 

307 out. 

308 

309 Returns 

310 ------- 

311 any : `bool` 

312 `True` if the query would (or might, depending on arguments) yield 

313 result rows. `False` if it definitely would not. 

314 """ 

315 raise NotImplementedError() 

316 

317 @abstractmethod 

318 def explain_no_results(self, tree: QueryTree, execute: bool) -> Iterable[str]: 

319 """Return human-readable messages that may help explain why the query 

320 yields no results. 

321 

322 Parameters 

323 ---------- 

324 tree : `QueryTree` 

325 Query tree to evaluate. 

326 execute : `bool`, optional 

327 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``) 

328 of aspects of the tree to more precisely determine where rows were 

329 filtered out. 

330 

331 Returns 

332 ------- 

333 messages : `~collections.abc.Iterable` [ `str` ] 

334 String messages that describe reasons the query might not yield any 

335 results. 

336 """ 

337 raise NotImplementedError() 

338 

339 @abstractmethod 

340 def get_default_collections(self) -> tuple[str, ...]: 

341 """Return the default collection search path. 

342 

343 Returns 

344 ------- 

345 collections : `tuple` [ `str`, ... ] 

346 The default collection search path as a tuple of `str`. 

347 

348 Raises 

349 ------ 

350 NoDefaultCollectionError 

351 Raised if there are no default collections. 

352 """ 

353 raise NotImplementedError() 

354 

355 @abstractmethod 

356 def get_dataset_type(self, name: str) -> DatasetType: 

357 """Return the dimensions for a dataset type. 

358 

359 Parameters 

360 ---------- 

361 name : `str` 

362 Name of the dataset type. 

363 

364 Returns 

365 ------- 

366 dataset_type : `DatasetType` 

367 Dimensions of the dataset type. 

368 

369 Raises 

370 ------ 

371 MissingDatasetTypeError 

372 Raised if the dataset type is not registered. 

373 """ 

374 raise NotImplementedError()