Coverage for python / lsst / daf / butler / registry / queries / _results.py: 77%

77 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:37 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ( 

30 "ChainedDatasetQueryResults", 

31 "DataCoordinateQueryResults", 

32 "DatasetQueryResults", 

33 "DimensionRecordQueryResults", 

34 "ParentDatasetQueryResults", 

35 "QueryResultsBase", 

36) 

37 

38import itertools 

39from abc import abstractmethod 

40from collections.abc import Iterable, Iterator, Sequence 

41from contextlib import AbstractContextManager 

42from typing import Any, Self 

43 

44from ..._dataset_ref import DatasetRef 

45from ..._dataset_type import DatasetType 

46from ...dimensions import ( 

47 DataCoordinate, 

48 DataCoordinateIterable, 

49 DimensionElement, 

50 DimensionGroup, 

51 DimensionRecord, 

52) 

53 

54 

55class LimitedQueryResultsBase: 

56 """Abstract base class defining functions that are shared by all of the 

57 other QueryResults classes. 

58 """ 

59 

60 @abstractmethod 

61 def count(self, *, exact: bool = True, discard: bool = False) -> int: 

62 """Count the number of rows this query would return. 

63 

64 Parameters 

65 ---------- 

66 exact : `bool`, optional 

67 If `True`, run the full query and perform post-query filtering if 

68 needed to account for that filtering in the count. If `False`, the 

69 result may be an upper bound. 

70 discard : `bool`, optional 

71 If `True`, compute the exact count even if it would require running 

72 the full query and then throwing away the result rows after 

73 counting them. If `False`, this is an error, as the user would 

74 usually be better off executing the query first to fetch its rows 

75 into a new query (or passing ``exact=False``). Ignored if 

76 ``exact=False``. 

77 

78 Returns 

79 ------- 

80 count : `int` 

81 The number of rows the query would return, or an upper bound if 

82 ``exact=False``. 

83 

84 Notes 

85 ----- 

86 This counts the number of rows returned, not the number of unique rows 

87 returned, so even with ``exact=True`` it may provide only an upper 

88 bound on the number of *deduplicated* result rows. 

89 """ 

90 raise NotImplementedError() 

91 

92 @abstractmethod 

93 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

94 """Test whether this query returns any results. 

95 

96 Parameters 

97 ---------- 

98 execute : `bool`, optional 

99 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

100 determined prior to execution that the query would return no rows. 

101 exact : `bool`, optional 

102 If `True`, run the full query and perform post-query filtering if 

103 needed, until at least one result row is found. If `False`, the 

104 returned result does not account for post-query filtering, and 

105 hence may be `True` even when all result rows would be filtered 

106 out. 

107 

108 Returns 

109 ------- 

110 any : `bool` 

111 `True` if the query would (or might, depending on arguments) yield 

112 result rows. `False` if it definitely would not. 

113 """ 

114 raise NotImplementedError() 

115 

116 @abstractmethod 

117 def explain_no_results(self, execute: bool = True) -> Iterable[str]: 

118 """Return human-readable messages that may help explain why the query 

119 yields no results. 

120 

121 Parameters 

122 ---------- 

123 execute : `bool`, optional 

124 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``) 

125 of aspects of the tree to more precisely determine where rows were 

126 filtered out. 

127 

128 Returns 

129 ------- 

130 messages : `~collections.abc.Iterable` [ `str` ] 

131 String messages that describe reasons the query might not yield any 

132 results. 

133 """ 

134 raise NotImplementedError() 

135 

136 

137class QueryResultsBase(LimitedQueryResultsBase): 

138 """Abstract base class defining functions shared by several of the other 

139 QueryResults classes. 

140 """ 

141 

142 @abstractmethod 

143 def order_by(self, *args: str) -> Self: 

144 """Make the iterator return ordered results. 

145 

146 Parameters 

147 ---------- 

148 *args : `str` 

149 Names of the columns/dimensions to use for ordering. Column name 

150 can be prefixed with minus (``-``) to use descending ordering. 

151 

152 Returns 

153 ------- 

154 result : `typing.Self` 

155 Returns ``self`` instance which is updated to return ordered 

156 result. 

157 

158 Notes 

159 ----- 

160 This method modifies the iterator in place and returns the same 

161 instance to support method chaining. 

162 """ 

163 raise NotImplementedError() 

164 

165 @abstractmethod 

166 def limit(self, limit: int) -> Self: 

167 """Make the iterator return limited number of records. 

168 

169 Parameters 

170 ---------- 

171 limit : `int` 

172 Upper limit on the number of returned records. 

173 

174 Returns 

175 ------- 

176 result : `typing.Self` 

177 Returns ``self`` instance which is updated to return limited set 

178 of records. 

179 

180 Notes 

181 ----- 

182 This method modifies the iterator in place and returns the same 

183 instance to support method chaining. Normally this method is used 

184 together with `order_by` method. 

185 """ 

186 raise NotImplementedError() 

187 

188 

189class DataCoordinateQueryResults(QueryResultsBase, DataCoordinateIterable): 

190 """An enhanced implementation of `DataCoordinateIterable` that represents 

191 data IDs retrieved from a database query. 

192 """ 

193 

194 @abstractmethod 

195 def materialize(self) -> AbstractContextManager[DataCoordinateQueryResults]: 

196 """Insert this query's results into a temporary table. 

197 

198 Returns 

199 ------- 

200 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

201 A context manager that ensures the temporary table is created and 

202 populated in ``__enter__`` (returning a results object backed by 

203 that table), and dropped in ``__exit__``. If ``self`` is already 

204 materialized, the context manager may do nothing (reflecting the 

205 fact that an outer context manager should already take care of 

206 everything else). 

207 

208 Notes 

209 ----- 

210 When using a very large result set to perform multiple queries (e.g. 

211 multiple calls to `subset` with different arguments, or even a single 

212 call to `expanded`), it may be much more efficient to start by 

213 materializing the query and only then performing the follow up queries. 

214 It may also be less efficient, depending on how well database engine's 

215 query optimizer can simplify those particular follow-up queries and 

216 how efficiently it caches query results even when the are not 

217 explicitly inserted into a temporary table. See `expanded` and 

218 `subset` for examples. 

219 """ 

220 raise NotImplementedError() 

221 

222 @abstractmethod 

223 def expanded(self) -> DataCoordinateQueryResults: 

224 """Return a results object for which `hasRecords` returns `True`. 

225 

226 This method may involve actually executing database queries to fetch 

227 `DimensionRecord` objects. 

228 

229 Returns 

230 ------- 

231 results : `DataCoordinateQueryResults` 

232 A results object for which `hasRecords` returns `True`. May be 

233 ``self`` if that is already the case. 

234 

235 Notes 

236 ----- 

237 For very result sets, it may be much more efficient to call 

238 `materialize` before calling `expanded`, to avoid performing the 

239 original query multiple times (as a subquery) in the follow-up queries 

240 that fetch dimension records. For example:: 

241 

242 with registry.queryDataIds(...).materialize() as tempDataIds: 

243 dataIdsWithRecords = tempDataIds.expanded() 

244 for dataId in dataIdsWithRecords: 

245 ... 

246 """ 

247 raise NotImplementedError() 

248 

249 @abstractmethod 

250 def subset( 

251 self, 

252 dimensions: DimensionGroup | Iterable[str] | None = None, 

253 *, 

254 unique: bool = False, 

255 ) -> DataCoordinateQueryResults: 

256 """Return a results object containing a subset of the dimensions of 

257 this one, and/or a unique near-subset of its rows. 

258 

259 This method may involve actually executing database queries to fetch 

260 `DimensionRecord` objects. 

261 

262 Parameters 

263 ---------- 

264 dimensions : `DimensionGroup` or \ 

265 `~collections.abc.Iterable` [ `str`], optional 

266 Dimensions to include in the new results object. If `None`, 

267 ``self.dimensions`` is used. 

268 unique : `bool`, optional 

269 If `True` (`False` is default), the query should only return unique 

270 data IDs. This is implemented in the database; to obtain unique 

271 results via Python-side processing (which may be more efficient in 

272 some cases), use `toSet` to construct a `DataCoordinateSet` from 

273 this results object instead. 

274 

275 Returns 

276 ------- 

277 results : `DataCoordinateQueryResults` 

278 A results object corresponding to the given criteria. May be 

279 ``self`` if it already qualifies. 

280 

281 Raises 

282 ------ 

283 ValueError 

284 Raised when ``dimensions`` is not a subset of the dimensions in 

285 this result. 

286 

287 Notes 

288 ----- 

289 This method can only return a "near-subset" of the original result rows 

290 in general because of subtleties in how spatial overlaps are 

291 implemented; see `Query.projected` for more information. 

292 

293 When calling `subset` multiple times on the same very large result set, 

294 it may be much more efficient to call `materialize` first. For 

295 example:: 

296 

297 dimensions1 = DimensionGroup(...) 

298 dimensions2 = DimensionGroup(...) 

299 with registry.queryDataIds(...).materialize() as tempDataIds: 

300 for dataId1 in tempDataIds.subset(dimensions1, unique=True): 

301 ... 

302 for dataId2 in tempDataIds.subset(dimensions2, unique=True): 

303 ... 

304 """ 

305 raise NotImplementedError() 

306 

307 @abstractmethod 

308 def findDatasets( 

309 self, 

310 datasetType: DatasetType | str, 

311 collections: Any, 

312 *, 

313 findFirst: bool = True, 

314 ) -> ParentDatasetQueryResults: 

315 """Find datasets using the data IDs identified by this query. 

316 

317 Parameters 

318 ---------- 

319 datasetType : `DatasetType` or `str` 

320 Dataset type or the name of one to search for. Must have 

321 dimensions that are a subset of ``self.graph``. 

322 collections : `typing.Any` 

323 An expression that fully or partially identifies the collections 

324 to search for the dataset, such as a `str`, `re.Pattern`, or 

325 iterable thereof. ``...`` can be used to return all collections. 

326 See :ref:`daf_butler_collection_expressions` for more information. 

327 findFirst : `bool`, optional 

328 If `True` (default), for each result data ID, only yield one 

329 `DatasetRef`, from the first collection in which a dataset of that 

330 dataset type appears (according to the order of ``collections`` 

331 passed in). If `True`, ``collections`` must not contain regular 

332 expressions and may not be ``...``. 

333 

334 Returns 

335 ------- 

336 datasets : `ParentDatasetQueryResults` 

337 A lazy-evaluation object representing dataset query results, 

338 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

339 nested data IDs in those dataset references will have records as 

340 well. 

341 

342 Raises 

343 ------ 

344 MissingDatasetTypeError 

345 Raised if the given dataset type is not registered. 

346 """ 

347 raise NotImplementedError() 

348 

349 @abstractmethod 

350 def findRelatedDatasets( 

351 self, 

352 datasetType: DatasetType | str, 

353 collections: Any, 

354 *, 

355 findFirst: bool = True, 

356 dimensions: DimensionGroup | Iterable[str] | None = None, 

357 ) -> Iterable[tuple[DataCoordinate, DatasetRef]]: 

358 """Find datasets using the data IDs identified by this query, and 

359 return them along with the original data IDs. 

360 

361 This is a variant of `findDatasets` that is often more useful when 

362 the target dataset type does not have all of the dimensions of the 

363 original data ID query, as is generally the case with calibration 

364 lookups. 

365 

366 Parameters 

367 ---------- 

368 datasetType : `DatasetType` or `str` 

369 Dataset type or the name of one to search for. Must have 

370 dimensions that are a subset of ``self.graph``. 

371 collections : `typing.Any` 

372 An expression that fully or partially identifies the collections 

373 to search for the dataset, such as a `str`, `re.Pattern`, or 

374 iterable thereof. ``...`` can be used to return all collections. 

375 See :ref:`daf_butler_collection_expressions` for more information. 

376 findFirst : `bool`, optional 

377 If `True` (default), for each data ID in ``self``, only yield one 

378 `DatasetRef`, from the first collection in which a dataset of that 

379 dataset type appears (according to the order of ``collections`` 

380 passed in). If `True`, ``collections`` must not contain regular 

381 expressions and may not be ``...``. Note that this is not the 

382 same as yielding one `DatasetRef` for each yielded data ID if 

383 ``dimensions`` is not `None`. 

384 dimensions : `DimensionGroup` or \ 

385 `~collections.abc.Iterable` [ `str` ], optional 

386 The dimensions of the data IDs returned. Must be a subset of 

387 ``self.dimensions``. 

388 

389 Returns 

390 ------- 

391 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`, \ 

392 `DatasetRef` ] ] 

393 An iterable of (data ID, dataset reference) pairs. 

394 

395 Raises 

396 ------ 

397 MissingDatasetTypeError 

398 Raised if the given dataset type is not registered. 

399 """ 

400 raise NotImplementedError() 

401 

402 

403class DatasetQueryResults(LimitedQueryResultsBase, Iterable[DatasetRef]): 

404 """An interface for objects that represent the results of queries for 

405 datasets. 

406 """ 

407 

408 @abstractmethod 

409 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

410 """Group results by parent dataset type. 

411 

412 Returns 

413 ------- 

414 iter : `~collections.abc.Iterator` [ `ParentDatasetQueryResults` ] 

415 An iterator over `DatasetQueryResults` instances that are each 

416 responsible for a single parent dataset type. 

417 """ 

418 raise NotImplementedError() 

419 

420 @abstractmethod 

421 def expanded(self) -> Self: 

422 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords` 

423 returns `True` for all data IDs in returned `DatasetRef` objects. 

424 

425 Returns 

426 ------- 

427 expanded : `DatasetQueryResults` 

428 Either a new `DatasetQueryResults` instance or ``self``, if it is 

429 already expanded. 

430 

431 Notes 

432 ----- 

433 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

434 to call `materialize` before expanding data IDs for very large result 

435 sets. 

436 """ 

437 raise NotImplementedError() 

438 

439 def _iter_by_dataset_type(self) -> Iterator[tuple[DatasetType, Iterable[DatasetRef]]]: 

440 """Group results by dataset type. 

441 

442 This is a private hook for the interface defined by 

443 `DatasetRef.iter_by_type`, enabling much more efficient 

444 processing of heterogeneous `DatasetRef` iterables when they come 

445 directly from queries. 

446 """ 

447 for parent_results in self.byParentDatasetType(): 

448 dataset_type = parent_results.parentDatasetType 

449 yield dataset_type, parent_results 

450 

451 

452class ParentDatasetQueryResults(DatasetQueryResults): 

453 """An object that represents results from a query for datasets with a 

454 single parent `DatasetType`. 

455 """ 

456 

457 @property 

458 @abstractmethod 

459 def parentDatasetType(self) -> DatasetType: 

460 """The parent dataset type for all datasets in this iterable 

461 (`DatasetType`). 

462 """ 

463 raise NotImplementedError() 

464 

465 @property 

466 @abstractmethod 

467 def dataIds(self) -> DataCoordinateQueryResults: 

468 """A lazy-evaluation object representing a query for just the data 

469 IDs of the datasets that would be returned by this query 

470 (`DataCoordinateQueryResults`). 

471 

472 The returned object is not in general `zip`-iterable with ``self``; 

473 it may be in a different order or have (or not have) duplicates. 

474 """ 

475 raise NotImplementedError() 

476 

477 

478class ChainedDatasetQueryResults(DatasetQueryResults): 

479 """A `DatasetQueryResults` implementation that simply chains together 

480 other results objects, each for a different parent dataset type. 

481 

482 Parameters 

483 ---------- 

484 chain : `~collections.abc.Sequence` [ `ParentDatasetQueryResults` ] 

485 The underlying results objects this object will chain together. 

486 doomed_by : `~collections.abc.Iterable` [ `str` ], optional 

487 A list of messages (appropriate for e.g. logging or exceptions) that 

488 explain why the query is known to return no results even before it is 

489 executed. Queries with a non-empty list will never be executed. 

490 Child results objects may also have their own list. 

491 """ 

492 

493 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()): 

494 self._chain = chain 

495 self._doomed_by = tuple(doomed_by) 

496 

497 __slots__ = ("_chain",) 

498 

499 def __iter__(self) -> Iterator[DatasetRef]: 

500 return itertools.chain.from_iterable(self._chain) 

501 

502 def __repr__(self) -> str: 

503 return "<DatasetRef iterator for multiple dataset types>" 

504 

505 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

506 # Docstring inherited from DatasetQueryResults. 

507 return iter(self._chain) 

508 

509 def expanded(self) -> ChainedDatasetQueryResults: 

510 # Docstring inherited from DatasetQueryResults. 

511 return ChainedDatasetQueryResults([r.expanded() for r in self._chain], self._doomed_by) 

512 

513 def count(self, *, exact: bool = True, discard: bool = False) -> int: 

514 # Docstring inherited. 

515 return sum(r.count(exact=exact, discard=discard) for r in self._chain) 

516 

517 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

518 # Docstring inherited. 

519 return any(r.any(execute=execute, exact=exact) for r in self._chain) 

520 

521 def explain_no_results(self, execute: bool = True) -> Iterable[str]: 

522 # Docstring inherited. 

523 result = list(self._doomed_by) 

524 for r in self._chain: 

525 result.extend(r.explain_no_results(execute=execute)) 

526 return result 

527 

528 

529class DimensionRecordQueryResults(QueryResultsBase, Iterable[DimensionRecord]): 

530 """An interface for objects that represent the results of queries for 

531 dimension records. 

532 """ 

533 

534 @property 

535 @abstractmethod 

536 def element(self) -> DimensionElement: 

537 raise NotImplementedError() 

538 

539 @abstractmethod 

540 def run(self) -> DimensionRecordQueryResults: 

541 raise NotImplementedError()