Coverage for python/lsst/daf/butler/registry/queries/_results.py: 32%

273 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-21 02:03 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DatabaseDimensionRecordQueryResults", 

26 "DataCoordinateQueryResults", 

27 "DatasetQueryResults", 

28 "DimensionRecordQueryResults", 

29 "ParentDatasetQueryResults", 

30) 

31 

32import itertools 

33import operator 

34from abc import abstractmethod 

35from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence 

36from contextlib import AbstractContextManager, ExitStack, contextmanager 

37from typing import Any, Optional 

38 

39import sqlalchemy 

40 

41from ...core import ( 

42 DataCoordinate, 

43 DataCoordinateIterable, 

44 DatasetRef, 

45 DatasetType, 

46 Dimension, 

47 DimensionGraph, 

48 DimensionRecord, 

49 SimpleQuery, 

50) 

51from ..interfaces import Database, DimensionRecordStorage 

52from ._query import Query 

53from ._structs import ElementOrderByClause, QuerySummary 

54 

55QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[tuple[int, Optional[int]]]], Query] 

56"""Type of a query factory method type used by DataCoordinateQueryResults. 

57""" 

58 

59 

60class DataCoordinateQueryResults(DataCoordinateIterable): 

61 """An enhanced implementation of `DataCoordinateIterable` that represents 

62 data IDs retrieved from a database query. 

63 

64 Parameters 

65 ---------- 

66 db : `Database` 

67 Database engine used to execute queries. 

68 query_factory : `QueryFactoryMethod` 

69 Method which creates an instance of `Query` class. 

70 graph : `DimensionGraph` 

71 Dimensions used by query. 

72 order_by : `Iterable` [ `str` ], optional 

73 Optional sequence of column names used for result ordering. 

74 limit : `Tuple` [ `int`, `int` ], optional 

75 Limit for the number of returned records and optional offset. 

76 records : `Mapping`, optional 

77 A nested mapping containing `DimensionRecord` objects for all 

78 dimensions and all data IDs this query will yield. If `None` 

79 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

80 The outer mapping has `str` keys (the names of dimension elements). 

81 The inner mapping has `tuple` keys representing data IDs (tuple 

82 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

83 

84 Notes 

85 ----- 

86 Constructing an instance of this does nothing; the query is not executed 

87 until it is iterated over (or some other operation is performed that 

88 involves iteration). 

89 

90 Instances should generally only be constructed by `Registry` methods or the 

91 methods of other query result objects. 

92 """ 

93 

94 def __init__( 

95 self, 

96 db: Database, 

97 query_factory: QueryFactoryMethod, 

98 graph: DimensionGraph, 

99 *, 

100 order_by: Iterable[str] | None = None, 

101 limit: tuple[int, int | None] | None = None, 

102 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

103 ): 

104 self._db = db 

105 self._query_factory = query_factory 

106 self._graph = graph 

107 self._order_by = order_by 

108 self._limit = limit 

109 self._records = records 

110 self._cached_query: Query | None = None 

111 

112 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query") 

113 

114 @classmethod 

115 def from_query( 

116 cls, 

117 db: Database, 

118 query: Query, 

119 graph: DimensionGraph, 

120 *, 

121 order_by: Iterable[str] | None = None, 

122 limit: tuple[int, int | None] | None = None, 

123 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

124 ) -> DataCoordinateQueryResults: 

125 """Make an instance from a pre-existing query instead of a factory. 

126 

127 Parameters 

128 ---------- 

129 db : `Database` 

130 Database engine used to execute queries. 

131 query : `Query` 

132 Low-level representation of the query that backs this result 

133 object. 

134 graph : `DimensionGraph` 

135 Dimensions used by query. 

136 order_by : `Iterable` [ `str` ], optional 

137 Optional sequence of column names used for result ordering. 

138 limit : `Tuple` [ `int`, `int` ], optional 

139 Limit for the number of returned records and optional offset. 

140 records : `Mapping`, optional 

141 A nested mapping containing `DimensionRecord` objects for all 

142 dimensions and all data IDs this query will yield. If `None` 

143 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

144 The outer mapping has `str` keys (the names of dimension elements). 

145 The inner mapping has `tuple` keys representing data IDs (tuple 

146 conversions of `DataCoordinate.values()`) and `DimensionRecord` 

147 values. 

148 """ 

149 

150 def factory(order_by: Iterable[str] | None, limit: tuple[int, int | None] | None) -> Query: 

151 return query 

152 

153 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records) 

154 

155 def __iter__(self) -> Iterator[DataCoordinate]: 

156 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

157 

158 def __repr__(self) -> str: 

159 return f"<DataCoordinate iterator with dimensions={self._graph}>" 

160 

161 def _clone( 

162 self, 

163 *, 

164 query_factory: QueryFactoryMethod | None = None, 

165 query: Query | None = None, 

166 graph: DimensionGraph | None = None, 

167 order_by: Iterable[str] | None = None, 

168 limit: tuple[int, int | None] | None = None, 

169 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

170 ) -> DataCoordinateQueryResults: 

171 """Clone this instance potentially updating some attributes.""" 

172 graph = graph if graph is not None else self._graph 

173 order_by = order_by if order_by is not None else self._order_by 

174 limit = limit if limit is not None else self._limit 

175 records = records if records is not None else self._records 

176 if query is None: 

177 query_factory = query_factory or self._query_factory 

178 return DataCoordinateQueryResults( 

179 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records 

180 ) 

181 else: 

182 return DataCoordinateQueryResults.from_query( 

183 self._db, query, graph, order_by=order_by, limit=limit, records=records 

184 ) 

185 

186 @property 

187 def _query(self) -> Query: 

188 """Query representation instance (`Query`)""" 

189 if self._cached_query is None: 

190 self._cached_query = self._query_factory(self._order_by, self._limit) 

191 assert ( 

192 self._cached_query.datasetType is None 

193 ), "Query used to initialize data coordinate results should not have any datasets." 

194 return self._cached_query 

195 

196 @property 

197 def graph(self) -> DimensionGraph: 

198 # Docstring inherited from DataCoordinateIterable. 

199 return self._graph 

200 

201 def hasFull(self) -> bool: 

202 # Docstring inherited from DataCoordinateIterable. 

203 return True 

204 

205 def hasRecords(self) -> bool: 

206 # Docstring inherited from DataCoordinateIterable. 

207 return self._records is not None or not self._graph 

208 

209 @contextmanager 

210 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

211 """Insert this query's results into a temporary table. 

212 

213 Returns 

214 ------- 

215 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

216 A context manager that ensures the temporary table is created and 

217 populated in ``__enter__`` (returning a results object backed by 

218 that table), and dropped in ``__exit__``. If ``self`` is already 

219 materialized, the context manager may do nothing (reflecting the 

220 fact that an outer context manager should already take care of 

221 everything else). 

222 

223 Notes 

224 ----- 

225 When using a very large result set to perform multiple queries (e.g. 

226 multiple calls to `subset` with different arguments, or even a single 

227 call to `expanded`), it may be much more efficient to start by 

228 materializing the query and only then performing the follow up queries. 

229 It may also be less efficient, depending on how well database engine's 

230 query optimizer can simplify those particular follow-up queries and 

231 how efficiently it caches query results even when the are not 

232 explicitly inserted into a temporary table. See `expanded` and 

233 `subset` for examples. 

234 """ 

235 with self._query.materialize(self._db) as materialized: 

236 # Note that we depend on order_by columns to be passes from Query 

237 # to MaterializedQuery, so order_by and limit are not used. 

238 yield self._clone(query=materialized) 

239 

240 def expanded(self) -> DataCoordinateQueryResults: 

241 """Return a results object for which `hasRecords` returns `True`. 

242 

243 This method may involve actually executing database queries to fetch 

244 `DimensionRecord` objects. 

245 

246 Returns 

247 ------- 

248 results : `DataCoordinateQueryResults` 

249 A results object for which `hasRecords` returns `True`. May be 

250 ``self`` if that is already the case. 

251 

252 Notes 

253 ----- 

254 For very result sets, it may be much more efficient to call 

255 `materialize` before calling `expanded`, to avoid performing the 

256 original query multiple times (as a subquery) in the follow-up queries 

257 that fetch dimension records. For example:: 

258 

259 with registry.queryDataIds(...).materialize() as tempDataIds: 

260 dataIdsWithRecords = tempDataIds.expanded() 

261 for dataId in dataIdsWithRecords: 

262 ... 

263 """ 

264 if self._records is None: 

265 records = {} 

266 for element in self.graph.elements: 

267 subset = self.subset(graph=element.graph, unique=True) 

268 records[element.name] = { 

269 tuple(record.dataId.values()): record 

270 for record in self._query.backend.managers.dimensions[element].fetch(subset) 

271 } 

272 

273 return self._clone(query=self._query, records=records) 

274 else: 

275 return self 

276 

277 def subset( 

278 self, graph: DimensionGraph | None = None, *, unique: bool = False 

279 ) -> DataCoordinateQueryResults: 

280 """Return a results object containing a subset of the dimensions of 

281 this one, and/or a unique near-subset of its rows. 

282 

283 This method may involve actually executing database queries to fetch 

284 `DimensionRecord` objects. 

285 

286 Parameters 

287 ---------- 

288 graph : `DimensionGraph`, optional 

289 Dimensions to include in the new results object. If `None`, 

290 ``self.graph`` is used. 

291 unique : `bool`, optional 

292 If `True` (`False` is default), the query should only return unique 

293 data IDs. This is implemented in the database; to obtain unique 

294 results via Python-side processing (which may be more efficient in 

295 some cases), use `toSet` to construct a `DataCoordinateSet` from 

296 this results object instead. 

297 

298 Returns 

299 ------- 

300 results : `DataCoordinateQueryResults` 

301 A results object corresponding to the given criteria. May be 

302 ``self`` if it already qualifies. 

303 

304 Raises 

305 ------ 

306 ValueError 

307 Raised when ``graph`` is not a subset of the dimension graph in 

308 this result. 

309 

310 Notes 

311 ----- 

312 This method can only return a "near-subset" of the original result rows 

313 in general because of subtleties in how spatial overlaps are 

314 implemented; see `Query.subset` for more information. 

315 

316 When calling `subset` multiple times on the same very large result set, 

317 it may be much more efficient to call `materialize` first. For 

318 example:: 

319 

320 dimensions1 = DimensionGraph(...) 

321 dimensions2 = DimensionGraph(...) 

322 with registry.queryDataIds(...).materialize() as tempDataIds: 

323 for dataId1 in tempDataIds.subset( 

324 graph=dimensions1, 

325 unique=True): 

326 ... 

327 for dataId2 in tempDataIds.subset( 

328 graph=dimensions2, 

329 unique=True): 

330 ... 

331 """ 

332 if graph is None: 

333 graph = self.graph 

334 if not graph.issubset(self.graph): 

335 raise ValueError(f"{graph} is not a subset of {self.graph}") 

336 if graph == self.graph and (not unique or self._query.isUnique()): 

337 return self 

338 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None 

339 if self._records is not None: 

340 records = {element.name: self._records[element.name] for element in graph.elements} 

341 else: 

342 records = None 

343 query = self._query.subset(graph=graph, datasets=False, unique=unique) 

344 

345 return self._clone(graph=graph, query=query, records=records) 

346 

347 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

348 # Docstring inherited from DataCoordinateIterable. 

349 sql = self._query.sql 

350 if sql is not None: 

351 fromClause = sql.alias("c") 

352 query.join( 

353 fromClause, 

354 onclause=sqlalchemy.sql.and_( 

355 *[ 

356 columns(dimension.name) == fromClause.columns[dimension.name] 

357 for dimension in self.graph.required 

358 ] 

359 ), 

360 ) 

361 

362 def findDatasets( 

363 self, 

364 datasetType: DatasetType | str, 

365 collections: Any, 

366 *, 

367 findFirst: bool = True, 

368 components: bool | None = None, 

369 ) -> ParentDatasetQueryResults: 

370 """Find datasets using the data IDs identified by this query. 

371 

372 Parameters 

373 ---------- 

374 datasetType : `DatasetType` or `str` 

375 Dataset type or the name of one to search for. Must have 

376 dimensions that are a subset of ``self.graph``. 

377 collections : `Any` 

378 An expression that fully or partially identifies the collections 

379 to search for the dataset, such as a `str`, `re.Pattern`, or 

380 iterable thereof. ``...`` can be used to return all collections. 

381 See :ref:`daf_butler_collection_expressions` for more information. 

382 findFirst : `bool`, optional 

383 If `True` (default), for each result data ID, only yield one 

384 `DatasetRef`, from the first collection in which a dataset of that 

385 dataset type appears (according to the order of ``collections`` 

386 passed in). If `True`, ``collections`` must not contain regular 

387 expressions and may not be ``...``. 

388 components : `bool`, optional 

389 If `True`, apply all expression patterns to component dataset type 

390 names as well. If `False`, never apply patterns to components. If 

391 `None` (default), apply patterns to components only if their parent 

392 datasets were not matched by the expression. Fully-specified 

393 component datasets (`str` or `DatasetType` instances) are always 

394 included. 

395 

396 Values other than `False` are deprecated, and only `False` will be 

397 supported after v26. After v27 this argument will be removed 

398 entirely. 

399 

400 Returns 

401 ------- 

402 datasets : `ParentDatasetQueryResults` 

403 A lazy-evaluation object representing dataset query results, 

404 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

405 nested data IDs in those dataset references will have records as 

406 well. 

407 

408 Raises 

409 ------ 

410 ValueError 

411 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

412 MissingDatasetTypeError 

413 Raised if the given dataset type is not registered. 

414 """ 

415 parent_dataset_type, components_found = self._query.backend.resolve_single_dataset_type_wildcard( 

416 datasetType, components=components, explicit_only=True 

417 ) 

418 if not parent_dataset_type.dimensions.issubset(self.graph): 

419 raise ValueError( 

420 f"findDatasets requires that the dataset type have only dimensions in " 

421 f"the DataCoordinateQueryResult used as input to the search, but " 

422 f"{parent_dataset_type.name} has dimensions {parent_dataset_type.dimensions}, " 

423 f"while the input dimensions are {self.graph}." 

424 ) 

425 summary = QuerySummary( 

426 self.graph, whereRegion=self._query.whereRegion, datasets=[parent_dataset_type] 

427 ) 

428 builder = self._query.makeBuilder(summary) 

429 builder.joinDataset(parent_dataset_type, collections=collections, findFirst=findFirst) 

430 query = builder.finish(joinMissing=False) 

431 return ParentDatasetQueryResults( 

432 db=self._db, 

433 query=query, 

434 components=components_found, 

435 records=self._records, 

436 datasetType=parent_dataset_type, 

437 ) 

438 

439 def count(self, *, exact: bool = True) -> int: 

440 """Count the number of rows this query would return. 

441 

442 Parameters 

443 ---------- 

444 exact : `bool`, optional 

445 If `True`, run the full query and perform post-query filtering if 

446 needed to account for that filtering in the count. If `False`, the 

447 result may be an upper bound. 

448 

449 Returns 

450 ------- 

451 count : `int` 

452 The number of rows the query would return, or an upper bound if 

453 ``exact=False``. 

454 

455 Notes 

456 ----- 

457 This counts the number of rows returned, not the number of unique rows 

458 returned, so even with ``exact=True`` it may provide only an upper 

459 bound on the number of *deduplicated* result rows. 

460 """ 

461 return self._query.count(self._db, exact=exact) 

462 

463 def any( 

464 self, 

465 *, 

466 execute: bool = True, 

467 exact: bool = True, 

468 ) -> bool: 

469 """Test whether this query returns any results. 

470 

471 Parameters 

472 ---------- 

473 execute : `bool`, optional 

474 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

475 determined prior to execution that the query would return no rows. 

476 exact : `bool`, optional 

477 If `True`, run the full query and perform post-query filtering if 

478 needed, until at least one result row is found. If `False`, the 

479 returned result does not account for post-query filtering, and 

480 hence may be `True` even when all result rows would be filtered 

481 out. 

482 

483 Returns 

484 ------- 

485 any : `bool` 

486 `True` if the query would (or might, depending on arguments) yield 

487 result rows. `False` if it definitely would not. 

488 """ 

489 return self._query.any(self._db, execute=execute, exact=exact) 

490 

491 def explain_no_results(self) -> Iterable[str]: 

492 """Return human-readable messages that may help explain why the query 

493 yields no results. 

494 

495 Returns 

496 ------- 

497 messages : `Iterable` [ `str` ] 

498 String messages that describe reasons the query might not yield any 

499 results. 

500 

501 Notes 

502 ----- 

503 Messages related to post-query filtering are only available if the 

504 iterator has been exhausted, or if `any` or `count` was already called 

505 (with ``exact=True`` for the latter two). 

506 

507 This method first yields messages that are generated while the query is 

508 being built or filtered, but may then proceed to diagnostics generated 

509 by performing what should be inexpensive follow-up queries. Callers 

510 can short-circuit this at any time by simplying not iterating further. 

511 """ 

512 return self._query.explain_no_results(self._db) 

513 

514 def order_by(self, *args: str) -> DataCoordinateQueryResults: 

515 """Make the iterator return ordered result. 

516 

517 Parameters 

518 ---------- 

519 *args : `str` 

520 Names of the columns/dimensions to use for ordering. Column name 

521 can be prefixed with minus (``-``) to use descending ordering. 

522 

523 Returns 

524 ------- 

525 result : `DataCoordinateQueryResults` 

526 Returns ``self`` instance which is updated to return ordered 

527 result. 

528 

529 Notes 

530 ----- 

531 This method modifies the iterator in place and returns the same 

532 instance to support method chaining. 

533 """ 

534 return self._clone(order_by=args) 

535 

536 def limit(self, limit: int, offset: int | None = None) -> DataCoordinateQueryResults: 

537 """Make the iterator return limited number of records. 

538 

539 Parameters 

540 ---------- 

541 limit : `int` 

542 Upper limit on the number of returned records. 

543 offset : `int` or `None` 

544 If not `None` then the number of records to skip before returning 

545 ``limit`` records. 

546 

547 Returns 

548 ------- 

549 result : `DataCoordinateQueryResults` 

550 Returns ``self`` instance which is updated to return limited set 

551 of records. 

552 

553 Notes 

554 ----- 

555 This method modifies the iterator in place and returns the same 

556 instance to support method chaining. Normally this method is used 

557 together with `order_by` method. 

558 """ 

559 return self._clone(limit=(limit, offset)) 

560 

561 

562class DatasetQueryResults(Iterable[DatasetRef]): 

563 """An interface for objects that represent the results of queries for 

564 datasets. 

565 """ 

566 

567 @abstractmethod 

568 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

569 """Group results by parent dataset type. 

570 

571 Returns 

572 ------- 

573 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

574 An iterator over `DatasetQueryResults` instances that are each 

575 responsible for a single parent dataset type (either just that 

576 dataset type, one or more of its component dataset types, or both). 

577 """ 

578 raise NotImplementedError() 

579 

580 @abstractmethod 

581 def materialize(self) -> AbstractContextManager[DatasetQueryResults]: 

582 """Insert this query's results into a temporary table. 

583 

584 Returns 

585 ------- 

586 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

587 A context manager that ensures the temporary table is created and 

588 populated in ``__enter__`` (returning a results object backed by 

589 that table), and dropped in ``__exit__``. If ``self`` is already 

590 materialized, the context manager may do nothing (reflecting the 

591 fact that an outer context manager should already take care of 

592 everything else). 

593 """ 

594 raise NotImplementedError() 

595 

596 @abstractmethod 

597 def expanded(self) -> DatasetQueryResults: 

598 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords` 

599 returns `True` for all data IDs in returned `DatasetRef` objects. 

600 

601 Returns 

602 ------- 

603 expanded : `DatasetQueryResults` 

604 Either a new `DatasetQueryResults` instance or ``self``, if it is 

605 already expanded. 

606 

607 Notes 

608 ----- 

609 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

610 to call `materialize` before expanding data IDs for very large result 

611 sets. 

612 """ 

613 raise NotImplementedError() 

614 

615 @abstractmethod 

616 def count(self, *, exact: bool = True) -> int: 

617 """Count the number of rows this query would return. 

618 

619 Parameters 

620 ---------- 

621 exact : `bool`, optional 

622 If `True`, run the full query and perform post-query filtering if 

623 needed to account for that filtering in the count. If `False`, the 

624 result may be an upper bound. 

625 

626 Returns 

627 ------- 

628 count : `int` 

629 The number of rows the query would return, or an upper bound if 

630 ``exact=False``. 

631 

632 Notes 

633 ----- 

634 This counts the number of rows returned, not the number of unique rows 

635 returned, so even with ``exact=True`` it may provide only an upper 

636 bound on the number of *deduplicated* result rows. 

637 """ 

638 raise NotImplementedError() 

639 

640 @abstractmethod 

641 def any( 

642 self, 

643 *, 

644 execute: bool = True, 

645 exact: bool = True, 

646 ) -> bool: 

647 """Test whether this query returns any results. 

648 

649 Parameters 

650 ---------- 

651 execute : `bool`, optional 

652 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

653 determined prior to execution that the query would return no rows. 

654 exact : `bool`, optional 

655 If `True`, run the full query and perform post-query filtering if 

656 needed, until at least one result row is found. If `False`, the 

657 returned result does not account for post-query filtering, and 

658 hence may be `True` even when all result rows would be filtered 

659 out. 

660 

661 Returns 

662 ------- 

663 any : `bool` 

664 `True` if the query would (or might, depending on arguments) yield 

665 result rows. `False` if it definitely would not. 

666 """ 

667 raise NotImplementedError() 

668 

669 @abstractmethod 

670 def explain_no_results(self) -> Iterable[str]: 

671 """Return human-readable messages that may help explain why the query 

672 yields no results. 

673 

674 Returns 

675 ------- 

676 messages : `Iterable` [ `str` ] 

677 String messages that describe reasons the query might not yield any 

678 results. 

679 

680 Notes 

681 ----- 

682 Messages related to post-query filtering are only available if the 

683 iterator has been exhausted, or if `any` or `count` was already called 

684 (with ``exact=True`` for the latter two). 

685 

686 This method first yields messages that are generated while the query is 

687 being built or filtered, but may then proceed to diagnostics generated 

688 by performing what should be inexpensive follow-up queries. Callers 

689 can short-circuit this at any time by simplying not iterating further. 

690 """ 

691 raise NotImplementedError() 

692 

693 

694class ParentDatasetQueryResults(DatasetQueryResults): 

695 """An object that represents results from a query for datasets with a 

696 single parent `DatasetType`. 

697 

698 Parameters 

699 ---------- 

700 db : `Database` 

701 Database engine to execute queries against. 

702 query : `Query` 

703 Low-level query object that backs these results. ``query.datasetType`` 

704 will be the parent dataset type for this object, and may not be `None`. 

705 components : `Sequence` [ `str` or `None` ] 

706 Names of components to include in iteration. `None` may be included 

707 (at most once) to include the parent dataset type. 

708 records : `Mapping`, optional 

709 Mapping containing `DimensionRecord` objects for all dimensions and 

710 all data IDs this query will yield. If `None` (default), 

711 `DataCoordinate.hasRecords` will return `False` for all nested data 

712 IDs. This is a nested mapping with `str` names of dimension elements 

713 as outer keys, `DimensionRecord` instances as inner values, and 

714 ``tuple(record.dataId.values())`` for the inner keys / outer values 

715 (where ``record`` is the innermost `DimensionRecord` instance). 

716 datasetType : `DatasetType`, optional 

717 Parent dataset type for all datasets returned by this query. If not 

718 provided, ``query.datasetType`` be used, and must not be `None` (as it 

719 is in the case where the query is known to yield no results prior to 

720 execution). 

721 """ 

722 

723 def __init__( 

724 self, 

725 db: Database, 

726 query: Query, 

727 *, 

728 components: Sequence[str | None], 

729 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

730 datasetType: DatasetType | None = None, 

731 ): 

732 self._db = db 

733 self._query = query 

734 self._components = components 

735 self._records = records 

736 if datasetType is None: 

737 datasetType = query.datasetType 

738 assert datasetType is not None, "Query used to initialize dataset results must have a dataset." 

739 assert datasetType.dimensions.issubset( 

740 query.graph 

741 ), f"Query dimensions {query.graph} do not match dataset type dimensions {datasetType.dimensions}." 

742 self._datasetType = datasetType 

743 

744 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

745 

746 def __iter__(self) -> Iterator[DatasetRef]: 

747 for row in self._query.rows(self._db): 

748 parentRef = self._query.extractDatasetRef(row, records=self._records) 

749 for component in self._components: 

750 if component is None: 

751 yield parentRef 

752 else: 

753 yield parentRef.makeComponentRef(component) 

754 

755 def __repr__(self) -> str: 

756 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>" 

757 

758 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

759 # Docstring inherited from DatasetQueryResults. 

760 yield self 

761 

762 @contextmanager 

763 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

764 # Docstring inherited from DatasetQueryResults. 

765 with self._query.materialize(self._db) as materialized: 

766 yield ParentDatasetQueryResults( 

767 self._db, materialized, components=self._components, records=self._records 

768 ) 

769 

770 @property 

771 def parentDatasetType(self) -> DatasetType: 

772 """The parent dataset type for all datasets in this iterable 

773 (`DatasetType`). 

774 """ 

775 return self._datasetType 

776 

777 @property 

778 def dataIds(self) -> DataCoordinateQueryResults: 

779 """A lazy-evaluation object representing a query for just the data 

780 IDs of the datasets that would be returned by this query 

781 (`DataCoordinateQueryResults`). 

782 

783 The returned object is not in general `zip`-iterable with ``self``; 

784 it may be in a different order or have (or not have) duplicates. 

785 """ 

786 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False) 

787 return DataCoordinateQueryResults.from_query( 

788 self._db, 

789 query, 

790 self.parentDatasetType.dimensions, 

791 records=self._records, 

792 ) 

793 

794 def withComponents(self, components: Sequence[str | None]) -> ParentDatasetQueryResults: 

795 """Return a new query results object for the same parent datasets but 

796 different components. 

797 

798 components : `Sequence` [ `str` or `None` ] 

799 Names of components to include in iteration. `None` may be 

800 included (at most once) to include the parent dataset type. 

801 """ 

802 return ParentDatasetQueryResults( 

803 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType 

804 ) 

805 

806 def expanded(self) -> ParentDatasetQueryResults: 

807 # Docstring inherited from DatasetQueryResults. 

808 if self._records is None: 

809 records = self.dataIds.expanded()._records 

810 return ParentDatasetQueryResults( 

811 self._db, 

812 self._query, 

813 records=records, 

814 components=self._components, 

815 datasetType=self._datasetType, 

816 ) 

817 else: 

818 return self 

819 

820 def count(self, *, exact: bool = True) -> int: 

821 # Docstring inherited. 

822 return len(self._components) * self._query.count(self._db, exact=exact) 

823 

824 def any( 

825 self, 

826 *, 

827 execute: bool = True, 

828 exact: bool = True, 

829 ) -> bool: 

830 # Docstring inherited. 

831 return self._query.any(self._db, execute=execute, exact=exact) 

832 

833 def explain_no_results(self) -> Iterable[str]: 

834 # Docstring inherited. 

835 return self._query.explain_no_results(self._db) 

836 

837 

838class ChainedDatasetQueryResults(DatasetQueryResults): 

839 """A `DatasetQueryResults` implementation that simply chains together 

840 other results objects, each for a different parent dataset type. 

841 

842 Parameters 

843 ---------- 

844 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

845 The underlying results objects this object will chain together. 

846 doomed_by : `Iterable` [ `str` ], optional 

847 A list of messages (appropriate for e.g. logging or exceptions) that 

848 explain why the query is known to return no results even before it is 

849 executed. Queries with a non-empty list will never be executed. 

850 Child results objects may also have their own list. 

851 """ 

852 

853 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()): 

854 self._chain = chain 

855 self._doomed_by = tuple(doomed_by) 

856 

857 __slots__ = ("_chain",) 

858 

859 def __iter__(self) -> Iterator[DatasetRef]: 

860 return itertools.chain.from_iterable(self._chain) 

861 

862 def __repr__(self) -> str: 

863 return "<DatasetRef iterator for multiple dataset types>" 

864 

865 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

866 # Docstring inherited from DatasetQueryResults. 

867 return iter(self._chain) 

868 

869 @contextmanager 

870 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

871 # Docstring inherited from DatasetQueryResults. 

872 with ExitStack() as stack: 

873 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain]) 

874 

875 def expanded(self) -> ChainedDatasetQueryResults: 

876 # Docstring inherited from DatasetQueryResults. 

877 return ChainedDatasetQueryResults([r.expanded() for r in self._chain]) 

878 

879 def count(self, *, exact: bool = True) -> int: 

880 # Docstring inherited. 

881 return sum(r.count(exact=exact) for r in self._chain) 

882 

883 def any( 

884 self, 

885 *, 

886 execute: bool = True, 

887 exact: bool = True, 

888 ) -> bool: 

889 # Docstring inherited. 

890 return any(r.any(execute=execute, exact=exact) for r in self._chain) 

891 

892 def explain_no_results(self) -> Iterable[str]: 

893 # Docstring inherited. 

894 for r in self._chain: 

895 yield from r.explain_no_results() 

896 yield from self._doomed_by 

897 

898 

899class DimensionRecordQueryResults(Iterable[DimensionRecord]): 

900 """An interface for objects that represent the results of queries for 

901 dimension records. 

902 """ 

903 

904 @abstractmethod 

905 def count(self, *, exact: bool = True) -> int: 

906 """Count the number of rows this query would return. 

907 

908 Parameters 

909 ---------- 

910 exact : `bool`, optional 

911 If `True`, run the full query and perform post-query filtering if 

912 needed to account for that filtering in the count. If `False`, the 

913 result may be an upper bound. 

914 

915 Returns 

916 ------- 

917 count : `int` 

918 The number of rows the query would return, or an upper bound if 

919 ``exact=False``. 

920 

921 Notes 

922 ----- 

923 This counts the number of rows returned, not the number of unique rows 

924 returned, so even with ``exact=True`` it may provide only an upper 

925 bound on the number of *deduplicated* result rows. 

926 """ 

927 raise NotImplementedError() 

928 

929 @abstractmethod 

930 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

931 """Test whether this query returns any results. 

932 

933 Parameters 

934 ---------- 

935 execute : `bool`, optional 

936 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

937 determined prior to execution that the query would return no rows. 

938 exact : `bool`, optional 

939 If `True`, run the full query and perform post-query filtering if 

940 needed, until at least one result row is found. If `False`, the 

941 returned result does not account for post-query filtering, and 

942 hence may be `True` even when all result rows would be filtered 

943 out. 

944 

945 Returns 

946 ------- 

947 any : `bool` 

948 `True` if the query would (or might, depending on arguments) yield 

949 result rows. `False` if it definitely would not. 

950 """ 

951 raise NotImplementedError() 

952 

953 @abstractmethod 

954 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

955 """Make the iterator return ordered result. 

956 

957 Parameters 

958 ---------- 

959 *args : `str` 

960 Names of the columns/dimensions to use for ordering. Column name 

961 can be prefixed with minus (``-``) to use descending ordering. 

962 

963 Returns 

964 ------- 

965 result : `DimensionRecordQueryResults` 

966 Returns ``self`` instance which is updated to return ordered 

967 result. 

968 

969 Notes 

970 ----- 

971 This method can modify the iterator in place and return the same 

972 instance. 

973 """ 

974 raise NotImplementedError() 

975 

976 @abstractmethod 

977 def limit(self, limit: int, offset: int | None = None) -> DimensionRecordQueryResults: 

978 """Make the iterator return limited number of records. 

979 

980 Parameters 

981 ---------- 

982 limit : `int` 

983 Upper limit on the number of returned records. 

984 offset : `int` or `None` 

985 If not `None` then the number of records to skip before returning 

986 ``limit`` records. 

987 

988 Returns 

989 ------- 

990 result : `DimensionRecordQueryResults` 

991 Returns ``self`` instance which is updated to return limited set 

992 of records. 

993 

994 Notes 

995 ----- 

996 This method can modify the iterator in place and return the same 

997 instance. Normally this method is used together with `order_by` 

998 method. 

999 """ 

1000 raise NotImplementedError() 

1001 

1002 @abstractmethod 

1003 def explain_no_results(self) -> Iterable[str]: 

1004 """Return human-readable messages that may help explain why the query 

1005 yields no results. 

1006 

1007 Returns 

1008 ------- 

1009 messages : `Iterable` [ `str` ] 

1010 String messages that describe reasons the query might not yield any 

1011 results. 

1012 

1013 Notes 

1014 ----- 

1015 Messages related to post-query filtering are only available if the 

1016 iterator has been exhausted, or if `any` or `count` was already called 

1017 (with ``exact=True`` for the latter two). 

1018 

1019 This method first yields messages that are generated while the query is 

1020 being built or filtered, but may then proceed to diagnostics generated 

1021 by performing what should be inexpensive follow-up queries. Callers 

1022 can short-circuit this at any time by simply not iterating further. 

1023 """ 

1024 raise NotImplementedError() 

1025 

1026 

1027class _DimensionRecordKey: 

1028 """Class for objects used as keys in ordering `DimensionRecord` instances. 

1029 

1030 Parameters 

1031 ---------- 

1032 attributes : `Sequence` [ `str` ] 

1033 Sequence of attribute names to use for comparison. 

1034 ordering : `Sequence` [ `bool` ] 

1035 Matching sequence of ordering flags, `False` for descending ordering, 

1036 `True` for ascending ordering. 

1037 record : `DimensionRecord` 

1038 `DimensionRecord` to compare to other records. 

1039 """ 

1040 

1041 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord): 

1042 self.attributes = attributes 

1043 self.ordering = ordering 

1044 self.rec = record 

1045 

1046 def _cmp(self, other: _DimensionRecordKey) -> int: 

1047 """Compare two records using provided comparison operator. 

1048 

1049 Parameters 

1050 ---------- 

1051 other : `_DimensionRecordKey` 

1052 Key for other record. 

1053 

1054 Returns 

1055 ------- 

1056 result : `int` 

1057 0 if keys are identical, negative if ``self`` is ordered before 

1058 ``other``, positive otherwise. 

1059 """ 

1060 for attribute, ordering in zip(self.attributes, self.ordering): 

1061 # timespan.begin/end cannot use getattr 

1062 attrgetter = operator.attrgetter(attribute) 

1063 lhs = attrgetter(self.rec) 

1064 rhs = attrgetter(other.rec) 

1065 if not ordering: 

1066 lhs, rhs = rhs, lhs 

1067 if lhs != rhs: 

1068 return 1 if lhs > rhs else -1 

1069 return 0 

1070 

1071 def __lt__(self, other: _DimensionRecordKey) -> bool: 

1072 return self._cmp(other) < 0 

1073 

1074 def __gt__(self, other: _DimensionRecordKey) -> bool: 

1075 return self._cmp(other) > 0 

1076 

1077 def __eq__(self, other: Any) -> bool: 

1078 if not isinstance(other, _DimensionRecordKey): 

1079 return NotImplemented 

1080 return self._cmp(other) == 0 

1081 

1082 def __le__(self, other: _DimensionRecordKey) -> bool: 

1083 return self._cmp(other) <= 0 

1084 

1085 def __ge__(self, other: _DimensionRecordKey) -> bool: 

1086 return self._cmp(other) >= 0 

1087 

1088 

1089class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults): 

1090 """Implementation of DimensionRecordQueryResults using database query. 

1091 

1092 Parameters 

1093 ---------- 

1094 dataIds : `DataCoordinateQueryResults` 

1095 Iterator for DataIds. 

1096 recordStorage : `DimensionRecordStorage` 

1097 Instance of storage class for dimension records. 

1098 """ 

1099 

1100 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage): 

1101 self._dataIds = dataIds 

1102 self._recordStorage = recordStorage 

1103 self._order_by: Iterable[str] = () 

1104 

1105 def __iter__(self) -> Iterator[DimensionRecord]: 

1106 # LIMIT is already applied at DataCoordinateQueryResults level 

1107 # (assumption here is that if DataId exists then dimension record 

1108 # exists too and their counts must be equal). fetch() does not 

1109 # guarantee ordering, so we need to sort records in memory below. 

1110 recordIter = self._recordStorage.fetch(self._dataIds) 

1111 if not self._order_by: 

1112 return iter(recordIter) 

1113 

1114 # Parse list of column names and build a list of attribute name for 

1115 # ordering. Note that here we only support ordering by direct 

1116 # attributes of the element, and not other elements from the dimension 

1117 # graph. 

1118 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element) 

1119 attributes: list[str] = [] 

1120 ordering: list[bool] = [] 

1121 for column in orderBy.order_by_columns: 

1122 if column.column is None: 

1123 assert isinstance(column.element, Dimension), "Element must be a Dimension" 

1124 attributes.append(column.element.primaryKey.name) 

1125 else: 

1126 attributes.append(column.column) 

1127 ordering.append(column.ordering) 

1128 

1129 def _key(record: DimensionRecord) -> _DimensionRecordKey: 

1130 return _DimensionRecordKey(attributes, ordering, record) 

1131 

1132 records = sorted(recordIter, key=_key) 

1133 return iter(records) 

1134 

1135 def count(self, *, exact: bool = True) -> int: 

1136 # Docstring inherited from base class. 

1137 return self._dataIds.count(exact=exact) 

1138 

1139 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

1140 # Docstring inherited from base class. 

1141 return self._dataIds.any(execute=execute, exact=exact) 

1142 

1143 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

1144 # Docstring inherited from base class. 

1145 self._dataIds = self._dataIds.order_by(*args) 

1146 self._order_by = args 

1147 return self 

1148 

1149 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

1150 # Docstring inherited from base class. 

1151 self._dataIds = self._dataIds.limit(limit, offset) 

1152 return self 

1153 

1154 def explain_no_results(self) -> Iterable[str]: 

1155 # Docstring inherited. 

1156 return self._dataIds.explain_no_results()