Coverage for python/lsst/daf/butler/registry/queries/_results.py: 32%

273 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-04 02:19 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DatabaseDimensionRecordQueryResults", 

26 "DataCoordinateQueryResults", 

27 "DatasetQueryResults", 

28 "DimensionRecordQueryResults", 

29 "ParentDatasetQueryResults", 

30) 

31 

32import itertools 

33import operator 

34from abc import abstractmethod 

35from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence 

36from contextlib import AbstractContextManager, ExitStack, contextmanager 

37from typing import Any, Optional 

38 

39import sqlalchemy 

40 

41from ...core import ( 

42 DataCoordinate, 

43 DataCoordinateIterable, 

44 DatasetRef, 

45 DatasetType, 

46 Dimension, 

47 DimensionGraph, 

48 DimensionRecord, 

49 SimpleQuery, 

50) 

51from ..interfaces import Database, DimensionRecordStorage 

52from ._query import Query 

53from ._structs import ElementOrderByClause, QuerySummary 

54 

55QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[tuple[int, Optional[int]]]], Query] 

56"""Type of a query factory method type used by DataCoordinateQueryResults. 

57""" 

58 

59 

60class DataCoordinateQueryResults(DataCoordinateIterable): 

61 """An enhanced implementation of `DataCoordinateIterable` that represents 

62 data IDs retrieved from a database query. 

63 

64 Parameters 

65 ---------- 

66 db : `Database` 

67 Database engine used to execute queries. 

68 query_factory : `QueryFactoryMethod` 

69 Method which creates an instance of `Query` class. 

70 graph : `DimensionGraph` 

71 Dimensions used by query. 

72 order_by : `Iterable` [ `str` ], optional 

73 Optional sequence of column names used for result ordering. 

74 limit : `Tuple` [ `int`, `int` ], optional 

75 Limit for the number of returned records and optional offset. 

76 records : `Mapping`, optional 

77 A nested mapping containing `DimensionRecord` objects for all 

78 dimensions and all data IDs this query will yield. If `None` 

79 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

80 The outer mapping has `str` keys (the names of dimension elements). 

81 The inner mapping has `tuple` keys representing data IDs (tuple 

82 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

83 

84 Notes 

85 ----- 

86 Constructing an instance of this does nothing; the query is not executed 

87 until it is iterated over (or some other operation is performed that 

88 involves iteration). 

89 

90 Instances should generally only be constructed by `Registry` methods or the 

91 methods of other query result objects. 

92 """ 

93 

94 def __init__( 

95 self, 

96 db: Database, 

97 query_factory: QueryFactoryMethod, 

98 graph: DimensionGraph, 

99 *, 

100 order_by: Iterable[str] | None = None, 

101 limit: tuple[int, int | None] | None = None, 

102 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

103 ): 

104 self._db = db 

105 self._query_factory = query_factory 

106 self._graph = graph 

107 self._order_by = order_by 

108 self._limit = limit 

109 self._records = records 

110 self._cached_query: Query | None = None 

111 

112 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query") 

113 

114 @classmethod 

115 def from_query( 

116 cls, 

117 db: Database, 

118 query: Query, 

119 graph: DimensionGraph, 

120 *, 

121 order_by: Iterable[str] | None = None, 

122 limit: tuple[int, int | None] | None = None, 

123 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

124 ) -> DataCoordinateQueryResults: 

125 """Make an instance from a pre-existing query instead of a factory. 

126 

127 Parameters 

128 ---------- 

129 db : `Database` 

130 Database engine used to execute queries. 

131 query : `Query` 

132 Low-level representation of the query that backs this result 

133 object. 

134 graph : `DimensionGraph` 

135 Dimensions used by query. 

136 order_by : `Iterable` [ `str` ], optional 

137 Optional sequence of column names used for result ordering. 

138 limit : `Tuple` [ `int`, `int` ], optional 

139 Limit for the number of returned records and optional offset. 

140 records : `Mapping`, optional 

141 A nested mapping containing `DimensionRecord` objects for all 

142 dimensions and all data IDs this query will yield. If `None` 

143 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

144 The outer mapping has `str` keys (the names of dimension elements). 

145 The inner mapping has `tuple` keys representing data IDs (tuple 

146 conversions of `DataCoordinate.values()`) and `DimensionRecord` 

147 values. 

148 """ 

149 

150 def factory(order_by: Iterable[str] | None, limit: tuple[int, int | None] | None) -> Query: 

151 return query 

152 

153 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records) 

154 

155 def __iter__(self) -> Iterator[DataCoordinate]: 

156 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

157 

158 def __repr__(self) -> str: 

159 return f"<DataCoordinate iterator with dimensions={self._graph}>" 

160 

161 def _clone( 

162 self, 

163 *, 

164 query_factory: QueryFactoryMethod | None = None, 

165 query: Query | None = None, 

166 graph: DimensionGraph | None = None, 

167 order_by: Iterable[str] | None = None, 

168 limit: tuple[int, int | None] | None = None, 

169 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

170 ) -> DataCoordinateQueryResults: 

171 """Clone this instance potentially updating some attributes.""" 

172 graph = graph if graph is not None else self._graph 

173 order_by = order_by if order_by is not None else self._order_by 

174 limit = limit if limit is not None else self._limit 

175 records = records if records is not None else self._records 

176 if query is None: 

177 query_factory = query_factory or self._query_factory 

178 return DataCoordinateQueryResults( 

179 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records 

180 ) 

181 else: 

182 return DataCoordinateQueryResults.from_query( 

183 self._db, query, graph, order_by=order_by, limit=limit, records=records 

184 ) 

185 

186 @property 

187 def _query(self) -> Query: 

188 """Query representation instance (`Query`)""" 

189 if self._cached_query is None: 

190 self._cached_query = self._query_factory(self._order_by, self._limit) 

191 assert ( 

192 self._cached_query.datasetType is None 

193 ), "Query used to initialize data coordinate results should not have any datasets." 

194 return self._cached_query 

195 

196 @property 

197 def graph(self) -> DimensionGraph: 

198 # Docstring inherited from DataCoordinateIterable. 

199 return self._graph 

200 

201 def hasFull(self) -> bool: 

202 # Docstring inherited from DataCoordinateIterable. 

203 return True 

204 

205 def hasRecords(self) -> bool: 

206 # Docstring inherited from DataCoordinateIterable. 

207 return self._records is not None or not self._graph 

208 

209 @contextmanager 

210 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

211 """Insert this query's results into a temporary table. 

212 

213 Returns 

214 ------- 

215 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

216 A context manager that ensures the temporary table is created and 

217 populated in ``__enter__`` (returning a results object backed by 

218 that table), and dropped in ``__exit__``. If ``self`` is already 

219 materialized, the context manager may do nothing (reflecting the 

220 fact that an outer context manager should already take care of 

221 everything else). 

222 

223 Notes 

224 ----- 

225 When using a very large result set to perform multiple queries (e.g. 

226 multiple calls to `subset` with different arguments, or even a single 

227 call to `expanded`), it may be much more efficient to start by 

228 materializing the query and only then performing the follow up queries. 

229 It may also be less efficient, depending on how well database engine's 

230 query optimizer can simplify those particular follow-up queries and 

231 how efficiently it caches query results even when the are not 

232 explicitly inserted into a temporary table. See `expanded` and 

233 `subset` for examples. 

234 """ 

235 with self._query.materialize(self._db) as materialized: 

236 # Note that we depend on order_by columns to be passes from Query 

237 # to MaterializedQuery, so order_by and limit are not used. 

238 yield self._clone(query=materialized) 

239 

240 def expanded(self) -> DataCoordinateQueryResults: 

241 """Return a results object for which `hasRecords` returns `True`. 

242 

243 This method may involve actually executing database queries to fetch 

244 `DimensionRecord` objects. 

245 

246 Returns 

247 ------- 

248 results : `DataCoordinateQueryResults` 

249 A results object for which `hasRecords` returns `True`. May be 

250 ``self`` if that is already the case. 

251 

252 Notes 

253 ----- 

254 For very result sets, it may be much more efficient to call 

255 `materialize` before calling `expanded`, to avoid performing the 

256 original query multiple times (as a subquery) in the follow-up queries 

257 that fetch dimension records. For example:: 

258 

259 with registry.queryDataIds(...).materialize() as tempDataIds: 

260 dataIdsWithRecords = tempDataIds.expanded() 

261 for dataId in dataIdsWithRecords: 

262 ... 

263 """ 

264 if self._records is None: 

265 records = {} 

266 for element in self.graph.elements: 

267 subset = self.subset(graph=element.graph, unique=True) 

268 records[element.name] = { 

269 tuple(record.dataId.values()): record 

270 for record in self._query.backend.managers.dimensions[element].fetch(subset) 

271 } 

272 

273 return self._clone(query=self._query, records=records) 

274 else: 

275 return self 

276 

277 def subset( 

278 self, graph: DimensionGraph | None = None, *, unique: bool = False 

279 ) -> DataCoordinateQueryResults: 

280 """Return a results object containing a subset of the dimensions of 

281 this one, and/or a unique near-subset of its rows. 

282 

283 This method may involve actually executing database queries to fetch 

284 `DimensionRecord` objects. 

285 

286 Parameters 

287 ---------- 

288 graph : `DimensionGraph`, optional 

289 Dimensions to include in the new results object. If `None`, 

290 ``self.graph`` is used. 

291 unique : `bool`, optional 

292 If `True` (`False` is default), the query should only return unique 

293 data IDs. This is implemented in the database; to obtain unique 

294 results via Python-side processing (which may be more efficient in 

295 some cases), use `toSet` to construct a `DataCoordinateSet` from 

296 this results object instead. 

297 

298 Returns 

299 ------- 

300 results : `DataCoordinateQueryResults` 

301 A results object corresponding to the given criteria. May be 

302 ``self`` if it already qualifies. 

303 

304 Raises 

305 ------ 

306 ValueError 

307 Raised when ``graph`` is not a subset of the dimension graph in 

308 this result. 

309 

310 Notes 

311 ----- 

312 This method can only return a "near-subset" of the original result rows 

313 in general because of subtleties in how spatial overlaps are 

314 implemented; see `Query.subset` for more information. 

315 

316 When calling `subset` multiple times on the same very large result set, 

317 it may be much more efficient to call `materialize` first. For 

318 example:: 

319 

320 dimensions1 = DimensionGraph(...) 

321 dimensions2 = DimensionGraph(...) 

322 with registry.queryDataIds(...).materialize() as tempDataIds: 

323 for dataId1 in tempDataIds.subset( 

324 graph=dimensions1, 

325 unique=True): 

326 ... 

327 for dataId2 in tempDataIds.subset( 

328 graph=dimensions2, 

329 unique=True): 

330 ... 

331 """ 

332 if graph is None: 

333 graph = self.graph 

334 if not graph.issubset(self.graph): 

335 raise ValueError(f"{graph} is not a subset of {self.graph}") 

336 if graph == self.graph and (not unique or self._query.isUnique()): 

337 return self 

338 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None 

339 if self._records is not None: 

340 records = {element.name: self._records[element.name] for element in graph.elements} 

341 else: 

342 records = None 

343 query = self._query.subset(graph=graph, datasets=False, unique=unique) 

344 

345 return self._clone(graph=graph, query=query, records=records) 

346 

347 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

348 # Docstring inherited from DataCoordinateIterable. 

349 sql = self._query.sql 

350 if sql is not None: 

351 fromClause = sql.alias("c") 

352 query.join( 

353 fromClause, 

354 onclause=sqlalchemy.sql.and_( 

355 *[ 

356 columns(dimension.name) == fromClause.columns[dimension.name] 

357 for dimension in self.graph.required 

358 ] 

359 ), 

360 ) 

361 

362 def findDatasets( 

363 self, datasetType: DatasetType | str, collections: Any, *, findFirst: bool = True 

364 ) -> ParentDatasetQueryResults: 

365 """Find datasets using the data IDs identified by this query. 

366 

367 Parameters 

368 ---------- 

369 datasetType : `DatasetType` or `str` 

370 Dataset type or the name of one to search for. Must have 

371 dimensions that are a subset of ``self.graph``. 

372 collections : `Any` 

373 An expression that fully or partially identifies the collections 

374 to search for the dataset, such as a `str`, `re.Pattern`, or 

375 iterable thereof. ``...`` can be used to return all collections. 

376 See :ref:`daf_butler_collection_expressions` for more information. 

377 findFirst : `bool`, optional 

378 If `True` (default), for each result data ID, only yield one 

379 `DatasetRef`, from the first collection in which a dataset of that 

380 dataset type appears (according to the order of ``collections`` 

381 passed in). If `True`, ``collections`` must not contain regular 

382 expressions and may not be ``...``. 

383 

384 Returns 

385 ------- 

386 datasets : `ParentDatasetQueryResults` 

387 A lazy-evaluation object representing dataset query results, 

388 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

389 nested data IDs in those dataset references will have records as 

390 well. 

391 

392 Raises 

393 ------ 

394 ValueError 

395 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

396 MissingDatasetTypeError 

397 Raised if the given dataset type is not registered. 

398 """ 

399 parent_dataset_type, components = self._query.backend.resolve_single_dataset_type_wildcard( 

400 datasetType, explicit_only=True 

401 ) 

402 if not parent_dataset_type.dimensions.issubset(self.graph): 

403 raise ValueError( 

404 f"findDatasets requires that the dataset type have only dimensions in " 

405 f"the DataCoordinateQueryResult used as input to the search, but " 

406 f"{parent_dataset_type.name} has dimensions {parent_dataset_type.dimensions}, " 

407 f"while the input dimensions are {self.graph}." 

408 ) 

409 summary = QuerySummary( 

410 self.graph, whereRegion=self._query.whereRegion, datasets=[parent_dataset_type] 

411 ) 

412 builder = self._query.makeBuilder(summary) 

413 builder.joinDataset(parent_dataset_type, collections=collections, findFirst=findFirst) 

414 query = builder.finish(joinMissing=False) 

415 return ParentDatasetQueryResults( 

416 db=self._db, 

417 query=query, 

418 components=components, 

419 records=self._records, 

420 datasetType=parent_dataset_type, 

421 ) 

422 

423 def count(self, *, exact: bool = True) -> int: 

424 """Count the number of rows this query would return. 

425 

426 Parameters 

427 ---------- 

428 exact : `bool`, optional 

429 If `True`, run the full query and perform post-query filtering if 

430 needed to account for that filtering in the count. If `False`, the 

431 result may be an upper bound. 

432 

433 Returns 

434 ------- 

435 count : `int` 

436 The number of rows the query would return, or an upper bound if 

437 ``exact=False``. 

438 

439 Notes 

440 ----- 

441 This counts the number of rows returned, not the number of unique rows 

442 returned, so even with ``exact=True`` it may provide only an upper 

443 bound on the number of *deduplicated* result rows. 

444 """ 

445 return self._query.count(self._db, exact=exact) 

446 

447 def any( 

448 self, 

449 *, 

450 execute: bool = True, 

451 exact: bool = True, 

452 ) -> bool: 

453 """Test whether this query returns any results. 

454 

455 Parameters 

456 ---------- 

457 execute : `bool`, optional 

458 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

459 determined prior to execution that the query would return no rows. 

460 exact : `bool`, optional 

461 If `True`, run the full query and perform post-query filtering if 

462 needed, until at least one result row is found. If `False`, the 

463 returned result does not account for post-query filtering, and 

464 hence may be `True` even when all result rows would be filtered 

465 out. 

466 

467 Returns 

468 ------- 

469 any : `bool` 

470 `True` if the query would (or might, depending on arguments) yield 

471 result rows. `False` if it definitely would not. 

472 """ 

473 return self._query.any(self._db, execute=execute, exact=exact) 

474 

475 def explain_no_results(self) -> Iterable[str]: 

476 """Return human-readable messages that may help explain why the query 

477 yields no results. 

478 

479 Returns 

480 ------- 

481 messages : `Iterable` [ `str` ] 

482 String messages that describe reasons the query might not yield any 

483 results. 

484 

485 Notes 

486 ----- 

487 Messages related to post-query filtering are only available if the 

488 iterator has been exhausted, or if `any` or `count` was already called 

489 (with ``exact=True`` for the latter two). 

490 

491 This method first yields messages that are generated while the query is 

492 being built or filtered, but may then proceed to diagnostics generated 

493 by performing what should be inexpensive follow-up queries. Callers 

494 can short-circuit this at any time by simplying not iterating further. 

495 """ 

496 return self._query.explain_no_results(self._db) 

497 

498 def order_by(self, *args: str) -> DataCoordinateQueryResults: 

499 """Make the iterator return ordered result. 

500 

501 Parameters 

502 ---------- 

503 *args : `str` 

504 Names of the columns/dimensions to use for ordering. Column name 

505 can be prefixed with minus (``-``) to use descending ordering. 

506 

507 Returns 

508 ------- 

509 result : `DataCoordinateQueryResults` 

510 Returns ``self`` instance which is updated to return ordered 

511 result. 

512 

513 Notes 

514 ----- 

515 This method modifies the iterator in place and returns the same 

516 instance to support method chaining. 

517 """ 

518 return self._clone(order_by=args) 

519 

520 def limit(self, limit: int, offset: int | None = None) -> DataCoordinateQueryResults: 

521 """Make the iterator return limited number of records. 

522 

523 Parameters 

524 ---------- 

525 limit : `int` 

526 Upper limit on the number of returned records. 

527 offset : `int` or `None` 

528 If not `None` then the number of records to skip before returning 

529 ``limit`` records. 

530 

531 Returns 

532 ------- 

533 result : `DataCoordinateQueryResults` 

534 Returns ``self`` instance which is updated to return limited set 

535 of records. 

536 

537 Notes 

538 ----- 

539 This method modifies the iterator in place and returns the same 

540 instance to support method chaining. Normally this method is used 

541 together with `order_by` method. 

542 """ 

543 return self._clone(limit=(limit, offset)) 

544 

545 

546class DatasetQueryResults(Iterable[DatasetRef]): 

547 """An interface for objects that represent the results of queries for 

548 datasets. 

549 """ 

550 

551 @abstractmethod 

552 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

553 """Group results by parent dataset type. 

554 

555 Returns 

556 ------- 

557 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

558 An iterator over `DatasetQueryResults` instances that are each 

559 responsible for a single parent dataset type (either just that 

560 dataset type, one or more of its component dataset types, or both). 

561 """ 

562 raise NotImplementedError() 

563 

564 @abstractmethod 

565 def materialize(self) -> AbstractContextManager[DatasetQueryResults]: 

566 """Insert this query's results into a temporary table. 

567 

568 Returns 

569 ------- 

570 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

571 A context manager that ensures the temporary table is created and 

572 populated in ``__enter__`` (returning a results object backed by 

573 that table), and dropped in ``__exit__``. If ``self`` is already 

574 materialized, the context manager may do nothing (reflecting the 

575 fact that an outer context manager should already take care of 

576 everything else). 

577 """ 

578 raise NotImplementedError() 

579 

580 @abstractmethod 

581 def expanded(self) -> DatasetQueryResults: 

582 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords` 

583 returns `True` for all data IDs in returned `DatasetRef` objects. 

584 

585 Returns 

586 ------- 

587 expanded : `DatasetQueryResults` 

588 Either a new `DatasetQueryResults` instance or ``self``, if it is 

589 already expanded. 

590 

591 Notes 

592 ----- 

593 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

594 to call `materialize` before expanding data IDs for very large result 

595 sets. 

596 """ 

597 raise NotImplementedError() 

598 

599 @abstractmethod 

600 def count(self, *, exact: bool = True) -> int: 

601 """Count the number of rows this query would return. 

602 

603 Parameters 

604 ---------- 

605 exact : `bool`, optional 

606 If `True`, run the full query and perform post-query filtering if 

607 needed to account for that filtering in the count. If `False`, the 

608 result may be an upper bound. 

609 

610 Returns 

611 ------- 

612 count : `int` 

613 The number of rows the query would return, or an upper bound if 

614 ``exact=False``. 

615 

616 Notes 

617 ----- 

618 This counts the number of rows returned, not the number of unique rows 

619 returned, so even with ``exact=True`` it may provide only an upper 

620 bound on the number of *deduplicated* result rows. 

621 """ 

622 raise NotImplementedError() 

623 

624 @abstractmethod 

625 def any( 

626 self, 

627 *, 

628 execute: bool = True, 

629 exact: bool = True, 

630 ) -> bool: 

631 """Test whether this query returns any results. 

632 

633 Parameters 

634 ---------- 

635 execute : `bool`, optional 

636 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

637 determined prior to execution that the query would return no rows. 

638 exact : `bool`, optional 

639 If `True`, run the full query and perform post-query filtering if 

640 needed, until at least one result row is found. If `False`, the 

641 returned result does not account for post-query filtering, and 

642 hence may be `True` even when all result rows would be filtered 

643 out. 

644 

645 Returns 

646 ------- 

647 any : `bool` 

648 `True` if the query would (or might, depending on arguments) yield 

649 result rows. `False` if it definitely would not. 

650 """ 

651 raise NotImplementedError() 

652 

653 @abstractmethod 

654 def explain_no_results(self) -> Iterable[str]: 

655 """Return human-readable messages that may help explain why the query 

656 yields no results. 

657 

658 Returns 

659 ------- 

660 messages : `Iterable` [ `str` ] 

661 String messages that describe reasons the query might not yield any 

662 results. 

663 

664 Notes 

665 ----- 

666 Messages related to post-query filtering are only available if the 

667 iterator has been exhausted, or if `any` or `count` was already called 

668 (with ``exact=True`` for the latter two). 

669 

670 This method first yields messages that are generated while the query is 

671 being built or filtered, but may then proceed to diagnostics generated 

672 by performing what should be inexpensive follow-up queries. Callers 

673 can short-circuit this at any time by simplying not iterating further. 

674 """ 

675 raise NotImplementedError() 

676 

677 

678class ParentDatasetQueryResults(DatasetQueryResults): 

679 """An object that represents results from a query for datasets with a 

680 single parent `DatasetType`. 

681 

682 Parameters 

683 ---------- 

684 db : `Database` 

685 Database engine to execute queries against. 

686 query : `Query` 

687 Low-level query object that backs these results. ``query.datasetType`` 

688 will be the parent dataset type for this object, and may not be `None`. 

689 components : `Sequence` [ `str` or `None` ] 

690 Names of components to include in iteration. `None` may be included 

691 (at most once) to include the parent dataset type. 

692 records : `Mapping`, optional 

693 Mapping containing `DimensionRecord` objects for all dimensions and 

694 all data IDs this query will yield. If `None` (default), 

695 `DataCoordinate.hasRecords` will return `False` for all nested data 

696 IDs. This is a nested mapping with `str` names of dimension elements 

697 as outer keys, `DimensionRecord` instances as inner values, and 

698 ``tuple(record.dataId.values())`` for the inner keys / outer values 

699 (where ``record`` is the innermost `DimensionRecord` instance). 

700 datasetType : `DatasetType`, optional 

701 Parent dataset type for all datasets returned by this query. If not 

702 provided, ``query.datasetType`` be used, and must not be `None` (as it 

703 is in the case where the query is known to yield no results prior to 

704 execution). 

705 """ 

706 

707 def __init__( 

708 self, 

709 db: Database, 

710 query: Query, 

711 *, 

712 components: Sequence[str | None], 

713 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

714 datasetType: DatasetType | None = None, 

715 ): 

716 self._db = db 

717 self._query = query 

718 self._components = components 

719 self._records = records 

720 if datasetType is None: 

721 datasetType = query.datasetType 

722 assert datasetType is not None, "Query used to initialize dataset results must have a dataset." 

723 assert datasetType.dimensions.issubset( 

724 query.graph 

725 ), f"Query dimensions {query.graph} do not match dataset type dimensions {datasetType.dimensions}." 

726 self._datasetType = datasetType 

727 

728 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

729 

730 def __iter__(self) -> Iterator[DatasetRef]: 

731 for row in self._query.rows(self._db): 

732 parentRef = self._query.extractDatasetRef(row, records=self._records) 

733 for component in self._components: 

734 if component is None: 

735 yield parentRef 

736 else: 

737 yield parentRef.makeComponentRef(component) 

738 

739 def __repr__(self) -> str: 

740 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>" 

741 

742 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

743 # Docstring inherited from DatasetQueryResults. 

744 yield self 

745 

746 @contextmanager 

747 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

748 # Docstring inherited from DatasetQueryResults. 

749 with self._query.materialize(self._db) as materialized: 

750 yield ParentDatasetQueryResults( 

751 self._db, materialized, components=self._components, records=self._records 

752 ) 

753 

754 @property 

755 def parentDatasetType(self) -> DatasetType: 

756 """The parent dataset type for all datasets in this iterable 

757 (`DatasetType`). 

758 """ 

759 return self._datasetType 

760 

761 @property 

762 def dataIds(self) -> DataCoordinateQueryResults: 

763 """A lazy-evaluation object representing a query for just the data 

764 IDs of the datasets that would be returned by this query 

765 (`DataCoordinateQueryResults`). 

766 

767 The returned object is not in general `zip`-iterable with ``self``; 

768 it may be in a different order or have (or not have) duplicates. 

769 """ 

770 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False) 

771 return DataCoordinateQueryResults.from_query( 

772 self._db, 

773 query, 

774 self.parentDatasetType.dimensions, 

775 records=self._records, 

776 ) 

777 

778 def withComponents(self, components: Sequence[str | None]) -> ParentDatasetQueryResults: 

779 """Return a new query results object for the same parent datasets but 

780 different components. 

781 

782 components : `Sequence` [ `str` or `None` ] 

783 Names of components to include in iteration. `None` may be 

784 included (at most once) to include the parent dataset type. 

785 """ 

786 return ParentDatasetQueryResults( 

787 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType 

788 ) 

789 

790 def expanded(self) -> ParentDatasetQueryResults: 

791 # Docstring inherited from DatasetQueryResults. 

792 if self._records is None: 

793 records = self.dataIds.expanded()._records 

794 return ParentDatasetQueryResults( 

795 self._db, 

796 self._query, 

797 records=records, 

798 components=self._components, 

799 datasetType=self._datasetType, 

800 ) 

801 else: 

802 return self 

803 

804 def count(self, *, exact: bool = True) -> int: 

805 # Docstring inherited. 

806 return len(self._components) * self._query.count(self._db, exact=exact) 

807 

808 def any( 

809 self, 

810 *, 

811 execute: bool = True, 

812 exact: bool = True, 

813 ) -> bool: 

814 # Docstring inherited. 

815 return self._query.any(self._db, execute=execute, exact=exact) 

816 

817 def explain_no_results(self) -> Iterable[str]: 

818 # Docstring inherited. 

819 return self._query.explain_no_results(self._db) 

820 

821 

822class ChainedDatasetQueryResults(DatasetQueryResults): 

823 """A `DatasetQueryResults` implementation that simply chains together 

824 other results objects, each for a different parent dataset type. 

825 

826 Parameters 

827 ---------- 

828 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

829 The underlying results objects this object will chain together. 

830 doomed_by : `Iterable` [ `str` ], optional 

831 A list of messages (appropriate for e.g. logging or exceptions) that 

832 explain why the query is known to return no results even before it is 

833 executed. Queries with a non-empty list will never be executed. 

834 Child results objects may also have their own list. 

835 """ 

836 

837 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()): 

838 self._chain = chain 

839 self._doomed_by = tuple(doomed_by) 

840 

841 __slots__ = ("_chain",) 

842 

843 def __iter__(self) -> Iterator[DatasetRef]: 

844 return itertools.chain.from_iterable(self._chain) 

845 

846 def __repr__(self) -> str: 

847 return "<DatasetRef iterator for multiple dataset types>" 

848 

849 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

850 # Docstring inherited from DatasetQueryResults. 

851 return iter(self._chain) 

852 

853 @contextmanager 

854 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

855 # Docstring inherited from DatasetQueryResults. 

856 with ExitStack() as stack: 

857 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain]) 

858 

859 def expanded(self) -> ChainedDatasetQueryResults: 

860 # Docstring inherited from DatasetQueryResults. 

861 return ChainedDatasetQueryResults([r.expanded() for r in self._chain]) 

862 

863 def count(self, *, exact: bool = True) -> int: 

864 # Docstring inherited. 

865 return sum(r.count(exact=exact) for r in self._chain) 

866 

867 def any( 

868 self, 

869 *, 

870 execute: bool = True, 

871 exact: bool = True, 

872 ) -> bool: 

873 # Docstring inherited. 

874 return any(r.any(execute=execute, exact=exact) for r in self._chain) 

875 

876 def explain_no_results(self) -> Iterable[str]: 

877 # Docstring inherited. 

878 for r in self._chain: 

879 yield from r.explain_no_results() 

880 yield from self._doomed_by 

881 

882 

883class DimensionRecordQueryResults(Iterable[DimensionRecord]): 

884 """An interface for objects that represent the results of queries for 

885 dimension records. 

886 """ 

887 

888 @abstractmethod 

889 def count(self, *, exact: bool = True) -> int: 

890 """Count the number of rows this query would return. 

891 

892 Parameters 

893 ---------- 

894 exact : `bool`, optional 

895 If `True`, run the full query and perform post-query filtering if 

896 needed to account for that filtering in the count. If `False`, the 

897 result may be an upper bound. 

898 

899 Returns 

900 ------- 

901 count : `int` 

902 The number of rows the query would return, or an upper bound if 

903 ``exact=False``. 

904 

905 Notes 

906 ----- 

907 This counts the number of rows returned, not the number of unique rows 

908 returned, so even with ``exact=True`` it may provide only an upper 

909 bound on the number of *deduplicated* result rows. 

910 """ 

911 raise NotImplementedError() 

912 

913 @abstractmethod 

914 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

915 """Test whether this query returns any results. 

916 

917 Parameters 

918 ---------- 

919 execute : `bool`, optional 

920 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

921 determined prior to execution that the query would return no rows. 

922 exact : `bool`, optional 

923 If `True`, run the full query and perform post-query filtering if 

924 needed, until at least one result row is found. If `False`, the 

925 returned result does not account for post-query filtering, and 

926 hence may be `True` even when all result rows would be filtered 

927 out. 

928 

929 Returns 

930 ------- 

931 any : `bool` 

932 `True` if the query would (or might, depending on arguments) yield 

933 result rows. `False` if it definitely would not. 

934 """ 

935 raise NotImplementedError() 

936 

937 @abstractmethod 

938 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

939 """Make the iterator return ordered result. 

940 

941 Parameters 

942 ---------- 

943 *args : `str` 

944 Names of the columns/dimensions to use for ordering. Column name 

945 can be prefixed with minus (``-``) to use descending ordering. 

946 

947 Returns 

948 ------- 

949 result : `DimensionRecordQueryResults` 

950 Returns ``self`` instance which is updated to return ordered 

951 result. 

952 

953 Notes 

954 ----- 

955 This method can modify the iterator in place and return the same 

956 instance. 

957 """ 

958 raise NotImplementedError() 

959 

960 @abstractmethod 

961 def limit(self, limit: int, offset: int | None = None) -> DimensionRecordQueryResults: 

962 """Make the iterator return limited number of records. 

963 

964 Parameters 

965 ---------- 

966 limit : `int` 

967 Upper limit on the number of returned records. 

968 offset : `int` or `None` 

969 If not `None` then the number of records to skip before returning 

970 ``limit`` records. 

971 

972 Returns 

973 ------- 

974 result : `DimensionRecordQueryResults` 

975 Returns ``self`` instance which is updated to return limited set 

976 of records. 

977 

978 Notes 

979 ----- 

980 This method can modify the iterator in place and return the same 

981 instance. Normally this method is used together with `order_by` 

982 method. 

983 """ 

984 raise NotImplementedError() 

985 

986 @abstractmethod 

987 def explain_no_results(self) -> Iterable[str]: 

988 """Return human-readable messages that may help explain why the query 

989 yields no results. 

990 

991 Returns 

992 ------- 

993 messages : `Iterable` [ `str` ] 

994 String messages that describe reasons the query might not yield any 

995 results. 

996 

997 Notes 

998 ----- 

999 Messages related to post-query filtering are only available if the 

1000 iterator has been exhausted, or if `any` or `count` was already called 

1001 (with ``exact=True`` for the latter two). 

1002 

1003 This method first yields messages that are generated while the query is 

1004 being built or filtered, but may then proceed to diagnostics generated 

1005 by performing what should be inexpensive follow-up queries. Callers 

1006 can short-circuit this at any time by simply not iterating further. 

1007 """ 

1008 raise NotImplementedError() 

1009 

1010 

1011class _DimensionRecordKey: 

1012 """Class for objects used as keys in ordering `DimensionRecord` instances. 

1013 

1014 Parameters 

1015 ---------- 

1016 attributes : `Sequence` [ `str` ] 

1017 Sequence of attribute names to use for comparison. 

1018 ordering : `Sequence` [ `bool` ] 

1019 Matching sequence of ordering flags, `False` for descending ordering, 

1020 `True` for ascending ordering. 

1021 record : `DimensionRecord` 

1022 `DimensionRecord` to compare to other records. 

1023 """ 

1024 

1025 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord): 

1026 self.attributes = attributes 

1027 self.ordering = ordering 

1028 self.rec = record 

1029 

1030 def _cmp(self, other: _DimensionRecordKey) -> int: 

1031 """Compare two records using provided comparison operator. 

1032 

1033 Parameters 

1034 ---------- 

1035 other : `_DimensionRecordKey` 

1036 Key for other record. 

1037 

1038 Returns 

1039 ------- 

1040 result : `int` 

1041 0 if keys are identical, negative if ``self`` is ordered before 

1042 ``other``, positive otherwise. 

1043 """ 

1044 for attribute, ordering in zip(self.attributes, self.ordering): 

1045 # timespan.begin/end cannot use getattr 

1046 attrgetter = operator.attrgetter(attribute) 

1047 lhs = attrgetter(self.rec) 

1048 rhs = attrgetter(other.rec) 

1049 if not ordering: 

1050 lhs, rhs = rhs, lhs 

1051 if lhs != rhs: 

1052 return 1 if lhs > rhs else -1 

1053 return 0 

1054 

1055 def __lt__(self, other: _DimensionRecordKey) -> bool: 

1056 return self._cmp(other) < 0 

1057 

1058 def __gt__(self, other: _DimensionRecordKey) -> bool: 

1059 return self._cmp(other) > 0 

1060 

1061 def __eq__(self, other: Any) -> bool: 

1062 if not isinstance(other, _DimensionRecordKey): 

1063 return NotImplemented 

1064 return self._cmp(other) == 0 

1065 

1066 def __le__(self, other: _DimensionRecordKey) -> bool: 

1067 return self._cmp(other) <= 0 

1068 

1069 def __ge__(self, other: _DimensionRecordKey) -> bool: 

1070 return self._cmp(other) >= 0 

1071 

1072 

1073class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults): 

1074 """Implementation of DimensionRecordQueryResults using database query. 

1075 

1076 Parameters 

1077 ---------- 

1078 dataIds : `DataCoordinateQueryResults` 

1079 Iterator for DataIds. 

1080 recordStorage : `DimensionRecordStorage` 

1081 Instance of storage class for dimension records. 

1082 """ 

1083 

1084 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage): 

1085 self._dataIds = dataIds 

1086 self._recordStorage = recordStorage 

1087 self._order_by: Iterable[str] = () 

1088 

1089 def __iter__(self) -> Iterator[DimensionRecord]: 

1090 # LIMIT is already applied at DataCoordinateQueryResults level 

1091 # (assumption here is that if DataId exists then dimension record 

1092 # exists too and their counts must be equal). fetch() does not 

1093 # guarantee ordering, so we need to sort records in memory below. 

1094 recordIter = self._recordStorage.fetch(self._dataIds) 

1095 if not self._order_by: 

1096 return iter(recordIter) 

1097 

1098 # Parse list of column names and build a list of attribute name for 

1099 # ordering. Note that here we only support ordering by direct 

1100 # attributes of the element, and not other elements from the dimension 

1101 # graph. 

1102 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element) 

1103 attributes: list[str] = [] 

1104 ordering: list[bool] = [] 

1105 for column in orderBy.order_by_columns: 

1106 if column.column is None: 

1107 assert isinstance(column.element, Dimension), "Element must be a Dimension" 

1108 attributes.append(column.element.primaryKey.name) 

1109 else: 

1110 attributes.append(column.column) 

1111 ordering.append(column.ordering) 

1112 

1113 def _key(record: DimensionRecord) -> _DimensionRecordKey: 

1114 return _DimensionRecordKey(attributes, ordering, record) 

1115 

1116 records = sorted(recordIter, key=_key) 

1117 return iter(records) 

1118 

1119 def count(self, *, exact: bool = True) -> int: 

1120 # Docstring inherited from base class. 

1121 return self._dataIds.count(exact=exact) 

1122 

1123 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

1124 # Docstring inherited from base class. 

1125 return self._dataIds.any(execute=execute, exact=exact) 

1126 

1127 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

1128 # Docstring inherited from base class. 

1129 self._dataIds = self._dataIds.order_by(*args) 

1130 self._order_by = args 

1131 return self 

1132 

1133 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

1134 # Docstring inherited from base class. 

1135 self._dataIds = self._dataIds.limit(limit, offset) 

1136 return self 

1137 

1138 def explain_no_results(self) -> Iterable[str]: 

1139 # Docstring inherited. 

1140 return self._dataIds.explain_no_results()