Coverage for python/lsst/daf/butler/registry/queries/_results.py: 33%

280 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-08-19 12:04 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DatabaseDimensionRecordQueryResults", 

26 "DataCoordinateQueryResults", 

27 "DatasetQueryResults", 

28 "DimensionRecordQueryResults", 

29 "ParentDatasetQueryResults", 

30) 

31 

32import itertools 

33import operator 

34from abc import abstractmethod 

35from contextlib import ExitStack, contextmanager 

36from typing import ( 

37 Any, 

38 Callable, 

39 ContextManager, 

40 Iterable, 

41 Iterator, 

42 List, 

43 Mapping, 

44 Optional, 

45 Sequence, 

46 Tuple, 

47 Union, 

48) 

49 

50import sqlalchemy 

51 

52from ...core import ( 

53 DataCoordinate, 

54 DataCoordinateIterable, 

55 DatasetRef, 

56 DatasetType, 

57 Dimension, 

58 DimensionGraph, 

59 DimensionRecord, 

60 SimpleQuery, 

61) 

62from ..interfaces import Database, DimensionRecordStorage 

63from ._query import Query 

64from ._structs import ElementOrderByClause, QuerySummary 

65 

66QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query] 

67"""Type of a query factory method type used by DataCoordinateQueryResults. 

68""" 

69 

70 

71class DataCoordinateQueryResults(DataCoordinateIterable): 

72 """An enhanced implementation of `DataCoordinateIterable` that represents 

73 data IDs retrieved from a database query. 

74 

75 Parameters 

76 ---------- 

77 db : `Database` 

78 Database engine used to execute queries. 

79 query_factory : `QueryFactoryMethod` 

80 Method which creates an instance of `Query` class. 

81 graph : `DimensionGraph` 

82 Dimensions used by query. 

83 order_by : `Iterable` [ `str` ], optional 

84 Optional sequence of column names used for result ordering. 

85 limit : `Tuple` [ `int`, `int` ], optional 

86 Limit for the number of returned records and optional offset. 

87 records : `Mapping`, optional 

88 A nested mapping containing `DimensionRecord` objects for all 

89 dimensions and all data IDs this query will yield. If `None` 

90 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

91 The outer mapping has `str` keys (the names of dimension elements). 

92 The inner mapping has `tuple` keys representing data IDs (tuple 

93 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

94 

95 Notes 

96 ----- 

97 Constructing an instance of this does nothing; the query is not executed 

98 until it is iterated over (or some other operation is performed that 

99 involves iteration). 

100 

101 Instances should generally only be constructed by `Registry` methods or the 

102 methods of other query result objects. 

103 """ 

104 

105 def __init__( 

106 self, 

107 db: Database, 

108 query_factory: QueryFactoryMethod, 

109 graph: DimensionGraph, 

110 *, 

111 order_by: Optional[Iterable[str]] = None, 

112 limit: Optional[Tuple[int, Optional[int]]] = None, 

113 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

114 ): 

115 self._db = db 

116 self._query_factory = query_factory 

117 self._graph = graph 

118 self._order_by = order_by 

119 self._limit = limit 

120 self._records = records 

121 self._cached_query: Optional[Query] = None 

122 

123 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query") 

124 

125 @classmethod 

126 def from_query( 

127 cls, 

128 db: Database, 

129 query: Query, 

130 graph: DimensionGraph, 

131 *, 

132 order_by: Optional[Iterable[str]] = None, 

133 limit: Optional[Tuple[int, Optional[int]]] = None, 

134 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

135 ) -> DataCoordinateQueryResults: 

136 """Make an instance from a pre-existing query instead of a factory. 

137 

138 Parameters 

139 ---------- 

140 db : `Database` 

141 Database engine used to execute queries. 

142 query : `Query` 

143 Low-level representation of the query that backs this result 

144 object. 

145 graph : `DimensionGraph` 

146 Dimensions used by query. 

147 order_by : `Iterable` [ `str` ], optional 

148 Optional sequence of column names used for result ordering. 

149 limit : `Tuple` [ `int`, `int` ], optional 

150 Limit for the number of returned records and optional offset. 

151 records : `Mapping`, optional 

152 A nested mapping containing `DimensionRecord` objects for all 

153 dimensions and all data IDs this query will yield. If `None` 

154 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

155 The outer mapping has `str` keys (the names of dimension elements). 

156 The inner mapping has `tuple` keys representing data IDs (tuple 

157 conversions of `DataCoordinate.values()`) and `DimensionRecord` 

158 values. 

159 """ 

160 

161 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query: 

162 return query 

163 

164 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records) 

165 

166 def __iter__(self) -> Iterator[DataCoordinate]: 

167 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

168 

169 def __repr__(self) -> str: 

170 return f"<DataCoordinate iterator with dimensions={self._graph}>" 

171 

172 def _clone( 

173 self, 

174 *, 

175 query_factory: Optional[QueryFactoryMethod] = None, 

176 query: Optional[Query] = None, 

177 graph: Optional[DimensionGraph] = None, 

178 order_by: Optional[Iterable[str]] = None, 

179 limit: Optional[Tuple[int, Optional[int]]] = None, 

180 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

181 ) -> DataCoordinateQueryResults: 

182 """Clone this instance potentially updating some attributes.""" 

183 graph = graph if graph is not None else self._graph 

184 order_by = order_by if order_by is not None else self._order_by 

185 limit = limit if limit is not None else self._limit 

186 records = records if records is not None else self._records 

187 if query is None: 

188 query_factory = query_factory or self._query_factory 

189 return DataCoordinateQueryResults( 

190 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records 

191 ) 

192 else: 

193 return DataCoordinateQueryResults.from_query( 

194 self._db, query, graph, order_by=order_by, limit=limit, records=records 

195 ) 

196 

197 @property 

198 def _query(self) -> Query: 

199 """Query representation instance (`Query`)""" 

200 if self._cached_query is None: 

201 self._cached_query = self._query_factory(self._order_by, self._limit) 

202 assert ( 

203 self._cached_query.datasetType is None 

204 ), "Query used to initialize data coordinate results should not have any datasets." 

205 return self._cached_query 

206 

207 @property 

208 def graph(self) -> DimensionGraph: 

209 # Docstring inherited from DataCoordinateIterable. 

210 return self._graph 

211 

212 def hasFull(self) -> bool: 

213 # Docstring inherited from DataCoordinateIterable. 

214 return True 

215 

216 def hasRecords(self) -> bool: 

217 # Docstring inherited from DataCoordinateIterable. 

218 return self._records is not None or not self._graph 

219 

220 @contextmanager 

221 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

222 """Insert this query's results into a temporary table. 

223 

224 Returns 

225 ------- 

226 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

227 A context manager that ensures the temporary table is created and 

228 populated in ``__enter__`` (returning a results object backed by 

229 that table), and dropped in ``__exit__``. If ``self`` is already 

230 materialized, the context manager may do nothing (reflecting the 

231 fact that an outer context manager should already take care of 

232 everything else). 

233 

234 Notes 

235 ----- 

236 When using a very large result set to perform multiple queries (e.g. 

237 multiple calls to `subset` with different arguments, or even a single 

238 call to `expanded`), it may be much more efficient to start by 

239 materializing the query and only then performing the follow up queries. 

240 It may also be less efficient, depending on how well database engine's 

241 query optimizer can simplify those particular follow-up queries and 

242 how efficiently it caches query results even when the are not 

243 explicitly inserted into a temporary table. See `expanded` and 

244 `subset` for examples. 

245 """ 

246 with self._query.materialize(self._db) as materialized: 

247 # Note that we depend on order_by columns to be passes from Query 

248 # to MaterializedQuery, so order_by and limit are not used. 

249 yield self._clone(query=materialized) 

250 

251 def expanded(self) -> DataCoordinateQueryResults: 

252 """Return a results object for which `hasRecords` returns `True`. 

253 

254 This method may involve actually executing database queries to fetch 

255 `DimensionRecord` objects. 

256 

257 Returns 

258 ------- 

259 results : `DataCoordinateQueryResults` 

260 A results object for which `hasRecords` returns `True`. May be 

261 ``self`` if that is already the case. 

262 

263 Notes 

264 ----- 

265 For very result sets, it may be much more efficient to call 

266 `materialize` before calling `expanded`, to avoid performing the 

267 original query multiple times (as a subquery) in the follow-up queries 

268 that fetch dimension records. For example:: 

269 

270 with registry.queryDataIds(...).materialize() as tempDataIds: 

271 dataIdsWithRecords = tempDataIds.expanded() 

272 for dataId in dataIdsWithRecords: 

273 ... 

274 """ 

275 if self._records is None: 

276 records = {} 

277 for element in self.graph.elements: 

278 subset = self.subset(graph=element.graph, unique=True) 

279 records[element.name] = { 

280 tuple(record.dataId.values()): record 

281 for record in self._query.managers.dimensions[element].fetch(subset) 

282 } 

283 

284 return self._clone(query=self._query, records=records) 

285 else: 

286 return self 

287 

288 def subset( 

289 self, graph: Optional[DimensionGraph] = None, *, unique: bool = False 

290 ) -> DataCoordinateQueryResults: 

291 """Return a results object containing a subset of the dimensions of 

292 this one, and/or a unique near-subset of its rows. 

293 

294 This method may involve actually executing database queries to fetch 

295 `DimensionRecord` objects. 

296 

297 Parameters 

298 ---------- 

299 graph : `DimensionGraph`, optional 

300 Dimensions to include in the new results object. If `None`, 

301 ``self.graph`` is used. 

302 unique : `bool`, optional 

303 If `True` (`False` is default), the query should only return unique 

304 data IDs. This is implemented in the database; to obtain unique 

305 results via Python-side processing (which may be more efficient in 

306 some cases), use `toSet` to construct a `DataCoordinateSet` from 

307 this results object instead. 

308 

309 Returns 

310 ------- 

311 results : `DataCoordinateQueryResults` 

312 A results object corresponding to the given criteria. May be 

313 ``self`` if it already qualifies. 

314 

315 Raises 

316 ------ 

317 ValueError 

318 Raised when ``graph`` is not a subset of the dimension graph in 

319 this result. 

320 

321 Notes 

322 ----- 

323 This method can only return a "near-subset" of the original result rows 

324 in general because of subtleties in how spatial overlaps are 

325 implemented; see `Query.subset` for more information. 

326 

327 When calling `subset` multiple times on the same very large result set, 

328 it may be much more efficient to call `materialize` first. For 

329 example:: 

330 

331 dimensions1 = DimensionGraph(...) 

332 dimensions2 = DimensionGraph(...) 

333 with registry.queryDataIds(...).materialize() as tempDataIds: 

334 for dataId1 in tempDataIds.subset( 

335 graph=dimensions1, 

336 unique=True): 

337 ... 

338 for dataId2 in tempDataIds.subset( 

339 graph=dimensions2, 

340 unique=True): 

341 ... 

342 """ 

343 if graph is None: 

344 graph = self.graph 

345 if not graph.issubset(self.graph): 

346 raise ValueError(f"{graph} is not a subset of {self.graph}") 

347 if graph == self.graph and (not unique or self._query.isUnique()): 

348 return self 

349 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] 

350 if self._records is not None: 

351 records = {element.name: self._records[element.name] for element in graph.elements} 

352 else: 

353 records = None 

354 query = self._query.subset(graph=graph, datasets=False, unique=unique) 

355 

356 return self._clone(graph=graph, query=query, records=records) 

357 

358 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

359 # Docstring inherited from DataCoordinateIterable. 

360 sql = self._query.sql 

361 if sql is not None: 

362 fromClause = sql.alias("c") 

363 query.join( 

364 fromClause, 

365 onclause=sqlalchemy.sql.and_( 

366 *[ 

367 columns(dimension.name) == fromClause.columns[dimension.name] 

368 for dimension in self.graph.required 

369 ] 

370 ), 

371 ) 

372 

373 def findDatasets( 

374 self, datasetType: Union[DatasetType, str], collections: Any, *, findFirst: bool = True 

375 ) -> DatasetQueryResults: 

376 """Find datasets using the data IDs identified by this query. 

377 

378 Parameters 

379 ---------- 

380 datasetType : `DatasetType` or `str` 

381 Dataset type or the name of one to search for. Must have 

382 dimensions that are a subset of ``self.graph``. 

383 collections : `Any` 

384 An expression that fully or partially identifies the collections 

385 to search for the dataset, such as a `str`, `re.Pattern`, or 

386 iterable thereof. ``...`` can be used to return all collections. 

387 See :ref:`daf_butler_collection_expressions` for more information. 

388 findFirst : `bool`, optional 

389 If `True` (default), for each result data ID, only yield one 

390 `DatasetRef`, from the first collection in which a dataset of that 

391 dataset type appears (according to the order of ``collections`` 

392 passed in). If `True`, ``collections`` must not contain regular 

393 expressions and may not be ``...``. 

394 

395 Returns 

396 ------- 

397 datasets : `DatasetQueryResults` 

398 A lazy-evaluation object representing dataset query results, 

399 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

400 nested data IDs in those dataset references will have records as 

401 well. 

402 

403 Raises 

404 ------ 

405 ValueError 

406 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

407 """ 

408 if not isinstance(datasetType, DatasetType): 

409 storage = self._query.managers.datasets.find(datasetType) 

410 if storage is None: 

411 return ChainedDatasetQueryResults( 

412 [], 

413 doomed_by=[ 

414 f"Dataset type {datasetType!r} is not registered, so no instances of it can exist in " 

415 "any collection." 

416 ], 

417 ) 

418 else: 

419 datasetType = storage.datasetType 

420 if not datasetType.dimensions.issubset(self.graph): 

421 raise ValueError( 

422 f"findDatasets requires that the dataset type have only dimensions in " 

423 f"the DataCoordinateQueryResult used as input to the search, but " 

424 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input " 

425 f"dimensions are {self.graph}." 

426 ) 

427 if datasetType.isComponent(): 

428 # We were given a true DatasetType instance, but it's a component. 

429 components = [datasetType.component()] 

430 datasetType = datasetType.makeCompositeDatasetType() 

431 else: 

432 components = [None] 

433 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType]) 

434 builder = self._query.makeBuilder(summary) 

435 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst) 

436 query = builder.finish(joinMissing=False) 

437 return ParentDatasetQueryResults( 

438 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType 

439 ) 

440 

441 def count(self, *, exact: bool = True) -> int: 

442 """Count the number of rows this query would return. 

443 

444 Parameters 

445 ---------- 

446 exact : `bool`, optional 

447 If `True`, run the full query and perform post-query filtering if 

448 needed to account for that filtering in the count. If `False`, the 

449 result may be an upper bound. 

450 

451 Returns 

452 ------- 

453 count : `int` 

454 The number of rows the query would return, or an upper bound if 

455 ``exact=False``. 

456 

457 Notes 

458 ----- 

459 This counts the number of rows returned, not the number of unique rows 

460 returned, so even with ``exact=True`` it may provide only an upper 

461 bound on the number of *deduplicated* result rows. 

462 """ 

463 return self._query.count(self._db, exact=exact) 

464 

465 def any( 

466 self, 

467 *, 

468 execute: bool = True, 

469 exact: bool = True, 

470 ) -> bool: 

471 """Test whether this query returns any results. 

472 

473 Parameters 

474 ---------- 

475 execute : `bool`, optional 

476 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

477 determined prior to execution that the query would return no rows. 

478 exact : `bool`, optional 

479 If `True`, run the full query and perform post-query filtering if 

480 needed, until at least one result row is found. If `False`, the 

481 returned result does not account for post-query filtering, and 

482 hence may be `True` even when all result rows would be filtered 

483 out. 

484 

485 Returns 

486 ------- 

487 any : `bool` 

488 `True` if the query would (or might, depending on arguments) yield 

489 result rows. `False` if it definitely would not. 

490 """ 

491 return self._query.any(self._db, execute=execute, exact=exact) 

492 

493 def explain_no_results(self) -> Iterator[str]: 

494 """Return human-readable messages that may help explain why the query 

495 yields no results. 

496 

497 Returns 

498 ------- 

499 messages : `Iterator` [ `str` ] 

500 String messages that describe reasons the query might not yield any 

501 results. 

502 

503 Notes 

504 ----- 

505 Messages related to post-query filtering are only available if the 

506 iterator has been exhausted, or if `any` or `count` was already called 

507 (with ``exact=True`` for the latter two). 

508 

509 This method first yields messages that are generated while the query is 

510 being built or filtered, but may then proceed to diagnostics generated 

511 by performing what should be inexpensive follow-up queries. Callers 

512 can short-circuit this at any time by simplying not iterating further. 

513 """ 

514 return self._query.explain_no_results(self._db) 

515 

516 def order_by(self, *args: str) -> DataCoordinateQueryResults: 

517 """Make the iterator return ordered result. 

518 

519 Parameters 

520 ---------- 

521 *args : `str` 

522 Names of the columns/dimensions to use for ordering. Column name 

523 can be prefixed with minus (``-``) to use descending ordering. 

524 

525 Returns 

526 ------- 

527 result : `DataCoordinateQueryResults` 

528 Returns ``self`` instance which is updated to return ordered 

529 result. 

530 

531 Notes 

532 ----- 

533 This method modifies the iterator in place and returns the same 

534 instance to support method chaining. 

535 """ 

536 return self._clone(order_by=args) 

537 

538 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults: 

539 """Make the iterator return limited number of records. 

540 

541 Parameters 

542 ---------- 

543 limit : `int` 

544 Upper limit on the number of returned records. 

545 offset : `int` or `None` 

546 If not `None` then the number of records to skip before returning 

547 ``limit`` records. 

548 

549 Returns 

550 ------- 

551 result : `DataCoordinateQueryResults` 

552 Returns ``self`` instance which is updated to return limited set 

553 of records. 

554 

555 Notes 

556 ----- 

557 This method modifies the iterator in place and returns the same 

558 instance to support method chaining. Normally this method is used 

559 together with `order_by` method. 

560 """ 

561 return self._clone(limit=(limit, offset)) 

562 

563 

564class DatasetQueryResults(Iterable[DatasetRef]): 

565 """An interface for objects that represent the results of queries for 

566 datasets. 

567 """ 

568 

569 @abstractmethod 

570 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

571 """Group results by parent dataset type. 

572 

573 Returns 

574 ------- 

575 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

576 An iterator over `DatasetQueryResults` instances that are each 

577 responsible for a single parent dataset type (either just that 

578 dataset type, one or more of its component dataset types, or both). 

579 """ 

580 raise NotImplementedError() 

581 

582 @abstractmethod 

583 def materialize(self) -> ContextManager[DatasetQueryResults]: 

584 """Insert this query's results into a temporary table. 

585 

586 Returns 

587 ------- 

588 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

589 A context manager that ensures the temporary table is created and 

590 populated in ``__enter__`` (returning a results object backed by 

591 that table), and dropped in ``__exit__``. If ``self`` is already 

592 materialized, the context manager may do nothing (reflecting the 

593 fact that an outer context manager should already take care of 

594 everything else). 

595 """ 

596 raise NotImplementedError() 

597 

598 @abstractmethod 

599 def expanded(self) -> DatasetQueryResults: 

600 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords` 

601 returns `True` for all data IDs in returned `DatasetRef` objects. 

602 

603 Returns 

604 ------- 

605 expanded : `DatasetQueryResults` 

606 Either a new `DatasetQueryResults` instance or ``self``, if it is 

607 already expanded. 

608 

609 Notes 

610 ----- 

611 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

612 to call `materialize` before expanding data IDs for very large result 

613 sets. 

614 """ 

615 raise NotImplementedError() 

616 

617 @abstractmethod 

618 def count(self, *, exact: bool = True) -> int: 

619 """Count the number of rows this query would return. 

620 

621 Parameters 

622 ---------- 

623 exact : `bool`, optional 

624 If `True`, run the full query and perform post-query filtering if 

625 needed to account for that filtering in the count. If `False`, the 

626 result may be an upper bound. 

627 

628 Returns 

629 ------- 

630 count : `int` 

631 The number of rows the query would return, or an upper bound if 

632 ``exact=False``. 

633 

634 Notes 

635 ----- 

636 This counts the number of rows returned, not the number of unique rows 

637 returned, so even with ``exact=True`` it may provide only an upper 

638 bound on the number of *deduplicated* result rows. 

639 """ 

640 raise NotImplementedError() 

641 

642 @abstractmethod 

643 def any( 

644 self, 

645 *, 

646 execute: bool = True, 

647 exact: bool = True, 

648 ) -> bool: 

649 """Test whether this query returns any results. 

650 

651 Parameters 

652 ---------- 

653 execute : `bool`, optional 

654 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

655 determined prior to execution that the query would return no rows. 

656 exact : `bool`, optional 

657 If `True`, run the full query and perform post-query filtering if 

658 needed, until at least one result row is found. If `False`, the 

659 returned result does not account for post-query filtering, and 

660 hence may be `True` even when all result rows would be filtered 

661 out. 

662 

663 Returns 

664 ------- 

665 any : `bool` 

666 `True` if the query would (or might, depending on arguments) yield 

667 result rows. `False` if it definitely would not. 

668 """ 

669 raise NotImplementedError() 

670 

671 @abstractmethod 

672 def explain_no_results(self) -> Iterator[str]: 

673 """Return human-readable messages that may help explain why the query 

674 yields no results. 

675 

676 Returns 

677 ------- 

678 messages : `Iterator` [ `str` ] 

679 String messages that describe reasons the query might not yield any 

680 results. 

681 

682 Notes 

683 ----- 

684 Messages related to post-query filtering are only available if the 

685 iterator has been exhausted, or if `any` or `count` was already called 

686 (with ``exact=True`` for the latter two). 

687 

688 This method first yields messages that are generated while the query is 

689 being built or filtered, but may then proceed to diagnostics generated 

690 by performing what should be inexpensive follow-up queries. Callers 

691 can short-circuit this at any time by simplying not iterating further. 

692 """ 

693 raise NotImplementedError() 

694 

695 

696class ParentDatasetQueryResults(DatasetQueryResults): 

697 """An object that represents results from a query for datasets with a 

698 single parent `DatasetType`. 

699 

700 Parameters 

701 ---------- 

702 db : `Database` 

703 Database engine to execute queries against. 

704 query : `Query` 

705 Low-level query object that backs these results. ``query.datasetType`` 

706 will be the parent dataset type for this object, and may not be `None`. 

707 components : `Sequence` [ `str` or `None` ] 

708 Names of components to include in iteration. `None` may be included 

709 (at most once) to include the parent dataset type. 

710 records : `Mapping`, optional 

711 Mapping containing `DimensionRecord` objects for all dimensions and 

712 all data IDs this query will yield. If `None` (default), 

713 `DataCoordinate.hasRecords` will return `False` for all nested data 

714 IDs. This is a nested mapping with `str` names of dimension elements 

715 as outer keys, `DimensionRecord` instances as inner values, and 

716 ``tuple(record.dataId.values())`` for the inner keys / outer values 

717 (where ``record`` is the innermost `DimensionRecord` instance). 

718 datasetType : `DatasetType`, optional 

719 Parent dataset type for all datasets returned by this query. If not 

720 provided, ``query.datasetType`` be used, and must not be `None` (as it 

721 is in the case where the query is known to yield no results prior to 

722 execution). 

723 """ 

724 

725 def __init__( 

726 self, 

727 db: Database, 

728 query: Query, 

729 *, 

730 components: Sequence[Optional[str]], 

731 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

732 datasetType: Optional[DatasetType] = None, 

733 ): 

734 self._db = db 

735 self._query = query 

736 self._components = components 

737 self._records = records 

738 if datasetType is None: 

739 datasetType = query.datasetType 

740 assert datasetType is not None, "Query used to initialize dataset results must have a dataset." 

741 assert datasetType.dimensions.issubset( 

742 query.graph 

743 ), f"Query dimensions {query.graph} do not match dataset type dimensions {datasetType.dimensions}." 

744 self._datasetType = datasetType 

745 

746 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

747 

748 def __iter__(self) -> Iterator[DatasetRef]: 

749 for row in self._query.rows(self._db): 

750 parentRef = self._query.extractDatasetRef(row, records=self._records) 

751 for component in self._components: 

752 if component is None: 

753 yield parentRef 

754 else: 

755 yield parentRef.makeComponentRef(component) 

756 

757 def __repr__(self) -> str: 

758 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>" 

759 

760 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

761 # Docstring inherited from DatasetQueryResults. 

762 yield self 

763 

764 @contextmanager 

765 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

766 # Docstring inherited from DatasetQueryResults. 

767 with self._query.materialize(self._db) as materialized: 

768 yield ParentDatasetQueryResults( 

769 self._db, materialized, components=self._components, records=self._records 

770 ) 

771 

772 @property 

773 def parentDatasetType(self) -> DatasetType: 

774 """The parent dataset type for all datasets in this iterable 

775 (`DatasetType`). 

776 """ 

777 return self._datasetType 

778 

779 @property 

780 def dataIds(self) -> DataCoordinateQueryResults: 

781 """A lazy-evaluation object representing a query for just the data 

782 IDs of the datasets that would be returned by this query 

783 (`DataCoordinateQueryResults`). 

784 

785 The returned object is not in general `zip`-iterable with ``self``; 

786 it may be in a different order or have (or not have) duplicates. 

787 """ 

788 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False) 

789 return DataCoordinateQueryResults.from_query( 

790 self._db, 

791 query, 

792 self.parentDatasetType.dimensions, 

793 records=self._records, 

794 ) 

795 

796 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults: 

797 """Return a new query results object for the same parent datasets but 

798 different components. 

799 

800 components : `Sequence` [ `str` or `None` ] 

801 Names of components to include in iteration. `None` may be 

802 included (at most once) to include the parent dataset type. 

803 """ 

804 return ParentDatasetQueryResults( 

805 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType 

806 ) 

807 

808 def expanded(self) -> ParentDatasetQueryResults: 

809 # Docstring inherited from DatasetQueryResults. 

810 if self._records is None: 

811 records = self.dataIds.expanded()._records 

812 return ParentDatasetQueryResults( 

813 self._db, 

814 self._query, 

815 records=records, 

816 components=self._components, 

817 datasetType=self._datasetType, 

818 ) 

819 else: 

820 return self 

821 

822 def count(self, *, exact: bool = True) -> int: 

823 # Docstring inherited. 

824 return len(self._components) * self._query.count(self._db, exact=exact) 

825 

826 def any( 

827 self, 

828 *, 

829 execute: bool = True, 

830 exact: bool = True, 

831 ) -> bool: 

832 # Docstring inherited. 

833 return self._query.any(self._db, execute=execute, exact=exact) 

834 

835 def explain_no_results(self) -> Iterator[str]: 

836 # Docstring inherited. 

837 return self._query.explain_no_results(self._db) 

838 

839 

840class ChainedDatasetQueryResults(DatasetQueryResults): 

841 """A `DatasetQueryResults` implementation that simply chains together 

842 other results objects, each for a different parent dataset type. 

843 

844 Parameters 

845 ---------- 

846 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

847 The underlying results objects this object will chain together. 

848 doomed_by : `Iterable` [ `str` ], optional 

849 A list of messages (appropriate for e.g. logging or exceptions) that 

850 explain why the query is known to return no results even before it is 

851 executed. Queries with a non-empty list will never be executed. 

852 Child results objects may also have their own list. 

853 """ 

854 

855 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()): 

856 self._chain = chain 

857 self._doomed_by = tuple(doomed_by) 

858 

859 __slots__ = ("_chain",) 

860 

861 def __iter__(self) -> Iterator[DatasetRef]: 

862 return itertools.chain.from_iterable(self._chain) 

863 

864 def __repr__(self) -> str: 

865 return "<DatasetRef iterator for multiple dataset types>" 

866 

867 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

868 # Docstring inherited from DatasetQueryResults. 

869 return iter(self._chain) 

870 

871 @contextmanager 

872 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

873 # Docstring inherited from DatasetQueryResults. 

874 with ExitStack() as stack: 

875 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain]) 

876 

877 def expanded(self) -> ChainedDatasetQueryResults: 

878 # Docstring inherited from DatasetQueryResults. 

879 return ChainedDatasetQueryResults([r.expanded() for r in self._chain]) 

880 

881 def count(self, *, exact: bool = True) -> int: 

882 # Docstring inherited. 

883 return sum(r.count(exact=exact) for r in self._chain) 

884 

885 def any( 

886 self, 

887 *, 

888 execute: bool = True, 

889 exact: bool = True, 

890 ) -> bool: 

891 # Docstring inherited. 

892 return any(r.any(execute=execute, exact=exact) for r in self._chain) 

893 

894 def explain_no_results(self) -> Iterator[str]: 

895 # Docstring inherited. 

896 for r in self._chain: 

897 yield from r.explain_no_results() 

898 yield from self._doomed_by 

899 

900 

901class DimensionRecordQueryResults(Iterable[DimensionRecord]): 

902 """An interface for objects that represent the results of queries for 

903 dimension records. 

904 """ 

905 

906 @abstractmethod 

907 def count(self, *, exact: bool = True) -> int: 

908 """Count the number of rows this query would return. 

909 

910 Parameters 

911 ---------- 

912 exact : `bool`, optional 

913 If `True`, run the full query and perform post-query filtering if 

914 needed to account for that filtering in the count. If `False`, the 

915 result may be an upper bound. 

916 

917 Returns 

918 ------- 

919 count : `int` 

920 The number of rows the query would return, or an upper bound if 

921 ``exact=False``. 

922 

923 Notes 

924 ----- 

925 This counts the number of rows returned, not the number of unique rows 

926 returned, so even with ``exact=True`` it may provide only an upper 

927 bound on the number of *deduplicated* result rows. 

928 """ 

929 raise NotImplementedError() 

930 

931 @abstractmethod 

932 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

933 """Test whether this query returns any results. 

934 

935 Parameters 

936 ---------- 

937 execute : `bool`, optional 

938 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

939 determined prior to execution that the query would return no rows. 

940 exact : `bool`, optional 

941 If `True`, run the full query and perform post-query filtering if 

942 needed, until at least one result row is found. If `False`, the 

943 returned result does not account for post-query filtering, and 

944 hence may be `True` even when all result rows would be filtered 

945 out. 

946 

947 Returns 

948 ------- 

949 any : `bool` 

950 `True` if the query would (or might, depending on arguments) yield 

951 result rows. `False` if it definitely would not. 

952 """ 

953 raise NotImplementedError() 

954 

955 @abstractmethod 

956 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

957 """Make the iterator return ordered result. 

958 

959 Parameters 

960 ---------- 

961 *args : `str` 

962 Names of the columns/dimensions to use for ordering. Column name 

963 can be prefixed with minus (``-``) to use descending ordering. 

964 

965 Returns 

966 ------- 

967 result : `DimensionRecordQueryResults` 

968 Returns ``self`` instance which is updated to return ordered 

969 result. 

970 

971 Notes 

972 ----- 

973 This method can modify the iterator in place and return the same 

974 instance. 

975 """ 

976 raise NotImplementedError() 

977 

978 @abstractmethod 

979 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

980 """Make the iterator return limited number of records. 

981 

982 Parameters 

983 ---------- 

984 limit : `int` 

985 Upper limit on the number of returned records. 

986 offset : `int` or `None` 

987 If not `None` then the number of records to skip before returning 

988 ``limit`` records. 

989 

990 Returns 

991 ------- 

992 result : `DimensionRecordQueryResults` 

993 Returns ``self`` instance which is updated to return limited set 

994 of records. 

995 

996 Notes 

997 ----- 

998 This method can modify the iterator in place and return the same 

999 instance. Normally this method is used together with `order_by` 

1000 method. 

1001 """ 

1002 raise NotImplementedError() 

1003 

1004 @abstractmethod 

1005 def explain_no_results(self) -> Iterator[str]: 

1006 """Return human-readable messages that may help explain why the query 

1007 yields no results. 

1008 

1009 Returns 

1010 ------- 

1011 messages : `Iterator` [ `str` ] 

1012 String messages that describe reasons the query might not yield any 

1013 results. 

1014 

1015 Notes 

1016 ----- 

1017 Messages related to post-query filtering are only available if the 

1018 iterator has been exhausted, or if `any` or `count` was already called 

1019 (with ``exact=True`` for the latter two). 

1020 

1021 This method first yields messages that are generated while the query is 

1022 being built or filtered, but may then proceed to diagnostics generated 

1023 by performing what should be inexpensive follow-up queries. Callers 

1024 can short-circuit this at any time by simply not iterating further. 

1025 """ 

1026 raise NotImplementedError() 

1027 

1028 

1029class _DimensionRecordKey: 

1030 """Class for objects used as keys in ordering `DimensionRecord` instances. 

1031 

1032 Parameters 

1033 ---------- 

1034 attributes : `Sequence` [ `str` ] 

1035 Sequence of attribute names to use for comparison. 

1036 ordering : `Sequence` [ `bool` ] 

1037 Matching sequence of ordering flags, `False` for descending ordering, 

1038 `True` for ascending ordering. 

1039 record : `DimensionRecord` 

1040 `DimensionRecord` to compare to other records. 

1041 """ 

1042 

1043 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord): 

1044 self.attributes = attributes 

1045 self.ordering = ordering 

1046 self.rec = record 

1047 

1048 def _cmp(self, other: _DimensionRecordKey) -> int: 

1049 """Compare two records using provided comparison operator. 

1050 

1051 Parameters 

1052 ---------- 

1053 other : `_DimensionRecordKey` 

1054 Key for other record. 

1055 

1056 Returns 

1057 ------- 

1058 result : `int` 

1059 0 if keys are identical, negative if ``self`` is ordered before 

1060 ``other``, positive otherwise. 

1061 """ 

1062 for attribute, ordering in zip(self.attributes, self.ordering): 

1063 # timespan.begin/end cannot use getattr 

1064 attrgetter = operator.attrgetter(attribute) 

1065 lhs = attrgetter(self.rec) 

1066 rhs = attrgetter(other.rec) 

1067 if not ordering: 

1068 lhs, rhs = rhs, lhs 

1069 if lhs != rhs: 

1070 return 1 if lhs > rhs else -1 

1071 return 0 

1072 

1073 def __lt__(self, other: _DimensionRecordKey) -> bool: 

1074 return self._cmp(other) < 0 

1075 

1076 def __gt__(self, other: _DimensionRecordKey) -> bool: 

1077 return self._cmp(other) > 0 

1078 

1079 def __eq__(self, other: Any) -> bool: 

1080 if not isinstance(other, _DimensionRecordKey): 

1081 return NotImplemented 

1082 return self._cmp(other) == 0 

1083 

1084 def __le__(self, other: _DimensionRecordKey) -> bool: 

1085 return self._cmp(other) <= 0 

1086 

1087 def __ge__(self, other: _DimensionRecordKey) -> bool: 

1088 return self._cmp(other) >= 0 

1089 

1090 

1091class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults): 

1092 """Implementation of DimensionRecordQueryResults using database query. 

1093 

1094 Parameters 

1095 ---------- 

1096 dataIds : `DataCoordinateQueryResults` 

1097 Iterator for DataIds. 

1098 recordStorage : `DimensionRecordStorage` 

1099 Instance of storage class for dimension records. 

1100 """ 

1101 

1102 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage): 

1103 self._dataIds = dataIds 

1104 self._recordStorage = recordStorage 

1105 self._order_by: Iterable[str] = () 

1106 

1107 def __iter__(self) -> Iterator[DimensionRecord]: 

1108 # LIMIT is already applied at DataCoordinateQueryResults level 

1109 # (assumption here is that if DataId exists then dimension record 

1110 # exists too and their counts must be equal). fetch() does not 

1111 # guarantee ordering, so we need to sort records in memory below. 

1112 recordIter = self._recordStorage.fetch(self._dataIds) 

1113 if not self._order_by: 

1114 return iter(recordIter) 

1115 

1116 # Parse list of column names and build a list of attribute name for 

1117 # ordering. Note that here we only support ordering by direct 

1118 # attributes of the element, and not other elements from the dimension 

1119 # graph. 

1120 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element) 

1121 attributes: List[str] = [] 

1122 ordering: List[bool] = [] 

1123 for column in orderBy.order_by_columns: 

1124 if column.column is None: 

1125 assert isinstance(column.element, Dimension), "Element must be a Dimension" 

1126 attributes.append(column.element.primaryKey.name) 

1127 else: 

1128 attributes.append(column.column) 

1129 ordering.append(column.ordering) 

1130 

1131 def _key(record: DimensionRecord) -> _DimensionRecordKey: 

1132 return _DimensionRecordKey(attributes, ordering, record) 

1133 

1134 records = sorted(recordIter, key=_key) 

1135 return iter(records) 

1136 

1137 def count(self, *, exact: bool = True) -> int: 

1138 # Docstring inherited from base class. 

1139 return self._dataIds.count(exact=exact) 

1140 

1141 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

1142 # Docstring inherited from base class. 

1143 return self._dataIds.any(execute=execute, exact=exact) 

1144 

1145 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

1146 # Docstring inherited from base class. 

1147 self._dataIds = self._dataIds.order_by(*args) 

1148 self._order_by = args 

1149 return self 

1150 

1151 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

1152 # Docstring inherited from base class. 

1153 self._dataIds = self._dataIds.limit(limit, offset) 

1154 return self 

1155 

1156 def explain_no_results(self) -> Iterator[str]: 

1157 # Docstring inherited. 

1158 return self._dataIds.explain_no_results()