Coverage for python/lsst/daf/butler/registry/queries/_results.py: 35%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

236 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DatabaseDimensionRecordQueryResults", 

26 "DataCoordinateQueryResults", 

27 "DatasetQueryResults", 

28 "DimensionRecordQueryResults", 

29 "ParentDatasetQueryResults", 

30) 

31 

32import itertools 

33from abc import abstractmethod 

34from contextlib import ExitStack, contextmanager 

35from typing import ( 

36 Any, 

37 Callable, 

38 ContextManager, 

39 Iterable, 

40 Iterator, 

41 Mapping, 

42 Optional, 

43 Sequence, 

44 Tuple, 

45 Union, 

46) 

47 

48import sqlalchemy 

49 

50from ...core import ( 

51 DataCoordinate, 

52 DataCoordinateIterable, 

53 DatasetRef, 

54 DatasetType, 

55 DimensionGraph, 

56 DimensionRecord, 

57 SimpleQuery, 

58) 

59from ..interfaces import Database, DimensionRecordStorage 

60from ._query import Query 

61from ._structs import QuerySummary 

62 

63QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query] 

64"""Type of a query factory method type used by DataCoordinateQueryResults. 

65""" 

66 

67 

68class DataCoordinateQueryResults(DataCoordinateIterable): 

69 """An enhanced implementation of `DataCoordinateIterable` that represents 

70 data IDs retrieved from a database query. 

71 

72 Parameters 

73 ---------- 

74 db : `Database` 

75 Database engine used to execute queries. 

76 query_factory : `QueryFactoryMethod` 

77 Method which creates an instance of `Query` class. 

78 graph : `DimensionGraph` 

79 Dimensions used by query. 

80 order_by : `Iterable` [ `str` ], optional 

81 Optional sequence of column names used for result ordering. 

82 limit : `Tuple` [ `int`, `int` ], optional 

83 Limit for the number of returned records and optional offset. 

84 records : `Mapping`, optional 

85 A nested mapping containing `DimensionRecord` objects for all 

86 dimensions and all data IDs this query will yield. If `None` 

87 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

88 The outer mapping has `str` keys (the names of dimension elements). 

89 The inner mapping has `tuple` keys representing data IDs (tuple 

90 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

91 

92 Notes 

93 ----- 

94 Constructing an instance of this does nothing; the query is not executed 

95 until it is iterated over (or some other operation is performed that 

96 involves iteration). 

97 

98 Instances should generally only be constructed by `Registry` methods or the 

99 methods of other query result objects. 

100 """ 

101 

102 def __init__( 

103 self, 

104 db: Database, 

105 query_factory: QueryFactoryMethod, 

106 graph: DimensionGraph, 

107 *, 

108 order_by: Optional[Iterable[str]] = None, 

109 limit: Optional[Tuple[int, Optional[int]]] = None, 

110 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

111 ): 

112 self._db = db 

113 self._query_factory = query_factory 

114 self._graph = graph 

115 self._order_by = order_by 

116 self._limit = limit 

117 self._records = records 

118 self._cached_query: Optional[Query] = None 

119 

120 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query") 

121 

122 @classmethod 

123 def from_query( 

124 cls, 

125 db: Database, 

126 query: Query, 

127 graph: DimensionGraph, 

128 *, 

129 order_by: Optional[Iterable[str]] = None, 

130 limit: Optional[Tuple[int, Optional[int]]] = None, 

131 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

132 ) -> DataCoordinateQueryResults: 

133 """Make an instance from a pre-existing query instead of a factory. 

134 

135 Parameters 

136 ---------- 

137 db : `Database` 

138 Database engine used to execute queries. 

139 query : `Query` 

140 Low-level representation of the query that backs this result 

141 object. 

142 graph : `DimensionGraph` 

143 Dimensions used by query. 

144 order_by : `Iterable` [ `str` ], optional 

145 Optional sequence of column names used for result ordering. 

146 limit : `Tuple` [ `int`, `int` ], optional 

147 Limit for the number of returned records and optional offset. 

148 records : `Mapping`, optional 

149 A nested mapping containing `DimensionRecord` objects for all 

150 dimensions and all data IDs this query will yield. If `None` 

151 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

152 The outer mapping has `str` keys (the names of dimension elements). 

153 The inner mapping has `tuple` keys representing data IDs (tuple 

154 conversions of `DataCoordinate.values()`) and `DimensionRecord` 

155 values. 

156 """ 

157 

158 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query: 

159 return query 

160 

161 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records) 

162 

163 def __iter__(self) -> Iterator[DataCoordinate]: 

164 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

165 

166 def __repr__(self) -> str: 

167 return f"<DataCoordinate iterator with dimensions={self._graph}>" 

168 

169 def _clone( 

170 self, 

171 *, 

172 query_factory: Optional[QueryFactoryMethod] = None, 

173 query: Optional[Query] = None, 

174 graph: Optional[DimensionGraph] = None, 

175 order_by: Optional[Iterable[str]] = None, 

176 limit: Optional[Tuple[int, Optional[int]]] = None, 

177 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

178 ) -> DataCoordinateQueryResults: 

179 """Clone this instance potentially updating some attributes.""" 

180 graph = graph if graph is not None else self._graph 

181 order_by = order_by if order_by is not None else self._order_by 

182 limit = limit if limit is not None else self._limit 

183 records = records if records is not None else self._records 

184 if query is None: 

185 query_factory = query_factory or self._query_factory 

186 return DataCoordinateQueryResults( 

187 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records 

188 ) 

189 else: 

190 return DataCoordinateQueryResults.from_query( 

191 self._db, query, graph, order_by=order_by, limit=limit, records=records 

192 ) 

193 

194 @property 

195 def _query(self) -> Query: 

196 """Query representation instance (`Query`)""" 

197 if self._cached_query is None: 

198 self._cached_query = self._query_factory(self._order_by, self._limit) 

199 assert ( 

200 self._cached_query.datasetType is None 

201 ), "Query used to initialize data coordinate results should not have any datasets." 

202 return self._cached_query 

203 

204 @property 

205 def graph(self) -> DimensionGraph: 

206 # Docstring inherited from DataCoordinateIterable. 

207 return self._graph 

208 

209 def hasFull(self) -> bool: 

210 # Docstring inherited from DataCoordinateIterable. 

211 return True 

212 

213 def hasRecords(self) -> bool: 

214 # Docstring inherited from DataCoordinateIterable. 

215 return self._records is not None or not self._graph 

216 

217 @contextmanager 

218 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

219 """Insert this query's results into a temporary table. 

220 

221 Returns 

222 ------- 

223 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

224 A context manager that ensures the temporary table is created and 

225 populated in ``__enter__`` (returning a results object backed by 

226 that table), and dropped in ``__exit__``. If ``self`` is already 

227 materialized, the context manager may do nothing (reflecting the 

228 fact that an outer context manager should already take care of 

229 everything else). 

230 

231 Notes 

232 ----- 

233 When using a very large result set to perform multiple queries (e.g. 

234 multiple calls to `subset` with different arguments, or even a single 

235 call to `expanded`), it may be much more efficient to start by 

236 materializing the query and only then performing the follow up queries. 

237 It may also be less efficient, depending on how well database engine's 

238 query optimizer can simplify those particular follow-up queries and 

239 how efficiently it caches query results even when the are not 

240 explicitly inserted into a temporary table. See `expanded` and 

241 `subset` for examples. 

242 """ 

243 with self._query.materialize(self._db) as materialized: 

244 # Note that we depend on order_by columns to be passes from Query 

245 # to MaterializedQuery, so order_by and limit are not used. 

246 yield self._clone(query=materialized) 

247 

248 def expanded(self) -> DataCoordinateQueryResults: 

249 """Return a results object for which `hasRecords` returns `True`. 

250 

251 This method may involve actually executing database queries to fetch 

252 `DimensionRecord` objects. 

253 

254 Returns 

255 ------- 

256 results : `DataCoordinateQueryResults` 

257 A results object for which `hasRecords` returns `True`. May be 

258 ``self`` if that is already the case. 

259 

260 Notes 

261 ----- 

262 For very result sets, it may be much more efficient to call 

263 `materialize` before calling `expanded`, to avoid performing the 

264 original query multiple times (as a subquery) in the follow-up queries 

265 that fetch dimension records. For example:: 

266 

267 with registry.queryDataIds(...).materialize() as tempDataIds: 

268 dataIdsWithRecords = tempDataIds.expanded() 

269 for dataId in dataIdsWithRecords: 

270 ... 

271 """ 

272 if self._records is None: 

273 records = {} 

274 for element in self.graph.elements: 

275 subset = self.subset(graph=element.graph, unique=True) 

276 records[element.name] = { 

277 tuple(record.dataId.values()): record 

278 for record in self._query.managers.dimensions[element].fetch(subset) 

279 } 

280 

281 return self._clone(query=self._query, records=records) 

282 else: 

283 return self 

284 

285 def subset( 

286 self, graph: Optional[DimensionGraph] = None, *, unique: bool = False 

287 ) -> DataCoordinateQueryResults: 

288 """Return a results object containing a subset of the dimensions of 

289 this one, and/or a unique near-subset of its rows. 

290 

291 This method may involve actually executing database queries to fetch 

292 `DimensionRecord` objects. 

293 

294 Parameters 

295 ---------- 

296 graph : `DimensionGraph`, optional 

297 Dimensions to include in the new results object. If `None`, 

298 ``self.graph`` is used. 

299 unique : `bool`, optional 

300 If `True` (`False` is default), the query should only return unique 

301 data IDs. This is implemented in the database; to obtain unique 

302 results via Python-side processing (which may be more efficient in 

303 some cases), use `toSet` to construct a `DataCoordinateSet` from 

304 this results object instead. 

305 

306 Returns 

307 ------- 

308 results : `DataCoordinateQueryResults` 

309 A results object corresponding to the given criteria. May be 

310 ``self`` if it already qualifies. 

311 

312 Notes 

313 ----- 

314 This method can only return a "near-subset" of the original result rows 

315 in general because of subtleties in how spatial overlaps are 

316 implemented; see `Query.subset` for more information. 

317 

318 When calling `subset` multiple times on the same very large result set, 

319 it may be much more efficient to call `materialize` first. For 

320 example:: 

321 

322 dimensions1 = DimensionGraph(...) 

323 dimensions2 = DimensionGraph(...) 

324 with registry.queryDataIds(...).materialize() as tempDataIds: 

325 for dataId1 in tempDataIds.subset( 

326 graph=dimensions1, 

327 unique=True): 

328 ... 

329 for dataId2 in tempDataIds.subset( 

330 graph=dimensions2, 

331 unique=True): 

332 ... 

333 """ 

334 if graph is None: 

335 graph = self.graph 

336 if not graph.issubset(self.graph): 

337 raise ValueError(f"{graph} is not a subset of {self.graph}") 

338 if graph == self.graph and (not unique or self._query.isUnique()): 

339 return self 

340 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] 

341 if self._records is not None: 

342 records = {element.name: self._records[element.name] for element in graph.elements} 

343 else: 

344 records = None 

345 query = self._query.subset(graph=graph, datasets=False, unique=unique) 

346 

347 return self._clone(graph=graph, query=query, records=records) 

348 

349 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

350 # Docstring inherited from DataCoordinateIterable. 

351 sql = self._query.sql 

352 if sql is not None: 

353 fromClause = sql.alias("c") 

354 query.join( 

355 fromClause, 

356 onclause=sqlalchemy.sql.and_( 

357 *[ 

358 columns(dimension.name) == fromClause.columns[dimension.name] 

359 for dimension in self.graph.required 

360 ] 

361 ), 

362 ) 

363 

364 def findDatasets( 

365 self, datasetType: Union[DatasetType, str], collections: Any, *, findFirst: bool = True 

366 ) -> ParentDatasetQueryResults: 

367 """Find datasets using the data IDs identified by this query. 

368 

369 Parameters 

370 ---------- 

371 datasetType : `DatasetType` or `str` 

372 Dataset type or the name of one to search for. Must have 

373 dimensions that are a subset of ``self.graph``. 

374 collections : `Any` 

375 An expression that fully or partially identifies the collections 

376 to search for the dataset, such as a `str`, `re.Pattern`, or 

377 iterable thereof. ``...`` can be used to return all collections. 

378 See :ref:`daf_butler_collection_expressions` for more information. 

379 findFirst : `bool`, optional 

380 If `True` (default), for each result data ID, only yield one 

381 `DatasetRef`, from the first collection in which a dataset of that 

382 dataset type appears (according to the order of ``collections`` 

383 passed in). If `True`, ``collections`` must not contain regular 

384 expressions and may not be ``...``. 

385 

386 Returns 

387 ------- 

388 datasets : `ParentDatasetQueryResults` 

389 A lazy-evaluation object representing dataset query results, 

390 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

391 nested data IDs in those dataset references will have records as 

392 well. 

393 

394 Raises 

395 ------ 

396 ValueError 

397 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

398 """ 

399 if not isinstance(datasetType, DatasetType): 

400 datasetType = self._query.managers.datasets[datasetType].datasetType 

401 # moving component handling down into managers. 

402 if not datasetType.dimensions.issubset(self.graph): 

403 raise ValueError( 

404 f"findDatasets requires that the dataset type have the same dimensions as " 

405 f"the DataCoordinateQueryResult used as input to the search, but " 

406 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input " 

407 f"dimensions are {self.graph}." 

408 ) 

409 if datasetType.isComponent(): 

410 # We were given a true DatasetType instance, but it's a component. 

411 parentName, componentName = datasetType.nameAndComponent() 

412 storage = self._query.managers.datasets[parentName] 

413 datasetType = storage.datasetType 

414 components = [componentName] 

415 else: 

416 components = [None] 

417 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType]) 

418 builder = self._query.makeBuilder(summary) 

419 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst) 

420 query = builder.finish(joinMissing=False) 

421 return ParentDatasetQueryResults( 

422 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType 

423 ) 

424 

425 def count(self, *, exact: bool = True) -> int: 

426 """Count the number of rows this query would return. 

427 

428 Parameters 

429 ---------- 

430 exact : `bool`, optional 

431 If `True`, run the full query and perform post-query filtering if 

432 needed to account for that filtering in the count. If `False`, the 

433 result may be an upper bound. 

434 

435 Returns 

436 ------- 

437 count : `int` 

438 The number of rows the query would return, or an upper bound if 

439 ``exact=False``. 

440 

441 Notes 

442 ----- 

443 This counts the number of rows returned, not the number of unique rows 

444 returned, so even with ``exact=True`` it may provide only an upper 

445 bound on the number of *deduplicated* result rows. 

446 """ 

447 return self._query.count(self._db, exact=exact) 

448 

449 def any( 

450 self, 

451 *, 

452 execute: bool = True, 

453 exact: bool = True, 

454 ) -> bool: 

455 """Test whether this query returns any results. 

456 

457 Parameters 

458 ---------- 

459 execute : `bool`, optional 

460 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

461 determined prior to execution that the query would return no rows. 

462 exact : `bool`, optional 

463 If `True`, run the full query and perform post-query filtering if 

464 needed, until at least one result row is found. If `False`, the 

465 returned result does not account for post-query filtering, and 

466 hence may be `True` even when all result rows would be filtered 

467 out. 

468 

469 Returns 

470 ------- 

471 any : `bool` 

472 `True` if the query would (or might, depending on arguments) yield 

473 result rows. `False` if it definitely would not. 

474 """ 

475 return self._query.any(self._db, execute=execute, exact=exact) 

476 

477 def explain_no_results(self) -> Iterator[str]: 

478 """Return human-readable messages that may help explain why the query 

479 yields no results. 

480 

481 Returns 

482 ------- 

483 messages : `Iterator` [ `str` ] 

484 String messages that describe reasons the query might not yield any 

485 results. 

486 

487 Notes 

488 ----- 

489 Messages related to post-query filtering are only available if the 

490 iterator has been exhausted, or if `any` or `count` was already called 

491 (with ``exact=True`` for the latter two). 

492 

493 This method first yields messages that are generated while the query is 

494 being built or filtered, but may then proceed to diagnostics generated 

495 by performing what should be inexpensive follow-up queries. Callers 

496 can short-circuit this at any time by simplying not iterating further. 

497 """ 

498 return self._query.explain_no_results(self._db) 

499 

500 def order_by(self, *args: str) -> DataCoordinateQueryResults: 

501 """Make the iterator return ordered result. 

502 

503 Parameters 

504 ---------- 

505 *args : `str` 

506 Names of the columns/dimensions to use for ordering. Column name 

507 can be prefixed with minus (``-``) to use descending ordering. 

508 

509 Returns 

510 ------- 

511 result : `DataCoordinateQueryResults` 

512 Returns ``self`` instance which is updated to return ordered 

513 result. 

514 

515 Notes 

516 ----- 

517 This method modifies the iterator in place and returns the same 

518 instance to support method chaining. 

519 """ 

520 return self._clone(order_by=args) 

521 

522 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults: 

523 """Make the iterator return limited number of records. 

524 

525 Parameters 

526 ---------- 

527 limit : `int` 

528 Upper limit on the number of returned records. 

529 offset : `int` or `None` 

530 If not `None` then the number of records to skip before returning 

531 ``limit`` records. 

532 

533 Returns 

534 ------- 

535 result : `DataCoordinateQueryResults` 

536 Returns ``self`` instance which is updated to return limited set 

537 of records. 

538 

539 Notes 

540 ----- 

541 This method modifies the iterator in place and returns the same 

542 instance to support method chaining. Normally this method is used 

543 together with `order_by` method. 

544 """ 

545 return self._clone(limit=(limit, offset)) 

546 

547 

548class DatasetQueryResults(Iterable[DatasetRef]): 

549 """An interface for objects that represent the results of queries for 

550 datasets. 

551 """ 

552 

553 @abstractmethod 

554 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

555 """Group results by parent dataset type. 

556 

557 Returns 

558 ------- 

559 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

560 An iterator over `DatasetQueryResults` instances that are each 

561 responsible for a single parent dataset type (either just that 

562 dataset type, one or more of its component dataset types, or both). 

563 """ 

564 raise NotImplementedError() 

565 

566 @abstractmethod 

567 def materialize(self) -> ContextManager[DatasetQueryResults]: 

568 """Insert this query's results into a temporary table. 

569 

570 Returns 

571 ------- 

572 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

573 A context manager that ensures the temporary table is created and 

574 populated in ``__enter__`` (returning a results object backed by 

575 that table), and dropped in ``__exit__``. If ``self`` is already 

576 materialized, the context manager may do nothing (reflecting the 

577 fact that an outer context manager should already take care of 

578 everything else). 

579 """ 

580 raise NotImplementedError() 

581 

582 @abstractmethod 

583 def expanded(self) -> DatasetQueryResults: 

584 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords` 

585 returns `True` for all data IDs in returned `DatasetRef` objects. 

586 

587 Returns 

588 ------- 

589 expanded : `DatasetQueryResults` 

590 Either a new `DatasetQueryResults` instance or ``self``, if it is 

591 already expanded. 

592 

593 Notes 

594 ----- 

595 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

596 to call `materialize` before expanding data IDs for very large result 

597 sets. 

598 """ 

599 raise NotImplementedError() 

600 

601 @abstractmethod 

602 def count(self, *, exact: bool = True) -> int: 

603 """Count the number of rows this query would return. 

604 

605 Parameters 

606 ---------- 

607 exact : `bool`, optional 

608 If `True`, run the full query and perform post-query filtering if 

609 needed to account for that filtering in the count. If `False`, the 

610 result may be an upper bound. 

611 

612 Returns 

613 ------- 

614 count : `int` 

615 The number of rows the query would return, or an upper bound if 

616 ``exact=False``. 

617 

618 Notes 

619 ----- 

620 This counts the number of rows returned, not the number of unique rows 

621 returned, so even with ``exact=True`` it may provide only an upper 

622 bound on the number of *deduplicated* result rows. 

623 """ 

624 raise NotImplementedError() 

625 

626 @abstractmethod 

627 def any( 

628 self, 

629 *, 

630 execute: bool = True, 

631 exact: bool = True, 

632 ) -> bool: 

633 """Test whether this query returns any results. 

634 

635 Parameters 

636 ---------- 

637 execute : `bool`, optional 

638 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

639 determined prior to execution that the query would return no rows. 

640 exact : `bool`, optional 

641 If `True`, run the full query and perform post-query filtering if 

642 needed, until at least one result row is found. If `False`, the 

643 returned result does not account for post-query filtering, and 

644 hence may be `True` even when all result rows would be filtered 

645 out. 

646 

647 Returns 

648 ------- 

649 any : `bool` 

650 `True` if the query would (or might, depending on arguments) yield 

651 result rows. `False` if it definitely would not. 

652 """ 

653 raise NotImplementedError() 

654 

655 @abstractmethod 

656 def explain_no_results(self) -> Iterator[str]: 

657 """Return human-readable messages that may help explain why the query 

658 yields no results. 

659 

660 Returns 

661 ------- 

662 messages : `Iterator` [ `str` ] 

663 String messages that describe reasons the query might not yield any 

664 results. 

665 

666 Notes 

667 ----- 

668 Messages related to post-query filtering are only available if the 

669 iterator has been exhausted, or if `any` or `count` was already called 

670 (with ``exact=True`` for the latter two). 

671 

672 This method first yields messages that are generated while the query is 

673 being built or filtered, but may then proceed to diagnostics generated 

674 by performing what should be inexpensive follow-up queries. Callers 

675 can short-circuit this at any time by simplying not iterating further. 

676 """ 

677 raise NotImplementedError() 

678 

679 

680class ParentDatasetQueryResults(DatasetQueryResults): 

681 """An object that represents results from a query for datasets with a 

682 single parent `DatasetType`. 

683 

684 Parameters 

685 ---------- 

686 db : `Database` 

687 Database engine to execute queries against. 

688 query : `Query` 

689 Low-level query object that backs these results. ``query.datasetType`` 

690 will be the parent dataset type for this object, and may not be `None`. 

691 components : `Sequence` [ `str` or `None` ] 

692 Names of components to include in iteration. `None` may be included 

693 (at most once) to include the parent dataset type. 

694 records : `Mapping`, optional 

695 Mapping containing `DimensionRecord` objects for all dimensions and 

696 all data IDs this query will yield. If `None` (default), 

697 `DataCoordinate.hasRecords` will return `False` for all nested data 

698 IDs. This is a nested mapping with `str` names of dimension elements 

699 as outer keys, `DimensionRecord` instances as inner values, and 

700 ``tuple(record.dataId.values())`` for the inner keys / outer values 

701 (where ``record`` is the innermost `DimensionRecord` instance). 

702 datasetType : `DatasetType`, optional 

703 Parent dataset type for all datasets returned by this query. If not 

704 provided, ``query.datasetType`` be used, and must not be `None` (as it 

705 is in the case where the query is known to yield no results prior to 

706 execution). 

707 """ 

708 

709 def __init__( 

710 self, 

711 db: Database, 

712 query: Query, 

713 *, 

714 components: Sequence[Optional[str]], 

715 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

716 datasetType: Optional[DatasetType] = None, 

717 ): 

718 self._db = db 

719 self._query = query 

720 self._components = components 

721 self._records = records 

722 if datasetType is None: 

723 datasetType = query.datasetType 

724 assert datasetType is not None, "Query used to initialize dataset results must have a dataset." 

725 assert ( 

726 datasetType.dimensions == query.graph 

727 ), f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}." 

728 self._datasetType = datasetType 

729 

730 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

731 

732 def __iter__(self) -> Iterator[DatasetRef]: 

733 for row in self._query.rows(self._db): 

734 parentRef = self._query.extractDatasetRef(row, records=self._records) 

735 for component in self._components: 

736 if component is None: 

737 yield parentRef 

738 else: 

739 yield parentRef.makeComponentRef(component) 

740 

741 def __repr__(self) -> str: 

742 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>" 

743 

744 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

745 # Docstring inherited from DatasetQueryResults. 

746 yield self 

747 

748 @contextmanager 

749 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

750 # Docstring inherited from DatasetQueryResults. 

751 with self._query.materialize(self._db) as materialized: 

752 yield ParentDatasetQueryResults( 

753 self._db, materialized, components=self._components, records=self._records 

754 ) 

755 

756 @property 

757 def parentDatasetType(self) -> DatasetType: 

758 """The parent dataset type for all datasets in this iterable 

759 (`DatasetType`). 

760 """ 

761 return self._datasetType 

762 

763 @property 

764 def dataIds(self) -> DataCoordinateQueryResults: 

765 """A lazy-evaluation object representing a query for just the data 

766 IDs of the datasets that would be returned by this query 

767 (`DataCoordinateQueryResults`). 

768 

769 The returned object is not in general `zip`-iterable with ``self``; 

770 it may be in a different order or have (or not have) duplicates. 

771 """ 

772 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False) 

773 return DataCoordinateQueryResults.from_query( 

774 self._db, 

775 query, 

776 self.parentDatasetType.dimensions, 

777 records=self._records, 

778 ) 

779 

780 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults: 

781 """Return a new query results object for the same parent datasets but 

782 different components. 

783 

784 components : `Sequence` [ `str` or `None` ] 

785 Names of components to include in iteration. `None` may be 

786 included (at most once) to include the parent dataset type. 

787 """ 

788 return ParentDatasetQueryResults( 

789 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType 

790 ) 

791 

792 def expanded(self) -> ParentDatasetQueryResults: 

793 # Docstring inherited from DatasetQueryResults. 

794 if self._records is None: 

795 records = self.dataIds.expanded()._records 

796 return ParentDatasetQueryResults( 

797 self._db, 

798 self._query, 

799 records=records, 

800 components=self._components, 

801 datasetType=self._datasetType, 

802 ) 

803 else: 

804 return self 

805 

806 def count(self, *, exact: bool = True) -> int: 

807 # Docstring inherited. 

808 return len(self._components) * self._query.count(self._db, exact=exact) 

809 

810 def any( 

811 self, 

812 *, 

813 execute: bool = True, 

814 exact: bool = True, 

815 ) -> bool: 

816 # Docstring inherited. 

817 return self._query.any(self._db, execute=execute, exact=exact) 

818 

819 def explain_no_results(self) -> Iterator[str]: 

820 # Docstring inherited. 

821 return self._query.explain_no_results(self._db) 

822 

823 

824class ChainedDatasetQueryResults(DatasetQueryResults): 

825 """A `DatasetQueryResults` implementation that simply chains together 

826 other results objects, each for a different parent dataset type. 

827 

828 Parameters 

829 ---------- 

830 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

831 The underlying results objects this object will chain together. 

832 doomed_by : `Iterable` [ `str` ], optional 

833 A list of messages (appropriate for e.g. logging or exceptions) that 

834 explain why the query is known to return no results even before it is 

835 executed. Queries with a non-empty list will never be executed. 

836 Child results objects may also have their own list. 

837 """ 

838 

839 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()): 

840 self._chain = chain 

841 self._doomed_by = tuple(doomed_by) 

842 

843 __slots__ = ("_chain",) 

844 

845 def __iter__(self) -> Iterator[DatasetRef]: 

846 return itertools.chain.from_iterable(self._chain) 

847 

848 def __repr__(self) -> str: 

849 return "<DatasetRef iterator for multiple dataset types>" 

850 

851 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

852 # Docstring inherited from DatasetQueryResults. 

853 return iter(self._chain) 

854 

855 @contextmanager 

856 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

857 # Docstring inherited from DatasetQueryResults. 

858 with ExitStack() as stack: 

859 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain]) 

860 

861 def expanded(self) -> ChainedDatasetQueryResults: 

862 # Docstring inherited from DatasetQueryResults. 

863 return ChainedDatasetQueryResults([r.expanded() for r in self._chain]) 

864 

865 def count(self, *, exact: bool = True) -> int: 

866 # Docstring inherited. 

867 return sum(r.count(exact=exact) for r in self._chain) 

868 

869 def any( 

870 self, 

871 *, 

872 execute: bool = True, 

873 exact: bool = True, 

874 ) -> bool: 

875 # Docstring inherited. 

876 return any(r.any(execute=execute, exact=exact) for r in self._chain) 

877 

878 def explain_no_results(self) -> Iterator[str]: 

879 # Docstring inherited. 

880 for r in self._chain: 

881 yield from r.explain_no_results() 

882 yield from self._doomed_by 

883 

884 

885class DimensionRecordQueryResults(Iterable[DimensionRecord]): 

886 """An interface for objects that represent the results of queries for 

887 dimension records. 

888 """ 

889 

890 @abstractmethod 

891 def count(self, *, exact: bool = True) -> int: 

892 """Count the number of rows this query would return. 

893 

894 Parameters 

895 ---------- 

896 exact : `bool`, optional 

897 If `True`, run the full query and perform post-query filtering if 

898 needed to account for that filtering in the count. If `False`, the 

899 result may be an upper bound. 

900 

901 Returns 

902 ------- 

903 count : `int` 

904 The number of rows the query would return, or an upper bound if 

905 ``exact=False``. 

906 

907 Notes 

908 ----- 

909 This counts the number of rows returned, not the number of unique rows 

910 returned, so even with ``exact=True`` it may provide only an upper 

911 bound on the number of *deduplicated* result rows. 

912 """ 

913 raise NotImplementedError() 

914 

915 @abstractmethod 

916 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

917 """Test whether this query returns any results. 

918 

919 Parameters 

920 ---------- 

921 execute : `bool`, optional 

922 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

923 determined prior to execution that the query would return no rows. 

924 exact : `bool`, optional 

925 If `True`, run the full query and perform post-query filtering if 

926 needed, until at least one result row is found. If `False`, the 

927 returned result does not account for post-query filtering, and 

928 hence may be `True` even when all result rows would be filtered 

929 out. 

930 

931 Returns 

932 ------- 

933 any : `bool` 

934 `True` if the query would (or might, depending on arguments) yield 

935 result rows. `False` if it definitely would not. 

936 """ 

937 raise NotImplementedError() 

938 

939 @abstractmethod 

940 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

941 """Make the iterator return ordered result. 

942 

943 Parameters 

944 ---------- 

945 *args : `str` 

946 Names of the columns/dimensions to use for ordering. Column name 

947 can be prefixed with minus (``-``) to use descending ordering. 

948 

949 Returns 

950 ------- 

951 result : `DimensionRecordQueryResults` 

952 Returns ``self`` instance which is updated to return ordered 

953 result. 

954 

955 Notes 

956 ----- 

957 This method can modify the iterator in place and return the same 

958 instance. 

959 """ 

960 raise NotImplementedError() 

961 

962 @abstractmethod 

963 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

964 """Make the iterator return limited number of records. 

965 

966 Parameters 

967 ---------- 

968 limit : `int` 

969 Upper limit on the number of returned records. 

970 offset : `int` or `None` 

971 If not `None` then the number of records to skip before returning 

972 ``limit`` records. 

973 

974 Returns 

975 ------- 

976 result : `DimensionRecordQueryResults` 

977 Returns ``self`` instance which is updated to return limited set 

978 of records. 

979 

980 Notes 

981 ----- 

982 This method can modify the iterator in place and return the same 

983 instance. Normally this method is used together with `order_by` 

984 method. 

985 """ 

986 raise NotImplementedError() 

987 

988 

989class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults): 

990 """Implementation of DimensionRecordQueryResults using database query. 

991 

992 Parameters 

993 ---------- 

994 dataIds : `DataCoordinateQueryResults` 

995 Iterator for DataIds. 

996 recordStorage : `DimensionRecordStorage` 

997 Instance of storage class for dimension records. 

998 """ 

999 

1000 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage): 

1001 self._dataIds = dataIds 

1002 self._recordStorage = recordStorage 

1003 self._order_by: Iterable[str] = () 

1004 

1005 def __iter__(self) -> Iterator[DimensionRecord]: 

1006 # LIMIT is already applied at DataCoordinateQueryResults level 

1007 # (assumption here is that if DataId exists then dimension record 

1008 # exists too and their counts must be equal). We still need to make 

1009 # sure that ordering is applied to dimension records as well. 

1010 if not self._order_by: 

1011 return iter(self._recordStorage.fetch(self._dataIds)) 

1012 else: 

1013 # fetch() method does not support ordering, for now do it hard way 

1014 # by fetching everything into memory and ordering by DataId 

1015 dataIds = self._dataIds.toSequence() 

1016 rec_map = {} 

1017 for rec in self._recordStorage.fetch(dataIds): 

1018 rec_map[rec.dataId] = rec 

1019 # TODO: Do we want to clean up dataIds that may be missing 

1020 return iter(rec_map[dataId] for dataId in dataIds) 

1021 

1022 def count(self, *, exact: bool = True) -> int: 

1023 # Docstring inherited from base class. 

1024 return self._dataIds.count(exact=exact) 

1025 

1026 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

1027 # Docstring inherited from base class. 

1028 return self._dataIds.any(execute=execute, exact=exact) 

1029 

1030 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

1031 # Docstring inherited from base class. 

1032 self._dataIds = self._dataIds.order_by(*args) 

1033 self._order_by = args 

1034 return self 

1035 

1036 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

1037 # Docstring inherited from base class. 

1038 self._dataIds = self._dataIds.limit(limit, offset) 

1039 return self