Coverage for python/lsst/daf/butler/registry/queries/_results.py: 33%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

273 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DatabaseDimensionRecordQueryResults", 

26 "DataCoordinateQueryResults", 

27 "DatasetQueryResults", 

28 "DimensionRecordQueryResults", 

29 "ParentDatasetQueryResults", 

30) 

31 

32import itertools 

33import operator 

34from abc import abstractmethod 

35from contextlib import ExitStack, contextmanager 

36from typing import ( 

37 Any, 

38 Callable, 

39 ContextManager, 

40 Iterable, 

41 Iterator, 

42 List, 

43 Mapping, 

44 Optional, 

45 Sequence, 

46 Tuple, 

47 Union, 

48) 

49 

50import sqlalchemy 

51 

52from ...core import ( 

53 DataCoordinate, 

54 DataCoordinateIterable, 

55 DatasetRef, 

56 DatasetType, 

57 Dimension, 

58 DimensionGraph, 

59 DimensionRecord, 

60 SimpleQuery, 

61) 

62from ..interfaces import Database, DimensionRecordStorage 

63from ._query import Query 

64from ._structs import ElementOrderByClause, QuerySummary 

65 

66QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query] 

67"""Type of a query factory method type used by DataCoordinateQueryResults. 

68""" 

69 

70 

71class DataCoordinateQueryResults(DataCoordinateIterable): 

72 """An enhanced implementation of `DataCoordinateIterable` that represents 

73 data IDs retrieved from a database query. 

74 

75 Parameters 

76 ---------- 

77 db : `Database` 

78 Database engine used to execute queries. 

79 query_factory : `QueryFactoryMethod` 

80 Method which creates an instance of `Query` class. 

81 graph : `DimensionGraph` 

82 Dimensions used by query. 

83 order_by : `Iterable` [ `str` ], optional 

84 Optional sequence of column names used for result ordering. 

85 limit : `Tuple` [ `int`, `int` ], optional 

86 Limit for the number of returned records and optional offset. 

87 records : `Mapping`, optional 

88 A nested mapping containing `DimensionRecord` objects for all 

89 dimensions and all data IDs this query will yield. If `None` 

90 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

91 The outer mapping has `str` keys (the names of dimension elements). 

92 The inner mapping has `tuple` keys representing data IDs (tuple 

93 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

94 

95 Notes 

96 ----- 

97 Constructing an instance of this does nothing; the query is not executed 

98 until it is iterated over (or some other operation is performed that 

99 involves iteration). 

100 

101 Instances should generally only be constructed by `Registry` methods or the 

102 methods of other query result objects. 

103 """ 

104 

105 def __init__( 

106 self, 

107 db: Database, 

108 query_factory: QueryFactoryMethod, 

109 graph: DimensionGraph, 

110 *, 

111 order_by: Optional[Iterable[str]] = None, 

112 limit: Optional[Tuple[int, Optional[int]]] = None, 

113 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

114 ): 

115 self._db = db 

116 self._query_factory = query_factory 

117 self._graph = graph 

118 self._order_by = order_by 

119 self._limit = limit 

120 self._records = records 

121 self._cached_query: Optional[Query] = None 

122 

123 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query") 

124 

125 @classmethod 

126 def from_query( 

127 cls, 

128 db: Database, 

129 query: Query, 

130 graph: DimensionGraph, 

131 *, 

132 order_by: Optional[Iterable[str]] = None, 

133 limit: Optional[Tuple[int, Optional[int]]] = None, 

134 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

135 ) -> DataCoordinateQueryResults: 

136 """Make an instance from a pre-existing query instead of a factory. 

137 

138 Parameters 

139 ---------- 

140 db : `Database` 

141 Database engine used to execute queries. 

142 query : `Query` 

143 Low-level representation of the query that backs this result 

144 object. 

145 graph : `DimensionGraph` 

146 Dimensions used by query. 

147 order_by : `Iterable` [ `str` ], optional 

148 Optional sequence of column names used for result ordering. 

149 limit : `Tuple` [ `int`, `int` ], optional 

150 Limit for the number of returned records and optional offset. 

151 records : `Mapping`, optional 

152 A nested mapping containing `DimensionRecord` objects for all 

153 dimensions and all data IDs this query will yield. If `None` 

154 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

155 The outer mapping has `str` keys (the names of dimension elements). 

156 The inner mapping has `tuple` keys representing data IDs (tuple 

157 conversions of `DataCoordinate.values()`) and `DimensionRecord` 

158 values. 

159 """ 

160 

161 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query: 

162 return query 

163 

164 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records) 

165 

166 def __iter__(self) -> Iterator[DataCoordinate]: 

167 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

168 

169 def __repr__(self) -> str: 

170 return f"<DataCoordinate iterator with dimensions={self._graph}>" 

171 

172 def _clone( 

173 self, 

174 *, 

175 query_factory: Optional[QueryFactoryMethod] = None, 

176 query: Optional[Query] = None, 

177 graph: Optional[DimensionGraph] = None, 

178 order_by: Optional[Iterable[str]] = None, 

179 limit: Optional[Tuple[int, Optional[int]]] = None, 

180 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

181 ) -> DataCoordinateQueryResults: 

182 """Clone this instance potentially updating some attributes.""" 

183 graph = graph if graph is not None else self._graph 

184 order_by = order_by if order_by is not None else self._order_by 

185 limit = limit if limit is not None else self._limit 

186 records = records if records is not None else self._records 

187 if query is None: 

188 query_factory = query_factory or self._query_factory 

189 return DataCoordinateQueryResults( 

190 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records 

191 ) 

192 else: 

193 return DataCoordinateQueryResults.from_query( 

194 self._db, query, graph, order_by=order_by, limit=limit, records=records 

195 ) 

196 

197 @property 

198 def _query(self) -> Query: 

199 """Query representation instance (`Query`)""" 

200 if self._cached_query is None: 

201 self._cached_query = self._query_factory(self._order_by, self._limit) 

202 assert ( 

203 self._cached_query.datasetType is None 

204 ), "Query used to initialize data coordinate results should not have any datasets." 

205 return self._cached_query 

206 

207 @property 

208 def graph(self) -> DimensionGraph: 

209 # Docstring inherited from DataCoordinateIterable. 

210 return self._graph 

211 

212 def hasFull(self) -> bool: 

213 # Docstring inherited from DataCoordinateIterable. 

214 return True 

215 

216 def hasRecords(self) -> bool: 

217 # Docstring inherited from DataCoordinateIterable. 

218 return self._records is not None or not self._graph 

219 

220 @contextmanager 

221 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

222 """Insert this query's results into a temporary table. 

223 

224 Returns 

225 ------- 

226 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

227 A context manager that ensures the temporary table is created and 

228 populated in ``__enter__`` (returning a results object backed by 

229 that table), and dropped in ``__exit__``. If ``self`` is already 

230 materialized, the context manager may do nothing (reflecting the 

231 fact that an outer context manager should already take care of 

232 everything else). 

233 

234 Notes 

235 ----- 

236 When using a very large result set to perform multiple queries (e.g. 

237 multiple calls to `subset` with different arguments, or even a single 

238 call to `expanded`), it may be much more efficient to start by 

239 materializing the query and only then performing the follow up queries. 

240 It may also be less efficient, depending on how well database engine's 

241 query optimizer can simplify those particular follow-up queries and 

242 how efficiently it caches query results even when the are not 

243 explicitly inserted into a temporary table. See `expanded` and 

244 `subset` for examples. 

245 """ 

246 with self._query.materialize(self._db) as materialized: 

247 # Note that we depend on order_by columns to be passes from Query 

248 # to MaterializedQuery, so order_by and limit are not used. 

249 yield self._clone(query=materialized) 

250 

251 def expanded(self) -> DataCoordinateQueryResults: 

252 """Return a results object for which `hasRecords` returns `True`. 

253 

254 This method may involve actually executing database queries to fetch 

255 `DimensionRecord` objects. 

256 

257 Returns 

258 ------- 

259 results : `DataCoordinateQueryResults` 

260 A results object for which `hasRecords` returns `True`. May be 

261 ``self`` if that is already the case. 

262 

263 Notes 

264 ----- 

265 For very result sets, it may be much more efficient to call 

266 `materialize` before calling `expanded`, to avoid performing the 

267 original query multiple times (as a subquery) in the follow-up queries 

268 that fetch dimension records. For example:: 

269 

270 with registry.queryDataIds(...).materialize() as tempDataIds: 

271 dataIdsWithRecords = tempDataIds.expanded() 

272 for dataId in dataIdsWithRecords: 

273 ... 

274 """ 

275 if self._records is None: 

276 records = {} 

277 for element in self.graph.elements: 

278 subset = self.subset(graph=element.graph, unique=True) 

279 records[element.name] = { 

280 tuple(record.dataId.values()): record 

281 for record in self._query.managers.dimensions[element].fetch(subset) 

282 } 

283 

284 return self._clone(query=self._query, records=records) 

285 else: 

286 return self 

287 

288 def subset( 

289 self, graph: Optional[DimensionGraph] = None, *, unique: bool = False 

290 ) -> DataCoordinateQueryResults: 

291 """Return a results object containing a subset of the dimensions of 

292 this one, and/or a unique near-subset of its rows. 

293 

294 This method may involve actually executing database queries to fetch 

295 `DimensionRecord` objects. 

296 

297 Parameters 

298 ---------- 

299 graph : `DimensionGraph`, optional 

300 Dimensions to include in the new results object. If `None`, 

301 ``self.graph`` is used. 

302 unique : `bool`, optional 

303 If `True` (`False` is default), the query should only return unique 

304 data IDs. This is implemented in the database; to obtain unique 

305 results via Python-side processing (which may be more efficient in 

306 some cases), use `toSet` to construct a `DataCoordinateSet` from 

307 this results object instead. 

308 

309 Returns 

310 ------- 

311 results : `DataCoordinateQueryResults` 

312 A results object corresponding to the given criteria. May be 

313 ``self`` if it already qualifies. 

314 

315 Notes 

316 ----- 

317 This method can only return a "near-subset" of the original result rows 

318 in general because of subtleties in how spatial overlaps are 

319 implemented; see `Query.subset` for more information. 

320 

321 When calling `subset` multiple times on the same very large result set, 

322 it may be much more efficient to call `materialize` first. For 

323 example:: 

324 

325 dimensions1 = DimensionGraph(...) 

326 dimensions2 = DimensionGraph(...) 

327 with registry.queryDataIds(...).materialize() as tempDataIds: 

328 for dataId1 in tempDataIds.subset( 

329 graph=dimensions1, 

330 unique=True): 

331 ... 

332 for dataId2 in tempDataIds.subset( 

333 graph=dimensions2, 

334 unique=True): 

335 ... 

336 """ 

337 if graph is None: 

338 graph = self.graph 

339 if not graph.issubset(self.graph): 

340 raise ValueError(f"{graph} is not a subset of {self.graph}") 

341 if graph == self.graph and (not unique or self._query.isUnique()): 

342 return self 

343 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] 

344 if self._records is not None: 

345 records = {element.name: self._records[element.name] for element in graph.elements} 

346 else: 

347 records = None 

348 query = self._query.subset(graph=graph, datasets=False, unique=unique) 

349 

350 return self._clone(graph=graph, query=query, records=records) 

351 

352 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

353 # Docstring inherited from DataCoordinateIterable. 

354 sql = self._query.sql 

355 if sql is not None: 

356 fromClause = sql.alias("c") 

357 query.join( 

358 fromClause, 

359 onclause=sqlalchemy.sql.and_( 

360 *[ 

361 columns(dimension.name) == fromClause.columns[dimension.name] 

362 for dimension in self.graph.required 

363 ] 

364 ), 

365 ) 

366 

367 def findDatasets( 

368 self, datasetType: Union[DatasetType, str], collections: Any, *, findFirst: bool = True 

369 ) -> ParentDatasetQueryResults: 

370 """Find datasets using the data IDs identified by this query. 

371 

372 Parameters 

373 ---------- 

374 datasetType : `DatasetType` or `str` 

375 Dataset type or the name of one to search for. Must have 

376 dimensions that are a subset of ``self.graph``. 

377 collections : `Any` 

378 An expression that fully or partially identifies the collections 

379 to search for the dataset, such as a `str`, `re.Pattern`, or 

380 iterable thereof. ``...`` can be used to return all collections. 

381 See :ref:`daf_butler_collection_expressions` for more information. 

382 findFirst : `bool`, optional 

383 If `True` (default), for each result data ID, only yield one 

384 `DatasetRef`, from the first collection in which a dataset of that 

385 dataset type appears (according to the order of ``collections`` 

386 passed in). If `True`, ``collections`` must not contain regular 

387 expressions and may not be ``...``. 

388 

389 Returns 

390 ------- 

391 datasets : `ParentDatasetQueryResults` 

392 A lazy-evaluation object representing dataset query results, 

393 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

394 nested data IDs in those dataset references will have records as 

395 well. 

396 

397 Raises 

398 ------ 

399 ValueError 

400 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

401 """ 

402 if not isinstance(datasetType, DatasetType): 

403 datasetType = self._query.managers.datasets[datasetType].datasetType 

404 # moving component handling down into managers. 

405 if not datasetType.dimensions.issubset(self.graph): 

406 raise ValueError( 

407 f"findDatasets requires that the dataset type have the same dimensions as " 

408 f"the DataCoordinateQueryResult used as input to the search, but " 

409 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input " 

410 f"dimensions are {self.graph}." 

411 ) 

412 if datasetType.isComponent(): 

413 # We were given a true DatasetType instance, but it's a component. 

414 parentName, componentName = datasetType.nameAndComponent() 

415 storage = self._query.managers.datasets[parentName] 

416 datasetType = storage.datasetType 

417 components = [componentName] 

418 else: 

419 components = [None] 

420 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType]) 

421 builder = self._query.makeBuilder(summary) 

422 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst) 

423 query = builder.finish(joinMissing=False) 

424 return ParentDatasetQueryResults( 

425 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType 

426 ) 

427 

428 def count(self, *, exact: bool = True) -> int: 

429 """Count the number of rows this query would return. 

430 

431 Parameters 

432 ---------- 

433 exact : `bool`, optional 

434 If `True`, run the full query and perform post-query filtering if 

435 needed to account for that filtering in the count. If `False`, the 

436 result may be an upper bound. 

437 

438 Returns 

439 ------- 

440 count : `int` 

441 The number of rows the query would return, or an upper bound if 

442 ``exact=False``. 

443 

444 Notes 

445 ----- 

446 This counts the number of rows returned, not the number of unique rows 

447 returned, so even with ``exact=True`` it may provide only an upper 

448 bound on the number of *deduplicated* result rows. 

449 """ 

450 return self._query.count(self._db, exact=exact) 

451 

452 def any( 

453 self, 

454 *, 

455 execute: bool = True, 

456 exact: bool = True, 

457 ) -> bool: 

458 """Test whether this query returns any results. 

459 

460 Parameters 

461 ---------- 

462 execute : `bool`, optional 

463 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

464 determined prior to execution that the query would return no rows. 

465 exact : `bool`, optional 

466 If `True`, run the full query and perform post-query filtering if 

467 needed, until at least one result row is found. If `False`, the 

468 returned result does not account for post-query filtering, and 

469 hence may be `True` even when all result rows would be filtered 

470 out. 

471 

472 Returns 

473 ------- 

474 any : `bool` 

475 `True` if the query would (or might, depending on arguments) yield 

476 result rows. `False` if it definitely would not. 

477 """ 

478 return self._query.any(self._db, execute=execute, exact=exact) 

479 

480 def explain_no_results(self) -> Iterator[str]: 

481 """Return human-readable messages that may help explain why the query 

482 yields no results. 

483 

484 Returns 

485 ------- 

486 messages : `Iterator` [ `str` ] 

487 String messages that describe reasons the query might not yield any 

488 results. 

489 

490 Notes 

491 ----- 

492 Messages related to post-query filtering are only available if the 

493 iterator has been exhausted, or if `any` or `count` was already called 

494 (with ``exact=True`` for the latter two). 

495 

496 This method first yields messages that are generated while the query is 

497 being built or filtered, but may then proceed to diagnostics generated 

498 by performing what should be inexpensive follow-up queries. Callers 

499 can short-circuit this at any time by simplying not iterating further. 

500 """ 

501 return self._query.explain_no_results(self._db) 

502 

503 def order_by(self, *args: str) -> DataCoordinateQueryResults: 

504 """Make the iterator return ordered result. 

505 

506 Parameters 

507 ---------- 

508 *args : `str` 

509 Names of the columns/dimensions to use for ordering. Column name 

510 can be prefixed with minus (``-``) to use descending ordering. 

511 

512 Returns 

513 ------- 

514 result : `DataCoordinateQueryResults` 

515 Returns ``self`` instance which is updated to return ordered 

516 result. 

517 

518 Notes 

519 ----- 

520 This method modifies the iterator in place and returns the same 

521 instance to support method chaining. 

522 """ 

523 return self._clone(order_by=args) 

524 

525 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults: 

526 """Make the iterator return limited number of records. 

527 

528 Parameters 

529 ---------- 

530 limit : `int` 

531 Upper limit on the number of returned records. 

532 offset : `int` or `None` 

533 If not `None` then the number of records to skip before returning 

534 ``limit`` records. 

535 

536 Returns 

537 ------- 

538 result : `DataCoordinateQueryResults` 

539 Returns ``self`` instance which is updated to return limited set 

540 of records. 

541 

542 Notes 

543 ----- 

544 This method modifies the iterator in place and returns the same 

545 instance to support method chaining. Normally this method is used 

546 together with `order_by` method. 

547 """ 

548 return self._clone(limit=(limit, offset)) 

549 

550 

551class DatasetQueryResults(Iterable[DatasetRef]): 

552 """An interface for objects that represent the results of queries for 

553 datasets. 

554 """ 

555 

556 @abstractmethod 

557 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

558 """Group results by parent dataset type. 

559 

560 Returns 

561 ------- 

562 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

563 An iterator over `DatasetQueryResults` instances that are each 

564 responsible for a single parent dataset type (either just that 

565 dataset type, one or more of its component dataset types, or both). 

566 """ 

567 raise NotImplementedError() 

568 

569 @abstractmethod 

570 def materialize(self) -> ContextManager[DatasetQueryResults]: 

571 """Insert this query's results into a temporary table. 

572 

573 Returns 

574 ------- 

575 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

576 A context manager that ensures the temporary table is created and 

577 populated in ``__enter__`` (returning a results object backed by 

578 that table), and dropped in ``__exit__``. If ``self`` is already 

579 materialized, the context manager may do nothing (reflecting the 

580 fact that an outer context manager should already take care of 

581 everything else). 

582 """ 

583 raise NotImplementedError() 

584 

585 @abstractmethod 

586 def expanded(self) -> DatasetQueryResults: 

587 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords` 

588 returns `True` for all data IDs in returned `DatasetRef` objects. 

589 

590 Returns 

591 ------- 

592 expanded : `DatasetQueryResults` 

593 Either a new `DatasetQueryResults` instance or ``self``, if it is 

594 already expanded. 

595 

596 Notes 

597 ----- 

598 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

599 to call `materialize` before expanding data IDs for very large result 

600 sets. 

601 """ 

602 raise NotImplementedError() 

603 

604 @abstractmethod 

605 def count(self, *, exact: bool = True) -> int: 

606 """Count the number of rows this query would return. 

607 

608 Parameters 

609 ---------- 

610 exact : `bool`, optional 

611 If `True`, run the full query and perform post-query filtering if 

612 needed to account for that filtering in the count. If `False`, the 

613 result may be an upper bound. 

614 

615 Returns 

616 ------- 

617 count : `int` 

618 The number of rows the query would return, or an upper bound if 

619 ``exact=False``. 

620 

621 Notes 

622 ----- 

623 This counts the number of rows returned, not the number of unique rows 

624 returned, so even with ``exact=True`` it may provide only an upper 

625 bound on the number of *deduplicated* result rows. 

626 """ 

627 raise NotImplementedError() 

628 

629 @abstractmethod 

630 def any( 

631 self, 

632 *, 

633 execute: bool = True, 

634 exact: bool = True, 

635 ) -> bool: 

636 """Test whether this query returns any results. 

637 

638 Parameters 

639 ---------- 

640 execute : `bool`, optional 

641 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

642 determined prior to execution that the query would return no rows. 

643 exact : `bool`, optional 

644 If `True`, run the full query and perform post-query filtering if 

645 needed, until at least one result row is found. If `False`, the 

646 returned result does not account for post-query filtering, and 

647 hence may be `True` even when all result rows would be filtered 

648 out. 

649 

650 Returns 

651 ------- 

652 any : `bool` 

653 `True` if the query would (or might, depending on arguments) yield 

654 result rows. `False` if it definitely would not. 

655 """ 

656 raise NotImplementedError() 

657 

658 @abstractmethod 

659 def explain_no_results(self) -> Iterator[str]: 

660 """Return human-readable messages that may help explain why the query 

661 yields no results. 

662 

663 Returns 

664 ------- 

665 messages : `Iterator` [ `str` ] 

666 String messages that describe reasons the query might not yield any 

667 results. 

668 

669 Notes 

670 ----- 

671 Messages related to post-query filtering are only available if the 

672 iterator has been exhausted, or if `any` or `count` was already called 

673 (with ``exact=True`` for the latter two). 

674 

675 This method first yields messages that are generated while the query is 

676 being built or filtered, but may then proceed to diagnostics generated 

677 by performing what should be inexpensive follow-up queries. Callers 

678 can short-circuit this at any time by simplying not iterating further. 

679 """ 

680 raise NotImplementedError() 

681 

682 

683class ParentDatasetQueryResults(DatasetQueryResults): 

684 """An object that represents results from a query for datasets with a 

685 single parent `DatasetType`. 

686 

687 Parameters 

688 ---------- 

689 db : `Database` 

690 Database engine to execute queries against. 

691 query : `Query` 

692 Low-level query object that backs these results. ``query.datasetType`` 

693 will be the parent dataset type for this object, and may not be `None`. 

694 components : `Sequence` [ `str` or `None` ] 

695 Names of components to include in iteration. `None` may be included 

696 (at most once) to include the parent dataset type. 

697 records : `Mapping`, optional 

698 Mapping containing `DimensionRecord` objects for all dimensions and 

699 all data IDs this query will yield. If `None` (default), 

700 `DataCoordinate.hasRecords` will return `False` for all nested data 

701 IDs. This is a nested mapping with `str` names of dimension elements 

702 as outer keys, `DimensionRecord` instances as inner values, and 

703 ``tuple(record.dataId.values())`` for the inner keys / outer values 

704 (where ``record`` is the innermost `DimensionRecord` instance). 

705 datasetType : `DatasetType`, optional 

706 Parent dataset type for all datasets returned by this query. If not 

707 provided, ``query.datasetType`` be used, and must not be `None` (as it 

708 is in the case where the query is known to yield no results prior to 

709 execution). 

710 """ 

711 

712 def __init__( 

713 self, 

714 db: Database, 

715 query: Query, 

716 *, 

717 components: Sequence[Optional[str]], 

718 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

719 datasetType: Optional[DatasetType] = None, 

720 ): 

721 self._db = db 

722 self._query = query 

723 self._components = components 

724 self._records = records 

725 if datasetType is None: 

726 datasetType = query.datasetType 

727 assert datasetType is not None, "Query used to initialize dataset results must have a dataset." 

728 assert ( 

729 datasetType.dimensions == query.graph 

730 ), f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}." 

731 self._datasetType = datasetType 

732 

733 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

734 

735 def __iter__(self) -> Iterator[DatasetRef]: 

736 for row in self._query.rows(self._db): 

737 parentRef = self._query.extractDatasetRef(row, records=self._records) 

738 for component in self._components: 

739 if component is None: 

740 yield parentRef 

741 else: 

742 yield parentRef.makeComponentRef(component) 

743 

744 def __repr__(self) -> str: 

745 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>" 

746 

747 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

748 # Docstring inherited from DatasetQueryResults. 

749 yield self 

750 

751 @contextmanager 

752 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

753 # Docstring inherited from DatasetQueryResults. 

754 with self._query.materialize(self._db) as materialized: 

755 yield ParentDatasetQueryResults( 

756 self._db, materialized, components=self._components, records=self._records 

757 ) 

758 

759 @property 

760 def parentDatasetType(self) -> DatasetType: 

761 """The parent dataset type for all datasets in this iterable 

762 (`DatasetType`). 

763 """ 

764 return self._datasetType 

765 

766 @property 

767 def dataIds(self) -> DataCoordinateQueryResults: 

768 """A lazy-evaluation object representing a query for just the data 

769 IDs of the datasets that would be returned by this query 

770 (`DataCoordinateQueryResults`). 

771 

772 The returned object is not in general `zip`-iterable with ``self``; 

773 it may be in a different order or have (or not have) duplicates. 

774 """ 

775 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False) 

776 return DataCoordinateQueryResults.from_query( 

777 self._db, 

778 query, 

779 self.parentDatasetType.dimensions, 

780 records=self._records, 

781 ) 

782 

783 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults: 

784 """Return a new query results object for the same parent datasets but 

785 different components. 

786 

787 components : `Sequence` [ `str` or `None` ] 

788 Names of components to include in iteration. `None` may be 

789 included (at most once) to include the parent dataset type. 

790 """ 

791 return ParentDatasetQueryResults( 

792 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType 

793 ) 

794 

795 def expanded(self) -> ParentDatasetQueryResults: 

796 # Docstring inherited from DatasetQueryResults. 

797 if self._records is None: 

798 records = self.dataIds.expanded()._records 

799 return ParentDatasetQueryResults( 

800 self._db, 

801 self._query, 

802 records=records, 

803 components=self._components, 

804 datasetType=self._datasetType, 

805 ) 

806 else: 

807 return self 

808 

809 def count(self, *, exact: bool = True) -> int: 

810 # Docstring inherited. 

811 return len(self._components) * self._query.count(self._db, exact=exact) 

812 

813 def any( 

814 self, 

815 *, 

816 execute: bool = True, 

817 exact: bool = True, 

818 ) -> bool: 

819 # Docstring inherited. 

820 return self._query.any(self._db, execute=execute, exact=exact) 

821 

822 def explain_no_results(self) -> Iterator[str]: 

823 # Docstring inherited. 

824 return self._query.explain_no_results(self._db) 

825 

826 

827class ChainedDatasetQueryResults(DatasetQueryResults): 

828 """A `DatasetQueryResults` implementation that simply chains together 

829 other results objects, each for a different parent dataset type. 

830 

831 Parameters 

832 ---------- 

833 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

834 The underlying results objects this object will chain together. 

835 doomed_by : `Iterable` [ `str` ], optional 

836 A list of messages (appropriate for e.g. logging or exceptions) that 

837 explain why the query is known to return no results even before it is 

838 executed. Queries with a non-empty list will never be executed. 

839 Child results objects may also have their own list. 

840 """ 

841 

842 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()): 

843 self._chain = chain 

844 self._doomed_by = tuple(doomed_by) 

845 

846 __slots__ = ("_chain",) 

847 

848 def __iter__(self) -> Iterator[DatasetRef]: 

849 return itertools.chain.from_iterable(self._chain) 

850 

851 def __repr__(self) -> str: 

852 return "<DatasetRef iterator for multiple dataset types>" 

853 

854 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

855 # Docstring inherited from DatasetQueryResults. 

856 return iter(self._chain) 

857 

858 @contextmanager 

859 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

860 # Docstring inherited from DatasetQueryResults. 

861 with ExitStack() as stack: 

862 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain]) 

863 

864 def expanded(self) -> ChainedDatasetQueryResults: 

865 # Docstring inherited from DatasetQueryResults. 

866 return ChainedDatasetQueryResults([r.expanded() for r in self._chain]) 

867 

868 def count(self, *, exact: bool = True) -> int: 

869 # Docstring inherited. 

870 return sum(r.count(exact=exact) for r in self._chain) 

871 

872 def any( 

873 self, 

874 *, 

875 execute: bool = True, 

876 exact: bool = True, 

877 ) -> bool: 

878 # Docstring inherited. 

879 return any(r.any(execute=execute, exact=exact) for r in self._chain) 

880 

881 def explain_no_results(self) -> Iterator[str]: 

882 # Docstring inherited. 

883 for r in self._chain: 

884 yield from r.explain_no_results() 

885 yield from self._doomed_by 

886 

887 

888class DimensionRecordQueryResults(Iterable[DimensionRecord]): 

889 """An interface for objects that represent the results of queries for 

890 dimension records. 

891 """ 

892 

893 @abstractmethod 

894 def count(self, *, exact: bool = True) -> int: 

895 """Count the number of rows this query would return. 

896 

897 Parameters 

898 ---------- 

899 exact : `bool`, optional 

900 If `True`, run the full query and perform post-query filtering if 

901 needed to account for that filtering in the count. If `False`, the 

902 result may be an upper bound. 

903 

904 Returns 

905 ------- 

906 count : `int` 

907 The number of rows the query would return, or an upper bound if 

908 ``exact=False``. 

909 

910 Notes 

911 ----- 

912 This counts the number of rows returned, not the number of unique rows 

913 returned, so even with ``exact=True`` it may provide only an upper 

914 bound on the number of *deduplicated* result rows. 

915 """ 

916 raise NotImplementedError() 

917 

918 @abstractmethod 

919 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

920 """Test whether this query returns any results. 

921 

922 Parameters 

923 ---------- 

924 execute : `bool`, optional 

925 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

926 determined prior to execution that the query would return no rows. 

927 exact : `bool`, optional 

928 If `True`, run the full query and perform post-query filtering if 

929 needed, until at least one result row is found. If `False`, the 

930 returned result does not account for post-query filtering, and 

931 hence may be `True` even when all result rows would be filtered 

932 out. 

933 

934 Returns 

935 ------- 

936 any : `bool` 

937 `True` if the query would (or might, depending on arguments) yield 

938 result rows. `False` if it definitely would not. 

939 """ 

940 raise NotImplementedError() 

941 

942 @abstractmethod 

943 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

944 """Make the iterator return ordered result. 

945 

946 Parameters 

947 ---------- 

948 *args : `str` 

949 Names of the columns/dimensions to use for ordering. Column name 

950 can be prefixed with minus (``-``) to use descending ordering. 

951 

952 Returns 

953 ------- 

954 result : `DimensionRecordQueryResults` 

955 Returns ``self`` instance which is updated to return ordered 

956 result. 

957 

958 Notes 

959 ----- 

960 This method can modify the iterator in place and return the same 

961 instance. 

962 """ 

963 raise NotImplementedError() 

964 

965 @abstractmethod 

966 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

967 """Make the iterator return limited number of records. 

968 

969 Parameters 

970 ---------- 

971 limit : `int` 

972 Upper limit on the number of returned records. 

973 offset : `int` or `None` 

974 If not `None` then the number of records to skip before returning 

975 ``limit`` records. 

976 

977 Returns 

978 ------- 

979 result : `DimensionRecordQueryResults` 

980 Returns ``self`` instance which is updated to return limited set 

981 of records. 

982 

983 Notes 

984 ----- 

985 This method can modify the iterator in place and return the same 

986 instance. Normally this method is used together with `order_by` 

987 method. 

988 """ 

989 raise NotImplementedError() 

990 

991 

992class _DimensionRecordKey: 

993 """Class for objects used as keys in ordering `DimensionRecord` instances. 

994 

995 Parameters 

996 ---------- 

997 attributes : `Sequence` [ `str` ] 

998 Sequence of attribute names to use for comparison. 

999 ordering : `Sequence` [ `bool` ] 

1000 Matching sequence of ordering flags, `False` for descending ordering, 

1001 `True` for ascending ordering. 

1002 record : `DimensionRecord` 

1003 `DimensionRecord` to compare to other records. 

1004 """ 

1005 

1006 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord): 

1007 self.attributes = attributes 

1008 self.ordering = ordering 

1009 self.rec = record 

1010 

1011 def _cmp(self, other: _DimensionRecordKey) -> int: 

1012 """Compare two records using provided comparison operator. 

1013 

1014 Parameters 

1015 ---------- 

1016 other : `_DimensionRecordKey` 

1017 Key for other record. 

1018 

1019 Returns 

1020 ------- 

1021 result : `int` 

1022 0 if keys are identical, negative if ``self`` is ordered before 

1023 ``other``, positive otherwise. 

1024 """ 

1025 for attribute, ordering in zip(self.attributes, self.ordering): 

1026 # timespan.begin/end cannot use getattr 

1027 attrgetter = operator.attrgetter(attribute) 

1028 lhs = attrgetter(self.rec) 

1029 rhs = attrgetter(other.rec) 

1030 if not ordering: 

1031 lhs, rhs = rhs, lhs 

1032 if lhs != rhs: 

1033 return 1 if lhs > rhs else -1 

1034 return 0 

1035 

1036 def __lt__(self, other: _DimensionRecordKey) -> bool: 

1037 return self._cmp(other) < 0 

1038 

1039 def __gt__(self, other: _DimensionRecordKey) -> bool: 

1040 return self._cmp(other) > 0 

1041 

1042 def __eq__(self, other: Any) -> bool: 

1043 if not isinstance(other, _DimensionRecordKey): 

1044 return NotImplemented 

1045 return self._cmp(other) == 0 

1046 

1047 def __le__(self, other: _DimensionRecordKey) -> bool: 

1048 return self._cmp(other) <= 0 

1049 

1050 def __ge__(self, other: _DimensionRecordKey) -> bool: 

1051 return self._cmp(other) >= 0 

1052 

1053 

1054class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults): 

1055 """Implementation of DimensionRecordQueryResults using database query. 

1056 

1057 Parameters 

1058 ---------- 

1059 dataIds : `DataCoordinateQueryResults` 

1060 Iterator for DataIds. 

1061 recordStorage : `DimensionRecordStorage` 

1062 Instance of storage class for dimension records. 

1063 """ 

1064 

1065 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage): 

1066 self._dataIds = dataIds 

1067 self._recordStorage = recordStorage 

1068 self._order_by: Iterable[str] = () 

1069 

1070 def __iter__(self) -> Iterator[DimensionRecord]: 

1071 # LIMIT is already applied at DataCoordinateQueryResults level 

1072 # (assumption here is that if DataId exists then dimension record 

1073 # exists too and their counts must be equal). fetch() does not 

1074 # guarantee ordering, so we need to sort records in memory below. 

1075 recordIter = self._recordStorage.fetch(self._dataIds) 

1076 if not self._order_by: 

1077 return iter(recordIter) 

1078 

1079 # Parse list of column names and build a list of attribute name for 

1080 # ordering. Note that here we only support ordering by direct 

1081 # attributes of the element, and not other elements from the dimension 

1082 # graph. 

1083 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element) 

1084 attributes: List[str] = [] 

1085 ordering: List[bool] = [] 

1086 for column in orderBy.order_by_columns: 

1087 if column.column is None: 

1088 assert isinstance(column.element, Dimension), "Element must be a Dimension" 

1089 attributes.append(column.element.primaryKey.name) 

1090 else: 

1091 attributes.append(column.column) 

1092 ordering.append(column.ordering) 

1093 

1094 def _key(record: DimensionRecord) -> _DimensionRecordKey: 

1095 return _DimensionRecordKey(attributes, ordering, record) 

1096 

1097 records = sorted(recordIter, key=_key) 

1098 return iter(records) 

1099 

1100 def count(self, *, exact: bool = True) -> int: 

1101 # Docstring inherited from base class. 

1102 return self._dataIds.count(exact=exact) 

1103 

1104 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

1105 # Docstring inherited from base class. 

1106 return self._dataIds.any(execute=execute, exact=exact) 

1107 

1108 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

1109 # Docstring inherited from base class. 

1110 self._dataIds = self._dataIds.order_by(*args) 

1111 self._order_by = args 

1112 return self 

1113 

1114 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

1115 # Docstring inherited from base class. 

1116 self._dataIds = self._dataIds.limit(limit, offset) 

1117 return self