Coverage for python/lsst/daf/butler/registry/queries/_results.py: 34%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

278 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DatabaseDimensionRecordQueryResults", 

26 "DataCoordinateQueryResults", 

27 "DatasetQueryResults", 

28 "DimensionRecordQueryResults", 

29 "ParentDatasetQueryResults", 

30) 

31 

32import itertools 

33import operator 

34from abc import abstractmethod 

35from contextlib import ExitStack, contextmanager 

36from typing import ( 

37 Any, 

38 Callable, 

39 ContextManager, 

40 Iterable, 

41 Iterator, 

42 List, 

43 Mapping, 

44 Optional, 

45 Sequence, 

46 Tuple, 

47 Union, 

48) 

49 

50import sqlalchemy 

51 

52from ...core import ( 

53 DataCoordinate, 

54 DataCoordinateIterable, 

55 DatasetRef, 

56 DatasetType, 

57 Dimension, 

58 DimensionGraph, 

59 DimensionRecord, 

60 SimpleQuery, 

61) 

62from ..interfaces import Database, DimensionRecordStorage 

63from ._query import Query 

64from ._structs import ElementOrderByClause, QuerySummary 

65 

66QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query] 

67"""Type of a query factory method type used by DataCoordinateQueryResults. 

68""" 

69 

70 

71class DataCoordinateQueryResults(DataCoordinateIterable): 

72 """An enhanced implementation of `DataCoordinateIterable` that represents 

73 data IDs retrieved from a database query. 

74 

75 Parameters 

76 ---------- 

77 db : `Database` 

78 Database engine used to execute queries. 

79 query_factory : `QueryFactoryMethod` 

80 Method which creates an instance of `Query` class. 

81 graph : `DimensionGraph` 

82 Dimensions used by query. 

83 order_by : `Iterable` [ `str` ], optional 

84 Optional sequence of column names used for result ordering. 

85 limit : `Tuple` [ `int`, `int` ], optional 

86 Limit for the number of returned records and optional offset. 

87 records : `Mapping`, optional 

88 A nested mapping containing `DimensionRecord` objects for all 

89 dimensions and all data IDs this query will yield. If `None` 

90 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

91 The outer mapping has `str` keys (the names of dimension elements). 

92 The inner mapping has `tuple` keys representing data IDs (tuple 

93 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

94 

95 Notes 

96 ----- 

97 Constructing an instance of this does nothing; the query is not executed 

98 until it is iterated over (or some other operation is performed that 

99 involves iteration). 

100 

101 Instances should generally only be constructed by `Registry` methods or the 

102 methods of other query result objects. 

103 """ 

104 

105 def __init__( 

106 self, 

107 db: Database, 

108 query_factory: QueryFactoryMethod, 

109 graph: DimensionGraph, 

110 *, 

111 order_by: Optional[Iterable[str]] = None, 

112 limit: Optional[Tuple[int, Optional[int]]] = None, 

113 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

114 ): 

115 self._db = db 

116 self._query_factory = query_factory 

117 self._graph = graph 

118 self._order_by = order_by 

119 self._limit = limit 

120 self._records = records 

121 self._cached_query: Optional[Query] = None 

122 

123 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query") 

124 

125 @classmethod 

126 def from_query( 

127 cls, 

128 db: Database, 

129 query: Query, 

130 graph: DimensionGraph, 

131 *, 

132 order_by: Optional[Iterable[str]] = None, 

133 limit: Optional[Tuple[int, Optional[int]]] = None, 

134 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

135 ) -> DataCoordinateQueryResults: 

136 """Make an instance from a pre-existing query instead of a factory. 

137 

138 Parameters 

139 ---------- 

140 db : `Database` 

141 Database engine used to execute queries. 

142 query : `Query` 

143 Low-level representation of the query that backs this result 

144 object. 

145 graph : `DimensionGraph` 

146 Dimensions used by query. 

147 order_by : `Iterable` [ `str` ], optional 

148 Optional sequence of column names used for result ordering. 

149 limit : `Tuple` [ `int`, `int` ], optional 

150 Limit for the number of returned records and optional offset. 

151 records : `Mapping`, optional 

152 A nested mapping containing `DimensionRecord` objects for all 

153 dimensions and all data IDs this query will yield. If `None` 

154 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

155 The outer mapping has `str` keys (the names of dimension elements). 

156 The inner mapping has `tuple` keys representing data IDs (tuple 

157 conversions of `DataCoordinate.values()`) and `DimensionRecord` 

158 values. 

159 """ 

160 

161 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query: 

162 return query 

163 

164 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records) 

165 

166 def __iter__(self) -> Iterator[DataCoordinate]: 

167 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

168 

169 def __repr__(self) -> str: 

170 return f"<DataCoordinate iterator with dimensions={self._graph}>" 

171 

172 def _clone( 

173 self, 

174 *, 

175 query_factory: Optional[QueryFactoryMethod] = None, 

176 query: Optional[Query] = None, 

177 graph: Optional[DimensionGraph] = None, 

178 order_by: Optional[Iterable[str]] = None, 

179 limit: Optional[Tuple[int, Optional[int]]] = None, 

180 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

181 ) -> DataCoordinateQueryResults: 

182 """Clone this instance potentially updating some attributes.""" 

183 graph = graph if graph is not None else self._graph 

184 order_by = order_by if order_by is not None else self._order_by 

185 limit = limit if limit is not None else self._limit 

186 records = records if records is not None else self._records 

187 if query is None: 

188 query_factory = query_factory or self._query_factory 

189 return DataCoordinateQueryResults( 

190 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records 

191 ) 

192 else: 

193 return DataCoordinateQueryResults.from_query( 

194 self._db, query, graph, order_by=order_by, limit=limit, records=records 

195 ) 

196 

197 @property 

198 def _query(self) -> Query: 

199 """Query representation instance (`Query`)""" 

200 if self._cached_query is None: 

201 self._cached_query = self._query_factory(self._order_by, self._limit) 

202 assert ( 

203 self._cached_query.datasetType is None 

204 ), "Query used to initialize data coordinate results should not have any datasets." 

205 return self._cached_query 

206 

207 @property 

208 def graph(self) -> DimensionGraph: 

209 # Docstring inherited from DataCoordinateIterable. 

210 return self._graph 

211 

212 def hasFull(self) -> bool: 

213 # Docstring inherited from DataCoordinateIterable. 

214 return True 

215 

216 def hasRecords(self) -> bool: 

217 # Docstring inherited from DataCoordinateIterable. 

218 return self._records is not None or not self._graph 

219 

220 @contextmanager 

221 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

222 """Insert this query's results into a temporary table. 

223 

224 Returns 

225 ------- 

226 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

227 A context manager that ensures the temporary table is created and 

228 populated in ``__enter__`` (returning a results object backed by 

229 that table), and dropped in ``__exit__``. If ``self`` is already 

230 materialized, the context manager may do nothing (reflecting the 

231 fact that an outer context manager should already take care of 

232 everything else). 

233 

234 Notes 

235 ----- 

236 When using a very large result set to perform multiple queries (e.g. 

237 multiple calls to `subset` with different arguments, or even a single 

238 call to `expanded`), it may be much more efficient to start by 

239 materializing the query and only then performing the follow up queries. 

240 It may also be less efficient, depending on how well database engine's 

241 query optimizer can simplify those particular follow-up queries and 

242 how efficiently it caches query results even when the are not 

243 explicitly inserted into a temporary table. See `expanded` and 

244 `subset` for examples. 

245 """ 

246 with self._query.materialize(self._db) as materialized: 

247 # Note that we depend on order_by columns to be passes from Query 

248 # to MaterializedQuery, so order_by and limit are not used. 

249 yield self._clone(query=materialized) 

250 

251 def expanded(self) -> DataCoordinateQueryResults: 

252 """Return a results object for which `hasRecords` returns `True`. 

253 

254 This method may involve actually executing database queries to fetch 

255 `DimensionRecord` objects. 

256 

257 Returns 

258 ------- 

259 results : `DataCoordinateQueryResults` 

260 A results object for which `hasRecords` returns `True`. May be 

261 ``self`` if that is already the case. 

262 

263 Notes 

264 ----- 

265 For very result sets, it may be much more efficient to call 

266 `materialize` before calling `expanded`, to avoid performing the 

267 original query multiple times (as a subquery) in the follow-up queries 

268 that fetch dimension records. For example:: 

269 

270 with registry.queryDataIds(...).materialize() as tempDataIds: 

271 dataIdsWithRecords = tempDataIds.expanded() 

272 for dataId in dataIdsWithRecords: 

273 ... 

274 """ 

275 if self._records is None: 

276 records = {} 

277 for element in self.graph.elements: 

278 subset = self.subset(graph=element.graph, unique=True) 

279 records[element.name] = { 

280 tuple(record.dataId.values()): record 

281 for record in self._query.managers.dimensions[element].fetch(subset) 

282 } 

283 

284 return self._clone(query=self._query, records=records) 

285 else: 

286 return self 

287 

288 def subset( 

289 self, graph: Optional[DimensionGraph] = None, *, unique: bool = False 

290 ) -> DataCoordinateQueryResults: 

291 """Return a results object containing a subset of the dimensions of 

292 this one, and/or a unique near-subset of its rows. 

293 

294 This method may involve actually executing database queries to fetch 

295 `DimensionRecord` objects. 

296 

297 Parameters 

298 ---------- 

299 graph : `DimensionGraph`, optional 

300 Dimensions to include in the new results object. If `None`, 

301 ``self.graph`` is used. 

302 unique : `bool`, optional 

303 If `True` (`False` is default), the query should only return unique 

304 data IDs. This is implemented in the database; to obtain unique 

305 results via Python-side processing (which may be more efficient in 

306 some cases), use `toSet` to construct a `DataCoordinateSet` from 

307 this results object instead. 

308 

309 Returns 

310 ------- 

311 results : `DataCoordinateQueryResults` 

312 A results object corresponding to the given criteria. May be 

313 ``self`` if it already qualifies. 

314 

315 Raises 

316 ------ 

317 ValueError 

318 Raised when ``graph`` is not a subset of the dimension graph in 

319 this result. 

320 

321 Notes 

322 ----- 

323 This method can only return a "near-subset" of the original result rows 

324 in general because of subtleties in how spatial overlaps are 

325 implemented; see `Query.subset` for more information. 

326 

327 When calling `subset` multiple times on the same very large result set, 

328 it may be much more efficient to call `materialize` first. For 

329 example:: 

330 

331 dimensions1 = DimensionGraph(...) 

332 dimensions2 = DimensionGraph(...) 

333 with registry.queryDataIds(...).materialize() as tempDataIds: 

334 for dataId1 in tempDataIds.subset( 

335 graph=dimensions1, 

336 unique=True): 

337 ... 

338 for dataId2 in tempDataIds.subset( 

339 graph=dimensions2, 

340 unique=True): 

341 ... 

342 """ 

343 if graph is None: 

344 graph = self.graph 

345 if not graph.issubset(self.graph): 

346 raise ValueError(f"{graph} is not a subset of {self.graph}") 

347 if graph == self.graph and (not unique or self._query.isUnique()): 

348 return self 

349 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] 

350 if self._records is not None: 

351 records = {element.name: self._records[element.name] for element in graph.elements} 

352 else: 

353 records = None 

354 query = self._query.subset(graph=graph, datasets=False, unique=unique) 

355 

356 return self._clone(graph=graph, query=query, records=records) 

357 

358 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

359 # Docstring inherited from DataCoordinateIterable. 

360 sql = self._query.sql 

361 if sql is not None: 

362 fromClause = sql.alias("c") 

363 query.join( 

364 fromClause, 

365 onclause=sqlalchemy.sql.and_( 

366 *[ 

367 columns(dimension.name) == fromClause.columns[dimension.name] 

368 for dimension in self.graph.required 

369 ] 

370 ), 

371 ) 

372 

373 def findDatasets( 

374 self, datasetType: Union[DatasetType, str], collections: Any, *, findFirst: bool = True 

375 ) -> ParentDatasetQueryResults: 

376 """Find datasets using the data IDs identified by this query. 

377 

378 Parameters 

379 ---------- 

380 datasetType : `DatasetType` or `str` 

381 Dataset type or the name of one to search for. Must have 

382 dimensions that are a subset of ``self.graph``. 

383 collections : `Any` 

384 An expression that fully or partially identifies the collections 

385 to search for the dataset, such as a `str`, `re.Pattern`, or 

386 iterable thereof. ``...`` can be used to return all collections. 

387 See :ref:`daf_butler_collection_expressions` for more information. 

388 findFirst : `bool`, optional 

389 If `True` (default), for each result data ID, only yield one 

390 `DatasetRef`, from the first collection in which a dataset of that 

391 dataset type appears (according to the order of ``collections`` 

392 passed in). If `True`, ``collections`` must not contain regular 

393 expressions and may not be ``...``. 

394 

395 Returns 

396 ------- 

397 datasets : `ParentDatasetQueryResults` 

398 A lazy-evaluation object representing dataset query results, 

399 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

400 nested data IDs in those dataset references will have records as 

401 well. 

402 

403 Raises 

404 ------ 

405 ValueError 

406 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

407 """ 

408 if not isinstance(datasetType, DatasetType): 

409 datasetType = self._query.managers.datasets[datasetType].datasetType 

410 # moving component handling down into managers. 

411 if not datasetType.dimensions.issubset(self.graph): 

412 raise ValueError( 

413 f"findDatasets requires that the dataset type have the same dimensions as " 

414 f"the DataCoordinateQueryResult used as input to the search, but " 

415 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input " 

416 f"dimensions are {self.graph}." 

417 ) 

418 if datasetType.isComponent(): 

419 # We were given a true DatasetType instance, but it's a component. 

420 parentName, componentName = datasetType.nameAndComponent() 

421 storage = self._query.managers.datasets[parentName] 

422 datasetType = storage.datasetType 

423 components = [componentName] 

424 else: 

425 components = [None] 

426 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType]) 

427 builder = self._query.makeBuilder(summary) 

428 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst) 

429 query = builder.finish(joinMissing=False) 

430 return ParentDatasetQueryResults( 

431 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType 

432 ) 

433 

434 def count(self, *, exact: bool = True) -> int: 

435 """Count the number of rows this query would return. 

436 

437 Parameters 

438 ---------- 

439 exact : `bool`, optional 

440 If `True`, run the full query and perform post-query filtering if 

441 needed to account for that filtering in the count. If `False`, the 

442 result may be an upper bound. 

443 

444 Returns 

445 ------- 

446 count : `int` 

447 The number of rows the query would return, or an upper bound if 

448 ``exact=False``. 

449 

450 Notes 

451 ----- 

452 This counts the number of rows returned, not the number of unique rows 

453 returned, so even with ``exact=True`` it may provide only an upper 

454 bound on the number of *deduplicated* result rows. 

455 """ 

456 return self._query.count(self._db, exact=exact) 

457 

458 def any( 

459 self, 

460 *, 

461 execute: bool = True, 

462 exact: bool = True, 

463 ) -> bool: 

464 """Test whether this query returns any results. 

465 

466 Parameters 

467 ---------- 

468 execute : `bool`, optional 

469 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

470 determined prior to execution that the query would return no rows. 

471 exact : `bool`, optional 

472 If `True`, run the full query and perform post-query filtering if 

473 needed, until at least one result row is found. If `False`, the 

474 returned result does not account for post-query filtering, and 

475 hence may be `True` even when all result rows would be filtered 

476 out. 

477 

478 Returns 

479 ------- 

480 any : `bool` 

481 `True` if the query would (or might, depending on arguments) yield 

482 result rows. `False` if it definitely would not. 

483 """ 

484 return self._query.any(self._db, execute=execute, exact=exact) 

485 

486 def explain_no_results(self) -> Iterator[str]: 

487 """Return human-readable messages that may help explain why the query 

488 yields no results. 

489 

490 Returns 

491 ------- 

492 messages : `Iterator` [ `str` ] 

493 String messages that describe reasons the query might not yield any 

494 results. 

495 

496 Notes 

497 ----- 

498 Messages related to post-query filtering are only available if the 

499 iterator has been exhausted, or if `any` or `count` was already called 

500 (with ``exact=True`` for the latter two). 

501 

502 This method first yields messages that are generated while the query is 

503 being built or filtered, but may then proceed to diagnostics generated 

504 by performing what should be inexpensive follow-up queries. Callers 

505 can short-circuit this at any time by simplying not iterating further. 

506 """ 

507 return self._query.explain_no_results(self._db) 

508 

509 def order_by(self, *args: str) -> DataCoordinateQueryResults: 

510 """Make the iterator return ordered result. 

511 

512 Parameters 

513 ---------- 

514 *args : `str` 

515 Names of the columns/dimensions to use for ordering. Column name 

516 can be prefixed with minus (``-``) to use descending ordering. 

517 

518 Returns 

519 ------- 

520 result : `DataCoordinateQueryResults` 

521 Returns ``self`` instance which is updated to return ordered 

522 result. 

523 

524 Notes 

525 ----- 

526 This method modifies the iterator in place and returns the same 

527 instance to support method chaining. 

528 """ 

529 return self._clone(order_by=args) 

530 

531 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults: 

532 """Make the iterator return limited number of records. 

533 

534 Parameters 

535 ---------- 

536 limit : `int` 

537 Upper limit on the number of returned records. 

538 offset : `int` or `None` 

539 If not `None` then the number of records to skip before returning 

540 ``limit`` records. 

541 

542 Returns 

543 ------- 

544 result : `DataCoordinateQueryResults` 

545 Returns ``self`` instance which is updated to return limited set 

546 of records. 

547 

548 Notes 

549 ----- 

550 This method modifies the iterator in place and returns the same 

551 instance to support method chaining. Normally this method is used 

552 together with `order_by` method. 

553 """ 

554 return self._clone(limit=(limit, offset)) 

555 

556 

557class DatasetQueryResults(Iterable[DatasetRef]): 

558 """An interface for objects that represent the results of queries for 

559 datasets. 

560 """ 

561 

562 @abstractmethod 

563 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

564 """Group results by parent dataset type. 

565 

566 Returns 

567 ------- 

568 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

569 An iterator over `DatasetQueryResults` instances that are each 

570 responsible for a single parent dataset type (either just that 

571 dataset type, one or more of its component dataset types, or both). 

572 """ 

573 raise NotImplementedError() 

574 

575 @abstractmethod 

576 def materialize(self) -> ContextManager[DatasetQueryResults]: 

577 """Insert this query's results into a temporary table. 

578 

579 Returns 

580 ------- 

581 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

582 A context manager that ensures the temporary table is created and 

583 populated in ``__enter__`` (returning a results object backed by 

584 that table), and dropped in ``__exit__``. If ``self`` is already 

585 materialized, the context manager may do nothing (reflecting the 

586 fact that an outer context manager should already take care of 

587 everything else). 

588 """ 

589 raise NotImplementedError() 

590 

591 @abstractmethod 

592 def expanded(self) -> DatasetQueryResults: 

593 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords` 

594 returns `True` for all data IDs in returned `DatasetRef` objects. 

595 

596 Returns 

597 ------- 

598 expanded : `DatasetQueryResults` 

599 Either a new `DatasetQueryResults` instance or ``self``, if it is 

600 already expanded. 

601 

602 Notes 

603 ----- 

604 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

605 to call `materialize` before expanding data IDs for very large result 

606 sets. 

607 """ 

608 raise NotImplementedError() 

609 

610 @abstractmethod 

611 def count(self, *, exact: bool = True) -> int: 

612 """Count the number of rows this query would return. 

613 

614 Parameters 

615 ---------- 

616 exact : `bool`, optional 

617 If `True`, run the full query and perform post-query filtering if 

618 needed to account for that filtering in the count. If `False`, the 

619 result may be an upper bound. 

620 

621 Returns 

622 ------- 

623 count : `int` 

624 The number of rows the query would return, or an upper bound if 

625 ``exact=False``. 

626 

627 Notes 

628 ----- 

629 This counts the number of rows returned, not the number of unique rows 

630 returned, so even with ``exact=True`` it may provide only an upper 

631 bound on the number of *deduplicated* result rows. 

632 """ 

633 raise NotImplementedError() 

634 

635 @abstractmethod 

636 def any( 

637 self, 

638 *, 

639 execute: bool = True, 

640 exact: bool = True, 

641 ) -> bool: 

642 """Test whether this query returns any results. 

643 

644 Parameters 

645 ---------- 

646 execute : `bool`, optional 

647 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

648 determined prior to execution that the query would return no rows. 

649 exact : `bool`, optional 

650 If `True`, run the full query and perform post-query filtering if 

651 needed, until at least one result row is found. If `False`, the 

652 returned result does not account for post-query filtering, and 

653 hence may be `True` even when all result rows would be filtered 

654 out. 

655 

656 Returns 

657 ------- 

658 any : `bool` 

659 `True` if the query would (or might, depending on arguments) yield 

660 result rows. `False` if it definitely would not. 

661 """ 

662 raise NotImplementedError() 

663 

664 @abstractmethod 

665 def explain_no_results(self) -> Iterator[str]: 

666 """Return human-readable messages that may help explain why the query 

667 yields no results. 

668 

669 Returns 

670 ------- 

671 messages : `Iterator` [ `str` ] 

672 String messages that describe reasons the query might not yield any 

673 results. 

674 

675 Notes 

676 ----- 

677 Messages related to post-query filtering are only available if the 

678 iterator has been exhausted, or if `any` or `count` was already called 

679 (with ``exact=True`` for the latter two). 

680 

681 This method first yields messages that are generated while the query is 

682 being built or filtered, but may then proceed to diagnostics generated 

683 by performing what should be inexpensive follow-up queries. Callers 

684 can short-circuit this at any time by simplying not iterating further. 

685 """ 

686 raise NotImplementedError() 

687 

688 

689class ParentDatasetQueryResults(DatasetQueryResults): 

690 """An object that represents results from a query for datasets with a 

691 single parent `DatasetType`. 

692 

693 Parameters 

694 ---------- 

695 db : `Database` 

696 Database engine to execute queries against. 

697 query : `Query` 

698 Low-level query object that backs these results. ``query.datasetType`` 

699 will be the parent dataset type for this object, and may not be `None`. 

700 components : `Sequence` [ `str` or `None` ] 

701 Names of components to include in iteration. `None` may be included 

702 (at most once) to include the parent dataset type. 

703 records : `Mapping`, optional 

704 Mapping containing `DimensionRecord` objects for all dimensions and 

705 all data IDs this query will yield. If `None` (default), 

706 `DataCoordinate.hasRecords` will return `False` for all nested data 

707 IDs. This is a nested mapping with `str` names of dimension elements 

708 as outer keys, `DimensionRecord` instances as inner values, and 

709 ``tuple(record.dataId.values())`` for the inner keys / outer values 

710 (where ``record`` is the innermost `DimensionRecord` instance). 

711 datasetType : `DatasetType`, optional 

712 Parent dataset type for all datasets returned by this query. If not 

713 provided, ``query.datasetType`` be used, and must not be `None` (as it 

714 is in the case where the query is known to yield no results prior to 

715 execution). 

716 """ 

717 

718 def __init__( 

719 self, 

720 db: Database, 

721 query: Query, 

722 *, 

723 components: Sequence[Optional[str]], 

724 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

725 datasetType: Optional[DatasetType] = None, 

726 ): 

727 self._db = db 

728 self._query = query 

729 self._components = components 

730 self._records = records 

731 if datasetType is None: 

732 datasetType = query.datasetType 

733 assert datasetType is not None, "Query used to initialize dataset results must have a dataset." 

734 assert ( 

735 datasetType.dimensions == query.graph 

736 ), f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}." 

737 self._datasetType = datasetType 

738 

739 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

740 

741 def __iter__(self) -> Iterator[DatasetRef]: 

742 for row in self._query.rows(self._db): 

743 parentRef = self._query.extractDatasetRef(row, records=self._records) 

744 for component in self._components: 

745 if component is None: 

746 yield parentRef 

747 else: 

748 yield parentRef.makeComponentRef(component) 

749 

750 def __repr__(self) -> str: 

751 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>" 

752 

753 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

754 # Docstring inherited from DatasetQueryResults. 

755 yield self 

756 

757 @contextmanager 

758 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

759 # Docstring inherited from DatasetQueryResults. 

760 with self._query.materialize(self._db) as materialized: 

761 yield ParentDatasetQueryResults( 

762 self._db, materialized, components=self._components, records=self._records 

763 ) 

764 

765 @property 

766 def parentDatasetType(self) -> DatasetType: 

767 """The parent dataset type for all datasets in this iterable 

768 (`DatasetType`). 

769 """ 

770 return self._datasetType 

771 

772 @property 

773 def dataIds(self) -> DataCoordinateQueryResults: 

774 """A lazy-evaluation object representing a query for just the data 

775 IDs of the datasets that would be returned by this query 

776 (`DataCoordinateQueryResults`). 

777 

778 The returned object is not in general `zip`-iterable with ``self``; 

779 it may be in a different order or have (or not have) duplicates. 

780 """ 

781 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False) 

782 return DataCoordinateQueryResults.from_query( 

783 self._db, 

784 query, 

785 self.parentDatasetType.dimensions, 

786 records=self._records, 

787 ) 

788 

789 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults: 

790 """Return a new query results object for the same parent datasets but 

791 different components. 

792 

793 components : `Sequence` [ `str` or `None` ] 

794 Names of components to include in iteration. `None` may be 

795 included (at most once) to include the parent dataset type. 

796 """ 

797 return ParentDatasetQueryResults( 

798 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType 

799 ) 

800 

801 def expanded(self) -> ParentDatasetQueryResults: 

802 # Docstring inherited from DatasetQueryResults. 

803 if self._records is None: 

804 records = self.dataIds.expanded()._records 

805 return ParentDatasetQueryResults( 

806 self._db, 

807 self._query, 

808 records=records, 

809 components=self._components, 

810 datasetType=self._datasetType, 

811 ) 

812 else: 

813 return self 

814 

815 def count(self, *, exact: bool = True) -> int: 

816 # Docstring inherited. 

817 return len(self._components) * self._query.count(self._db, exact=exact) 

818 

819 def any( 

820 self, 

821 *, 

822 execute: bool = True, 

823 exact: bool = True, 

824 ) -> bool: 

825 # Docstring inherited. 

826 return self._query.any(self._db, execute=execute, exact=exact) 

827 

828 def explain_no_results(self) -> Iterator[str]: 

829 # Docstring inherited. 

830 return self._query.explain_no_results(self._db) 

831 

832 

833class ChainedDatasetQueryResults(DatasetQueryResults): 

834 """A `DatasetQueryResults` implementation that simply chains together 

835 other results objects, each for a different parent dataset type. 

836 

837 Parameters 

838 ---------- 

839 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

840 The underlying results objects this object will chain together. 

841 doomed_by : `Iterable` [ `str` ], optional 

842 A list of messages (appropriate for e.g. logging or exceptions) that 

843 explain why the query is known to return no results even before it is 

844 executed. Queries with a non-empty list will never be executed. 

845 Child results objects may also have their own list. 

846 """ 

847 

848 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()): 

849 self._chain = chain 

850 self._doomed_by = tuple(doomed_by) 

851 

852 __slots__ = ("_chain",) 

853 

854 def __iter__(self) -> Iterator[DatasetRef]: 

855 return itertools.chain.from_iterable(self._chain) 

856 

857 def __repr__(self) -> str: 

858 return "<DatasetRef iterator for multiple dataset types>" 

859 

860 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

861 # Docstring inherited from DatasetQueryResults. 

862 return iter(self._chain) 

863 

864 @contextmanager 

865 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

866 # Docstring inherited from DatasetQueryResults. 

867 with ExitStack() as stack: 

868 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain]) 

869 

870 def expanded(self) -> ChainedDatasetQueryResults: 

871 # Docstring inherited from DatasetQueryResults. 

872 return ChainedDatasetQueryResults([r.expanded() for r in self._chain]) 

873 

874 def count(self, *, exact: bool = True) -> int: 

875 # Docstring inherited. 

876 return sum(r.count(exact=exact) for r in self._chain) 

877 

878 def any( 

879 self, 

880 *, 

881 execute: bool = True, 

882 exact: bool = True, 

883 ) -> bool: 

884 # Docstring inherited. 

885 return any(r.any(execute=execute, exact=exact) for r in self._chain) 

886 

887 def explain_no_results(self) -> Iterator[str]: 

888 # Docstring inherited. 

889 for r in self._chain: 

890 yield from r.explain_no_results() 

891 yield from self._doomed_by 

892 

893 

894class DimensionRecordQueryResults(Iterable[DimensionRecord]): 

895 """An interface for objects that represent the results of queries for 

896 dimension records. 

897 """ 

898 

899 @abstractmethod 

900 def count(self, *, exact: bool = True) -> int: 

901 """Count the number of rows this query would return. 

902 

903 Parameters 

904 ---------- 

905 exact : `bool`, optional 

906 If `True`, run the full query and perform post-query filtering if 

907 needed to account for that filtering in the count. If `False`, the 

908 result may be an upper bound. 

909 

910 Returns 

911 ------- 

912 count : `int` 

913 The number of rows the query would return, or an upper bound if 

914 ``exact=False``. 

915 

916 Notes 

917 ----- 

918 This counts the number of rows returned, not the number of unique rows 

919 returned, so even with ``exact=True`` it may provide only an upper 

920 bound on the number of *deduplicated* result rows. 

921 """ 

922 raise NotImplementedError() 

923 

924 @abstractmethod 

925 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

926 """Test whether this query returns any results. 

927 

928 Parameters 

929 ---------- 

930 execute : `bool`, optional 

931 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

932 determined prior to execution that the query would return no rows. 

933 exact : `bool`, optional 

934 If `True`, run the full query and perform post-query filtering if 

935 needed, until at least one result row is found. If `False`, the 

936 returned result does not account for post-query filtering, and 

937 hence may be `True` even when all result rows would be filtered 

938 out. 

939 

940 Returns 

941 ------- 

942 any : `bool` 

943 `True` if the query would (or might, depending on arguments) yield 

944 result rows. `False` if it definitely would not. 

945 """ 

946 raise NotImplementedError() 

947 

948 @abstractmethod 

949 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

950 """Make the iterator return ordered result. 

951 

952 Parameters 

953 ---------- 

954 *args : `str` 

955 Names of the columns/dimensions to use for ordering. Column name 

956 can be prefixed with minus (``-``) to use descending ordering. 

957 

958 Returns 

959 ------- 

960 result : `DimensionRecordQueryResults` 

961 Returns ``self`` instance which is updated to return ordered 

962 result. 

963 

964 Notes 

965 ----- 

966 This method can modify the iterator in place and return the same 

967 instance. 

968 """ 

969 raise NotImplementedError() 

970 

971 @abstractmethod 

972 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

973 """Make the iterator return limited number of records. 

974 

975 Parameters 

976 ---------- 

977 limit : `int` 

978 Upper limit on the number of returned records. 

979 offset : `int` or `None` 

980 If not `None` then the number of records to skip before returning 

981 ``limit`` records. 

982 

983 Returns 

984 ------- 

985 result : `DimensionRecordQueryResults` 

986 Returns ``self`` instance which is updated to return limited set 

987 of records. 

988 

989 Notes 

990 ----- 

991 This method can modify the iterator in place and return the same 

992 instance. Normally this method is used together with `order_by` 

993 method. 

994 """ 

995 raise NotImplementedError() 

996 

997 @abstractmethod 

998 def explain_no_results(self) -> Iterator[str]: 

999 """Return human-readable messages that may help explain why the query 

1000 yields no results. 

1001 

1002 Returns 

1003 ------- 

1004 messages : `Iterator` [ `str` ] 

1005 String messages that describe reasons the query might not yield any 

1006 results. 

1007 

1008 Notes 

1009 ----- 

1010 Messages related to post-query filtering are only available if the 

1011 iterator has been exhausted, or if `any` or `count` was already called 

1012 (with ``exact=True`` for the latter two). 

1013 

1014 This method first yields messages that are generated while the query is 

1015 being built or filtered, but may then proceed to diagnostics generated 

1016 by performing what should be inexpensive follow-up queries. Callers 

1017 can short-circuit this at any time by simply not iterating further. 

1018 """ 

1019 raise NotImplementedError() 

1020 

1021 

1022class _DimensionRecordKey: 

1023 """Class for objects used as keys in ordering `DimensionRecord` instances. 

1024 

1025 Parameters 

1026 ---------- 

1027 attributes : `Sequence` [ `str` ] 

1028 Sequence of attribute names to use for comparison. 

1029 ordering : `Sequence` [ `bool` ] 

1030 Matching sequence of ordering flags, `False` for descending ordering, 

1031 `True` for ascending ordering. 

1032 record : `DimensionRecord` 

1033 `DimensionRecord` to compare to other records. 

1034 """ 

1035 

1036 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord): 

1037 self.attributes = attributes 

1038 self.ordering = ordering 

1039 self.rec = record 

1040 

1041 def _cmp(self, other: _DimensionRecordKey) -> int: 

1042 """Compare two records using provided comparison operator. 

1043 

1044 Parameters 

1045 ---------- 

1046 other : `_DimensionRecordKey` 

1047 Key for other record. 

1048 

1049 Returns 

1050 ------- 

1051 result : `int` 

1052 0 if keys are identical, negative if ``self`` is ordered before 

1053 ``other``, positive otherwise. 

1054 """ 

1055 for attribute, ordering in zip(self.attributes, self.ordering): 

1056 # timespan.begin/end cannot use getattr 

1057 attrgetter = operator.attrgetter(attribute) 

1058 lhs = attrgetter(self.rec) 

1059 rhs = attrgetter(other.rec) 

1060 if not ordering: 

1061 lhs, rhs = rhs, lhs 

1062 if lhs != rhs: 

1063 return 1 if lhs > rhs else -1 

1064 return 0 

1065 

1066 def __lt__(self, other: _DimensionRecordKey) -> bool: 

1067 return self._cmp(other) < 0 

1068 

1069 def __gt__(self, other: _DimensionRecordKey) -> bool: 

1070 return self._cmp(other) > 0 

1071 

1072 def __eq__(self, other: Any) -> bool: 

1073 if not isinstance(other, _DimensionRecordKey): 

1074 return NotImplemented 

1075 return self._cmp(other) == 0 

1076 

1077 def __le__(self, other: _DimensionRecordKey) -> bool: 

1078 return self._cmp(other) <= 0 

1079 

1080 def __ge__(self, other: _DimensionRecordKey) -> bool: 

1081 return self._cmp(other) >= 0 

1082 

1083 

1084class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults): 

1085 """Implementation of DimensionRecordQueryResults using database query. 

1086 

1087 Parameters 

1088 ---------- 

1089 dataIds : `DataCoordinateQueryResults` 

1090 Iterator for DataIds. 

1091 recordStorage : `DimensionRecordStorage` 

1092 Instance of storage class for dimension records. 

1093 """ 

1094 

1095 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage): 

1096 self._dataIds = dataIds 

1097 self._recordStorage = recordStorage 

1098 self._order_by: Iterable[str] = () 

1099 

1100 def __iter__(self) -> Iterator[DimensionRecord]: 

1101 # LIMIT is already applied at DataCoordinateQueryResults level 

1102 # (assumption here is that if DataId exists then dimension record 

1103 # exists too and their counts must be equal). fetch() does not 

1104 # guarantee ordering, so we need to sort records in memory below. 

1105 recordIter = self._recordStorage.fetch(self._dataIds) 

1106 if not self._order_by: 

1107 return iter(recordIter) 

1108 

1109 # Parse list of column names and build a list of attribute name for 

1110 # ordering. Note that here we only support ordering by direct 

1111 # attributes of the element, and not other elements from the dimension 

1112 # graph. 

1113 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element) 

1114 attributes: List[str] = [] 

1115 ordering: List[bool] = [] 

1116 for column in orderBy.order_by_columns: 

1117 if column.column is None: 

1118 assert isinstance(column.element, Dimension), "Element must be a Dimension" 

1119 attributes.append(column.element.primaryKey.name) 

1120 else: 

1121 attributes.append(column.column) 

1122 ordering.append(column.ordering) 

1123 

1124 def _key(record: DimensionRecord) -> _DimensionRecordKey: 

1125 return _DimensionRecordKey(attributes, ordering, record) 

1126 

1127 records = sorted(recordIter, key=_key) 

1128 return iter(records) 

1129 

1130 def count(self, *, exact: bool = True) -> int: 

1131 # Docstring inherited from base class. 

1132 return self._dataIds.count(exact=exact) 

1133 

1134 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

1135 # Docstring inherited from base class. 

1136 return self._dataIds.any(execute=execute, exact=exact) 

1137 

1138 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

1139 # Docstring inherited from base class. 

1140 self._dataIds = self._dataIds.order_by(*args) 

1141 self._order_by = args 

1142 return self 

1143 

1144 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

1145 # Docstring inherited from base class. 

1146 self._dataIds = self._dataIds.limit(limit, offset) 

1147 return self 

1148 

1149 def explain_no_results(self) -> Iterator[str]: 

1150 # Docstring inherited. 

1151 return self._dataIds.explain_no_results()