Coverage for python/lsst/daf/butler/registry/queries/_results.py: 30%

282 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-24 23:50 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DatabaseDimensionRecordQueryResults", 

26 "DataCoordinateQueryResults", 

27 "DatasetQueryResults", 

28 "DimensionRecordQueryResults", 

29 "ParentDatasetQueryResults", 

30) 

31 

32import itertools 

33import operator 

34from abc import abstractmethod 

35from contextlib import ExitStack, contextmanager 

36from typing import ( 

37 Any, 

38 Callable, 

39 ContextManager, 

40 Iterable, 

41 Iterator, 

42 List, 

43 Mapping, 

44 Optional, 

45 Sequence, 

46 Tuple, 

47 Union, 

48) 

49 

50import sqlalchemy 

51 

52from ...core import ( 

53 DataCoordinate, 

54 DataCoordinateIterable, 

55 DatasetRef, 

56 DatasetType, 

57 Dimension, 

58 DimensionGraph, 

59 DimensionRecord, 

60 SimpleQuery, 

61) 

62from ..interfaces import Database, DimensionRecordStorage 

63from ._query import Query 

64from ._structs import ElementOrderByClause, QuerySummary 

65 

66QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query] 

67"""Type of a query factory method type used by DataCoordinateQueryResults. 

68""" 

69 

70 

71class DataCoordinateQueryResults(DataCoordinateIterable): 

72 """An enhanced implementation of `DataCoordinateIterable` that represents 

73 data IDs retrieved from a database query. 

74 

75 Parameters 

76 ---------- 

77 db : `Database` 

78 Database engine used to execute queries. 

79 query_factory : `QueryFactoryMethod` 

80 Method which creates an instance of `Query` class. 

81 graph : `DimensionGraph` 

82 Dimensions used by query. 

83 order_by : `Iterable` [ `str` ], optional 

84 Optional sequence of column names used for result ordering. 

85 limit : `Tuple` [ `int`, `int` ], optional 

86 Limit for the number of returned records and optional offset. 

87 records : `Mapping`, optional 

88 A nested mapping containing `DimensionRecord` objects for all 

89 dimensions and all data IDs this query will yield. If `None` 

90 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

91 The outer mapping has `str` keys (the names of dimension elements). 

92 The inner mapping has `tuple` keys representing data IDs (tuple 

93 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

94 

95 Notes 

96 ----- 

97 Constructing an instance of this does nothing; the query is not executed 

98 until it is iterated over (or some other operation is performed that 

99 involves iteration). 

100 

101 Instances should generally only be constructed by `Registry` methods or the 

102 methods of other query result objects. 

103 """ 

104 

105 def __init__( 

106 self, 

107 db: Database, 

108 query_factory: QueryFactoryMethod, 

109 graph: DimensionGraph, 

110 *, 

111 order_by: Optional[Iterable[str]] = None, 

112 limit: Optional[Tuple[int, Optional[int]]] = None, 

113 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

114 ): 

115 self._db = db 

116 self._query_factory = query_factory 

117 self._graph = graph 

118 self._order_by = order_by 

119 self._limit = limit 

120 self._records = records 

121 self._cached_query: Optional[Query] = None 

122 

123 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query") 

124 

125 @classmethod 

126 def from_query( 

127 cls, 

128 db: Database, 

129 query: Query, 

130 graph: DimensionGraph, 

131 *, 

132 order_by: Optional[Iterable[str]] = None, 

133 limit: Optional[Tuple[int, Optional[int]]] = None, 

134 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

135 ) -> DataCoordinateQueryResults: 

136 """Make an instance from a pre-existing query instead of a factory. 

137 

138 Parameters 

139 ---------- 

140 db : `Database` 

141 Database engine used to execute queries. 

142 query : `Query` 

143 Low-level representation of the query that backs this result 

144 object. 

145 graph : `DimensionGraph` 

146 Dimensions used by query. 

147 order_by : `Iterable` [ `str` ], optional 

148 Optional sequence of column names used for result ordering. 

149 limit : `Tuple` [ `int`, `int` ], optional 

150 Limit for the number of returned records and optional offset. 

151 records : `Mapping`, optional 

152 A nested mapping containing `DimensionRecord` objects for all 

153 dimensions and all data IDs this query will yield. If `None` 

154 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

155 The outer mapping has `str` keys (the names of dimension elements). 

156 The inner mapping has `tuple` keys representing data IDs (tuple 

157 conversions of `DataCoordinate.values()`) and `DimensionRecord` 

158 values. 

159 """ 

160 

161 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query: 

162 return query 

163 

164 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records) 

165 

166 def __iter__(self) -> Iterator[DataCoordinate]: 

167 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

168 

169 def __repr__(self) -> str: 

170 return f"<DataCoordinate iterator with dimensions={self._graph}>" 

171 

172 def _clone( 

173 self, 

174 *, 

175 query_factory: Optional[QueryFactoryMethod] = None, 

176 query: Optional[Query] = None, 

177 graph: Optional[DimensionGraph] = None, 

178 order_by: Optional[Iterable[str]] = None, 

179 limit: Optional[Tuple[int, Optional[int]]] = None, 

180 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

181 ) -> DataCoordinateQueryResults: 

182 """Clone this instance potentially updating some attributes.""" 

183 graph = graph if graph is not None else self._graph 

184 order_by = order_by if order_by is not None else self._order_by 

185 limit = limit if limit is not None else self._limit 

186 records = records if records is not None else self._records 

187 if query is None: 

188 query_factory = query_factory or self._query_factory 

189 return DataCoordinateQueryResults( 

190 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records 

191 ) 

192 else: 

193 return DataCoordinateQueryResults.from_query( 

194 self._db, query, graph, order_by=order_by, limit=limit, records=records 

195 ) 

196 

197 @property 

198 def _query(self) -> Query: 

199 """Query representation instance (`Query`)""" 

200 if self._cached_query is None: 

201 self._cached_query = self._query_factory(self._order_by, self._limit) 

202 assert ( 

203 self._cached_query.datasetType is None 

204 ), "Query used to initialize data coordinate results should not have any datasets." 

205 return self._cached_query 

206 

207 @property 

208 def graph(self) -> DimensionGraph: 

209 # Docstring inherited from DataCoordinateIterable. 

210 return self._graph 

211 

212 def hasFull(self) -> bool: 

213 # Docstring inherited from DataCoordinateIterable. 

214 return True 

215 

216 def hasRecords(self) -> bool: 

217 # Docstring inherited from DataCoordinateIterable. 

218 return self._records is not None or not self._graph 

219 

220 @contextmanager 

221 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

222 """Insert this query's results into a temporary table. 

223 

224 Returns 

225 ------- 

226 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

227 A context manager that ensures the temporary table is created and 

228 populated in ``__enter__`` (returning a results object backed by 

229 that table), and dropped in ``__exit__``. If ``self`` is already 

230 materialized, the context manager may do nothing (reflecting the 

231 fact that an outer context manager should already take care of 

232 everything else). 

233 

234 Notes 

235 ----- 

236 When using a very large result set to perform multiple queries (e.g. 

237 multiple calls to `subset` with different arguments, or even a single 

238 call to `expanded`), it may be much more efficient to start by 

239 materializing the query and only then performing the follow up queries. 

240 It may also be less efficient, depending on how well database engine's 

241 query optimizer can simplify those particular follow-up queries and 

242 how efficiently it caches query results even when the are not 

243 explicitly inserted into a temporary table. See `expanded` and 

244 `subset` for examples. 

245 """ 

246 with self._query.materialize(self._db) as materialized: 

247 # Note that we depend on order_by columns to be passes from Query 

248 # to MaterializedQuery, so order_by and limit are not used. 

249 yield self._clone(query=materialized) 

250 

251 def expanded(self) -> DataCoordinateQueryResults: 

252 """Return a results object for which `hasRecords` returns `True`. 

253 

254 This method may involve actually executing database queries to fetch 

255 `DimensionRecord` objects. 

256 

257 Returns 

258 ------- 

259 results : `DataCoordinateQueryResults` 

260 A results object for which `hasRecords` returns `True`. May be 

261 ``self`` if that is already the case. 

262 

263 Notes 

264 ----- 

265 For very result sets, it may be much more efficient to call 

266 `materialize` before calling `expanded`, to avoid performing the 

267 original query multiple times (as a subquery) in the follow-up queries 

268 that fetch dimension records. For example:: 

269 

270 with registry.queryDataIds(...).materialize() as tempDataIds: 

271 dataIdsWithRecords = tempDataIds.expanded() 

272 for dataId in dataIdsWithRecords: 

273 ... 

274 """ 

275 if self._records is None: 

276 records = {} 

277 for element in self.graph.elements: 

278 subset = self.subset(graph=element.graph, unique=True) 

279 records[element.name] = { 

280 tuple(record.dataId.values()): record 

281 for record in self._query.managers.dimensions[element].fetch(subset) 

282 } 

283 

284 return self._clone(query=self._query, records=records) 

285 else: 

286 return self 

287 

288 def subset( 

289 self, graph: Optional[DimensionGraph] = None, *, unique: bool = False 

290 ) -> DataCoordinateQueryResults: 

291 """Return a results object containing a subset of the dimensions of 

292 this one, and/or a unique near-subset of its rows. 

293 

294 This method may involve actually executing database queries to fetch 

295 `DimensionRecord` objects. 

296 

297 Parameters 

298 ---------- 

299 graph : `DimensionGraph`, optional 

300 Dimensions to include in the new results object. If `None`, 

301 ``self.graph`` is used. 

302 unique : `bool`, optional 

303 If `True` (`False` is default), the query should only return unique 

304 data IDs. This is implemented in the database; to obtain unique 

305 results via Python-side processing (which may be more efficient in 

306 some cases), use `toSet` to construct a `DataCoordinateSet` from 

307 this results object instead. 

308 

309 Returns 

310 ------- 

311 results : `DataCoordinateQueryResults` 

312 A results object corresponding to the given criteria. May be 

313 ``self`` if it already qualifies. 

314 

315 Raises 

316 ------ 

317 ValueError 

318 Raised when ``graph`` is not a subset of the dimension graph in 

319 this result. 

320 

321 Notes 

322 ----- 

323 This method can only return a "near-subset" of the original result rows 

324 in general because of subtleties in how spatial overlaps are 

325 implemented; see `Query.subset` for more information. 

326 

327 When calling `subset` multiple times on the same very large result set, 

328 it may be much more efficient to call `materialize` first. For 

329 example:: 

330 

331 dimensions1 = DimensionGraph(...) 

332 dimensions2 = DimensionGraph(...) 

333 with registry.queryDataIds(...).materialize() as tempDataIds: 

334 for dataId1 in tempDataIds.subset( 

335 graph=dimensions1, 

336 unique=True): 

337 ... 

338 for dataId2 in tempDataIds.subset( 

339 graph=dimensions2, 

340 unique=True): 

341 ... 

342 """ 

343 if graph is None: 

344 graph = self.graph 

345 if not graph.issubset(self.graph): 

346 raise ValueError(f"{graph} is not a subset of {self.graph}") 

347 if graph == self.graph and (not unique or self._query.isUnique()): 

348 return self 

349 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] 

350 if self._records is not None: 

351 records = {element.name: self._records[element.name] for element in graph.elements} 

352 else: 

353 records = None 

354 query = self._query.subset(graph=graph, datasets=False, unique=unique) 

355 

356 return self._clone(graph=graph, query=query, records=records) 

357 

358 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

359 # Docstring inherited from DataCoordinateIterable. 

360 sql = self._query.sql 

361 if sql is not None: 

362 fromClause = sql.alias("c") 

363 query.join( 

364 fromClause, 

365 onclause=sqlalchemy.sql.and_( 

366 *[ 

367 columns(dimension.name) == fromClause.columns[dimension.name] 

368 for dimension in self.graph.required 

369 ] 

370 ), 

371 ) 

372 

373 def findDatasets( 

374 self, datasetType: Union[DatasetType, str], collections: Any, *, findFirst: bool = True 

375 ) -> DatasetQueryResults: 

376 """Find datasets using the data IDs identified by this query. 

377 

378 Parameters 

379 ---------- 

380 datasetType : `DatasetType` or `str` 

381 Dataset type or the name of one to search for. Must have 

382 dimensions that are a subset of ``self.graph``. 

383 collections : `Any` 

384 An expression that fully or partially identifies the collections 

385 to search for the dataset, such as a `str`, `re.Pattern`, or 

386 iterable thereof. ``...`` can be used to return all collections. 

387 See :ref:`daf_butler_collection_expressions` for more information. 

388 findFirst : `bool`, optional 

389 If `True` (default), for each result data ID, only yield one 

390 `DatasetRef`, from the first collection in which a dataset of that 

391 dataset type appears (according to the order of ``collections`` 

392 passed in). If `True`, ``collections`` must not contain regular 

393 expressions and may not be ``...``. 

394 

395 Returns 

396 ------- 

397 datasets : `DatasetQueryResults` 

398 A lazy-evaluation object representing dataset query results, 

399 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

400 nested data IDs in those dataset references will have records as 

401 well. 

402 

403 Raises 

404 ------ 

405 ValueError 

406 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

407 """ 

408 if not isinstance(datasetType, DatasetType): 

409 storage = self._query.managers.datasets.find(datasetType) 

410 if storage is None: 

411 return ChainedDatasetQueryResults( 

412 [], 

413 doomed_by=[ 

414 f"No registered dataset type {datasetType!r} found, so no instances can " 

415 "exist in any collection." 

416 ], 

417 ) 

418 else: 

419 datasetType = storage.datasetType 

420 if not datasetType.dimensions.issubset(self.graph): 

421 raise ValueError( 

422 f"findDatasets requires that the dataset type have only dimensions in " 

423 f"the DataCoordinateQueryResult used as input to the search, but " 

424 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input " 

425 f"dimensions are {self.graph}." 

426 ) 

427 if datasetType.isComponent(): 

428 # We were given a true DatasetType instance, but it's a component. 

429 parentName, componentName = datasetType.nameAndComponent() 

430 storage = self._query.managers.datasets[parentName] 

431 datasetType = storage.datasetType 

432 components = [componentName] 

433 else: 

434 components = [None] 

435 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType]) 

436 builder = self._query.makeBuilder(summary) 

437 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst) 

438 query = builder.finish(joinMissing=False) 

439 return ParentDatasetQueryResults( 

440 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType 

441 ) 

442 

443 def count(self, *, exact: bool = True) -> int: 

444 """Count the number of rows this query would return. 

445 

446 Parameters 

447 ---------- 

448 exact : `bool`, optional 

449 If `True`, run the full query and perform post-query filtering if 

450 needed to account for that filtering in the count. If `False`, the 

451 result may be an upper bound. 

452 

453 Returns 

454 ------- 

455 count : `int` 

456 The number of rows the query would return, or an upper bound if 

457 ``exact=False``. 

458 

459 Notes 

460 ----- 

461 This counts the number of rows returned, not the number of unique rows 

462 returned, so even with ``exact=True`` it may provide only an upper 

463 bound on the number of *deduplicated* result rows. 

464 """ 

465 return self._query.count(self._db, exact=exact) 

466 

467 def any( 

468 self, 

469 *, 

470 execute: bool = True, 

471 exact: bool = True, 

472 ) -> bool: 

473 """Test whether this query returns any results. 

474 

475 Parameters 

476 ---------- 

477 execute : `bool`, optional 

478 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

479 determined prior to execution that the query would return no rows. 

480 exact : `bool`, optional 

481 If `True`, run the full query and perform post-query filtering if 

482 needed, until at least one result row is found. If `False`, the 

483 returned result does not account for post-query filtering, and 

484 hence may be `True` even when all result rows would be filtered 

485 out. 

486 

487 Returns 

488 ------- 

489 any : `bool` 

490 `True` if the query would (or might, depending on arguments) yield 

491 result rows. `False` if it definitely would not. 

492 """ 

493 return self._query.any(self._db, execute=execute, exact=exact) 

494 

495 def explain_no_results(self) -> Iterator[str]: 

496 """Return human-readable messages that may help explain why the query 

497 yields no results. 

498 

499 Returns 

500 ------- 

501 messages : `Iterator` [ `str` ] 

502 String messages that describe reasons the query might not yield any 

503 results. 

504 

505 Notes 

506 ----- 

507 Messages related to post-query filtering are only available if the 

508 iterator has been exhausted, or if `any` or `count` was already called 

509 (with ``exact=True`` for the latter two). 

510 

511 This method first yields messages that are generated while the query is 

512 being built or filtered, but may then proceed to diagnostics generated 

513 by performing what should be inexpensive follow-up queries. Callers 

514 can short-circuit this at any time by simplying not iterating further. 

515 """ 

516 return self._query.explain_no_results(self._db) 

517 

518 def order_by(self, *args: str) -> DataCoordinateQueryResults: 

519 """Make the iterator return ordered result. 

520 

521 Parameters 

522 ---------- 

523 *args : `str` 

524 Names of the columns/dimensions to use for ordering. Column name 

525 can be prefixed with minus (``-``) to use descending ordering. 

526 

527 Returns 

528 ------- 

529 result : `DataCoordinateQueryResults` 

530 Returns ``self`` instance which is updated to return ordered 

531 result. 

532 

533 Notes 

534 ----- 

535 This method modifies the iterator in place and returns the same 

536 instance to support method chaining. 

537 """ 

538 return self._clone(order_by=args) 

539 

540 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults: 

541 """Make the iterator return limited number of records. 

542 

543 Parameters 

544 ---------- 

545 limit : `int` 

546 Upper limit on the number of returned records. 

547 offset : `int` or `None` 

548 If not `None` then the number of records to skip before returning 

549 ``limit`` records. 

550 

551 Returns 

552 ------- 

553 result : `DataCoordinateQueryResults` 

554 Returns ``self`` instance which is updated to return limited set 

555 of records. 

556 

557 Notes 

558 ----- 

559 This method modifies the iterator in place and returns the same 

560 instance to support method chaining. Normally this method is used 

561 together with `order_by` method. 

562 """ 

563 return self._clone(limit=(limit, offset)) 

564 

565 

566class DatasetQueryResults(Iterable[DatasetRef]): 

567 """An interface for objects that represent the results of queries for 

568 datasets. 

569 """ 

570 

571 @abstractmethod 

572 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

573 """Group results by parent dataset type. 

574 

575 Returns 

576 ------- 

577 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

578 An iterator over `DatasetQueryResults` instances that are each 

579 responsible for a single parent dataset type (either just that 

580 dataset type, one or more of its component dataset types, or both). 

581 """ 

582 raise NotImplementedError() 

583 

584 @abstractmethod 

585 def materialize(self) -> ContextManager[DatasetQueryResults]: 

586 """Insert this query's results into a temporary table. 

587 

588 Returns 

589 ------- 

590 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

591 A context manager that ensures the temporary table is created and 

592 populated in ``__enter__`` (returning a results object backed by 

593 that table), and dropped in ``__exit__``. If ``self`` is already 

594 materialized, the context manager may do nothing (reflecting the 

595 fact that an outer context manager should already take care of 

596 everything else). 

597 """ 

598 raise NotImplementedError() 

599 

600 @abstractmethod 

601 def expanded(self) -> DatasetQueryResults: 

602 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords` 

603 returns `True` for all data IDs in returned `DatasetRef` objects. 

604 

605 Returns 

606 ------- 

607 expanded : `DatasetQueryResults` 

608 Either a new `DatasetQueryResults` instance or ``self``, if it is 

609 already expanded. 

610 

611 Notes 

612 ----- 

613 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

614 to call `materialize` before expanding data IDs for very large result 

615 sets. 

616 """ 

617 raise NotImplementedError() 

618 

619 @abstractmethod 

620 def count(self, *, exact: bool = True) -> int: 

621 """Count the number of rows this query would return. 

622 

623 Parameters 

624 ---------- 

625 exact : `bool`, optional 

626 If `True`, run the full query and perform post-query filtering if 

627 needed to account for that filtering in the count. If `False`, the 

628 result may be an upper bound. 

629 

630 Returns 

631 ------- 

632 count : `int` 

633 The number of rows the query would return, or an upper bound if 

634 ``exact=False``. 

635 

636 Notes 

637 ----- 

638 This counts the number of rows returned, not the number of unique rows 

639 returned, so even with ``exact=True`` it may provide only an upper 

640 bound on the number of *deduplicated* result rows. 

641 """ 

642 raise NotImplementedError() 

643 

644 @abstractmethod 

645 def any( 

646 self, 

647 *, 

648 execute: bool = True, 

649 exact: bool = True, 

650 ) -> bool: 

651 """Test whether this query returns any results. 

652 

653 Parameters 

654 ---------- 

655 execute : `bool`, optional 

656 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

657 determined prior to execution that the query would return no rows. 

658 exact : `bool`, optional 

659 If `True`, run the full query and perform post-query filtering if 

660 needed, until at least one result row is found. If `False`, the 

661 returned result does not account for post-query filtering, and 

662 hence may be `True` even when all result rows would be filtered 

663 out. 

664 

665 Returns 

666 ------- 

667 any : `bool` 

668 `True` if the query would (or might, depending on arguments) yield 

669 result rows. `False` if it definitely would not. 

670 """ 

671 raise NotImplementedError() 

672 

673 @abstractmethod 

674 def explain_no_results(self) -> Iterator[str]: 

675 """Return human-readable messages that may help explain why the query 

676 yields no results. 

677 

678 Returns 

679 ------- 

680 messages : `Iterator` [ `str` ] 

681 String messages that describe reasons the query might not yield any 

682 results. 

683 

684 Notes 

685 ----- 

686 Messages related to post-query filtering are only available if the 

687 iterator has been exhausted, or if `any` or `count` was already called 

688 (with ``exact=True`` for the latter two). 

689 

690 This method first yields messages that are generated while the query is 

691 being built or filtered, but may then proceed to diagnostics generated 

692 by performing what should be inexpensive follow-up queries. Callers 

693 can short-circuit this at any time by simplying not iterating further. 

694 """ 

695 raise NotImplementedError() 

696 

697 

698class ParentDatasetQueryResults(DatasetQueryResults): 

699 """An object that represents results from a query for datasets with a 

700 single parent `DatasetType`. 

701 

702 Parameters 

703 ---------- 

704 db : `Database` 

705 Database engine to execute queries against. 

706 query : `Query` 

707 Low-level query object that backs these results. ``query.datasetType`` 

708 will be the parent dataset type for this object, and may not be `None`. 

709 components : `Sequence` [ `str` or `None` ] 

710 Names of components to include in iteration. `None` may be included 

711 (at most once) to include the parent dataset type. 

712 records : `Mapping`, optional 

713 Mapping containing `DimensionRecord` objects for all dimensions and 

714 all data IDs this query will yield. If `None` (default), 

715 `DataCoordinate.hasRecords` will return `False` for all nested data 

716 IDs. This is a nested mapping with `str` names of dimension elements 

717 as outer keys, `DimensionRecord` instances as inner values, and 

718 ``tuple(record.dataId.values())`` for the inner keys / outer values 

719 (where ``record`` is the innermost `DimensionRecord` instance). 

720 datasetType : `DatasetType`, optional 

721 Parent dataset type for all datasets returned by this query. If not 

722 provided, ``query.datasetType`` be used, and must not be `None` (as it 

723 is in the case where the query is known to yield no results prior to 

724 execution). 

725 """ 

726 

727 def __init__( 

728 self, 

729 db: Database, 

730 query: Query, 

731 *, 

732 components: Sequence[Optional[str]], 

733 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

734 datasetType: Optional[DatasetType] = None, 

735 ): 

736 self._db = db 

737 self._query = query 

738 self._components = components 

739 self._records = records 

740 if datasetType is None: 

741 datasetType = query.datasetType 

742 assert datasetType is not None, "Query used to initialize dataset results must have a dataset." 

743 assert datasetType.dimensions.issubset( 

744 query.graph 

745 ), f"Query dimensions {query.graph} do not match dataset type dimensions {datasetType.dimensions}." 

746 self._datasetType = datasetType 

747 

748 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

749 

750 def __iter__(self) -> Iterator[DatasetRef]: 

751 for row in self._query.rows(self._db): 

752 parentRef = self._query.extractDatasetRef(row, records=self._records) 

753 for component in self._components: 

754 if component is None: 

755 yield parentRef 

756 else: 

757 yield parentRef.makeComponentRef(component) 

758 

759 def __repr__(self) -> str: 

760 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>" 

761 

762 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

763 # Docstring inherited from DatasetQueryResults. 

764 yield self 

765 

766 @contextmanager 

767 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

768 # Docstring inherited from DatasetQueryResults. 

769 with self._query.materialize(self._db) as materialized: 

770 yield ParentDatasetQueryResults( 

771 self._db, materialized, components=self._components, records=self._records 

772 ) 

773 

774 @property 

775 def parentDatasetType(self) -> DatasetType: 

776 """The parent dataset type for all datasets in this iterable 

777 (`DatasetType`). 

778 """ 

779 return self._datasetType 

780 

781 @property 

782 def dataIds(self) -> DataCoordinateQueryResults: 

783 """A lazy-evaluation object representing a query for just the data 

784 IDs of the datasets that would be returned by this query 

785 (`DataCoordinateQueryResults`). 

786 

787 The returned object is not in general `zip`-iterable with ``self``; 

788 it may be in a different order or have (or not have) duplicates. 

789 """ 

790 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False) 

791 return DataCoordinateQueryResults.from_query( 

792 self._db, 

793 query, 

794 self.parentDatasetType.dimensions, 

795 records=self._records, 

796 ) 

797 

798 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults: 

799 """Return a new query results object for the same parent datasets but 

800 different components. 

801 

802 components : `Sequence` [ `str` or `None` ] 

803 Names of components to include in iteration. `None` may be 

804 included (at most once) to include the parent dataset type. 

805 """ 

806 return ParentDatasetQueryResults( 

807 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType 

808 ) 

809 

810 def expanded(self) -> ParentDatasetQueryResults: 

811 # Docstring inherited from DatasetQueryResults. 

812 if self._records is None: 

813 records = self.dataIds.expanded()._records 

814 return ParentDatasetQueryResults( 

815 self._db, 

816 self._query, 

817 records=records, 

818 components=self._components, 

819 datasetType=self._datasetType, 

820 ) 

821 else: 

822 return self 

823 

824 def count(self, *, exact: bool = True) -> int: 

825 # Docstring inherited. 

826 return len(self._components) * self._query.count(self._db, exact=exact) 

827 

828 def any( 

829 self, 

830 *, 

831 execute: bool = True, 

832 exact: bool = True, 

833 ) -> bool: 

834 # Docstring inherited. 

835 return self._query.any(self._db, execute=execute, exact=exact) 

836 

837 def explain_no_results(self) -> Iterator[str]: 

838 # Docstring inherited. 

839 return self._query.explain_no_results(self._db) 

840 

841 

842class ChainedDatasetQueryResults(DatasetQueryResults): 

843 """A `DatasetQueryResults` implementation that simply chains together 

844 other results objects, each for a different parent dataset type. 

845 

846 Parameters 

847 ---------- 

848 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

849 The underlying results objects this object will chain together. 

850 doomed_by : `Iterable` [ `str` ], optional 

851 A list of messages (appropriate for e.g. logging or exceptions) that 

852 explain why the query is known to return no results even before it is 

853 executed. Queries with a non-empty list will never be executed. 

854 Child results objects may also have their own list. 

855 """ 

856 

857 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()): 

858 self._chain = chain 

859 self._doomed_by = tuple(doomed_by) 

860 

861 __slots__ = ("_chain",) 

862 

863 def __iter__(self) -> Iterator[DatasetRef]: 

864 return itertools.chain.from_iterable(self._chain) 

865 

866 def __repr__(self) -> str: 

867 return "<DatasetRef iterator for multiple dataset types>" 

868 

869 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

870 # Docstring inherited from DatasetQueryResults. 

871 return iter(self._chain) 

872 

873 @contextmanager 

874 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

875 # Docstring inherited from DatasetQueryResults. 

876 with ExitStack() as stack: 

877 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain]) 

878 

879 def expanded(self) -> ChainedDatasetQueryResults: 

880 # Docstring inherited from DatasetQueryResults. 

881 return ChainedDatasetQueryResults([r.expanded() for r in self._chain]) 

882 

883 def count(self, *, exact: bool = True) -> int: 

884 # Docstring inherited. 

885 return sum(r.count(exact=exact) for r in self._chain) 

886 

887 def any( 

888 self, 

889 *, 

890 execute: bool = True, 

891 exact: bool = True, 

892 ) -> bool: 

893 # Docstring inherited. 

894 return any(r.any(execute=execute, exact=exact) for r in self._chain) 

895 

896 def explain_no_results(self) -> Iterator[str]: 

897 # Docstring inherited. 

898 for r in self._chain: 

899 yield from r.explain_no_results() 

900 yield from self._doomed_by 

901 

902 

903class DimensionRecordQueryResults(Iterable[DimensionRecord]): 

904 """An interface for objects that represent the results of queries for 

905 dimension records. 

906 """ 

907 

908 @abstractmethod 

909 def count(self, *, exact: bool = True) -> int: 

910 """Count the number of rows this query would return. 

911 

912 Parameters 

913 ---------- 

914 exact : `bool`, optional 

915 If `True`, run the full query and perform post-query filtering if 

916 needed to account for that filtering in the count. If `False`, the 

917 result may be an upper bound. 

918 

919 Returns 

920 ------- 

921 count : `int` 

922 The number of rows the query would return, or an upper bound if 

923 ``exact=False``. 

924 

925 Notes 

926 ----- 

927 This counts the number of rows returned, not the number of unique rows 

928 returned, so even with ``exact=True`` it may provide only an upper 

929 bound on the number of *deduplicated* result rows. 

930 """ 

931 raise NotImplementedError() 

932 

933 @abstractmethod 

934 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

935 """Test whether this query returns any results. 

936 

937 Parameters 

938 ---------- 

939 execute : `bool`, optional 

940 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

941 determined prior to execution that the query would return no rows. 

942 exact : `bool`, optional 

943 If `True`, run the full query and perform post-query filtering if 

944 needed, until at least one result row is found. If `False`, the 

945 returned result does not account for post-query filtering, and 

946 hence may be `True` even when all result rows would be filtered 

947 out. 

948 

949 Returns 

950 ------- 

951 any : `bool` 

952 `True` if the query would (or might, depending on arguments) yield 

953 result rows. `False` if it definitely would not. 

954 """ 

955 raise NotImplementedError() 

956 

957 @abstractmethod 

958 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

959 """Make the iterator return ordered result. 

960 

961 Parameters 

962 ---------- 

963 *args : `str` 

964 Names of the columns/dimensions to use for ordering. Column name 

965 can be prefixed with minus (``-``) to use descending ordering. 

966 

967 Returns 

968 ------- 

969 result : `DimensionRecordQueryResults` 

970 Returns ``self`` instance which is updated to return ordered 

971 result. 

972 

973 Notes 

974 ----- 

975 This method can modify the iterator in place and return the same 

976 instance. 

977 """ 

978 raise NotImplementedError() 

979 

980 @abstractmethod 

981 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

982 """Make the iterator return limited number of records. 

983 

984 Parameters 

985 ---------- 

986 limit : `int` 

987 Upper limit on the number of returned records. 

988 offset : `int` or `None` 

989 If not `None` then the number of records to skip before returning 

990 ``limit`` records. 

991 

992 Returns 

993 ------- 

994 result : `DimensionRecordQueryResults` 

995 Returns ``self`` instance which is updated to return limited set 

996 of records. 

997 

998 Notes 

999 ----- 

1000 This method can modify the iterator in place and return the same 

1001 instance. Normally this method is used together with `order_by` 

1002 method. 

1003 """ 

1004 raise NotImplementedError() 

1005 

1006 @abstractmethod 

1007 def explain_no_results(self) -> Iterator[str]: 

1008 """Return human-readable messages that may help explain why the query 

1009 yields no results. 

1010 

1011 Returns 

1012 ------- 

1013 messages : `Iterator` [ `str` ] 

1014 String messages that describe reasons the query might not yield any 

1015 results. 

1016 

1017 Notes 

1018 ----- 

1019 Messages related to post-query filtering are only available if the 

1020 iterator has been exhausted, or if `any` or `count` was already called 

1021 (with ``exact=True`` for the latter two). 

1022 

1023 This method first yields messages that are generated while the query is 

1024 being built or filtered, but may then proceed to diagnostics generated 

1025 by performing what should be inexpensive follow-up queries. Callers 

1026 can short-circuit this at any time by simply not iterating further. 

1027 """ 

1028 raise NotImplementedError() 

1029 

1030 

1031class _DimensionRecordKey: 

1032 """Class for objects used as keys in ordering `DimensionRecord` instances. 

1033 

1034 Parameters 

1035 ---------- 

1036 attributes : `Sequence` [ `str` ] 

1037 Sequence of attribute names to use for comparison. 

1038 ordering : `Sequence` [ `bool` ] 

1039 Matching sequence of ordering flags, `False` for descending ordering, 

1040 `True` for ascending ordering. 

1041 record : `DimensionRecord` 

1042 `DimensionRecord` to compare to other records. 

1043 """ 

1044 

1045 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord): 

1046 self.attributes = attributes 

1047 self.ordering = ordering 

1048 self.rec = record 

1049 

1050 def _cmp(self, other: _DimensionRecordKey) -> int: 

1051 """Compare two records using provided comparison operator. 

1052 

1053 Parameters 

1054 ---------- 

1055 other : `_DimensionRecordKey` 

1056 Key for other record. 

1057 

1058 Returns 

1059 ------- 

1060 result : `int` 

1061 0 if keys are identical, negative if ``self`` is ordered before 

1062 ``other``, positive otherwise. 

1063 """ 

1064 for attribute, ordering in zip(self.attributes, self.ordering): 

1065 # timespan.begin/end cannot use getattr 

1066 attrgetter = operator.attrgetter(attribute) 

1067 lhs = attrgetter(self.rec) 

1068 rhs = attrgetter(other.rec) 

1069 if not ordering: 

1070 lhs, rhs = rhs, lhs 

1071 if lhs != rhs: 

1072 return 1 if lhs > rhs else -1 

1073 return 0 

1074 

1075 def __lt__(self, other: _DimensionRecordKey) -> bool: 

1076 return self._cmp(other) < 0 

1077 

1078 def __gt__(self, other: _DimensionRecordKey) -> bool: 

1079 return self._cmp(other) > 0 

1080 

1081 def __eq__(self, other: Any) -> bool: 

1082 if not isinstance(other, _DimensionRecordKey): 

1083 return NotImplemented 

1084 return self._cmp(other) == 0 

1085 

1086 def __le__(self, other: _DimensionRecordKey) -> bool: 

1087 return self._cmp(other) <= 0 

1088 

1089 def __ge__(self, other: _DimensionRecordKey) -> bool: 

1090 return self._cmp(other) >= 0 

1091 

1092 

1093class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults): 

1094 """Implementation of DimensionRecordQueryResults using database query. 

1095 

1096 Parameters 

1097 ---------- 

1098 dataIds : `DataCoordinateQueryResults` 

1099 Iterator for DataIds. 

1100 recordStorage : `DimensionRecordStorage` 

1101 Instance of storage class for dimension records. 

1102 """ 

1103 

1104 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage): 

1105 self._dataIds = dataIds 

1106 self._recordStorage = recordStorage 

1107 self._order_by: Iterable[str] = () 

1108 

1109 def __iter__(self) -> Iterator[DimensionRecord]: 

1110 # LIMIT is already applied at DataCoordinateQueryResults level 

1111 # (assumption here is that if DataId exists then dimension record 

1112 # exists too and their counts must be equal). fetch() does not 

1113 # guarantee ordering, so we need to sort records in memory below. 

1114 recordIter = self._recordStorage.fetch(self._dataIds) 

1115 if not self._order_by: 

1116 return iter(recordIter) 

1117 

1118 # Parse list of column names and build a list of attribute name for 

1119 # ordering. Note that here we only support ordering by direct 

1120 # attributes of the element, and not other elements from the dimension 

1121 # graph. 

1122 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element) 

1123 attributes: List[str] = [] 

1124 ordering: List[bool] = [] 

1125 for column in orderBy.order_by_columns: 

1126 if column.column is None: 

1127 assert isinstance(column.element, Dimension), "Element must be a Dimension" 

1128 attributes.append(column.element.primaryKey.name) 

1129 else: 

1130 attributes.append(column.column) 

1131 ordering.append(column.ordering) 

1132 

1133 def _key(record: DimensionRecord) -> _DimensionRecordKey: 

1134 return _DimensionRecordKey(attributes, ordering, record) 

1135 

1136 records = sorted(recordIter, key=_key) 

1137 return iter(records) 

1138 

1139 def count(self, *, exact: bool = True) -> int: 

1140 # Docstring inherited from base class. 

1141 return self._dataIds.count(exact=exact) 

1142 

1143 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

1144 # Docstring inherited from base class. 

1145 return self._dataIds.any(execute=execute, exact=exact) 

1146 

1147 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

1148 # Docstring inherited from base class. 

1149 self._dataIds = self._dataIds.order_by(*args) 

1150 self._order_by = args 

1151 return self 

1152 

1153 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

1154 # Docstring inherited from base class. 

1155 self._dataIds = self._dataIds.limit(limit, offset) 

1156 return self 

1157 

1158 def explain_no_results(self) -> Iterator[str]: 

1159 # Docstring inherited. 

1160 return self._dataIds.explain_no_results()