Coverage for python/lsst/daf/butler/registry/queries/_results.py: 33%

281 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-27 08:58 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DatabaseDimensionRecordQueryResults", 

26 "DataCoordinateQueryResults", 

27 "DatasetQueryResults", 

28 "DimensionRecordQueryResults", 

29 "ParentDatasetQueryResults", 

30) 

31 

32import itertools 

33import operator 

34from abc import abstractmethod 

35from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence 

36from contextlib import AbstractContextManager, ExitStack, contextmanager 

37from typing import Any, Optional 

38 

39import sqlalchemy 

40 

41from ...core import ( 

42 DataCoordinate, 

43 DataCoordinateIterable, 

44 DatasetRef, 

45 DatasetType, 

46 Dimension, 

47 DimensionGraph, 

48 DimensionRecord, 

49 SimpleQuery, 

50) 

51from ..interfaces import Database, DimensionRecordStorage 

52from ._query import Query 

53from ._structs import ElementOrderByClause, QuerySummary 

54 

55QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[tuple[int, Optional[int]]]], Query] 

56"""Type of a query factory method type used by DataCoordinateQueryResults. 

57""" 

58 

59 

60class DataCoordinateQueryResults(DataCoordinateIterable): 

61 """An enhanced implementation of `DataCoordinateIterable` that represents 

62 data IDs retrieved from a database query. 

63 

64 Parameters 

65 ---------- 

66 db : `Database` 

67 Database engine used to execute queries. 

68 query_factory : `QueryFactoryMethod` 

69 Method which creates an instance of `Query` class. 

70 graph : `DimensionGraph` 

71 Dimensions used by query. 

72 order_by : `Iterable` [ `str` ], optional 

73 Optional sequence of column names used for result ordering. 

74 limit : `Tuple` [ `int`, `int` ], optional 

75 Limit for the number of returned records and optional offset. 

76 records : `Mapping`, optional 

77 A nested mapping containing `DimensionRecord` objects for all 

78 dimensions and all data IDs this query will yield. If `None` 

79 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

80 The outer mapping has `str` keys (the names of dimension elements). 

81 The inner mapping has `tuple` keys representing data IDs (tuple 

82 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

83 

84 Notes 

85 ----- 

86 Constructing an instance of this does nothing; the query is not executed 

87 until it is iterated over (or some other operation is performed that 

88 involves iteration). 

89 

90 Instances should generally only be constructed by `Registry` methods or the 

91 methods of other query result objects. 

92 """ 

93 

94 def __init__( 

95 self, 

96 db: Database, 

97 query_factory: QueryFactoryMethod, 

98 graph: DimensionGraph, 

99 *, 

100 order_by: Iterable[str] | None = None, 

101 limit: tuple[int, int | None] | None = None, 

102 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

103 ): 

104 self._db = db 

105 self._query_factory = query_factory 

106 self._graph = graph 

107 self._order_by = order_by 

108 self._limit = limit 

109 self._records = records 

110 self._cached_query: Query | None = None 

111 

112 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query") 

113 

114 @classmethod 

115 def from_query( 

116 cls, 

117 db: Database, 

118 query: Query, 

119 graph: DimensionGraph, 

120 *, 

121 order_by: Iterable[str] | None = None, 

122 limit: tuple[int, int | None] | None = None, 

123 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

124 ) -> DataCoordinateQueryResults: 

125 """Make an instance from a pre-existing query instead of a factory. 

126 

127 Parameters 

128 ---------- 

129 db : `Database` 

130 Database engine used to execute queries. 

131 query : `Query` 

132 Low-level representation of the query that backs this result 

133 object. 

134 graph : `DimensionGraph` 

135 Dimensions used by query. 

136 order_by : `Iterable` [ `str` ], optional 

137 Optional sequence of column names used for result ordering. 

138 limit : `Tuple` [ `int`, `int` ], optional 

139 Limit for the number of returned records and optional offset. 

140 records : `Mapping`, optional 

141 A nested mapping containing `DimensionRecord` objects for all 

142 dimensions and all data IDs this query will yield. If `None` 

143 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

144 The outer mapping has `str` keys (the names of dimension elements). 

145 The inner mapping has `tuple` keys representing data IDs (tuple 

146 conversions of `DataCoordinate.values()`) and `DimensionRecord` 

147 values. 

148 """ 

149 

150 def factory(order_by: Iterable[str] | None, limit: tuple[int, int | None] | None) -> Query: 

151 return query 

152 

153 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, limit=limit, records=records) 

154 

155 def __iter__(self) -> Iterator[DataCoordinate]: 

156 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

157 

158 def __repr__(self) -> str: 

159 return f"<DataCoordinate iterator with dimensions={self._graph}>" 

160 

161 def _clone( 

162 self, 

163 *, 

164 query_factory: QueryFactoryMethod | None = None, 

165 query: Query | None = None, 

166 graph: DimensionGraph | None = None, 

167 order_by: Iterable[str] | None = None, 

168 limit: tuple[int, int | None] | None = None, 

169 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

170 ) -> DataCoordinateQueryResults: 

171 """Clone this instance potentially updating some attributes.""" 

172 graph = graph if graph is not None else self._graph 

173 order_by = order_by if order_by is not None else self._order_by 

174 limit = limit if limit is not None else self._limit 

175 records = records if records is not None else self._records 

176 if query is None: 

177 query_factory = query_factory or self._query_factory 

178 return DataCoordinateQueryResults( 

179 self._db, query_factory, graph, order_by=order_by, limit=limit, records=records 

180 ) 

181 else: 

182 return DataCoordinateQueryResults.from_query( 

183 self._db, query, graph, order_by=order_by, limit=limit, records=records 

184 ) 

185 

186 @property 

187 def _query(self) -> Query: 

188 """Query representation instance (`Query`)""" 

189 if self._cached_query is None: 

190 self._cached_query = self._query_factory(self._order_by, self._limit) 

191 assert ( 

192 self._cached_query.datasetType is None 

193 ), "Query used to initialize data coordinate results should not have any datasets." 

194 return self._cached_query 

195 

196 @property 

197 def graph(self) -> DimensionGraph: 

198 # Docstring inherited from DataCoordinateIterable. 

199 return self._graph 

200 

201 def hasFull(self) -> bool: 

202 # Docstring inherited from DataCoordinateIterable. 

203 return True 

204 

205 def hasRecords(self) -> bool: 

206 # Docstring inherited from DataCoordinateIterable. 

207 return self._records is not None or not self._graph 

208 

209 @contextmanager 

210 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

211 """Insert this query's results into a temporary table. 

212 

213 Returns 

214 ------- 

215 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

216 A context manager that ensures the temporary table is created and 

217 populated in ``__enter__`` (returning a results object backed by 

218 that table), and dropped in ``__exit__``. If ``self`` is already 

219 materialized, the context manager may do nothing (reflecting the 

220 fact that an outer context manager should already take care of 

221 everything else). 

222 

223 Notes 

224 ----- 

225 When using a very large result set to perform multiple queries (e.g. 

226 multiple calls to `subset` with different arguments, or even a single 

227 call to `expanded`), it may be much more efficient to start by 

228 materializing the query and only then performing the follow up queries. 

229 It may also be less efficient, depending on how well database engine's 

230 query optimizer can simplify those particular follow-up queries and 

231 how efficiently it caches query results even when the are not 

232 explicitly inserted into a temporary table. See `expanded` and 

233 `subset` for examples. 

234 """ 

235 with self._query.materialize(self._db) as materialized: 

236 # Note that we depend on order_by columns to be passes from Query 

237 # to MaterializedQuery, so order_by and limit are not used. 

238 yield self._clone(query=materialized) 

239 

240 def expanded(self) -> DataCoordinateQueryResults: 

241 """Return a results object for which `hasRecords` returns `True`. 

242 

243 This method may involve actually executing database queries to fetch 

244 `DimensionRecord` objects. 

245 

246 Returns 

247 ------- 

248 results : `DataCoordinateQueryResults` 

249 A results object for which `hasRecords` returns `True`. May be 

250 ``self`` if that is already the case. 

251 

252 Notes 

253 ----- 

254 For very result sets, it may be much more efficient to call 

255 `materialize` before calling `expanded`, to avoid performing the 

256 original query multiple times (as a subquery) in the follow-up queries 

257 that fetch dimension records. For example:: 

258 

259 with registry.queryDataIds(...).materialize() as tempDataIds: 

260 dataIdsWithRecords = tempDataIds.expanded() 

261 for dataId in dataIdsWithRecords: 

262 ... 

263 """ 

264 if self._records is None: 

265 records = {} 

266 for element in self.graph.elements: 

267 subset = self.subset(graph=element.graph, unique=True) 

268 records[element.name] = { 

269 tuple(record.dataId.values()): record 

270 for record in self._query.backend.managers.dimensions[element].fetch(subset) 

271 } 

272 

273 return self._clone(query=self._query, records=records) 

274 else: 

275 return self 

276 

277 def subset( 

278 self, graph: DimensionGraph | None = None, *, unique: bool = False 

279 ) -> DataCoordinateQueryResults: 

280 """Return a results object containing a subset of the dimensions of 

281 this one, and/or a unique near-subset of its rows. 

282 

283 This method may involve actually executing database queries to fetch 

284 `DimensionRecord` objects. 

285 

286 Parameters 

287 ---------- 

288 graph : `DimensionGraph`, optional 

289 Dimensions to include in the new results object. If `None`, 

290 ``self.graph`` is used. 

291 unique : `bool`, optional 

292 If `True` (`False` is default), the query should only return unique 

293 data IDs. This is implemented in the database; to obtain unique 

294 results via Python-side processing (which may be more efficient in 

295 some cases), use `toSet` to construct a `DataCoordinateSet` from 

296 this results object instead. 

297 

298 Returns 

299 ------- 

300 results : `DataCoordinateQueryResults` 

301 A results object corresponding to the given criteria. May be 

302 ``self`` if it already qualifies. 

303 

304 Raises 

305 ------ 

306 ValueError 

307 Raised when ``graph`` is not a subset of the dimension graph in 

308 this result. 

309 

310 Notes 

311 ----- 

312 This method can only return a "near-subset" of the original result rows 

313 in general because of subtleties in how spatial overlaps are 

314 implemented; see `Query.subset` for more information. 

315 

316 When calling `subset` multiple times on the same very large result set, 

317 it may be much more efficient to call `materialize` first. For 

318 example:: 

319 

320 dimensions1 = DimensionGraph(...) 

321 dimensions2 = DimensionGraph(...) 

322 with registry.queryDataIds(...).materialize() as tempDataIds: 

323 for dataId1 in tempDataIds.subset( 

324 graph=dimensions1, 

325 unique=True): 

326 ... 

327 for dataId2 in tempDataIds.subset( 

328 graph=dimensions2, 

329 unique=True): 

330 ... 

331 """ 

332 if graph is None: 

333 graph = self.graph 

334 if not graph.issubset(self.graph): 

335 raise ValueError(f"{graph} is not a subset of {self.graph}") 

336 if graph == self.graph and (not unique or self._query.isUnique()): 

337 return self 

338 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None 

339 if self._records is not None: 

340 records = {element.name: self._records[element.name] for element in graph.elements} 

341 else: 

342 records = None 

343 query = self._query.subset(graph=graph, datasets=False, unique=unique) 

344 

345 return self._clone(graph=graph, query=query, records=records) 

346 

347 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

348 # Docstring inherited from DataCoordinateIterable. 

349 sql = self._query.sql 

350 if sql is not None: 

351 fromClause = sql.alias("c") 

352 query.join( 

353 fromClause, 

354 onclause=sqlalchemy.sql.and_( 

355 *[ 

356 columns(dimension.name) == fromClause.columns[dimension.name] 

357 for dimension in self.graph.required 

358 ] 

359 ), 

360 ) 

361 

362 def findDatasets( 

363 self, datasetType: DatasetType | str, collections: Any, *, findFirst: bool = True 

364 ) -> DatasetQueryResults: 

365 """Find datasets using the data IDs identified by this query. 

366 

367 Parameters 

368 ---------- 

369 datasetType : `DatasetType` or `str` 

370 Dataset type or the name of one to search for. Must have 

371 dimensions that are a subset of ``self.graph``. 

372 collections : `Any` 

373 An expression that fully or partially identifies the collections 

374 to search for the dataset, such as a `str`, `re.Pattern`, or 

375 iterable thereof. ``...`` can be used to return all collections. 

376 See :ref:`daf_butler_collection_expressions` for more information. 

377 findFirst : `bool`, optional 

378 If `True` (default), for each result data ID, only yield one 

379 `DatasetRef`, from the first collection in which a dataset of that 

380 dataset type appears (according to the order of ``collections`` 

381 passed in). If `True`, ``collections`` must not contain regular 

382 expressions and may not be ``...``. 

383 

384 Returns 

385 ------- 

386 datasets : `DatasetQueryResults` 

387 A lazy-evaluation object representing dataset query results, 

388 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

389 nested data IDs in those dataset references will have records as 

390 well. 

391 

392 Raises 

393 ------ 

394 ValueError 

395 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

396 """ 

397 if not isinstance(datasetType, DatasetType): 

398 storage = self._query.backend.managers.datasets.find(datasetType) 

399 if storage is None: 

400 return ChainedDatasetQueryResults( 

401 [], 

402 doomed_by=[ 

403 f"Dataset type {datasetType!r} is not registered, so no instances of it can exist in " 

404 "any collection." 

405 ], 

406 ) 

407 else: 

408 datasetType = storage.datasetType 

409 if not datasetType.dimensions.issubset(self.graph): 

410 raise ValueError( 

411 f"findDatasets requires that the dataset type have only dimensions in " 

412 f"the DataCoordinateQueryResult used as input to the search, but " 

413 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input " 

414 f"dimensions are {self.graph}." 

415 ) 

416 if datasetType.isComponent(): 

417 # We were given a true DatasetType instance, but it's a component. 

418 components = [datasetType.component()] 

419 datasetType = datasetType.makeCompositeDatasetType() 

420 else: 

421 components = [None] 

422 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType]) 

423 builder = self._query.makeBuilder(summary) 

424 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst) 

425 query = builder.finish(joinMissing=False) 

426 return ParentDatasetQueryResults( 

427 db=self._db, query=query, components=components, records=self._records, datasetType=datasetType 

428 ) 

429 

430 def count(self, *, exact: bool = True) -> int: 

431 """Count the number of rows this query would return. 

432 

433 Parameters 

434 ---------- 

435 exact : `bool`, optional 

436 If `True`, run the full query and perform post-query filtering if 

437 needed to account for that filtering in the count. If `False`, the 

438 result may be an upper bound. 

439 

440 Returns 

441 ------- 

442 count : `int` 

443 The number of rows the query would return, or an upper bound if 

444 ``exact=False``. 

445 

446 Notes 

447 ----- 

448 This counts the number of rows returned, not the number of unique rows 

449 returned, so even with ``exact=True`` it may provide only an upper 

450 bound on the number of *deduplicated* result rows. 

451 """ 

452 return self._query.count(self._db, exact=exact) 

453 

454 def any( 

455 self, 

456 *, 

457 execute: bool = True, 

458 exact: bool = True, 

459 ) -> bool: 

460 """Test whether this query returns any results. 

461 

462 Parameters 

463 ---------- 

464 execute : `bool`, optional 

465 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

466 determined prior to execution that the query would return no rows. 

467 exact : `bool`, optional 

468 If `True`, run the full query and perform post-query filtering if 

469 needed, until at least one result row is found. If `False`, the 

470 returned result does not account for post-query filtering, and 

471 hence may be `True` even when all result rows would be filtered 

472 out. 

473 

474 Returns 

475 ------- 

476 any : `bool` 

477 `True` if the query would (or might, depending on arguments) yield 

478 result rows. `False` if it definitely would not. 

479 """ 

480 return self._query.any(self._db, execute=execute, exact=exact) 

481 

482 def explain_no_results(self) -> Iterator[str]: 

483 """Return human-readable messages that may help explain why the query 

484 yields no results. 

485 

486 Returns 

487 ------- 

488 messages : `Iterator` [ `str` ] 

489 String messages that describe reasons the query might not yield any 

490 results. 

491 

492 Notes 

493 ----- 

494 Messages related to post-query filtering are only available if the 

495 iterator has been exhausted, or if `any` or `count` was already called 

496 (with ``exact=True`` for the latter two). 

497 

498 This method first yields messages that are generated while the query is 

499 being built or filtered, but may then proceed to diagnostics generated 

500 by performing what should be inexpensive follow-up queries. Callers 

501 can short-circuit this at any time by simplying not iterating further. 

502 """ 

503 return self._query.explain_no_results(self._db) 

504 

505 def order_by(self, *args: str) -> DataCoordinateQueryResults: 

506 """Make the iterator return ordered result. 

507 

508 Parameters 

509 ---------- 

510 *args : `str` 

511 Names of the columns/dimensions to use for ordering. Column name 

512 can be prefixed with minus (``-``) to use descending ordering. 

513 

514 Returns 

515 ------- 

516 result : `DataCoordinateQueryResults` 

517 Returns ``self`` instance which is updated to return ordered 

518 result. 

519 

520 Notes 

521 ----- 

522 This method modifies the iterator in place and returns the same 

523 instance to support method chaining. 

524 """ 

525 return self._clone(order_by=args) 

526 

527 def limit(self, limit: int, offset: int | None = None) -> DataCoordinateQueryResults: 

528 """Make the iterator return limited number of records. 

529 

530 Parameters 

531 ---------- 

532 limit : `int` 

533 Upper limit on the number of returned records. 

534 offset : `int` or `None` 

535 If not `None` then the number of records to skip before returning 

536 ``limit`` records. 

537 

538 Returns 

539 ------- 

540 result : `DataCoordinateQueryResults` 

541 Returns ``self`` instance which is updated to return limited set 

542 of records. 

543 

544 Notes 

545 ----- 

546 This method modifies the iterator in place and returns the same 

547 instance to support method chaining. Normally this method is used 

548 together with `order_by` method. 

549 """ 

550 return self._clone(limit=(limit, offset)) 

551 

552 

553class DatasetQueryResults(Iterable[DatasetRef]): 

554 """An interface for objects that represent the results of queries for 

555 datasets. 

556 """ 

557 

558 @abstractmethod 

559 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

560 """Group results by parent dataset type. 

561 

562 Returns 

563 ------- 

564 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

565 An iterator over `DatasetQueryResults` instances that are each 

566 responsible for a single parent dataset type (either just that 

567 dataset type, one or more of its component dataset types, or both). 

568 """ 

569 raise NotImplementedError() 

570 

571 @abstractmethod 

572 def materialize(self) -> AbstractContextManager[DatasetQueryResults]: 

573 """Insert this query's results into a temporary table. 

574 

575 Returns 

576 ------- 

577 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

578 A context manager that ensures the temporary table is created and 

579 populated in ``__enter__`` (returning a results object backed by 

580 that table), and dropped in ``__exit__``. If ``self`` is already 

581 materialized, the context manager may do nothing (reflecting the 

582 fact that an outer context manager should already take care of 

583 everything else). 

584 """ 

585 raise NotImplementedError() 

586 

587 @abstractmethod 

588 def expanded(self) -> DatasetQueryResults: 

589 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords` 

590 returns `True` for all data IDs in returned `DatasetRef` objects. 

591 

592 Returns 

593 ------- 

594 expanded : `DatasetQueryResults` 

595 Either a new `DatasetQueryResults` instance or ``self``, if it is 

596 already expanded. 

597 

598 Notes 

599 ----- 

600 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

601 to call `materialize` before expanding data IDs for very large result 

602 sets. 

603 """ 

604 raise NotImplementedError() 

605 

606 @abstractmethod 

607 def count(self, *, exact: bool = True) -> int: 

608 """Count the number of rows this query would return. 

609 

610 Parameters 

611 ---------- 

612 exact : `bool`, optional 

613 If `True`, run the full query and perform post-query filtering if 

614 needed to account for that filtering in the count. If `False`, the 

615 result may be an upper bound. 

616 

617 Returns 

618 ------- 

619 count : `int` 

620 The number of rows the query would return, or an upper bound if 

621 ``exact=False``. 

622 

623 Notes 

624 ----- 

625 This counts the number of rows returned, not the number of unique rows 

626 returned, so even with ``exact=True`` it may provide only an upper 

627 bound on the number of *deduplicated* result rows. 

628 """ 

629 raise NotImplementedError() 

630 

631 @abstractmethod 

632 def any( 

633 self, 

634 *, 

635 execute: bool = True, 

636 exact: bool = True, 

637 ) -> bool: 

638 """Test whether this query returns any results. 

639 

640 Parameters 

641 ---------- 

642 execute : `bool`, optional 

643 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

644 determined prior to execution that the query would return no rows. 

645 exact : `bool`, optional 

646 If `True`, run the full query and perform post-query filtering if 

647 needed, until at least one result row is found. If `False`, the 

648 returned result does not account for post-query filtering, and 

649 hence may be `True` even when all result rows would be filtered 

650 out. 

651 

652 Returns 

653 ------- 

654 any : `bool` 

655 `True` if the query would (or might, depending on arguments) yield 

656 result rows. `False` if it definitely would not. 

657 """ 

658 raise NotImplementedError() 

659 

660 @abstractmethod 

661 def explain_no_results(self) -> Iterator[str]: 

662 """Return human-readable messages that may help explain why the query 

663 yields no results. 

664 

665 Returns 

666 ------- 

667 messages : `Iterator` [ `str` ] 

668 String messages that describe reasons the query might not yield any 

669 results. 

670 

671 Notes 

672 ----- 

673 Messages related to post-query filtering are only available if the 

674 iterator has been exhausted, or if `any` or `count` was already called 

675 (with ``exact=True`` for the latter two). 

676 

677 This method first yields messages that are generated while the query is 

678 being built or filtered, but may then proceed to diagnostics generated 

679 by performing what should be inexpensive follow-up queries. Callers 

680 can short-circuit this at any time by simplying not iterating further. 

681 """ 

682 raise NotImplementedError() 

683 

684 

685class ParentDatasetQueryResults(DatasetQueryResults): 

686 """An object that represents results from a query for datasets with a 

687 single parent `DatasetType`. 

688 

689 Parameters 

690 ---------- 

691 db : `Database` 

692 Database engine to execute queries against. 

693 query : `Query` 

694 Low-level query object that backs these results. ``query.datasetType`` 

695 will be the parent dataset type for this object, and may not be `None`. 

696 components : `Sequence` [ `str` or `None` ] 

697 Names of components to include in iteration. `None` may be included 

698 (at most once) to include the parent dataset type. 

699 records : `Mapping`, optional 

700 Mapping containing `DimensionRecord` objects for all dimensions and 

701 all data IDs this query will yield. If `None` (default), 

702 `DataCoordinate.hasRecords` will return `False` for all nested data 

703 IDs. This is a nested mapping with `str` names of dimension elements 

704 as outer keys, `DimensionRecord` instances as inner values, and 

705 ``tuple(record.dataId.values())`` for the inner keys / outer values 

706 (where ``record`` is the innermost `DimensionRecord` instance). 

707 datasetType : `DatasetType`, optional 

708 Parent dataset type for all datasets returned by this query. If not 

709 provided, ``query.datasetType`` be used, and must not be `None` (as it 

710 is in the case where the query is known to yield no results prior to 

711 execution). 

712 """ 

713 

714 def __init__( 

715 self, 

716 db: Database, 

717 query: Query, 

718 *, 

719 components: Sequence[str | None], 

720 records: Mapping[str, Mapping[tuple, DimensionRecord]] | None = None, 

721 datasetType: DatasetType | None = None, 

722 ): 

723 self._db = db 

724 self._query = query 

725 self._components = components 

726 self._records = records 

727 if datasetType is None: 

728 datasetType = query.datasetType 

729 assert datasetType is not None, "Query used to initialize dataset results must have a dataset." 

730 assert datasetType.dimensions.issubset( 

731 query.graph 

732 ), f"Query dimensions {query.graph} do not match dataset type dimensions {datasetType.dimensions}." 

733 self._datasetType = datasetType 

734 

735 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

736 

737 def __iter__(self) -> Iterator[DatasetRef]: 

738 for row in self._query.rows(self._db): 

739 parentRef = self._query.extractDatasetRef(row, records=self._records) 

740 for component in self._components: 

741 if component is None: 

742 yield parentRef 

743 else: 

744 yield parentRef.makeComponentRef(component) 

745 

746 def __repr__(self) -> str: 

747 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>" 

748 

749 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

750 # Docstring inherited from DatasetQueryResults. 

751 yield self 

752 

753 @contextmanager 

754 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

755 # Docstring inherited from DatasetQueryResults. 

756 with self._query.materialize(self._db) as materialized: 

757 yield ParentDatasetQueryResults( 

758 self._db, materialized, components=self._components, records=self._records 

759 ) 

760 

761 @property 

762 def parentDatasetType(self) -> DatasetType: 

763 """The parent dataset type for all datasets in this iterable 

764 (`DatasetType`). 

765 """ 

766 return self._datasetType 

767 

768 @property 

769 def dataIds(self) -> DataCoordinateQueryResults: 

770 """A lazy-evaluation object representing a query for just the data 

771 IDs of the datasets that would be returned by this query 

772 (`DataCoordinateQueryResults`). 

773 

774 The returned object is not in general `zip`-iterable with ``self``; 

775 it may be in a different order or have (or not have) duplicates. 

776 """ 

777 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False) 

778 return DataCoordinateQueryResults.from_query( 

779 self._db, 

780 query, 

781 self.parentDatasetType.dimensions, 

782 records=self._records, 

783 ) 

784 

785 def withComponents(self, components: Sequence[str | None]) -> ParentDatasetQueryResults: 

786 """Return a new query results object for the same parent datasets but 

787 different components. 

788 

789 components : `Sequence` [ `str` or `None` ] 

790 Names of components to include in iteration. `None` may be 

791 included (at most once) to include the parent dataset type. 

792 """ 

793 return ParentDatasetQueryResults( 

794 self._db, self._query, records=self._records, components=components, datasetType=self._datasetType 

795 ) 

796 

797 def expanded(self) -> ParentDatasetQueryResults: 

798 # Docstring inherited from DatasetQueryResults. 

799 if self._records is None: 

800 records = self.dataIds.expanded()._records 

801 return ParentDatasetQueryResults( 

802 self._db, 

803 self._query, 

804 records=records, 

805 components=self._components, 

806 datasetType=self._datasetType, 

807 ) 

808 else: 

809 return self 

810 

811 def count(self, *, exact: bool = True) -> int: 

812 # Docstring inherited. 

813 return len(self._components) * self._query.count(self._db, exact=exact) 

814 

815 def any( 

816 self, 

817 *, 

818 execute: bool = True, 

819 exact: bool = True, 

820 ) -> bool: 

821 # Docstring inherited. 

822 return self._query.any(self._db, execute=execute, exact=exact) 

823 

824 def explain_no_results(self) -> Iterator[str]: 

825 # Docstring inherited. 

826 return self._query.explain_no_results(self._db) 

827 

828 

829class ChainedDatasetQueryResults(DatasetQueryResults): 

830 """A `DatasetQueryResults` implementation that simply chains together 

831 other results objects, each for a different parent dataset type. 

832 

833 Parameters 

834 ---------- 

835 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

836 The underlying results objects this object will chain together. 

837 doomed_by : `Iterable` [ `str` ], optional 

838 A list of messages (appropriate for e.g. logging or exceptions) that 

839 explain why the query is known to return no results even before it is 

840 executed. Queries with a non-empty list will never be executed. 

841 Child results objects may also have their own list. 

842 """ 

843 

844 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()): 

845 self._chain = chain 

846 self._doomed_by = tuple(doomed_by) 

847 

848 __slots__ = ("_chain",) 

849 

850 def __iter__(self) -> Iterator[DatasetRef]: 

851 return itertools.chain.from_iterable(self._chain) 

852 

853 def __repr__(self) -> str: 

854 return "<DatasetRef iterator for multiple dataset types>" 

855 

856 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

857 # Docstring inherited from DatasetQueryResults. 

858 return iter(self._chain) 

859 

860 @contextmanager 

861 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

862 # Docstring inherited from DatasetQueryResults. 

863 with ExitStack() as stack: 

864 yield ChainedDatasetQueryResults([stack.enter_context(r.materialize()) for r in self._chain]) 

865 

866 def expanded(self) -> ChainedDatasetQueryResults: 

867 # Docstring inherited from DatasetQueryResults. 

868 return ChainedDatasetQueryResults([r.expanded() for r in self._chain]) 

869 

870 def count(self, *, exact: bool = True) -> int: 

871 # Docstring inherited. 

872 return sum(r.count(exact=exact) for r in self._chain) 

873 

874 def any( 

875 self, 

876 *, 

877 execute: bool = True, 

878 exact: bool = True, 

879 ) -> bool: 

880 # Docstring inherited. 

881 return any(r.any(execute=execute, exact=exact) for r in self._chain) 

882 

883 def explain_no_results(self) -> Iterator[str]: 

884 # Docstring inherited. 

885 for r in self._chain: 

886 yield from r.explain_no_results() 

887 yield from self._doomed_by 

888 

889 

890class DimensionRecordQueryResults(Iterable[DimensionRecord]): 

891 """An interface for objects that represent the results of queries for 

892 dimension records. 

893 """ 

894 

895 @abstractmethod 

896 def count(self, *, exact: bool = True) -> int: 

897 """Count the number of rows this query would return. 

898 

899 Parameters 

900 ---------- 

901 exact : `bool`, optional 

902 If `True`, run the full query and perform post-query filtering if 

903 needed to account for that filtering in the count. If `False`, the 

904 result may be an upper bound. 

905 

906 Returns 

907 ------- 

908 count : `int` 

909 The number of rows the query would return, or an upper bound if 

910 ``exact=False``. 

911 

912 Notes 

913 ----- 

914 This counts the number of rows returned, not the number of unique rows 

915 returned, so even with ``exact=True`` it may provide only an upper 

916 bound on the number of *deduplicated* result rows. 

917 """ 

918 raise NotImplementedError() 

919 

920 @abstractmethod 

921 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

922 """Test whether this query returns any results. 

923 

924 Parameters 

925 ---------- 

926 execute : `bool`, optional 

927 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

928 determined prior to execution that the query would return no rows. 

929 exact : `bool`, optional 

930 If `True`, run the full query and perform post-query filtering if 

931 needed, until at least one result row is found. If `False`, the 

932 returned result does not account for post-query filtering, and 

933 hence may be `True` even when all result rows would be filtered 

934 out. 

935 

936 Returns 

937 ------- 

938 any : `bool` 

939 `True` if the query would (or might, depending on arguments) yield 

940 result rows. `False` if it definitely would not. 

941 """ 

942 raise NotImplementedError() 

943 

944 @abstractmethod 

945 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

946 """Make the iterator return ordered result. 

947 

948 Parameters 

949 ---------- 

950 *args : `str` 

951 Names of the columns/dimensions to use for ordering. Column name 

952 can be prefixed with minus (``-``) to use descending ordering. 

953 

954 Returns 

955 ------- 

956 result : `DimensionRecordQueryResults` 

957 Returns ``self`` instance which is updated to return ordered 

958 result. 

959 

960 Notes 

961 ----- 

962 This method can modify the iterator in place and return the same 

963 instance. 

964 """ 

965 raise NotImplementedError() 

966 

967 @abstractmethod 

968 def limit(self, limit: int, offset: int | None = None) -> DimensionRecordQueryResults: 

969 """Make the iterator return limited number of records. 

970 

971 Parameters 

972 ---------- 

973 limit : `int` 

974 Upper limit on the number of returned records. 

975 offset : `int` or `None` 

976 If not `None` then the number of records to skip before returning 

977 ``limit`` records. 

978 

979 Returns 

980 ------- 

981 result : `DimensionRecordQueryResults` 

982 Returns ``self`` instance which is updated to return limited set 

983 of records. 

984 

985 Notes 

986 ----- 

987 This method can modify the iterator in place and return the same 

988 instance. Normally this method is used together with `order_by` 

989 method. 

990 """ 

991 raise NotImplementedError() 

992 

993 @abstractmethod 

994 def explain_no_results(self) -> Iterator[str]: 

995 """Return human-readable messages that may help explain why the query 

996 yields no results. 

997 

998 Returns 

999 ------- 

1000 messages : `Iterator` [ `str` ] 

1001 String messages that describe reasons the query might not yield any 

1002 results. 

1003 

1004 Notes 

1005 ----- 

1006 Messages related to post-query filtering are only available if the 

1007 iterator has been exhausted, or if `any` or `count` was already called 

1008 (with ``exact=True`` for the latter two). 

1009 

1010 This method first yields messages that are generated while the query is 

1011 being built or filtered, but may then proceed to diagnostics generated 

1012 by performing what should be inexpensive follow-up queries. Callers 

1013 can short-circuit this at any time by simply not iterating further. 

1014 """ 

1015 raise NotImplementedError() 

1016 

1017 

1018class _DimensionRecordKey: 

1019 """Class for objects used as keys in ordering `DimensionRecord` instances. 

1020 

1021 Parameters 

1022 ---------- 

1023 attributes : `Sequence` [ `str` ] 

1024 Sequence of attribute names to use for comparison. 

1025 ordering : `Sequence` [ `bool` ] 

1026 Matching sequence of ordering flags, `False` for descending ordering, 

1027 `True` for ascending ordering. 

1028 record : `DimensionRecord` 

1029 `DimensionRecord` to compare to other records. 

1030 """ 

1031 

1032 def __init__(self, attributes: Sequence[str], ordering: Sequence[bool], record: DimensionRecord): 

1033 self.attributes = attributes 

1034 self.ordering = ordering 

1035 self.rec = record 

1036 

1037 def _cmp(self, other: _DimensionRecordKey) -> int: 

1038 """Compare two records using provided comparison operator. 

1039 

1040 Parameters 

1041 ---------- 

1042 other : `_DimensionRecordKey` 

1043 Key for other record. 

1044 

1045 Returns 

1046 ------- 

1047 result : `int` 

1048 0 if keys are identical, negative if ``self`` is ordered before 

1049 ``other``, positive otherwise. 

1050 """ 

1051 for attribute, ordering in zip(self.attributes, self.ordering): 

1052 # timespan.begin/end cannot use getattr 

1053 attrgetter = operator.attrgetter(attribute) 

1054 lhs = attrgetter(self.rec) 

1055 rhs = attrgetter(other.rec) 

1056 if not ordering: 

1057 lhs, rhs = rhs, lhs 

1058 if lhs != rhs: 

1059 return 1 if lhs > rhs else -1 

1060 return 0 

1061 

1062 def __lt__(self, other: _DimensionRecordKey) -> bool: 

1063 return self._cmp(other) < 0 

1064 

1065 def __gt__(self, other: _DimensionRecordKey) -> bool: 

1066 return self._cmp(other) > 0 

1067 

1068 def __eq__(self, other: Any) -> bool: 

1069 if not isinstance(other, _DimensionRecordKey): 

1070 return NotImplemented 

1071 return self._cmp(other) == 0 

1072 

1073 def __le__(self, other: _DimensionRecordKey) -> bool: 

1074 return self._cmp(other) <= 0 

1075 

1076 def __ge__(self, other: _DimensionRecordKey) -> bool: 

1077 return self._cmp(other) >= 0 

1078 

1079 

1080class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults): 

1081 """Implementation of DimensionRecordQueryResults using database query. 

1082 

1083 Parameters 

1084 ---------- 

1085 dataIds : `DataCoordinateQueryResults` 

1086 Iterator for DataIds. 

1087 recordStorage : `DimensionRecordStorage` 

1088 Instance of storage class for dimension records. 

1089 """ 

1090 

1091 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage): 

1092 self._dataIds = dataIds 

1093 self._recordStorage = recordStorage 

1094 self._order_by: Iterable[str] = () 

1095 

1096 def __iter__(self) -> Iterator[DimensionRecord]: 

1097 # LIMIT is already applied at DataCoordinateQueryResults level 

1098 # (assumption here is that if DataId exists then dimension record 

1099 # exists too and their counts must be equal). fetch() does not 

1100 # guarantee ordering, so we need to sort records in memory below. 

1101 recordIter = self._recordStorage.fetch(self._dataIds) 

1102 if not self._order_by: 

1103 return iter(recordIter) 

1104 

1105 # Parse list of column names and build a list of attribute name for 

1106 # ordering. Note that here we only support ordering by direct 

1107 # attributes of the element, and not other elements from the dimension 

1108 # graph. 

1109 orderBy = ElementOrderByClause(self._order_by, self._recordStorage.element) 

1110 attributes: list[str] = [] 

1111 ordering: list[bool] = [] 

1112 for column in orderBy.order_by_columns: 

1113 if column.column is None: 

1114 assert isinstance(column.element, Dimension), "Element must be a Dimension" 

1115 attributes.append(column.element.primaryKey.name) 

1116 else: 

1117 attributes.append(column.column) 

1118 ordering.append(column.ordering) 

1119 

1120 def _key(record: DimensionRecord) -> _DimensionRecordKey: 

1121 return _DimensionRecordKey(attributes, ordering, record) 

1122 

1123 records = sorted(recordIter, key=_key) 

1124 return iter(records) 

1125 

1126 def count(self, *, exact: bool = True) -> int: 

1127 # Docstring inherited from base class. 

1128 return self._dataIds.count(exact=exact) 

1129 

1130 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

1131 # Docstring inherited from base class. 

1132 return self._dataIds.any(execute=execute, exact=exact) 

1133 

1134 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

1135 # Docstring inherited from base class. 

1136 self._dataIds = self._dataIds.order_by(*args) 

1137 self._order_by = args 

1138 return self 

1139 

1140 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

1141 # Docstring inherited from base class. 

1142 self._dataIds = self._dataIds.limit(limit, offset) 

1143 return self 

1144 

1145 def explain_no_results(self) -> Iterator[str]: 

1146 # Docstring inherited. 

1147 return self._dataIds.explain_no_results()