Coverage for python/lsst/daf/butler/registry/queries/_results.py: 35%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

236 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DatabaseDimensionRecordQueryResults", 

26 "DataCoordinateQueryResults", 

27 "DatasetQueryResults", 

28 "DimensionRecordQueryResults", 

29 "ParentDatasetQueryResults", 

30) 

31 

32from abc import abstractmethod 

33from contextlib import contextmanager, ExitStack 

34import itertools 

35from typing import ( 

36 Any, 

37 Callable, 

38 ContextManager, 

39 Iterable, 

40 Iterator, 

41 Mapping, 

42 Optional, 

43 Sequence, 

44 Tuple, 

45 Union, 

46) 

47 

48import sqlalchemy 

49 

50from ...core import ( 

51 DataCoordinate, 

52 DataCoordinateIterable, 

53 DatasetRef, 

54 DatasetType, 

55 DimensionGraph, 

56 DimensionRecord, 

57 SimpleQuery, 

58) 

59from ..interfaces import Database, DimensionRecordStorage 

60from ._query import Query 

61from ._structs import QuerySummary 

62 

63 

64QueryFactoryMethod = Callable[[Optional[Iterable[str]], Optional[Tuple[int, Optional[int]]]], Query] 

65"""Type of a query factory method type used by DataCoordinateQueryResults. 

66""" 

67 

68 

69class DataCoordinateQueryResults(DataCoordinateIterable): 

70 """An enhanced implementation of `DataCoordinateIterable` that represents 

71 data IDs retrieved from a database query. 

72 

73 Parameters 

74 ---------- 

75 db : `Database` 

76 Database engine used to execute queries. 

77 query_factory : `QueryFactoryMethod` 

78 Method which creates an instance of `Query` class. 

79 graph : `DimensionGraph` 

80 Dimensions used by query. 

81 order_by : `Iterable` [ `str` ], optional 

82 Optional sequence of column names used for result ordering. 

83 limit : `Tuple` [ `int`, `int` ], optional 

84 Limit for the number of returned records and optional offset. 

85 records : `Mapping`, optional 

86 A nested mapping containing `DimensionRecord` objects for all 

87 dimensions and all data IDs this query will yield. If `None` 

88 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

89 The outer mapping has `str` keys (the names of dimension elements). 

90 The inner mapping has `tuple` keys representing data IDs (tuple 

91 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

92 

93 Notes 

94 ----- 

95 Constructing an instance of this does nothing; the query is not executed 

96 until it is iterated over (or some other operation is performed that 

97 involves iteration). 

98 

99 Instances should generally only be constructed by `Registry` methods or the 

100 methods of other query result objects. 

101 """ 

102 def __init__(self, 

103 db: Database, 

104 query_factory: QueryFactoryMethod, 

105 graph: DimensionGraph, 

106 *, 

107 order_by: Optional[Iterable[str]] = None, 

108 limit: Optional[Tuple[int, Optional[int]]] = None, 

109 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None): 

110 self._db = db 

111 self._query_factory = query_factory 

112 self._graph = graph 

113 self._order_by = order_by 

114 self._limit = limit 

115 self._records = records 

116 self._cached_query: Optional[Query] = None 

117 

118 __slots__ = ("_db", "_query_factory", "_graph", "_order_by", "_limit", "_records", "_cached_query") 

119 

120 @classmethod 

121 def from_query(cls, 

122 db: Database, 

123 query: Query, 

124 graph: DimensionGraph, 

125 *, 

126 order_by: Optional[Iterable[str]] = None, 

127 limit: Optional[Tuple[int, Optional[int]]] = None, 

128 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None 

129 ) -> DataCoordinateQueryResults: 

130 """Make an instance from a pre-existing query instead of a factory. 

131 

132 Parameters 

133 ---------- 

134 db : `Database` 

135 Database engine used to execute queries. 

136 query : `Query` 

137 Low-level representation of the query that backs this result 

138 object. 

139 graph : `DimensionGraph` 

140 Dimensions used by query. 

141 order_by : `Iterable` [ `str` ], optional 

142 Optional sequence of column names used for result ordering. 

143 limit : `Tuple` [ `int`, `int` ], optional 

144 Limit for the number of returned records and optional offset. 

145 records : `Mapping`, optional 

146 A nested mapping containing `DimensionRecord` objects for all 

147 dimensions and all data IDs this query will yield. If `None` 

148 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

149 The outer mapping has `str` keys (the names of dimension elements). 

150 The inner mapping has `tuple` keys representing data IDs (tuple 

151 conversions of `DataCoordinate.values()`) and `DimensionRecord` 

152 values. 

153 """ 

154 def factory(order_by: Optional[Iterable[str]], limit: Optional[Tuple[int, Optional[int]]]) -> Query: 

155 return query 

156 

157 return DataCoordinateQueryResults(db, factory, graph, order_by=order_by, 

158 limit=limit, records=records) 

159 

160 def __iter__(self) -> Iterator[DataCoordinate]: 

161 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

162 

163 def __repr__(self) -> str: 

164 return f"<DataCoordinate iterator with dimensions={self._graph}>" 

165 

166 def _clone(self, *, 

167 query_factory: Optional[QueryFactoryMethod] = None, 

168 query: Optional[Query] = None, 

169 graph: Optional[DimensionGraph] = None, 

170 order_by: Optional[Iterable[str]] = None, 

171 limit: Optional[Tuple[int, Optional[int]]] = None, 

172 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

173 ) -> DataCoordinateQueryResults: 

174 """Clone this instance potentially updating some attributes. 

175 """ 

176 graph = graph if graph is not None else self._graph 

177 order_by = order_by if order_by is not None else self._order_by 

178 limit = limit if limit is not None else self._limit 

179 records = records if records is not None else self._records 

180 if query is None: 

181 query_factory = query_factory or self._query_factory 

182 return DataCoordinateQueryResults(self._db, query_factory, graph, 

183 order_by=order_by, limit=limit, records=records) 

184 else: 

185 return DataCoordinateQueryResults.from_query(self._db, query, graph, 

186 order_by=order_by, limit=limit, records=records) 

187 

188 @property 

189 def _query(self) -> Query: 

190 """Query representation instance (`Query`)""" 

191 if self._cached_query is None: 

192 self._cached_query = self._query_factory(self._order_by, self._limit) 

193 assert self._cached_query.datasetType is None, \ 

194 "Query used to initialize data coordinate results should not have any datasets." 

195 return self._cached_query 

196 

197 @property 

198 def graph(self) -> DimensionGraph: 

199 # Docstring inherited from DataCoordinateIterable. 

200 return self._graph 

201 

202 def hasFull(self) -> bool: 

203 # Docstring inherited from DataCoordinateIterable. 

204 return True 

205 

206 def hasRecords(self) -> bool: 

207 # Docstring inherited from DataCoordinateIterable. 

208 return self._records is not None or not self._graph 

209 

210 @contextmanager 

211 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

212 """Insert this query's results into a temporary table. 

213 

214 Returns 

215 ------- 

216 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

217 A context manager that ensures the temporary table is created and 

218 populated in ``__enter__`` (returning a results object backed by 

219 that table), and dropped in ``__exit__``. If ``self`` is already 

220 materialized, the context manager may do nothing (reflecting the 

221 fact that an outer context manager should already take care of 

222 everything else). 

223 

224 Notes 

225 ----- 

226 When using a very large result set to perform multiple queries (e.g. 

227 multiple calls to `subset` with different arguments, or even a single 

228 call to `expanded`), it may be much more efficient to start by 

229 materializing the query and only then performing the follow up queries. 

230 It may also be less efficient, depending on how well database engine's 

231 query optimizer can simplify those particular follow-up queries and 

232 how efficiently it caches query results even when the are not 

233 explicitly inserted into a temporary table. See `expanded` and 

234 `subset` for examples. 

235 """ 

236 with self._query.materialize(self._db) as materialized: 

237 # Note that we depend on order_by columns to be passes from Query 

238 # to MaterializedQuery, so order_by and limit are not used. 

239 yield self._clone(query=materialized) 

240 

241 def expanded(self) -> DataCoordinateQueryResults: 

242 """Return a results object for which `hasRecords` returns `True`. 

243 

244 This method may involve actually executing database queries to fetch 

245 `DimensionRecord` objects. 

246 

247 Returns 

248 ------- 

249 results : `DataCoordinateQueryResults` 

250 A results object for which `hasRecords` returns `True`. May be 

251 ``self`` if that is already the case. 

252 

253 Notes 

254 ----- 

255 For very result sets, it may be much more efficient to call 

256 `materialize` before calling `expanded`, to avoid performing the 

257 original query multiple times (as a subquery) in the follow-up queries 

258 that fetch dimension records. For example:: 

259 

260 with registry.queryDataIds(...).materialize() as tempDataIds: 

261 dataIdsWithRecords = tempDataIds.expanded() 

262 for dataId in dataIdsWithRecords: 

263 ... 

264 """ 

265 if self._records is None: 

266 records = {} 

267 for element in self.graph.elements: 

268 subset = self.subset(graph=element.graph, unique=True) 

269 records[element.name] = { 

270 tuple(record.dataId.values()): record 

271 for record in self._query.managers.dimensions[element].fetch(subset) 

272 } 

273 

274 return self._clone(query=self._query, records=records) 

275 else: 

276 return self 

277 

278 def subset(self, graph: Optional[DimensionGraph] = None, *, 

279 unique: bool = False) -> DataCoordinateQueryResults: 

280 """Return a results object containing a subset of the dimensions of 

281 this one, and/or a unique near-subset of its rows. 

282 

283 This method may involve actually executing database queries to fetch 

284 `DimensionRecord` objects. 

285 

286 Parameters 

287 ---------- 

288 graph : `DimensionGraph`, optional 

289 Dimensions to include in the new results object. If `None`, 

290 ``self.graph`` is used. 

291 unique : `bool`, optional 

292 If `True` (`False` is default), the query should only return unique 

293 data IDs. This is implemented in the database; to obtain unique 

294 results via Python-side processing (which may be more efficient in 

295 some cases), use `toSet` to construct a `DataCoordinateSet` from 

296 this results object instead. 

297 

298 Returns 

299 ------- 

300 results : `DataCoordinateQueryResults` 

301 A results object corresponding to the given criteria. May be 

302 ``self`` if it already qualifies. 

303 

304 Notes 

305 ----- 

306 This method can only return a "near-subset" of the original result rows 

307 in general because of subtleties in how spatial overlaps are 

308 implemented; see `Query.subset` for more information. 

309 

310 When calling `subset` multiple times on the same very large result set, 

311 it may be much more efficient to call `materialize` first. For 

312 example:: 

313 

314 dimensions1 = DimensionGraph(...) 

315 dimensions2 = DimensionGraph(...) 

316 with registry.queryDataIds(...).materialize() as tempDataIds: 

317 for dataId1 in tempDataIds.subset( 

318 graph=dimensions1, 

319 unique=True): 

320 ... 

321 for dataId2 in tempDataIds.subset( 

322 graph=dimensions2, 

323 unique=True): 

324 ... 

325 """ 

326 if graph is None: 

327 graph = self.graph 

328 if not graph.issubset(self.graph): 

329 raise ValueError(f"{graph} is not a subset of {self.graph}") 

330 if graph == self.graph and (not unique or self._query.isUnique()): 

331 return self 

332 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] 

333 if self._records is not None: 

334 records = {element.name: self._records[element.name] for element in graph.elements} 

335 else: 

336 records = None 

337 query = self._query.subset(graph=graph, datasets=False, unique=unique) 

338 

339 return self._clone(graph=graph, query=query, records=records) 

340 

341 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

342 # Docstring inherited from DataCoordinateIterable. 

343 sql = self._query.sql 

344 if sql is not None: 

345 fromClause = sql.alias("c") 

346 query.join( 

347 fromClause, 

348 onclause=sqlalchemy.sql.and_(*[ 

349 columns(dimension.name) == fromClause.columns[dimension.name] 

350 for dimension in self.graph.required 

351 ]) 

352 ) 

353 

354 def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *, 

355 findFirst: bool = True) -> ParentDatasetQueryResults: 

356 """Find datasets using the data IDs identified by this query. 

357 

358 Parameters 

359 ---------- 

360 datasetType : `DatasetType` or `str` 

361 Dataset type or the name of one to search for. Must have 

362 dimensions that are a subset of ``self.graph``. 

363 collections : `Any` 

364 An expression that fully or partially identifies the collections 

365 to search for the dataset, such as a `str`, `re.Pattern`, or 

366 iterable thereof. ``...`` can be used to return all collections. 

367 See :ref:`daf_butler_collection_expressions` for more information. 

368 findFirst : `bool`, optional 

369 If `True` (default), for each result data ID, only yield one 

370 `DatasetRef`, from the first collection in which a dataset of that 

371 dataset type appears (according to the order of ``collections`` 

372 passed in). If `True`, ``collections`` must not contain regular 

373 expressions and may not be ``...``. 

374 

375 Returns 

376 ------- 

377 datasets : `ParentDatasetQueryResults` 

378 A lazy-evaluation object representing dataset query results, 

379 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

380 nested data IDs in those dataset references will have records as 

381 well. 

382 

383 Raises 

384 ------ 

385 ValueError 

386 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

387 """ 

388 if not isinstance(datasetType, DatasetType): 

389 datasetType = self._query.managers.datasets[datasetType].datasetType 

390 # moving component handling down into managers. 

391 if not datasetType.dimensions.issubset(self.graph): 

392 raise ValueError(f"findDatasets requires that the dataset type have the same dimensions as " 

393 f"the DataCoordinateQueryResult used as input to the search, but " 

394 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input " 

395 f"dimensions are {self.graph}.") 

396 if datasetType.isComponent(): 

397 # We were given a true DatasetType instance, but it's a component. 

398 parentName, componentName = datasetType.nameAndComponent() 

399 storage = self._query.managers.datasets[parentName] 

400 datasetType = storage.datasetType 

401 components = [componentName] 

402 else: 

403 components = [None] 

404 summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType]) 

405 builder = self._query.makeBuilder(summary) 

406 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst) 

407 query = builder.finish(joinMissing=False) 

408 return ParentDatasetQueryResults(db=self._db, query=query, components=components, 

409 records=self._records, datasetType=datasetType) 

410 

411 def count(self, *, exact: bool = True) -> int: 

412 """Count the number of rows this query would return. 

413 

414 Parameters 

415 ---------- 

416 exact : `bool`, optional 

417 If `True`, run the full query and perform post-query filtering if 

418 needed to account for that filtering in the count. If `False`, the 

419 result may be an upper bound. 

420 

421 Returns 

422 ------- 

423 count : `int` 

424 The number of rows the query would return, or an upper bound if 

425 ``exact=False``. 

426 

427 Notes 

428 ----- 

429 This counts the number of rows returned, not the number of unique rows 

430 returned, so even with ``exact=True`` it may provide only an upper 

431 bound on the number of *deduplicated* result rows. 

432 """ 

433 return self._query.count(self._db, exact=exact) 

434 

435 def any( 

436 self, *, 

437 execute: bool = True, 

438 exact: bool = True, 

439 ) -> bool: 

440 """Test whether this query returns any results. 

441 

442 Parameters 

443 ---------- 

444 execute : `bool`, optional 

445 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

446 determined prior to execution that the query would return no rows. 

447 exact : `bool`, optional 

448 If `True`, run the full query and perform post-query filtering if 

449 needed, until at least one result row is found. If `False`, the 

450 returned result does not account for post-query filtering, and 

451 hence may be `True` even when all result rows would be filtered 

452 out. 

453 

454 Returns 

455 ------- 

456 any : `bool` 

457 `True` if the query would (or might, depending on arguments) yield 

458 result rows. `False` if it definitely would not. 

459 """ 

460 return self._query.any(self._db, execute=execute, exact=exact) 

461 

462 def explain_no_results(self) -> Iterator[str]: 

463 """Return human-readable messages that may help explain why the query 

464 yields no results. 

465 

466 Returns 

467 ------- 

468 messages : `Iterator` [ `str` ] 

469 String messages that describe reasons the query might not yield any 

470 results. 

471 

472 Notes 

473 ----- 

474 Messages related to post-query filtering are only available if the 

475 iterator has been exhausted, or if `any` or `count` was already called 

476 (with ``exact=True`` for the latter two). 

477 

478 This method first yields messages that are generated while the query is 

479 being built or filtered, but may then proceed to diagnostics generated 

480 by performing what should be inexpensive follow-up queries. Callers 

481 can short-circuit this at any time by simplying not iterating further. 

482 """ 

483 return self._query.explain_no_results(self._db) 

484 

485 def order_by(self, *args: str) -> DataCoordinateQueryResults: 

486 """Make the iterator return ordered result. 

487 

488 Parameters 

489 ---------- 

490 *args : `str` 

491 Names of the columns/dimensions to use for ordering. Column name 

492 can be prefixed with minus (``-``) to use descending ordering. 

493 

494 Returns 

495 ------- 

496 result : `DataCoordinateQueryResults` 

497 Returns ``self`` instance which is updated to return ordered 

498 result. 

499 

500 Notes 

501 ----- 

502 This method modifies the iterator in place and returns the same 

503 instance to support method chaining. 

504 """ 

505 return self._clone(order_by=args) 

506 

507 def limit(self, limit: int, offset: Optional[int] = None) -> DataCoordinateQueryResults: 

508 """Make the iterator return limited number of records. 

509 

510 Parameters 

511 ---------- 

512 limit : `int` 

513 Upper limit on the number of returned records. 

514 offset : `int` or `None` 

515 If not `None` then the number of records to skip before returning 

516 ``limit`` records. 

517 

518 Returns 

519 ------- 

520 result : `DataCoordinateQueryResults` 

521 Returns ``self`` instance which is updated to return limited set 

522 of records. 

523 

524 Notes 

525 ----- 

526 This method modifies the iterator in place and returns the same 

527 instance to support method chaining. Normally this method is used 

528 together with `order_by` method. 

529 """ 

530 return self._clone(limit=(limit, offset)) 

531 

532 

533class DatasetQueryResults(Iterable[DatasetRef]): 

534 """An interface for objects that represent the results of queries for 

535 datasets. 

536 """ 

537 

538 @abstractmethod 

539 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

540 """Group results by parent dataset type. 

541 

542 Returns 

543 ------- 

544 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

545 An iterator over `DatasetQueryResults` instances that are each 

546 responsible for a single parent dataset type (either just that 

547 dataset type, one or more of its component dataset types, or both). 

548 """ 

549 raise NotImplementedError() 

550 

551 @abstractmethod 

552 def materialize(self) -> ContextManager[DatasetQueryResults]: 

553 """Insert this query's results into a temporary table. 

554 

555 Returns 

556 ------- 

557 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

558 A context manager that ensures the temporary table is created and 

559 populated in ``__enter__`` (returning a results object backed by 

560 that table), and dropped in ``__exit__``. If ``self`` is already 

561 materialized, the context manager may do nothing (reflecting the 

562 fact that an outer context manager should already take care of 

563 everything else). 

564 """ 

565 raise NotImplementedError() 

566 

567 @abstractmethod 

568 def expanded(self) -> DatasetQueryResults: 

569 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords` 

570 returns `True` for all data IDs in returned `DatasetRef` objects. 

571 

572 Returns 

573 ------- 

574 expanded : `DatasetQueryResults` 

575 Either a new `DatasetQueryResults` instance or ``self``, if it is 

576 already expanded. 

577 

578 Notes 

579 ----- 

580 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

581 to call `materialize` before expanding data IDs for very large result 

582 sets. 

583 """ 

584 raise NotImplementedError() 

585 

586 @abstractmethod 

587 def count(self, *, exact: bool = True) -> int: 

588 """Count the number of rows this query would return. 

589 

590 Parameters 

591 ---------- 

592 exact : `bool`, optional 

593 If `True`, run the full query and perform post-query filtering if 

594 needed to account for that filtering in the count. If `False`, the 

595 result may be an upper bound. 

596 

597 Returns 

598 ------- 

599 count : `int` 

600 The number of rows the query would return, or an upper bound if 

601 ``exact=False``. 

602 

603 Notes 

604 ----- 

605 This counts the number of rows returned, not the number of unique rows 

606 returned, so even with ``exact=True`` it may provide only an upper 

607 bound on the number of *deduplicated* result rows. 

608 """ 

609 raise NotImplementedError() 

610 

611 @abstractmethod 

612 def any( 

613 self, *, 

614 execute: bool = True, 

615 exact: bool = True, 

616 ) -> bool: 

617 """Test whether this query returns any results. 

618 

619 Parameters 

620 ---------- 

621 execute : `bool`, optional 

622 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

623 determined prior to execution that the query would return no rows. 

624 exact : `bool`, optional 

625 If `True`, run the full query and perform post-query filtering if 

626 needed, until at least one result row is found. If `False`, the 

627 returned result does not account for post-query filtering, and 

628 hence may be `True` even when all result rows would be filtered 

629 out. 

630 

631 Returns 

632 ------- 

633 any : `bool` 

634 `True` if the query would (or might, depending on arguments) yield 

635 result rows. `False` if it definitely would not. 

636 """ 

637 raise NotImplementedError() 

638 

639 @abstractmethod 

640 def explain_no_results(self) -> Iterator[str]: 

641 """Return human-readable messages that may help explain why the query 

642 yields no results. 

643 

644 Returns 

645 ------- 

646 messages : `Iterator` [ `str` ] 

647 String messages that describe reasons the query might not yield any 

648 results. 

649 

650 Notes 

651 ----- 

652 Messages related to post-query filtering are only available if the 

653 iterator has been exhausted, or if `any` or `count` was already called 

654 (with ``exact=True`` for the latter two). 

655 

656 This method first yields messages that are generated while the query is 

657 being built or filtered, but may then proceed to diagnostics generated 

658 by performing what should be inexpensive follow-up queries. Callers 

659 can short-circuit this at any time by simplying not iterating further. 

660 """ 

661 raise NotImplementedError() 

662 

663 

664class ParentDatasetQueryResults(DatasetQueryResults): 

665 """An object that represents results from a query for datasets with a 

666 single parent `DatasetType`. 

667 

668 Parameters 

669 ---------- 

670 db : `Database` 

671 Database engine to execute queries against. 

672 query : `Query` 

673 Low-level query object that backs these results. ``query.datasetType`` 

674 will be the parent dataset type for this object, and may not be `None`. 

675 components : `Sequence` [ `str` or `None` ] 

676 Names of components to include in iteration. `None` may be included 

677 (at most once) to include the parent dataset type. 

678 records : `Mapping`, optional 

679 Mapping containing `DimensionRecord` objects for all dimensions and 

680 all data IDs this query will yield. If `None` (default), 

681 `DataCoordinate.hasRecords` will return `False` for all nested data 

682 IDs. This is a nested mapping with `str` names of dimension elements 

683 as outer keys, `DimensionRecord` instances as inner values, and 

684 ``tuple(record.dataId.values())`` for the inner keys / outer values 

685 (where ``record`` is the innermost `DimensionRecord` instance). 

686 datasetType : `DatasetType`, optional 

687 Parent dataset type for all datasets returned by this query. If not 

688 provided, ``query.datasetType`` be used, and must not be `None` (as it 

689 is in the case where the query is known to yield no results prior to 

690 execution). 

691 """ 

692 def __init__(self, db: Database, query: Query, *, 

693 components: Sequence[Optional[str]], 

694 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

695 datasetType: Optional[DatasetType] = None): 

696 self._db = db 

697 self._query = query 

698 self._components = components 

699 self._records = records 

700 if datasetType is None: 

701 datasetType = query.datasetType 

702 assert datasetType is not None, \ 

703 "Query used to initialize dataset results must have a dataset." 

704 assert datasetType.dimensions == query.graph, \ 

705 f"Query dimensions {query.graph} do not match dataset type dimesions {datasetType.dimensions}." 

706 self._datasetType = datasetType 

707 

708 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

709 

710 def __iter__(self) -> Iterator[DatasetRef]: 

711 for row in self._query.rows(self._db): 

712 parentRef = self._query.extractDatasetRef(row, records=self._records) 

713 for component in self._components: 

714 if component is None: 

715 yield parentRef 

716 else: 

717 yield parentRef.makeComponentRef(component) 

718 

719 def __repr__(self) -> str: 

720 return f"<DatasetRef iterator for [components of] {self._datasetType.name}>" 

721 

722 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

723 # Docstring inherited from DatasetQueryResults. 

724 yield self 

725 

726 @contextmanager 

727 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

728 # Docstring inherited from DatasetQueryResults. 

729 with self._query.materialize(self._db) as materialized: 

730 yield ParentDatasetQueryResults(self._db, materialized, 

731 components=self._components, 

732 records=self._records) 

733 

734 @property 

735 def parentDatasetType(self) -> DatasetType: 

736 """The parent dataset type for all datasets in this iterable 

737 (`DatasetType`). 

738 """ 

739 return self._datasetType 

740 

741 @property 

742 def dataIds(self) -> DataCoordinateQueryResults: 

743 """A lazy-evaluation object representing a query for just the data 

744 IDs of the datasets that would be returned by this query 

745 (`DataCoordinateQueryResults`). 

746 

747 The returned object is not in general `zip`-iterable with ``self``; 

748 it may be in a different order or have (or not have) duplicates. 

749 """ 

750 query = self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False) 

751 return DataCoordinateQueryResults.from_query( 

752 self._db, 

753 query, 

754 self.parentDatasetType.dimensions, 

755 records=self._records, 

756 ) 

757 

758 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults: 

759 """Return a new query results object for the same parent datasets but 

760 different components. 

761 

762 components : `Sequence` [ `str` or `None` ] 

763 Names of components to include in iteration. `None` may be 

764 included (at most once) to include the parent dataset type. 

765 """ 

766 return ParentDatasetQueryResults(self._db, self._query, records=self._records, 

767 components=components, datasetType=self._datasetType) 

768 

769 def expanded(self) -> ParentDatasetQueryResults: 

770 # Docstring inherited from DatasetQueryResults. 

771 if self._records is None: 

772 records = self.dataIds.expanded()._records 

773 return ParentDatasetQueryResults(self._db, self._query, records=records, 

774 components=self._components, datasetType=self._datasetType) 

775 else: 

776 return self 

777 

778 def count(self, *, exact: bool = True) -> int: 

779 # Docstring inherited. 

780 return len(self._components) * self._query.count(self._db, exact=exact) 

781 

782 def any( 

783 self, *, 

784 execute: bool = True, 

785 exact: bool = True, 

786 ) -> bool: 

787 # Docstring inherited. 

788 return self._query.any(self._db, execute=execute, exact=exact) 

789 

790 def explain_no_results(self) -> Iterator[str]: 

791 # Docstring inherited. 

792 return self._query.explain_no_results(self._db) 

793 

794 

795class ChainedDatasetQueryResults(DatasetQueryResults): 

796 """A `DatasetQueryResults` implementation that simply chains together 

797 other results objects, each for a different parent dataset type. 

798 

799 Parameters 

800 ---------- 

801 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

802 The underlying results objects this object will chain together. 

803 doomed_by : `Iterable` [ `str` ], optional 

804 A list of messages (appropriate for e.g. logging or exceptions) that 

805 explain why the query is known to return no results even before it is 

806 executed. Queries with a non-empty list will never be executed. 

807 Child results objects may also have their own list. 

808 """ 

809 

810 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()): 

811 self._chain = chain 

812 self._doomed_by = tuple(doomed_by) 

813 

814 __slots__ = ("_chain",) 

815 

816 def __iter__(self) -> Iterator[DatasetRef]: 

817 return itertools.chain.from_iterable(self._chain) 

818 

819 def __repr__(self) -> str: 

820 return "<DatasetRef iterator for multiple dataset types>" 

821 

822 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

823 # Docstring inherited from DatasetQueryResults. 

824 return iter(self._chain) 

825 

826 @contextmanager 

827 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

828 # Docstring inherited from DatasetQueryResults. 

829 with ExitStack() as stack: 

830 yield ChainedDatasetQueryResults( 

831 [stack.enter_context(r.materialize()) for r in self._chain] 

832 ) 

833 

834 def expanded(self) -> ChainedDatasetQueryResults: 

835 # Docstring inherited from DatasetQueryResults. 

836 return ChainedDatasetQueryResults([r.expanded() for r in self._chain]) 

837 

838 def count(self, *, exact: bool = True) -> int: 

839 # Docstring inherited. 

840 return sum(r.count(exact=exact) for r in self._chain) 

841 

842 def any( 

843 self, *, 

844 execute: bool = True, 

845 exact: bool = True, 

846 ) -> bool: 

847 # Docstring inherited. 

848 return any(r.any(execute=execute, exact=exact) for r in self._chain) 

849 

850 def explain_no_results(self) -> Iterator[str]: 

851 # Docstring inherited. 

852 for r in self._chain: 

853 yield from r.explain_no_results() 

854 yield from self._doomed_by 

855 

856 

857class DimensionRecordQueryResults(Iterable[DimensionRecord]): 

858 """An interface for objects that represent the results of queries for 

859 dimension records. 

860 """ 

861 

862 @abstractmethod 

863 def count(self, *, exact: bool = True) -> int: 

864 """Count the number of rows this query would return. 

865 

866 Parameters 

867 ---------- 

868 exact : `bool`, optional 

869 If `True`, run the full query and perform post-query filtering if 

870 needed to account for that filtering in the count. If `False`, the 

871 result may be an upper bound. 

872 

873 Returns 

874 ------- 

875 count : `int` 

876 The number of rows the query would return, or an upper bound if 

877 ``exact=False``. 

878 

879 Notes 

880 ----- 

881 This counts the number of rows returned, not the number of unique rows 

882 returned, so even with ``exact=True`` it may provide only an upper 

883 bound on the number of *deduplicated* result rows. 

884 """ 

885 raise NotImplementedError() 

886 

887 @abstractmethod 

888 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

889 """Test whether this query returns any results. 

890 

891 Parameters 

892 ---------- 

893 execute : `bool`, optional 

894 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

895 determined prior to execution that the query would return no rows. 

896 exact : `bool`, optional 

897 If `True`, run the full query and perform post-query filtering if 

898 needed, until at least one result row is found. If `False`, the 

899 returned result does not account for post-query filtering, and 

900 hence may be `True` even when all result rows would be filtered 

901 out. 

902 

903 Returns 

904 ------- 

905 any : `bool` 

906 `True` if the query would (or might, depending on arguments) yield 

907 result rows. `False` if it definitely would not. 

908 """ 

909 raise NotImplementedError() 

910 

911 @abstractmethod 

912 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

913 """Make the iterator return ordered result. 

914 

915 Parameters 

916 ---------- 

917 *args : `str` 

918 Names of the columns/dimensions to use for ordering. Column name 

919 can be prefixed with minus (``-``) to use descending ordering. 

920 

921 Returns 

922 ------- 

923 result : `DimensionRecordQueryResults` 

924 Returns ``self`` instance which is updated to return ordered 

925 result. 

926 

927 Notes 

928 ----- 

929 This method can modify the iterator in place and return the same 

930 instance. 

931 """ 

932 raise NotImplementedError() 

933 

934 @abstractmethod 

935 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

936 """Make the iterator return limited number of records. 

937 

938 Parameters 

939 ---------- 

940 limit : `int` 

941 Upper limit on the number of returned records. 

942 offset : `int` or `None` 

943 If not `None` then the number of records to skip before returning 

944 ``limit`` records. 

945 

946 Returns 

947 ------- 

948 result : `DimensionRecordQueryResults` 

949 Returns ``self`` instance which is updated to return limited set 

950 of records. 

951 

952 Notes 

953 ----- 

954 This method can modify the iterator in place and return the same 

955 instance. Normally this method is used together with `order_by` 

956 method. 

957 """ 

958 raise NotImplementedError() 

959 

960 

961class DatabaseDimensionRecordQueryResults(DimensionRecordQueryResults): 

962 """Implementation of DimensionRecordQueryResults using database query. 

963 

964 Parameters 

965 ---------- 

966 dataIds : `DataCoordinateQueryResults` 

967 Iterator for DataIds. 

968 recordStorage : `DimensionRecordStorage` 

969 Instance of storage class for dimension records. 

970 """ 

971 def __init__(self, dataIds: DataCoordinateQueryResults, recordStorage: DimensionRecordStorage): 

972 self._dataIds = dataIds 

973 self._recordStorage = recordStorage 

974 self._order_by: Iterable[str] = () 

975 

976 def __iter__(self) -> Iterator[DimensionRecord]: 

977 # LIMIT is already applied at DataCoordinateQueryResults level 

978 # (assumption here is that if DataId exists then dimension record 

979 # exists too and their counts must be equal). We still need to make 

980 # sure that ordering is applied to dimension records as well. 

981 if not self._order_by: 

982 return iter(self._recordStorage.fetch(self._dataIds)) 

983 else: 

984 # fetch() method does not support ordering, for now do it hard way 

985 # by fetching everything into memory and ordering by DataId 

986 dataIds = self._dataIds.toSequence() 

987 rec_map = {} 

988 for rec in self._recordStorage.fetch(dataIds): 

989 rec_map[rec.dataId] = rec 

990 # TODO: Do we want to clean up dataIds that may be missing 

991 return iter(rec_map[dataId] for dataId in dataIds) 

992 

993 def count(self, *, exact: bool = True) -> int: 

994 # Docstring inherited from base class. 

995 return self._dataIds.count(exact=exact) 

996 

997 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

998 # Docstring inherited from base class. 

999 return self._dataIds.any(execute=execute, exact=exact) 

1000 

1001 def order_by(self, *args: str) -> DimensionRecordQueryResults: 

1002 # Docstring inherited from base class. 

1003 self._dataIds = self._dataIds.order_by(*args) 

1004 self._order_by = args 

1005 return self 

1006 

1007 def limit(self, limit: int, offset: Optional[int] = None) -> DimensionRecordQueryResults: 

1008 # Docstring inherited from base class. 

1009 self._dataIds = self._dataIds.limit(limit, offset) 

1010 return self