Coverage for python/lsst/daf/butler/registry/queries/_query.py: 17%

204 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = () 

24 

25from collections.abc import Iterable, Iterator, Mapping, Sequence, Set 

26from contextlib import contextmanager 

27from typing import Any, cast, final 

28 

29from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm 

30 

31from ...core import ( 

32 DataCoordinate, 

33 DatasetColumnTag, 

34 DatasetRef, 

35 DatasetType, 

36 Dimension, 

37 DimensionElement, 

38 DimensionGraph, 

39 DimensionKeyColumnTag, 

40 DimensionRecord, 

41) 

42from ..wildcards import CollectionWildcard 

43from ._query_backend import QueryBackend 

44from ._query_context import QueryContext 

45from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader 

46 

47 

48@final 

49class Query: 

50 """A general-purpose representation of a registry query. 

51 

52 Parameters 

53 ---------- 

54 dimensions : `DimensionGraph` 

55 The dimensions that span the query and are used to join its relations 

56 together. 

57 backend : `QueryBackend` 

58 Backend object used to create the query and new ones derived from it. 

59 context : `QueryContext` 

60 Context manager that holds relation engines and database connections 

61 for the query. 

62 relation : `Relation` 

63 The relation tree representation of the query as a series of operations 

64 on tables. 

65 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

66 `~collections.abc.Set` [ `str` ] ] 

67 Constraints on governor dimensions encoded in this query's relation. 

68 This is a mapping from governor dimension name to sets of values that 

69 dimension may take. 

70 is_deferred : `bool` 

71 If `True`, modifier methods that return a related `Query` object should 

72 not immediately execute the new query. 

73 has_record_columns : `bool` or `DimensionElement` 

74 Whether this query's relation already includes columns for all or some 

75 dimension element records: `True` means all elements in ``dimensions`` 

76 either have records present in ``record_caches`` or all columns present 

77 in ``relation``, while a specific `DimensionElement` means that element 

78 does. 

79 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \ 

80 `~collections.abc.Mapping` 

81 [ `DataCoordinate`, `DimensionRecord` ] ], optional 

82 Cached dimension record values, organized first by dimension element 

83 and then by data ID. 

84 

85 Notes 

86 ----- 

87 Iterating over a `Query` yields mappings from `ColumnTag` to the associated 

88 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and 

89 `iter_dimension_records` methods can be used to instead iterate over 

90 various butler primitives derived from these rows. 

91 

92 Iterating over a `Query` may or may not execute database queries again each 

93 time, depending on the state of its relation tree - see `Query.run` for 

94 details. 

95 

96 Query is immutable; all methods that might appear to modify it in place 

97 actually return a new object (though many attributes will be shared). 

98 

99 Query is currently (still) an internal-to-Registry object, with only the 

100 "QueryResults" classes that are backed by it directly exposed to users. It 

101 has been designed with the intent that it will eventually play a larger 

102 role, either as the main query result object in a redesigned query 

103 interface, or a "power user" result option that accompanies simpler 

104 replacements for the current "QueryResults" objects. 

105 """ 

106 

107 def __init__( 

108 self, 

109 dimensions: DimensionGraph, 

110 backend: QueryBackend[QueryContext], 

111 context: QueryContext, 

112 relation: Relation, 

113 governor_constraints: Mapping[str, Set[str]], 

114 is_deferred: bool, 

115 has_record_columns: bool | DimensionElement, 

116 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

117 ): 

118 self._dimensions = dimensions 

119 self._backend = backend 

120 self._context = context 

121 self._relation = relation 

122 self._governor_constraints = governor_constraints 

123 self._is_deferred = is_deferred 

124 self._has_record_columns = has_record_columns 

125 self._record_caches = record_caches if record_caches is not None else {} 

126 

127 @property 

128 def dimensions(self) -> DimensionGraph: 

129 """The dimensions that span the query and are used to join its 

130 relations together (`DimensionGraph`). 

131 """ 

132 return self._dimensions 

133 

134 @property 

135 def relation(self) -> Relation: 

136 """The relation tree representation of the query as a series of 

137 operations on tables (`Relation`). 

138 """ 

139 return self._relation 

140 

141 @property 

142 def has_record_columns(self) -> bool | DimensionElement: 

143 """Whether this query's relation already includes columns for all or 

144 some dimension element records (`bool` or `DimensionElement`). 

145 """ 

146 return self._has_record_columns 

147 

148 @property 

149 def backend(self) -> QueryBackend[QueryContext]: 

150 """Backend object used to create the query and new ones derived from it 

151 (`QueryBackend`). 

152 """ 

153 return self._backend 

154 

155 @contextmanager 

156 def open_context(self) -> Iterator[None]: 

157 """Return a context manager that ensures a database connection is 

158 established and temporary tables and cursors have a defined lifetime. 

159 

160 Returns 

161 ------- 

162 context : `contextlib.AbstractContextManager` 

163 Context manager with no return value. 

164 """ 

165 if self._context.is_open: 

166 yield 

167 else: 

168 with self._context: 

169 yield 

170 

171 def __str__(self) -> str: 

172 return str(self._relation) 

173 

174 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]: 

175 return iter(self._context.fetch_iterable(self._relation)) 

176 

177 def iter_data_ids(self, dimensions: DimensionGraph | None = None) -> Iterator[DataCoordinate]: 

178 """Return an iterator that converts result rows to data IDs. 

179 

180 Parameters 

181 ---------- 

182 dimensions : `DimensionGraph`, optional 

183 Dimensions of the data IDs to return. If not provided, 

184 ``self.dimensions`` is used. 

185 

186 Returns 

187 ------- 

188 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ] 

189 Iterator that yields data IDs. 

190 """ 

191 if dimensions is None: 

192 dimensions = self._dimensions 

193 reader = DataCoordinateReader.make( 

194 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches 

195 ) 

196 if not (reader.columns_required <= self.relation.columns): 

197 raise ColumnError( 

198 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

199 f"for data IDs with dimensions {dimensions}." 

200 ) 

201 return (reader.read(row) for row in self) 

202 

203 def iter_dataset_refs( 

204 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,) 

205 ) -> Iterator[DatasetRef]: 

206 """Return an iterator that converts result rows to dataset references. 

207 

208 Parameters 

209 ---------- 

210 dataset_type : `DatasetType` 

211 The parent dataset type to yield references for. 

212 components : `~collections.abc.Sequence` [ `None` or `str` ] 

213 Which component dataset types to construct refs for from each row 

214 representing a parent; `None` for the parent itself. 

215 

216 Returns 

217 ------- 

218 refs : `~collections.abc.Iterator` [ `DatasetRef` ] 

219 Iterator that yields (resolved) dataset references. 

220 """ 

221 reader = DatasetRefReader( 

222 dataset_type, 

223 translate_collection=self._backend.get_collection_name, 

224 records=self._has_record_columns is True, 

225 record_caches=self._record_caches, 

226 ) 

227 if not (reader.columns_required <= self.relation.columns): 

228 raise ColumnError( 

229 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

230 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}." 

231 ) 

232 for row in self: 

233 parent_ref = reader.read(row) 

234 for component in components: 

235 if component is None: 

236 yield parent_ref 

237 else: 

238 yield parent_ref.makeComponentRef(component) 

239 

240 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]: 

241 """Return an iterator that converts result rows to dimension records. 

242 

243 Parameters 

244 ---------- 

245 element : `DimensionElement`, optional 

246 Dimension element whose records will be returned. If not provided, 

247 `has_record_columns` must be a `DimensionElement` instance. 

248 

249 Returns 

250 ------- 

251 records : `~collections.abc.Iterator` [ `DimensionRecord` ] 

252 Iterator that yields dimension records. 

253 """ 

254 if element is None: 

255 match self._has_record_columns: 

256 case True | False: 

257 raise ValueError("No default dimension element in query; 'element' must be given.") 

258 case only_element_with_records: 

259 element = only_element_with_records 

260 if (cache := self._record_caches.get(element)) is not None: 

261 return (cache[data_id] for data_id in self.iter_data_ids(element.graph)) 

262 else: 

263 reader = DimensionRecordReader(element) 

264 if not (reader.columns_required <= self.relation.columns): 

265 raise ColumnError( 

266 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

267 f"for records of element {element.name}." 

268 ) 

269 return (reader.read(row) for row in self) 

270 

271 def run(self) -> Query: 

272 """Execute the query and hold its results in memory. 

273 

274 Returns 

275 ------- 

276 executed : `Query` 

277 New query that holds the query results. 

278 

279 Notes 

280 ----- 

281 Iterating over the results of a query that has been `run` will always 

282 iterate over an existing container, while iterating over a query that 

283 has not been run will result in executing at least some of the query 

284 each time. 

285 

286 Running a query also sets its `is_deferred` flag to `False`, which will 

287 cause new queries constructed by its methods to be run immediately, 

288 unless ``defer=True`` is passed to the factory method. After a query 

289 has been run, factory methods will also tend to prefer to apply new 

290 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python 

291 code acting on the existing container rather than going back to SQL, 

292 which can be less efficient overall that applying operations to a 

293 deferred query and executing them all only at the end. 

294 

295 Running a query is represented in terms of relations by adding a 

296 `~lsst.daf.relation.Materialization` marker relation in the iteration 

297 engine and then processing the relation tree; this attaches the 

298 container of rows to that new relation to short-circuit any future 

299 processing of the tree and lock changes to the tree upstream of it. 

300 This is very different from the SQL-engine 

301 `~lsst.daf.relation.Materialization` added to the tree by the 

302 `materialize` method from a user perspective, though it has a similar 

303 representation in the relation tree. 

304 """ 

305 relation = ( 

306 # Make a new relation that definitely ends in the iteration engine 

307 # (this does nothing if it already does). 

308 self.relation.transferred_to(self._context.iteration_engine) 

309 # Make the new relation save its rows to an in-memory Python 

310 # collection in relation.payload when processed. 

311 .materialized(name_prefix="run") 

312 ) 

313 # Actually process the relation, simplifying out trivial relations, 

314 # executing any SQL queries, and saving results to relation.payload. 

315 # We discard the simplified relation that's returned, because we want 

316 # the new query to have any extra diagnostic information contained in 

317 # the original. 

318 self._context.process(relation) 

319 return self._copy(relation, False) 

320 

321 def materialized(self, defer_postprocessing: bool = True) -> Query: 

322 """Materialize the results of this query in its context's preferred 

323 engine. 

324 

325 Usually this means inserting the results into a temporary table in a 

326 database. 

327 

328 Parameters 

329 ---------- 

330 defer_postprocessing : `bool`, optional 

331 If `True`, do not execute operations that occur in the context's 

332 `QueryContext.iteration_engine` up front; instead insert and 

333 execute a materialization upstream of them (e.g. via a a SQL 

334 ``INSERT INTO ... SELECT`` statement, with no fetching to the 

335 client) and execute the postprocessing operations when iterating 

336 over the query results. If `False`, and iteration-engine 

337 postprocessing operations exist, run the full query, execute them 

338 now, and upload the results. 

339 If the relation is already in the preferred engine, this option 

340 is ignored and the materialization will not involve fetching rows 

341 to the iteration engine at all. If the relation has already been 

342 materialized in the iteration engine (i.e. via `run`), then this 

343 option is again ignored and an upload of the existing rows will 

344 be performed. 

345 

346 Returns 

347 ------- 

348 materialized : `Query` 

349 Modified query with the same row-and-column content with a 

350 materialization in ``self.context.preferred_engine``. 

351 """ 

352 if defer_postprocessing or self.relation.engine == self._context.preferred_engine: 

353 relation, stripped = self._context.strip_postprocessing(self._relation) 

354 if relation.engine == self._context.preferred_engine: 

355 # We got all the way to the engine we want to materialize in. 

356 # Apply that operation to the tree, process it (which actually 

357 # creates a temporary table and populates it), and then reapply 

358 # the stripped operations. 

359 relation = relation.materialized() 

360 self._context.process(relation) 

361 for operation in stripped: 

362 relation = operation.apply( 

363 relation, transfer=True, preferred_engine=self._context.iteration_engine 

364 ) 

365 return self._copy(relation, True) 

366 # Either defer_postprocessing=False, or attempting to strip off unary 

367 # operations until we got to the preferred engine didn't work, because 

368 # this tree doesn't actually involve the preferred engine. So we just 

369 # transfer to the preferred engine first, and then materialize, 

370 # process, and return. 

371 relation = self._relation.transferred_to(self._context.preferred_engine).materialized() 

372 self._context.process(relation) 

373 return self._copy(relation, True) 

374 

375 def projected( 

376 self, 

377 dimensions: Iterable[Dimension | str] | None = None, 

378 unique: bool = True, 

379 columns: Iterable[ColumnTag] | None = None, 

380 defer: bool | None = None, 

381 drop_postprocessing: bool = False, 

382 keep_record_columns: bool = True, 

383 ) -> Query: 

384 """Return a modified `Query` with a subset of this one's columns. 

385 

386 Parameters 

387 ---------- 

388 dimensions : `~collections.abc.Iterable` [ `Dimension` or `str` ], 

389 optional 

390 Dimensions to include in the new query. Will be expanded to 

391 include all required and implied dependencies. Must be a subset of 

392 ``self.dimensions``. If not provided, ``self.dimensions`` is used. 

393 unique : `bool`, optional 

394 If `True` (default) deduplicate rows after dropping columns. 

395 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional 

396 Additional dataset or dimension record columns to include in the 

397 query. Dimension key columns added here are ignored unless they 

398 extend beyond the key columns implied by the ``dimensions`` 

399 argument (which is an error). 

400 defer : `bool`, optional 

401 If `False`, run the new query immediately. If `True`, do not. If 

402 `None` (default), the ``defer`` option passed when making ``self`` 

403 is used (this option is "sticky"). 

404 drop_postprocessing : `bool`, optional 

405 Drop any iteration-engine operations that depend on columns that 

406 are being removed (e.g. region-overlap tests when region columns 

407 are being dropped), making it more likely that projection and 

408 deduplication could be performed in the preferred engine, where 

409 they may be more efficient. 

410 keep_record_columns : `bool`, optional 

411 If `True` (default) and this query `has_record_columns`, implicitly 

412 add any of those to ``columns`` whose dimension element is in the 

413 given ``dimensions``. 

414 

415 Returns 

416 ------- 

417 query : `Query` 

418 New query with the requested columns only, optionally deduplicated. 

419 

420 Notes 

421 ----- 

422 Dataset columns are dropped from the new query unless passed via the 

423 ``columns`` argument. All other columns are by default preserved. 

424 

425 Raises 

426 ------ 

427 lsst.daf.relation.ColumnError 

428 Raised if the columns to include in the new query are not all 

429 present in the current query. 

430 """ 

431 if dimensions is None: 

432 dimensions = set(self._dimensions) 

433 else: 

434 dimensions = set(dimensions) 

435 if columns is not None: 

436 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns)) 

437 dimensions = self._dimensions.universe.extract(dimensions) 

438 if columns is None: 

439 columns = set() 

440 else: 

441 columns = set(columns) 

442 columns.update(DimensionKeyColumnTag.generate(dimensions.names)) 

443 if keep_record_columns: 

444 if self._has_record_columns is True: 

445 for element in dimensions.elements: 

446 if element not in self._record_caches: 

447 columns.update(element.RecordClass.fields.columns) 

448 elif self._has_record_columns in dimensions.elements: 

449 element = cast(DimensionElement, self._has_record_columns) 

450 columns.update(element.RecordClass.fields.columns) 

451 if drop_postprocessing: 

452 relation = self._context.drop_invalidated_postprocessing(self._relation, columns) 

453 # Dropping postprocessing Calculations could cause other columns 

454 # we had otherwise intended to keep to be dropped as well. 

455 columns &= relation.columns 

456 else: 

457 relation = self._relation 

458 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine) 

459 if unique: 

460 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine) 

461 return self._chain(relation, defer, dimensions=dimensions) 

462 

463 def with_record_columns( 

464 self, dimension_element: DimensionElement | None = None, defer: bool | None = None 

465 ) -> Query: 

466 """Return a modified `Query` with additional dimension record columns 

467 and/or caches. 

468 

469 Parameters 

470 ---------- 

471 dimension_element : `DimensionElement`, optional 

472 Single element to add record columns for, or `None` default to add 

473 them for all elements in `dimensions`. 

474 defer : `bool`, optional 

475 If `False`, run the new query immediately. If `True`, do not. If 

476 `None` (default), the ``defer`` option passed when making ``self`` 

477 is used (this option is "sticky"). 

478 

479 Returns 

480 ------- 

481 query : `Query` 

482 New query with the requested record columns either in the relation 

483 or (when possible) available via record caching. 

484 

485 Notes 

486 ----- 

487 Adding dimension record columns is fundamentally different from adding 

488 new dimension key columns or dataset columns, because it is purely an 

489 addition of columns, not rows - we can always join in a dimension 

490 element table (if it has not already been included) on keys already 

491 present in the current relation, confident that there is exactly one 

492 row in the dimension element table for each row in the current 

493 relation. 

494 """ 

495 if self._has_record_columns is True or self._has_record_columns == dimension_element: 

496 return self 

497 record_caches = dict(self._record_caches) 

498 columns_required: set[ColumnTag] = set() 

499 for element in self.dimensions.elements if dimension_element is None else [dimension_element]: 

500 if element in record_caches: 

501 continue 

502 if (cache := self._backend.get_dimension_record_cache(element.name, self._context)) is not None: 

503 record_caches[element] = cache 

504 else: 

505 columns_required.update(element.RecordClass.fields.columns.keys()) 

506 # Modify the relation we have to remove any projections that dropped 

507 # columns we now want, as long the relation's behavior is otherwise 

508 # unchanged. 

509 columns_required -= self._relation.columns 

510 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

511 columns_required.difference_update(columns_found) 

512 if columns_required: 

513 relation = self._backend.make_dimension_relation( 

514 self._dimensions, 

515 columns_required, 

516 self._context, 

517 initial_relation=relation, 

518 # Don't permit joins to use any columns beyond those in the 

519 # original relation, as that would change what this operation 

520 # does. 

521 initial_join_max_columns=frozenset(self._relation.columns), 

522 governor_constraints=self._governor_constraints, 

523 ) 

524 return self._chain( 

525 relation, 

526 defer=defer, 

527 has_record_columns=True if dimension_element is None else dimension_element, 

528 record_caches=record_caches, 

529 ) 

530 

531 def find_datasets( 

532 self, 

533 dataset_type: DatasetType, 

534 collections: Any, 

535 *, 

536 find_first: bool = True, 

537 columns: Set[str] = frozenset(("dataset_id", "run")), 

538 defer: bool | None = None, 

539 ) -> Query: 

540 """Return a modified `Query` that includes a search for datasets of the 

541 given type. 

542 

543 Parameters 

544 ---------- 

545 dataset_type : `DatasetType` 

546 Dataset type to search for. May not be a component. 

547 collections 

548 Collection search path or pattern. Must be a single collection 

549 name or ordered sequence if ``find_first=True``. See 

550 :ref:`daf_butler_collection_expressions` for more information. 

551 find_first : `bool`, optional 

552 If `True` (default) search collections in order until the first 

553 match for each data ID is found. If `False`, return all matches in 

554 all collections. 

555 columns : `~collections.abc.Set` [ `str` ] 

556 Dataset columns to include in the new query. Options include 

557 

558 - ``dataset_id``: the unique identifier of the dataset. The type 

559 is implementation-dependent. Never nullable. Included by 

560 default. 

561 

562 - ``ingest_date``: the date and time the dataset was added to the 

563 data repository. 

564 

565 - ``run``: the foreign key column to the `~CollectionType.RUN` 

566 collection holding the dataset (not necessarily the collection 

567 name). The type is dependent on the collection manager 

568 implementation. Included by default. 

569 

570 - ``collection``: the foreign key column to the collection type in 

571 which the dataset was actually in this search. The type is 

572 dependent on the collection manager implementation. This may 

573 differ from ``run`` if the dataset is present in a matching 

574 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

575 collection, which means the same dataset may also appear multiple 

576 times in the query results. 

577 

578 - ``timespan``: the validity range for datasets found in a 

579 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other 

580 collection types. 

581 

582 The default columns (``dataset_id`` and ``run``) are sufficient to 

583 enable `iter_dataset_refs`, which also takes care of translating 

584 the internal ``RUN`` collection key into its public name. 

585 

586 Setting this to an empty set while passing ``find_first=False`` 

587 will return a query that is constrained by dataset existence in 

588 some matching collection that does not actually return which 

589 datasets existed. 

590 defer : `bool`, optional 

591 If `False`, run the new query immediately. If `True`, do not. If 

592 `None` (default), the ``defer`` option passed when making ``self`` 

593 is used (this option is "sticky"). 

594 

595 Returns 

596 ------- 

597 query : `Query` 

598 New query with the requested dataset columns, constrained by the 

599 existence of datasets of this type in the given collection. 

600 

601 Raises 

602 ------ 

603 lsst.daf.relation.ColumnError 

604 Raised if a dataset search is already present in this query and 

605 this is a find-first search. 

606 ValueError 

607 Raised if the given dataset type's dimensions are not a subset of 

608 the current query's dimensions. 

609 """ 

610 if find_first and DatasetColumnTag.filter_from(self._relation.columns): 

611 raise ColumnError( 

612 "Cannot search for datasets with find_first=True " 

613 "on a query that already includes dataset columns." 

614 ) 

615 # 

616 # TODO: it'd nice to do a QueryContext.restore_columns call here or 

617 # similar, to look for dataset-constraint joins already present in the 

618 # relation and expand them to include dataset-result columns as well, 

619 # instead of doing a possibly-redundant join here. But that would 

620 # require pushing relation usage down further into 

621 # DatasetStorageManager.make_relation, so that it doesn't need to be 

622 # given the columns, and then giving the relation system the ability to 

623 # simplify-away redundant joins when they only provide columns that 

624 # aren't ultimately used. The right time to look into that is probably 

625 # when investigating whether the base QueryBackend should be 

626 # responsible for producing an "abstract" relation tree of some sort, 

627 # with the subclasses only responsible for filling it in with payloads 

628 # (and possibly replacing some leaves with new sub-trees) during when 

629 # "processed" (or in some other "prepare" step). 

630 # 

631 # This is a low priority for three reasons: 

632 # - there's some chance the database's query optimizer will simplify 

633 # away these redundant joins; 

634 # - at present, the main use of this code path is in QG generation, 

635 # where we materialize the initial data ID query into a temp table 

636 # and hence can't go back and "recover" those dataset columns anyway; 

637 # 

638 if not (dataset_type.dimensions <= self._dimensions): 

639 raise ValueError( 

640 "Cannot find datasets from a query unless the dataset types's dimensions " 

641 f"({dataset_type.dimensions}, for {dataset_type.name}) are a subset of the query's " 

642 f"({self._dimensions})." 

643 ) 

644 columns = set(columns) 

645 columns.add("dataset_id") 

646 collections = CollectionWildcard.from_expression(collections) 

647 if find_first: 

648 collections.require_ordered() 

649 rejections: list[str] = [] 

650 collection_records = self._backend.resolve_dataset_collections( 

651 dataset_type, 

652 collections, 

653 governor_constraints=self._governor_constraints, 

654 allow_calibration_collections=False, # TODO 

655 rejections=rejections, 

656 ) 

657 if not collection_records: 

658 relation = self._relation.join( 

659 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context) 

660 ) 

661 elif find_first: 

662 relation = self._backend.make_dataset_search_relation( 

663 dataset_type, collection_records, columns, self._context, join_to=self._relation 

664 ) 

665 else: 

666 dataset_relation = self._backend.make_dataset_query_relation( 

667 dataset_type, collection_records, columns, self._context 

668 ) 

669 relation = self.relation.join(dataset_relation) 

670 return self._chain(relation, defer=defer) 

671 

672 def sliced( 

673 self, 

674 start: int = 0, 

675 stop: int | None = None, 

676 defer: bool | None = None, 

677 ) -> Query: 

678 """Return a modified `Query` with that takes a slice of this one's 

679 rows. 

680 

681 Parameters 

682 ---------- 

683 start : `int`, optional 

684 First index to include, inclusive. 

685 stop : `int` or `None`, optional 

686 One past the last index to include (i.e. exclusive). 

687 defer : `bool`, optional 

688 If `False`, run the new query immediately. If `True`, do not. If 

689 `None` (default), the ``defer`` option passed when making ``self`` 

690 is used (this option is "sticky"). 

691 

692 Returns 

693 ------- 

694 query : `Query` 

695 New query with the requested slice. 

696 

697 Notes 

698 ----- 

699 This operation must be implemented in the iteration engine if there are 

700 postprocessing operations, which may be much less efficient than 

701 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..`` 

702 in SQL). 

703 

704 Since query row order is usually arbitrary, it usually makes sense to 

705 call `sorted` before calling `sliced` to make the results 

706 deterministic. This is not checked because there are some contexts 

707 where getting an arbitrary subset of the results of a given size 

708 still makes sense. 

709 """ 

710 return self._chain(self._relation[start:stop], defer) 

711 

712 def sorted( 

713 self, 

714 order_by: Iterable[SortTerm], 

715 defer: bool | None = None, 

716 ) -> Query: 

717 """Return a modified `Query` that sorts this one's rows. 

718 

719 Parameters 

720 ---------- 

721 order_by : `~collections.abc.Iterable` [ `SortTerm` ] 

722 Expressions to sort by. 

723 defer : `bool`, optional 

724 If `False`, run the new query immediately. If `True`, do not. If 

725 `None` (default), the ``defer`` option passed when making ``self`` 

726 is used (this option is "sticky"). 

727 

728 Returns 

729 ------- 

730 query : `Query` 

731 New query with the requested sorting. 

732 

733 Notes 

734 ----- 

735 The ``order_by`` expression can include references to dimension record 

736 columns that were not present in the original relation; this is 

737 similar to calling `with_record_columns` for those columns first (but 

738 in this case column requests cannot be satisfied by record caches). 

739 All other columns referenced must be present in the query already. 

740 """ 

741 op = Sort(tuple(order_by)) 

742 columns_required = set(op.columns_required) 

743 columns_required.difference_update(self._relation.columns) 

744 if columns_required: 

745 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

746 columns_required.difference_update(columns_found) 

747 if columns_required: 

748 try: 

749 relation = self._backend.make_dimension_relation( 

750 self._dimensions, 

751 columns_required, 

752 self._context, 

753 initial_relation=relation, 

754 # Don't permit joins to use any columns beyond those in 

755 # the original relation, as that would change what this 

756 # operation does. 

757 initial_join_max_columns=frozenset(self._relation.columns), 

758 governor_constraints=self._governor_constraints, 

759 ) 

760 except ColumnError as err: 

761 raise ColumnError( 

762 "Cannot sort by columns that were not included in the original query or " 

763 "fully resolved by its dimensions." 

764 ) from err 

765 else: 

766 relation = self._relation 

767 relation = op.apply(relation, preferred_engine=self._context.preferred_engine) 

768 return self._chain(relation, defer) 

769 

770 def count(self, *, exact: bool = True, discard: bool = False) -> int: 

771 """Count the number of rows in this query. 

772 

773 Parameters 

774 ---------- 

775 exact : `bool`, optional 

776 If `True` (default), return the exact number of rows. If `False`, 

777 returning an upper bound is permitted if it can be done much more 

778 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but 

779 ignoring client-side filtering that would otherwise take place. 

780 discard : `bool`, optional 

781 If `True`, compute the exact count even if it would require running 

782 the full query and then throwing away the result rows after 

783 counting them. If `False`, this is an error, as the user would 

784 usually be better off executing the query first to fetch its rows 

785 into a new query (or passing ``exact=False``). Ignored if 

786 ``exact=False``. 

787 

788 Returns 

789 ------- 

790 n_rows : `int` 

791 Number of rows in the query, or an upper bound. This includes 

792 duplicates, if there are any. 

793 

794 Raises 

795 ------ 

796 RuntimeError 

797 Raised if an exact count was requested and could not be obtained 

798 without fetching and discarding rows. 

799 """ 

800 if self._relation.min_rows == self._relation.max_rows: 

801 return self._relation.max_rows 

802 return self._context.count(self._relation, exact=exact, discard=discard) 

803 

804 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

805 """Check whether this query has any result rows at all. 

806 

807 Parameters 

808 ---------- 

809 execute : `bool`, optional 

810 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

811 determined prior to execution that the query would return no rows. 

812 exact : `bool`, optional 

813 If `True`, run the full query and perform post-query filtering if 

814 needed, until at least one result row is found. If `False`, the 

815 returned result does not account for post-query filtering, and 

816 hence may be `True` even when all result rows would be filtered 

817 out. 

818 

819 Returns 

820 ------- 

821 any_rows : `bool` 

822 Whether the query has any rows, or if it may have any rows if 

823 ``exact=False``. 

824 

825 Raises 

826 ------ 

827 RuntimeError 

828 Raised if an exact check was requested and could not be obtained 

829 without executing the query. 

830 """ 

831 if self._relation.min_rows > 0: 

832 return True 

833 if self._relation.max_rows == 0: 

834 return False 

835 if execute: 

836 return self._context.any(self._relation, execute=execute, exact=exact) 

837 elif not exact: 

838 return True 

839 raise TypeError("Cannot obtain exact results without executing the query.") 

840 

841 def explain_no_results(self, execute: bool = True) -> list[str]: 

842 """Return human-readable messages that may help explain why the query 

843 yields no results. 

844 

845 Parameters 

846 ---------- 

847 execute : `bool`, optional 

848 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``) 

849 of aspects of the query to more precisely determine where rows were 

850 filtered out. 

851 

852 Returns 

853 ------- 

854 messages : `~collections.abc.Iterable` [ `str` ] 

855 String messages that describe reasons the query might not yield any 

856 results. 

857 """ 

858 # First try without actually executing any queries. 

859 diagnostics = Diagnostics.run(self._relation) 

860 if diagnostics.is_doomed: 

861 return diagnostics.messages 

862 if execute: 

863 # Try again, running LIMIT 1 queries as we walk back down the tree 

864 # to look for relations with no rows: 

865 diagnostics = Diagnostics.run(self._relation, executor=self._context.any) 

866 if diagnostics.is_doomed: 

867 return diagnostics.messages 

868 return [] 

869 

870 def _copy( 

871 self, 

872 relation: Relation, 

873 is_deferred: bool, 

874 dimensions: DimensionGraph | None = None, 

875 governor_constraints: Mapping[str, Set[str]] | None = None, 

876 has_record_columns: bool | DimensionElement | None = None, 

877 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

878 ) -> Query: 

879 """Return a modified copy of this query with some attributes replaced. 

880 

881 See class docs for parameter documentation; the only difference here 

882 is that the defaults are the values ``self`` was constructed with. 

883 """ 

884 return Query( 

885 dimensions=self._dimensions if dimensions is None else dimensions, 

886 backend=self._backend, 

887 context=self._context, 

888 relation=relation, 

889 governor_constraints=( 

890 governor_constraints if governor_constraints is not None else self._governor_constraints 

891 ), 

892 is_deferred=is_deferred, 

893 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns, 

894 record_caches=self._record_caches if record_caches is None else record_caches, 

895 ) 

896 

897 def _chain( 

898 self, 

899 relation: Relation, 

900 defer: bool | None, 

901 dimensions: DimensionGraph | None = None, 

902 governor_constraints: Mapping[str, Set[str]] | None = None, 

903 has_record_columns: bool | DimensionElement | None = None, 

904 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

905 ) -> Query: 

906 """Return a modified query with a new relation while handling the 

907 ubiquitous ``defer`` parameter's logic. 

908 

909 Parameters 

910 ---------- 

911 relation : `Relation` 

912 Relation for the new query. 

913 defer : `bool` 

914 If `False`, run the new query immediately. If `True`, do not. If 

915 `None` , the ``defer`` option passed when making ``self`` is used 

916 (this option is "sticky"). 

917 dimensions : `DimensionGraph`, optional 

918 See class docs. 

919 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

920 `~collections.abc.Set` [ `str` ] ], optional 

921 See class docs. 

922 has_record_columns : `bool` or `DimensionElement`, optional 

923 See class docs. 

924 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \ 

925 `~collections.abc.Mapping` \ 

926 [ `DataCoordinate`, `DimensionRecord` ] ], optional 

927 See class docs. 

928 

929 Returns 

930 ------- 

931 chained : `Query` 

932 Modified query, or ``self`` if no modifications were actually 

933 requested. 

934 """ 

935 if defer is None: 

936 defer = self._is_deferred 

937 if ( 

938 relation is self._relation 

939 and dimensions is None 

940 and defer == self._is_deferred 

941 and record_caches is None 

942 and has_record_columns is None 

943 and governor_constraints is None 

944 ): 

945 return self 

946 result = self._copy( 

947 relation, 

948 is_deferred=True, 

949 governor_constraints=governor_constraints, 

950 dimensions=dimensions, 

951 has_record_columns=has_record_columns, 

952 record_caches=record_caches, 

953 ) 

954 if not defer: 

955 result = result.run() 

956 return result