Coverage for python/lsst/daf/butler/registry/queries/_query.py: 14%

204 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-28 02:30 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = () 

24 

25from collections.abc import Iterable, Iterator, Mapping, Sequence, Set 

26from contextlib import contextmanager 

27from typing import Any, cast, final 

28 

29from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm 

30 

31from ...core import ( 

32 DataCoordinate, 

33 DatasetColumnTag, 

34 DatasetRef, 

35 DatasetType, 

36 Dimension, 

37 DimensionElement, 

38 DimensionGraph, 

39 DimensionKeyColumnTag, 

40 DimensionRecord, 

41) 

42from ..wildcards import CollectionWildcard 

43from ._query_backend import QueryBackend 

44from ._query_context import QueryContext 

45from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader 

46 

47 

48@final 

49class Query: 

50 """A general-purpose representation of a registry query. 

51 

52 Parameters 

53 ---------- 

54 dimensions : `DimensionGraph` 

55 The dimensions that span the query and are used to join its relations 

56 together. 

57 backend : `QueryBackend` 

58 Backend object used to create the query and new ones derived from it. 

59 context : `QueryContext` 

60 Context manager that holds relation engines and database connections 

61 for the query. 

62 relation : `Relation` 

63 The relation tree representation of the query as a series of operations 

64 on tables. 

65 governor_constraints : `Mapping` [ `str` [ `~collections.abc.Set` 

66 [ `str` ] ] ] 

67 Constraints on governor dimensions encoded in this query's relation. 

68 This is a mapping from governor dimension name to sets of values that 

69 dimension may take. 

70 is_deferred : `bool` 

71 If `True`, modifier methods that return a related `Query` object should 

72 not immediately execute the new query. 

73 has_record_columns : `bool` or `DimensionElement` 

74 Whether this query's relation already includes columns for all or some 

75 dimension element records: `True` means all elements in ``dimensions`` 

76 either have records present in ``record_caches`` or all columns present 

77 in ``relation``, while a specific `DimensionElement` means that element 

78 does. 

79 record_caches : `Mapping` [ `DimensionElement`, `Mapping` 

80 [ `DataCoordinate`, `DimensionRecord` ] ], optional 

81 Cached dimension record values, organized first by dimension element 

82 and then by data ID. 

83 

84 Notes 

85 ----- 

86 Iterating over a `Query` yields mappings from `ColumnTag` to the associated 

87 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and 

88 `iter_dimension_records` methods can be used to instead iterate over 

89 various butler primitives derived from these rows. 

90 

91 Iterating over a `Query` may or may not execute database queries again each 

92 time, depending on the state of its relation tree - see `Query.run` for 

93 details. 

94 

95 Query is immutable; all methods that might appear to modify it in place 

96 actually return a new object (though many attributes will be shared). 

97 

98 Query is currently (still) an internal-to-Registry object, with only the 

99 "QueryResults" classes that are backed by it directly exposed to users. It 

100 has been designed with the intent that it will eventually play a larger 

101 role, either as the main query result object in a redesigned query 

102 interface, or a "power user" result option that accompanies simpler 

103 replacements for the current "QueryResults" objects. 

104 """ 

105 

106 def __init__( 

107 self, 

108 dimensions: DimensionGraph, 

109 backend: QueryBackend[QueryContext], 

110 context: QueryContext, 

111 relation: Relation, 

112 governor_constraints: Mapping[str, Set[str]], 

113 is_deferred: bool, 

114 has_record_columns: bool | DimensionElement, 

115 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

116 ): 

117 self._dimensions = dimensions 

118 self._backend = backend 

119 self._context = context 

120 self._relation = relation 

121 self._governor_constraints = governor_constraints 

122 self._is_deferred = is_deferred 

123 self._has_record_columns = has_record_columns 

124 self._record_caches = record_caches if record_caches is not None else {} 

125 

126 @property 

127 def dimensions(self) -> DimensionGraph: 

128 """The dimensions that span the query and are used to join its 

129 relations together (`DimensionGraph`). 

130 """ 

131 return self._dimensions 

132 

133 @property 

134 def relation(self) -> Relation: 

135 """The relation tree representation of the query as a series of 

136 operations on tables (`Relation`). 

137 """ 

138 return self._relation 

139 

140 @property 

141 def has_record_columns(self) -> bool | DimensionElement: 

142 """Whether this query's relation already includes columns for all or 

143 some dimension element records (`bool` or `DimensionElement`). 

144 """ 

145 return self._has_record_columns 

146 

147 @property 

148 def backend(self) -> QueryBackend[QueryContext]: 

149 """Backend object used to create the query and new ones derived from it 

150 (`QueryBackend`). 

151 """ 

152 return self._backend 

153 

154 @contextmanager 

155 def open_context(self) -> Iterator[None]: 

156 """Return a context manager that ensures a database connection is 

157 established and temporary tables and cursors have a defined lifetime. 

158 

159 Returns 

160 ------- 

161 context : `contextlib.AbstractContextManager` 

162 Context manager with no return value. 

163 """ 

164 if self._context.is_open: 

165 yield 

166 else: 

167 with self._context: 

168 yield 

169 

170 def __str__(self) -> str: 

171 return str(self._relation) 

172 

173 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]: 

174 return iter(self._context.fetch_iterable(self._relation)) 

175 

176 def iter_data_ids(self, dimensions: DimensionGraph | None = None) -> Iterator[DataCoordinate]: 

177 """Return an iterator that converts result rows to data IDs. 

178 

179 Parameters 

180 ---------- 

181 dimensions : `DimensionGraph`, optional 

182 Dimensions of the data IDs to return. If not provided, 

183 ``self.dimensions`` is used. 

184 

185 Returns 

186 ------- 

187 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ] 

188 Iterator that yields data IDs. 

189 """ 

190 if dimensions is None: 

191 dimensions = self._dimensions 

192 reader = DataCoordinateReader.make( 

193 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches 

194 ) 

195 if not (reader.columns_required <= self.relation.columns): 

196 raise ColumnError( 

197 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

198 f"for data IDs with dimensions {dimensions}." 

199 ) 

200 return (reader.read(row) for row in self) 

201 

202 def iter_dataset_refs( 

203 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,) 

204 ) -> Iterator[DatasetRef]: 

205 """Return an iterator that converts result rows to dataset references. 

206 

207 Parameters 

208 ---------- 

209 dataset_type : `DatasetType` 

210 The parent dataset type to yield references for. 

211 components : `~collections.abc.Sequence` [ `None` or `str` ] 

212 Which component dataset types to construct refs for from each row 

213 representing a parent; `None` for the parent itself. 

214 

215 Returns 

216 ------- 

217 refs : `~collections.abc.Iterator` [ `DatasetRef` ] 

218 Iterator that yields (resolved) dataset references. 

219 """ 

220 reader = DatasetRefReader( 

221 dataset_type, 

222 translate_collection=self._backend.get_collection_name, 

223 records=self._has_record_columns is True, 

224 record_caches=self._record_caches, 

225 ) 

226 if not (reader.columns_required <= self.relation.columns): 

227 raise ColumnError( 

228 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

229 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}." 

230 ) 

231 for row in self: 

232 parent_ref = reader.read(row) 

233 for component in components: 

234 if component is None: 

235 yield parent_ref 

236 else: 

237 yield parent_ref.makeComponentRef(component) 

238 

239 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]: 

240 """Return an iterator that converts result rows to dimension records. 

241 

242 Parameters 

243 ---------- 

244 element : `DimensionElement`, optional 

245 Dimension element whose records will be returned. If not provided, 

246 `has_record_columns` must be a `DimensionElement` instance. 

247 

248 Returns 

249 ------- 

250 records : `~collections.abc.Iterator` [ `DimensionRecord` ] 

251 Iterator that yields dimension records. 

252 """ 

253 if element is None: 

254 match self._has_record_columns: 

255 case True | False: 

256 raise ValueError("No default dimension element in query; 'element' must be given.") 

257 case only_element_with_records: 

258 element = only_element_with_records 

259 if (cache := self._record_caches.get(element)) is not None: 

260 return (cache[data_id] for data_id in self.iter_data_ids(element.graph)) 

261 else: 

262 reader = DimensionRecordReader(element) 

263 if not (reader.columns_required <= self.relation.columns): 

264 raise ColumnError( 

265 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

266 f"for records of element {element.name}." 

267 ) 

268 return (reader.read(row) for row in self) 

269 

270 def run(self) -> Query: 

271 """Execute the query and hold its results in memory. 

272 

273 Returns 

274 ------- 

275 executed : `Query` 

276 New query that holds the query results. 

277 

278 Notes 

279 ----- 

280 Iterating over the results of a query that has been `run` will always 

281 iterate over an existing container, while iterating over a query that 

282 has not been run will result in executing at least some of the query 

283 each time. 

284 

285 Running a query also sets its `is_deferred` flag to `False`, which will 

286 cause new queries constructed by its methods to be run immediately, 

287 unless ``defer=True`` is passed to the factory method. After a query 

288 has been run, factory methods will also tend to prefer to apply new 

289 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python 

290 code acting on the existing container rather than going back to SQL, 

291 which can be less efficient overall that applying operations to a 

292 deferred query and executing them all only at the end. 

293 

294 Running a query is represented in terms of relations by adding a 

295 `~lsst.daf.relation.Materialization` marker relation in the iteration 

296 engine and then processing the relation tree; this attaches the 

297 container of rows to that new relation to short-circuit any future 

298 processing of the tree and lock changes to the tree upstream of it. 

299 This is very different from the SQL-engine 

300 `~lsst.daf.relation.Materialization` added to the tree by the 

301 `materialize` method from a user perspective, though it has a similar 

302 representation in the relation tree. 

303 """ 

304 relation = ( 

305 # Make a new relation that definitely ends in the iteration engine 

306 # (this does nothing if it already does). 

307 self.relation.transferred_to(self._context.iteration_engine) 

308 # Make the new relation save its rows to an in-memory Python 

309 # collection in relation.payload when processed. 

310 .materialized(name_prefix="run") 

311 ) 

312 # Actually process the relation, simplifying out trivial relations, 

313 # executing any SQL queries, and saving results to relation.payload. 

314 # We discard the simplified relation that's returned, because we want 

315 # the new query to have any extra diagnostic information contained in 

316 # the original. 

317 self._context.process(relation) 

318 return self._copy(relation, False) 

319 

320 def materialized(self, defer_postprocessing: bool = True) -> Query: 

321 """Materialize the results of this query in its context's preferred 

322 engine. 

323 

324 Usually this means inserting the results into a temporary table in a 

325 database. 

326 

327 Parameters 

328 ---------- 

329 defer_postprocessing : `bool`, optional 

330 If `True`, do not execute operations that occur in the context's 

331 `QueryContext.iteration_engine` up front; instead insert and 

332 execute a materialization upstream of them (e.g. via a a SQL 

333 ``INSERT INTO ... SELECT`` statement, with no fetching to the 

334 client) and execute the postprocessing operations when iterating 

335 over the query results. If `False`, and iteration-engine 

336 postprocessing operations exist, run the full query, execute them 

337 now, and upload the results. 

338 If the relation is already in the preferred engine, this option 

339 is ignored and the materialization will not involve fetching rows 

340 to the iteration engine at all. If the relation has already been 

341 materialized in the iteration engine (i.e. via `run`), then this 

342 option is again ignored and an upload of the existing rows will 

343 be performed. 

344 

345 Returns 

346 ------- 

347 materialized : `Query` 

348 Modified query with the same row-and-column content with a 

349 materialization in ``self.context.preferred_engine``. 

350 """ 

351 if defer_postprocessing or self.relation.engine == self._context.preferred_engine: 

352 relation, stripped = self._context.strip_postprocessing(self._relation) 

353 if relation.engine == self._context.preferred_engine: 

354 # We got all the way to the engine we want to materialize in. 

355 # Apply that operation to the tree, process it (which actually 

356 # creates a temporary table and populates it), and then reapply 

357 # the stripped operations. 

358 relation = relation.materialized() 

359 self._context.process(relation) 

360 for operation in stripped: 

361 relation = operation.apply( 

362 relation, transfer=True, preferred_engine=self._context.iteration_engine 

363 ) 

364 return self._copy(relation, True) 

365 # Either defer_postprocessing=False, or attempting to strip off unary 

366 # operations until we got to the preferred engine didn't work, because 

367 # this tree doesn't actually involve the preferred engine. So we just 

368 # transfer to the preferred engine first, and then materialize, 

369 # process, and return. 

370 relation = self._relation.transferred_to(self._context.preferred_engine).materialized() 

371 self._context.process(relation) 

372 return self._copy(relation, True) 

373 

374 def projected( 

375 self, 

376 dimensions: Iterable[Dimension | str] | None = None, 

377 unique: bool = True, 

378 columns: Iterable[ColumnTag] | None = None, 

379 defer: bool | None = None, 

380 drop_postprocessing: bool = False, 

381 keep_record_columns: bool = True, 

382 ) -> Query: 

383 """Return a modified `Query` with a subset of this one's columns. 

384 

385 Parameters 

386 ---------- 

387 dimensions : `~collections.abc.Iterable` [ `Dimension` or `str` ], 

388 optional 

389 Dimensions to include in the new query. Will be expanded to 

390 include all required and implied dependencies. Must be a subset of 

391 ``self.dimensions``. If not provided, ``self.dimensions`` is used. 

392 unique : `bool`, optional 

393 If `True` (default) deduplicate rows after dropping columns. 

394 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional 

395 Additional dataset or dimension record columns to include in the 

396 query. Dimension key columns added here are ignored unless they 

397 extend beyond the key columns implied by the ``dimensions`` 

398 argument (which is an error). 

399 defer : `bool`, optional 

400 If `False`, run the new query immediately. If `True`, do not. If 

401 `None` (default), the ``defer`` option passed when making ``self`` 

402 is used (this option is "sticky"). 

403 drop_postprocessing : `bool`, optional 

404 Drop any iteration-engine operations that depend on columns that 

405 are being removed (e.g. region-overlap tests when region columns 

406 are being dropped), making it more likely that projection and 

407 deduplication could be performed in the preferred engine, where 

408 they may be more efficient. 

409 keep_record_columns : `bool`, optional 

410 If `True` (default) and this query `has_record_columns`, implicitly 

411 add any of those to ``columns`` whose dimension element is in the 

412 given ``dimensions``. 

413 

414 Returns 

415 ------- 

416 query : `Query` 

417 New query with the requested columns only, optionally deduplicated. 

418 

419 Notes 

420 ----- 

421 Dataset columns are dropped from the new query unless passed via the 

422 ``columns`` argument. All other columns are by default preserved. 

423 

424 Raises 

425 ------ 

426 lsst.daf.relation.ColumnError 

427 Raised if the columns to include in the new query are not all 

428 present in the current query. 

429 """ 

430 if dimensions is None: 

431 dimensions = set(self._dimensions) 

432 else: 

433 dimensions = set(dimensions) 

434 if columns is not None: 

435 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns)) 

436 dimensions = self._dimensions.universe.extract(dimensions) 

437 if columns is None: 

438 columns = set() 

439 else: 

440 columns = set(columns) 

441 columns.update(DimensionKeyColumnTag.generate(dimensions.names)) 

442 if keep_record_columns: 

443 if self._has_record_columns is True: 

444 for element in dimensions.elements: 

445 if element not in self._record_caches: 

446 columns.update(element.RecordClass.fields.columns) 

447 elif self._has_record_columns in dimensions.elements: 

448 element = cast(DimensionElement, self._has_record_columns) 

449 columns.update(element.RecordClass.fields.columns) 

450 if drop_postprocessing: 

451 relation = self._context.drop_invalidated_postprocessing(self._relation, columns) 

452 # Dropping postprocessing Calculations could cause other columns 

453 # we had otherwise intended to keep to be dropped as well. 

454 columns &= relation.columns 

455 else: 

456 relation = self._relation 

457 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine) 

458 if unique: 

459 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine) 

460 return self._chain(relation, defer, dimensions=dimensions) 

461 

462 def with_record_columns( 

463 self, dimension_element: DimensionElement | None = None, defer: bool | None = None 

464 ) -> Query: 

465 """Return a modified `Query` with additional dimension record columns 

466 and/or caches. 

467 

468 Parameters 

469 ---------- 

470 dimension_element : `DimensionElement`, optional 

471 Single element to add record columns for, or `None` default to add 

472 them for all elements in `dimensions`. 

473 defer : `bool`, optional 

474 If `False`, run the new query immediately. If `True`, do not. If 

475 `None` (default), the ``defer`` option passed when making ``self`` 

476 is used (this option is "sticky"). 

477 

478 Returns 

479 ------- 

480 query : `Query` 

481 New query with the requested record columns either in the relation 

482 or (when possible) available via record caching. 

483 

484 Notes 

485 ----- 

486 Adding dimension record columns is fundamentally different from adding 

487 new dimension key columns or dataset columns, because it is purely an 

488 addition of columns, not rows - we can always join in a dimension 

489 element table (if it has not already been included) on keys already 

490 present in the current relation, confident that there is exactly one 

491 row in the dimension element table for each row in the current 

492 relation. 

493 """ 

494 if self._has_record_columns is True or self._has_record_columns == dimension_element: 

495 return self 

496 record_caches = dict(self._record_caches) 

497 columns_required: set[ColumnTag] = set() 

498 for element in self.dimensions.elements if dimension_element is None else [dimension_element]: 

499 if element in record_caches: 

500 continue 

501 if (cache := self._backend.get_dimension_record_cache(element.name, self._context)) is not None: 

502 record_caches[element] = cache 

503 else: 

504 columns_required.update(element.RecordClass.fields.columns.keys()) 

505 # Modify the relation we have to remove any projections that dropped 

506 # columns we now want, as long the relation's behavior is otherwise 

507 # unchanged. 

508 columns_required -= self._relation.columns 

509 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

510 columns_required.difference_update(columns_found) 

511 if columns_required: 

512 relation = self._backend.make_dimension_relation( 

513 self._dimensions, 

514 columns_required, 

515 self._context, 

516 initial_relation=relation, 

517 # Don't permit joins to use any columns beyond those in the 

518 # original relation, as that would change what this operation 

519 # does. 

520 initial_join_max_columns=frozenset(self._relation.columns), 

521 governor_constraints=self._governor_constraints, 

522 ) 

523 return self._chain( 

524 relation, 

525 defer=defer, 

526 has_record_columns=True if dimension_element is None else dimension_element, 

527 record_caches=record_caches, 

528 ) 

529 

530 def find_datasets( 

531 self, 

532 dataset_type: DatasetType, 

533 collections: Any, 

534 *, 

535 find_first: bool = True, 

536 columns: Set[str] = frozenset(("dataset_id", "run")), 

537 defer: bool | None = None, 

538 ) -> Query: 

539 """Return a modified `Query` that includes a search for datasets of the 

540 given type. 

541 

542 Parameters 

543 ---------- 

544 dataset_type : `DatasetType` 

545 Dataset type to search for. May not be a component. 

546 collections 

547 Collection search path or pattern. Must be a single collection 

548 name or ordered sequence if ``find_first=True``. See 

549 :ref:`daf_butler_collection_expressions` for more information. 

550 find_first : `bool`, optional 

551 If `True` (default) search collections in order until the first 

552 match for each data ID is found. If `False`, return all matches in 

553 all collections. 

554 columns : `~collections.abc.Set` [ `str` ] 

555 Dataset columns to include in the new query. Options include 

556 

557 - ``dataset_id``: the unique identifier of the dataset. The type 

558 is implementation-dependent. Never nullable. Included by 

559 default. 

560 

561 - ``ingest_date``: the date and time the dataset was added to the 

562 data repository. 

563 

564 - ``run``: the foreign key column to the `~CollectionType.RUN` 

565 collection holding the dataset (not necessarily the collection 

566 name). The type is dependent on the collection manager 

567 implementation. Included by default. 

568 

569 - ``collection``: the foreign key column to the collection type in 

570 which the dataset was actually in this search. The type is 

571 dependent on the collection manager implementation. This may 

572 differ from ``run`` if the dataset is present in a matching 

573 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

574 collection, which means the same dataset may also appear multiple 

575 times in the query results. 

576 

577 - ``timespan``: the validity range for datasets found in a 

578 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other 

579 collection types. 

580 

581 The default columns (``dataset_id`` and ``run``) are sufficient to 

582 enable `iter_dataset_refs`, which also takes care of translating 

583 the internal ``RUN`` collection key into its public name. 

584 

585 Setting this to an empty set while passing ``find_first=False`` 

586 will return a query that is constrained by dataset existence in 

587 some matching collection that does not actually return which 

588 datasets existed. 

589 defer : `bool`, optional 

590 If `False`, run the new query immediately. If `True`, do not. If 

591 `None` (default), the ``defer`` option passed when making ``self`` 

592 is used (this option is "sticky"). 

593 

594 Returns 

595 ------- 

596 query : `Query` 

597 New query with the requested dataset columns, constrained by the 

598 existence of datasets of this type in the given collection. 

599 

600 Raises 

601 ------ 

602 lsst.daf.relation.ColumnError 

603 Raised if a dataset search is already present in this query and 

604 this is a find-first search. 

605 ValueError 

606 Raised if the given dataset type's dimensions are not a subset of 

607 the current query's dimensions. 

608 """ 

609 if find_first and DatasetColumnTag.filter_from(self._relation.columns): 

610 raise ColumnError( 

611 "Cannot search for datasets with find_first=True " 

612 "on a query that already includes dataset columns." 

613 ) 

614 # 

615 # TODO: it'd nice to do a QueryContext.restore_columns call here or 

616 # similar, to look for dataset-constraint joins already present in the 

617 # relation and expand them to include dataset-result columns as well, 

618 # instead of doing a possibly-redundant join here. But that would 

619 # require pushing relation usage down further into 

620 # DatasetStorageManager.make_relation, so that it doesn't need to be 

621 # given the columns, and then giving the relation system the ability to 

622 # simplify-away redundant joins when they only provide columns that 

623 # aren't ultimately used. The right time to look into that is probably 

624 # when investigating whether the base QueryBackend should be 

625 # responsible for producing an "abstract" relation tree of some sort, 

626 # with the subclasses only responsible for filling it in with payloads 

627 # (and possibly replacing some leaves with new sub-trees) during when 

628 # "processed" (or in some other "prepare" step). 

629 # 

630 # This is a low priority for three reasons: 

631 # - there's some chance the database's query optimizer will simplify 

632 # away these redundant joins; 

633 # - at present, the main use of this code path is in QG generation, 

634 # where we materialize the initial data ID query into a temp table 

635 # and hence can't go back and "recover" those dataset columns anyway; 

636 # 

637 if not (dataset_type.dimensions <= self._dimensions): 

638 raise ValueError( 

639 "Cannot find datasets from a query unless the dataset types's dimensions " 

640 f"({dataset_type.dimensions}, for {dataset_type.name}) are a subset of the query's " 

641 f"({self._dimensions})." 

642 ) 

643 columns = set(columns) 

644 columns.add("dataset_id") 

645 collections = CollectionWildcard.from_expression(collections) 

646 if find_first: 

647 collections.require_ordered() 

648 rejections: list[str] = [] 

649 collection_records = self._backend.resolve_dataset_collections( 

650 dataset_type, 

651 collections, 

652 governor_constraints=self._governor_constraints, 

653 allow_calibration_collections=False, # TODO 

654 rejections=rejections, 

655 ) 

656 if not collection_records: 

657 relation = self._relation.join( 

658 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context) 

659 ) 

660 elif find_first: 

661 relation = self._backend.make_dataset_search_relation( 

662 dataset_type, collection_records, columns, self._context, join_to=self._relation 

663 ) 

664 else: 

665 dataset_relation = self._backend.make_dataset_query_relation( 

666 dataset_type, collection_records, columns, self._context 

667 ) 

668 relation = self.relation.join(dataset_relation) 

669 return self._chain(relation, defer=defer) 

670 

671 def sliced( 

672 self, 

673 start: int = 0, 

674 stop: int | None = None, 

675 defer: bool | None = None, 

676 ) -> Query: 

677 """Return a modified `Query` with that takes a slice of this one's 

678 rows. 

679 

680 Parameters 

681 ---------- 

682 start : `int`, optional 

683 First index to include, inclusive. 

684 stop : `int` or `None`, optional 

685 One past the last index to include (i.e. exclusive). 

686 defer : `bool`, optional 

687 If `False`, run the new query immediately. If `True`, do not. If 

688 `None` (default), the ``defer`` option passed when making ``self`` 

689 is used (this option is "sticky"). 

690 

691 Returns 

692 ------- 

693 query : `Query` 

694 New query with the requested slice. 

695 

696 Notes 

697 ----- 

698 This operation must be implemented in the iteration engine if there are 

699 postprocessing operations, which may be much less efficient than 

700 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..`` 

701 in SQL). 

702 

703 Since query row order is usually arbitrary, it usually makes sense to 

704 call `sorted` before calling `sliced` to make the results 

705 deterministic. This is not checked because there are some contexts 

706 where getting an arbitrary subset of the results of a given size 

707 still makes sense. 

708 """ 

709 return self._chain(self._relation[start:stop], defer) 

710 

711 def sorted( 

712 self, 

713 order_by: Iterable[SortTerm], 

714 defer: bool | None = None, 

715 ) -> Query: 

716 """Return a modified `Query` that sorts this one's rows. 

717 

718 Parameters 

719 ---------- 

720 order_by : `~collections.abc.Iterable` [ `SortTerm` ] 

721 Expressions to sort by. 

722 defer : `bool`, optional 

723 If `False`, run the new query immediately. If `True`, do not. If 

724 `None` (default), the ``defer`` option passed when making ``self`` 

725 is used (this option is "sticky"). 

726 

727 Returns 

728 ------- 

729 query : `Query` 

730 New query with the requested sorting. 

731 

732 Notes 

733 ----- 

734 The ``order_by`` expression can include references to dimension record 

735 columns that were not present in the original relation; this is 

736 similar to calling `with_record_columns` for those columns first (but 

737 in this case column requests cannot be satisfied by record caches). 

738 All other columns referenced must be present in the query already. 

739 """ 

740 op = Sort(tuple(order_by)) 

741 columns_required = set(op.columns_required) 

742 columns_required.difference_update(self._relation.columns) 

743 if columns_required: 

744 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

745 columns_required.difference_update(columns_found) 

746 if columns_required: 

747 try: 

748 relation = self._backend.make_dimension_relation( 

749 self._dimensions, 

750 columns_required, 

751 self._context, 

752 initial_relation=relation, 

753 # Don't permit joins to use any columns beyond those in 

754 # the original relation, as that would change what this 

755 # operation does. 

756 initial_join_max_columns=frozenset(self._relation.columns), 

757 governor_constraints=self._governor_constraints, 

758 ) 

759 except ColumnError as err: 

760 raise ColumnError( 

761 "Cannot sort by columns that were not included in the original query or " 

762 "fully resolved by its dimensions." 

763 ) from err 

764 else: 

765 relation = self._relation 

766 relation = op.apply(relation, preferred_engine=self._context.preferred_engine) 

767 return self._chain(relation, defer) 

768 

769 def count(self, *, exact: bool = True, discard: bool = False) -> int: 

770 """Count the number of rows in this query. 

771 

772 Parameters 

773 ---------- 

774 exact : `bool`, optional 

775 If `True` (default), return the exact number of rows. If `False`, 

776 returning an upper bound is permitted if it can be done much more 

777 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but 

778 ignoring client-side filtering that would otherwise take place. 

779 discard : `bool`, optional 

780 If `True`, compute the exact count even if it would require running 

781 the full query and then throwing away the result rows after 

782 counting them. If `False`, this is an error, as the user would 

783 usually be better off executing the query first to fetch its rows 

784 into a new query (or passing ``exact=False``). Ignored if 

785 ``exact=False``. 

786 

787 Returns 

788 ------- 

789 n_rows : `int` 

790 Number of rows in the query, or an upper bound. This includes 

791 duplicates, if there are any. 

792 

793 Raises 

794 ------ 

795 RuntimeError 

796 Raised if an exact count was requested and could not be obtained 

797 without fetching and discarding rows. 

798 """ 

799 if self._relation.min_rows == self._relation.max_rows: 

800 return self._relation.max_rows 

801 return self._context.count(self._relation, exact=exact, discard=discard) 

802 

803 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

804 """Check whether this query has any result rows at all. 

805 

806 Parameters 

807 ---------- 

808 execute : `bool`, optional 

809 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

810 determined prior to execution that the query would return no rows. 

811 exact : `bool`, optional 

812 If `True`, run the full query and perform post-query filtering if 

813 needed, until at least one result row is found. If `False`, the 

814 returned result does not account for post-query filtering, and 

815 hence may be `True` even when all result rows would be filtered 

816 out. 

817 

818 Returns 

819 ------- 

820 any_rows : `bool` 

821 Whether the query has any rows, or if it may have any rows if 

822 ``exact=False``. 

823 

824 Raises 

825 ------ 

826 RuntimeError 

827 Raised if an exact check was requested and could not be obtained 

828 without executing the query. 

829 """ 

830 if self._relation.min_rows > 0: 

831 return True 

832 if self._relation.max_rows == 0: 

833 return False 

834 if execute: 

835 return self._context.any(self._relation, execute=execute, exact=exact) 

836 elif not exact: 

837 return True 

838 raise TypeError("Cannot obtain exact results without executing the query.") 

839 

840 def explain_no_results(self, execute: bool = True) -> list[str]: 

841 """Return human-readable messages that may help explain why the query 

842 yields no results. 

843 

844 Parameters 

845 ---------- 

846 execute : `bool`, optional 

847 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``) 

848 of aspects of the query to more precisely determine where rows were 

849 filtered out. 

850 

851 Returns 

852 ------- 

853 messages : `Iterable` [ `str` ] 

854 String messages that describe reasons the query might not yield any 

855 results. 

856 """ 

857 # First try without actually executing any queries. 

858 diagnostics = Diagnostics.run(self._relation) 

859 if diagnostics.is_doomed: 

860 return diagnostics.messages 

861 if execute: 

862 # Try again, running LIMIT 1 queries as we walk back down the tree 

863 # to look for relations with no rows: 

864 diagnostics = Diagnostics.run(self._relation, executor=self._context.any) 

865 if diagnostics.is_doomed: 

866 return diagnostics.messages 

867 return [] 

868 

869 def _copy( 

870 self, 

871 relation: Relation, 

872 is_deferred: bool, 

873 dimensions: DimensionGraph | None = None, 

874 governor_constraints: Mapping[str, Set[str]] | None = None, 

875 has_record_columns: bool | DimensionElement | None = None, 

876 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

877 ) -> Query: 

878 """Return a modified copy of this query with some attributes replaced. 

879 

880 See class docs for parameter documentation; the only difference here 

881 is that the defaults are the values ``self`` was constructed with. 

882 """ 

883 return Query( 

884 dimensions=self._dimensions if dimensions is None else dimensions, 

885 backend=self._backend, 

886 context=self._context, 

887 relation=relation, 

888 governor_constraints=( 

889 governor_constraints if governor_constraints is not None else self._governor_constraints 

890 ), 

891 is_deferred=is_deferred, 

892 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns, 

893 record_caches=self._record_caches if record_caches is None else record_caches, 

894 ) 

895 

896 def _chain( 

897 self, 

898 relation: Relation, 

899 defer: bool | None, 

900 dimensions: DimensionGraph | None = None, 

901 governor_constraints: Mapping[str, Set[str]] | None = None, 

902 has_record_columns: bool | DimensionElement | None = None, 

903 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

904 ) -> Query: 

905 """Return a modified query with a new relation while handling the 

906 ubiquitous ``defer`` parameter's logic. 

907 

908 Parameters 

909 ---------- 

910 relation : `Relation` 

911 Relation for the new query. 

912 defer : `bool` 

913 If `False`, run the new query immediately. If `True`, do not. If 

914 `None` , the ``defer`` option passed when making ``self`` is used 

915 (this option is "sticky"). 

916 dimensions : `DimensionGraph`, optional 

917 See class docs. 

918 governor_constraints : `Mapping` [ `str` [ `~collections.abc.Set` 

919 [ `str` ] ] ], optional 

920 See class docs. 

921 has_record_columns : `bool` or `DimensionElement`, optional 

922 See class docs. 

923 record_caches : `Mapping` [ `DimensionElement`, `Mapping` 

924 [ `DataCoordinate`, `DimensionRecord` ] ], optional 

925 See class docs. 

926 

927 Returns 

928 ------- 

929 chained : `Query` 

930 Modified query, or ``self`` if no modifications were actually 

931 requested. 

932 """ 

933 if defer is None: 

934 defer = self._is_deferred 

935 if ( 

936 relation is self._relation 

937 and dimensions is None 

938 and defer == self._is_deferred 

939 and record_caches is None 

940 and has_record_columns is None 

941 and governor_constraints is None 

942 ): 

943 return self 

944 result = self._copy( 

945 relation, 

946 is_deferred=True, 

947 governor_constraints=governor_constraints, 

948 dimensions=dimensions, 

949 has_record_columns=has_record_columns, 

950 record_caches=record_caches, 

951 ) 

952 if not defer: 

953 result = result.run() 

954 return result