Coverage for python/lsst/daf/butler/registry/queries/_query.py: 14%

258 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-19 10:53 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = () 

30 

31import itertools 

32from collections.abc import Iterable, Iterator, Mapping, Sequence, Set 

33from contextlib import contextmanager 

34from typing import Any, cast, final 

35 

36from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm 

37 

38from ..._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag 

39from ..._dataset_ref import DatasetRef 

40from ..._dataset_type import DatasetType 

41from ...dimensions import ( 

42 DataCoordinate, 

43 DimensionElement, 

44 DimensionGroup, 

45 DimensionRecord, 

46 DimensionRecordSet, 

47) 

48from .._collection_type import CollectionType 

49from ..wildcards import CollectionWildcard 

50from ._query_backend import QueryBackend 

51from ._query_context import QueryContext 

52from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader 

53 

54 

55@final 

56class Query: 

57 """A general-purpose representation of a registry query. 

58 

59 Parameters 

60 ---------- 

61 dimensions : `DimensionGroup` 

62 The dimensions that span the query and are used to join its relations 

63 together. 

64 backend : `QueryBackend` 

65 Backend object used to create the query and new ones derived from it. 

66 context : `QueryContext` 

67 Context manager that holds relation engines and database connections 

68 for the query. 

69 relation : `Relation` 

70 The relation tree representation of the query as a series of operations 

71 on tables. 

72 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

73 `~collections.abc.Set` [ `str` ] ] 

74 Constraints on governor dimensions encoded in this query's relation. 

75 This is a mapping from governor dimension name to sets of values that 

76 dimension may take. 

77 is_deferred : `bool` 

78 If `True`, modifier methods that return a related `Query` object should 

79 not immediately execute the new query. 

80 has_record_columns : `bool` or `DimensionElement` 

81 Whether this query's relation already includes columns for all or some 

82 dimension element records: `True` means all elements in ``dimensions`` 

83 either have records present in ``record_caches`` or all columns present 

84 in ``relation``, while a specific `DimensionElement` means that element 

85 does. 

86 record_caches : `~collections.abc.Mapping` [ `str`, \ 

87 `DimensionRecordSet` ], optional 

88 Cached dimension record values. 

89 

90 Notes 

91 ----- 

92 Iterating over a `Query` yields mappings from `ColumnTag` to the associated 

93 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and 

94 `iter_dimension_records` methods can be used to instead iterate over 

95 various butler primitives derived from these rows. 

96 

97 Iterating over a `Query` may or may not execute database queries again each 

98 time, depending on the state of its relation tree - see `Query.run` for 

99 details. 

100 

101 Query is immutable; all methods that might appear to modify it in place 

102 actually return a new object (though many attributes will be shared). 

103 

104 Query is currently (still) an internal-to-Registry object, with only the 

105 "QueryResults" classes that are backed by it directly exposed to users. It 

106 has been designed with the intent that it will eventually play a larger 

107 role, either as the main query result object in a redesigned query 

108 interface, or a "power user" result option that accompanies simpler 

109 replacements for the current "QueryResults" objects. 

110 """ 

111 

112 def __init__( 

113 self, 

114 dimensions: DimensionGroup, 

115 backend: QueryBackend[QueryContext], 

116 context: QueryContext, 

117 relation: Relation, 

118 governor_constraints: Mapping[str, Set[str]], 

119 is_deferred: bool, 

120 has_record_columns: bool | DimensionElement, 

121 record_caches: Mapping[str, DimensionRecordSet] | None = None, 

122 ): 

123 self._dimensions = dimensions 

124 self._backend = backend 

125 self._context = context 

126 self._relation = relation 

127 self._governor_constraints = governor_constraints 

128 self._is_deferred = is_deferred 

129 self._has_record_columns = has_record_columns 

130 self._record_caches = record_caches if record_caches is not None else {} 

131 

132 @property 

133 def dimensions(self) -> DimensionGroup: 

134 """The dimensions that span the query and are used to join its 

135 relations together (`DimensionGroup`). 

136 """ 

137 return self._dimensions 

138 

139 @property 

140 def relation(self) -> Relation: 

141 """The relation tree representation of the query as a series of 

142 operations on tables (`Relation`). 

143 """ 

144 return self._relation 

145 

146 @property 

147 def has_record_columns(self) -> bool | DimensionElement: 

148 """Whether this query's relation already includes columns for all or 

149 some dimension element records (`bool` or `DimensionElement`). 

150 """ 

151 return self._has_record_columns 

152 

153 @property 

154 def backend(self) -> QueryBackend[QueryContext]: 

155 """Backend object used to create the query and new ones derived from it 

156 (`QueryBackend`). 

157 """ 

158 return self._backend 

159 

160 @contextmanager 

161 def open_context(self) -> Iterator[None]: 

162 """Return a context manager that ensures a database connection is 

163 established, temporary tables and cursors have a defined lifetime, 

164 and client-side caching is turned on. 

165 

166 Returns 

167 ------- 

168 context : `contextlib.AbstractContextManager` 

169 Context manager with no return value. 

170 """ 

171 with self._backend.caching_context(): 

172 if self._context.is_open: 

173 yield 

174 else: 

175 with self._context: 

176 yield 

177 

178 def __str__(self) -> str: 

179 return str(self._relation) 

180 

181 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]: 

182 return iter(self._context.fetch_iterable(self._relation)) 

183 

184 def iter_data_ids(self, dimensions: DimensionGroup | None = None) -> Iterator[DataCoordinate]: 

185 """Return an iterator that converts result rows to data IDs. 

186 

187 Parameters 

188 ---------- 

189 dimensions : `DimensionGroup`, optional 

190 Dimensions of the data IDs to return. If not provided, 

191 ``self.dimensions`` is used. 

192 

193 Returns 

194 ------- 

195 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ] 

196 Iterator that yields data IDs. 

197 """ 

198 if dimensions is None: 

199 dimensions = self._dimensions 

200 reader = DataCoordinateReader.make( 

201 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches 

202 ) 

203 if not (reader.columns_required <= self.relation.columns): 

204 raise ColumnError( 

205 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

206 f"for data IDs with dimensions {dimensions}." 

207 ) 

208 with self.backend.caching_context(): 

209 for row in self: 

210 yield reader.read(row) 

211 

212 def iter_dataset_refs( 

213 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,) 

214 ) -> Iterator[DatasetRef]: 

215 """Return an iterator that converts result rows to dataset references. 

216 

217 Parameters 

218 ---------- 

219 dataset_type : `DatasetType` 

220 The parent dataset type to yield references for. 

221 components : `~collections.abc.Sequence` [ `None` or `str` ] 

222 Which component dataset types to construct refs for from each row 

223 representing a parent; `None` for the parent itself. 

224 

225 Returns 

226 ------- 

227 refs : `~collections.abc.Iterator` [ `DatasetRef` ] 

228 Iterator that yields (resolved) dataset references. 

229 """ 

230 reader = DatasetRefReader( 

231 dataset_type, 

232 translate_collection=self._backend.get_collection_name, 

233 records=self._has_record_columns is True, 

234 record_caches=self._record_caches, 

235 ) 

236 if not (reader.columns_required <= self.relation.columns): 

237 raise ColumnError( 

238 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

239 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}." 

240 ) 

241 with self.backend.caching_context(): 

242 for row in self: 

243 parent_ref = reader.read(row) 

244 for component in components: 

245 if component is None: 

246 yield parent_ref 

247 else: 

248 yield parent_ref.makeComponentRef(component) 

249 

250 def iter_data_ids_and_dataset_refs( 

251 self, dataset_type: DatasetType, dimensions: DimensionGroup | None = None 

252 ) -> Iterator[tuple[DataCoordinate, DatasetRef]]: 

253 """Iterate over pairs of data IDs and dataset refs. 

254 

255 This permits the data ID dimensions to differ from the dataset 

256 dimensions. 

257 

258 Parameters 

259 ---------- 

260 dataset_type : `DatasetType` 

261 The parent dataset type to yield references for. 

262 dimensions : `DimensionGroup`, optional 

263 Dimensions of the data IDs to return. If not provided, 

264 ``self.dimensions`` is used. 

265 

266 Returns 

267 ------- 

268 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`, 

269 `DatasetRef` ] ] 

270 An iterator over (data ID, dataset reference) pairs. 

271 """ 

272 if dimensions is None: 

273 dimensions = self._dimensions 

274 data_id_reader = DataCoordinateReader.make( 

275 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches 

276 ) 

277 dataset_reader = DatasetRefReader( 

278 dataset_type, 

279 translate_collection=self._backend.get_collection_name, 

280 records=self._has_record_columns is True, 

281 record_caches=self._record_caches, 

282 ) 

283 if not (data_id_reader.columns_required <= self.relation.columns): 

284 raise ColumnError( 

285 f"Missing column(s) {set(data_id_reader.columns_required - self.relation.columns)} " 

286 f"for data IDs with dimensions {dimensions}." 

287 ) 

288 if not (dataset_reader.columns_required <= self.relation.columns): 

289 raise ColumnError( 

290 f"Missing column(s) {set(dataset_reader.columns_required - self.relation.columns)} " 

291 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}." 

292 ) 

293 with self.backend.caching_context(): 

294 for row in self: 

295 yield (data_id_reader.read(row), dataset_reader.read(row)) 

296 

297 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]: 

298 """Return an iterator that converts result rows to dimension records. 

299 

300 Parameters 

301 ---------- 

302 element : `DimensionElement`, optional 

303 Dimension element whose records will be returned. If not provided, 

304 `has_record_columns` must be a `DimensionElement` instance. 

305 

306 Returns 

307 ------- 

308 records : `~collections.abc.Iterator` [ `DimensionRecord` ] 

309 Iterator that yields dimension records. 

310 """ 

311 if element is None: 

312 match self._has_record_columns: 

313 case True | False: 

314 raise ValueError("No default dimension element in query; 'element' must be given.") 

315 case only_element_with_records: 

316 element = only_element_with_records 

317 if (cache := self._record_caches.get(element.name)) is not None: 

318 for data_id in self.iter_data_ids(element.minimal_group): 

319 yield cache.find(data_id) 

320 else: 

321 reader = DimensionRecordReader(element) 

322 if not (reader.columns_required <= self.relation.columns): 

323 raise ColumnError( 

324 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

325 f"for records of element {element.name}." 

326 ) 

327 with self._backend.caching_context(): 

328 for row in self: 

329 yield reader.read(row) 

330 

331 def run(self) -> Query: 

332 """Execute the query and hold its results in memory. 

333 

334 Returns 

335 ------- 

336 executed : `Query` 

337 New query that holds the query results. 

338 

339 Notes 

340 ----- 

341 Iterating over the results of a query that has been `run` will always 

342 iterate over an existing container, while iterating over a query that 

343 has not been run will result in executing at least some of the query 

344 each time. 

345 

346 Running a query also sets its `is_deferred` flag to `False`, which will 

347 cause new queries constructed by its methods to be run immediately, 

348 unless ``defer=True`` is passed to the factory method. After a query 

349 has been run, factory methods will also tend to prefer to apply new 

350 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python 

351 code acting on the existing container rather than going back to SQL, 

352 which can be less efficient overall that applying operations to a 

353 deferred query and executing them all only at the end. 

354 

355 Running a query is represented in terms of relations by adding a 

356 `~lsst.daf.relation.Materialization` marker relation in the iteration 

357 engine and then processing the relation tree; this attaches the 

358 container of rows to that new relation to short-circuit any future 

359 processing of the tree and lock changes to the tree upstream of it. 

360 This is very different from the SQL-engine 

361 `~lsst.daf.relation.Materialization` added to the tree by the 

362 `materialize` method from a user perspective, though it has a similar 

363 representation in the relation tree. 

364 """ 

365 relation = ( 

366 # Make a new relation that definitely ends in the iteration engine 

367 # (this does nothing if it already does). 

368 self.relation.transferred_to(self._context.iteration_engine) 

369 # Make the new relation save its rows to an in-memory Python 

370 # collection in relation.payload when processed. 

371 .materialized(name_prefix="run") 

372 ) 

373 # Actually process the relation, simplifying out trivial relations, 

374 # executing any SQL queries, and saving results to relation.payload. 

375 # We discard the simplified relation that's returned, because we want 

376 # the new query to have any extra diagnostic information contained in 

377 # the original. 

378 self._context.process(relation) 

379 return self._copy(relation, False) 

380 

381 def materialized(self, defer_postprocessing: bool = True) -> Query: 

382 """Materialize the results of this query in its context's preferred 

383 engine. 

384 

385 Usually this means inserting the results into a temporary table in a 

386 database. 

387 

388 Parameters 

389 ---------- 

390 defer_postprocessing : `bool`, optional 

391 If `True`, do not execute operations that occur in the context's 

392 `QueryContext.iteration_engine` up front; instead insert and 

393 execute a materialization upstream of them (e.g. via a a SQL 

394 ``INSERT INTO ... SELECT`` statement, with no fetching to the 

395 client) and execute the postprocessing operations when iterating 

396 over the query results. If `False`, and iteration-engine 

397 postprocessing operations exist, run the full query, execute them 

398 now, and upload the results. 

399 If the relation is already in the preferred engine, this option 

400 is ignored and the materialization will not involve fetching rows 

401 to the iteration engine at all. If the relation has already been 

402 materialized in the iteration engine (i.e. via `run`), then this 

403 option is again ignored and an upload of the existing rows will 

404 be performed. 

405 

406 Returns 

407 ------- 

408 materialized : `Query` 

409 Modified query with the same row-and-column content with a 

410 materialization in ``self.context.preferred_engine``. 

411 """ 

412 if defer_postprocessing or self.relation.engine == self._context.preferred_engine: 

413 relation, stripped = self._context.strip_postprocessing(self._relation) 

414 if relation.engine == self._context.preferred_engine: 

415 # We got all the way to the engine we want to materialize in. 

416 # Apply that operation to the tree, process it (which actually 

417 # creates a temporary table and populates it), and then reapply 

418 # the stripped operations. 

419 relation = relation.materialized() 

420 self._context.process(relation) 

421 for operation in stripped: 

422 relation = operation.apply( 

423 relation, transfer=True, preferred_engine=self._context.iteration_engine 

424 ) 

425 return self._copy(relation, True) 

426 # Either defer_postprocessing=False, or attempting to strip off unary 

427 # operations until we got to the preferred engine didn't work, because 

428 # this tree doesn't actually involve the preferred engine. So we just 

429 # transfer to the preferred engine first, and then materialize, 

430 # process, and return. 

431 relation = self._relation.transferred_to(self._context.preferred_engine).materialized() 

432 self._context.process(relation) 

433 return self._copy(relation, True) 

434 

435 def projected( 

436 self, 

437 dimensions: DimensionGroup | Iterable[str] | None = None, 

438 unique: bool = True, 

439 columns: Iterable[ColumnTag] | None = None, 

440 defer: bool | None = None, 

441 drop_postprocessing: bool = False, 

442 keep_record_columns: bool = True, 

443 ) -> Query: 

444 """Return a modified `Query` with a subset of this one's columns. 

445 

446 Parameters 

447 ---------- 

448 dimensions : `~collections.abc.Iterable` [ `str` ], 

449 optional 

450 Dimensions to include in the new query. Will be expanded to 

451 include all required and implied dependencies. Must be a subset of 

452 ``self.dimensions``. If not provided, ``self.dimensions`` is used. 

453 unique : `bool`, optional 

454 If `True` (default) deduplicate rows after dropping columns. 

455 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional 

456 Additional dataset or dimension record columns to include in the 

457 query. Dimension key columns added here are ignored unless they 

458 extend beyond the key columns implied by the ``dimensions`` 

459 argument (which is an error). 

460 defer : `bool`, optional 

461 If `False`, run the new query immediately. If `True`, do not. If 

462 `None` (default), the ``defer`` option passed when making ``self`` 

463 is used (this option is "sticky"). 

464 drop_postprocessing : `bool`, optional 

465 Drop any iteration-engine operations that depend on columns that 

466 are being removed (e.g. region-overlap tests when region columns 

467 are being dropped), making it more likely that projection and 

468 deduplication could be performed in the preferred engine, where 

469 they may be more efficient. 

470 keep_record_columns : `bool`, optional 

471 If `True` (default) and this query `has_record_columns`, implicitly 

472 add any of those to ``columns`` whose dimension element is in the 

473 given ``dimensions``. 

474 

475 Returns 

476 ------- 

477 query : `Query` 

478 New query with the requested columns only, optionally deduplicated. 

479 

480 Notes 

481 ----- 

482 Dataset columns are dropped from the new query unless passed via the 

483 ``columns`` argument. All other columns are by default preserved. 

484 

485 Raises 

486 ------ 

487 lsst.daf.relation.ColumnError 

488 Raised if the columns to include in the new query are not all 

489 present in the current query. 

490 """ 

491 match dimensions: 

492 case None: 

493 dimensions = set(self._dimensions.names) 

494 case DimensionGroup(): 

495 dimensions = set(dimensions.names) 

496 case iterable: 

497 dimensions = set(iterable) 

498 if columns is not None: 

499 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns)) 

500 dimensions = self._dimensions.universe.conform(dimensions) 

501 if columns is None: 

502 columns = set() 

503 else: 

504 columns = set(columns) 

505 columns.update(DimensionKeyColumnTag.generate(dimensions.names)) 

506 if keep_record_columns: 

507 if self._has_record_columns is True: 

508 for element_name in dimensions.elements: 

509 if element_name not in self._record_caches: 

510 columns.update(self.dimensions.universe[element_name].RecordClass.fields.columns) 

511 elif self._has_record_columns in dimensions.elements: 

512 element = cast(DimensionElement, self._has_record_columns) 

513 columns.update(element.RecordClass.fields.columns) 

514 if drop_postprocessing: 

515 relation = self._context.drop_invalidated_postprocessing(self._relation, columns) 

516 # Dropping postprocessing Calculations could cause other columns 

517 # we had otherwise intended to keep to be dropped as well. 

518 columns &= relation.columns 

519 else: 

520 relation = self._relation 

521 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine) 

522 if unique: 

523 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine) 

524 return self._chain(relation, defer, dimensions=dimensions) 

525 

526 def with_record_columns(self, dimension_element: str | None = None, defer: bool | None = None) -> Query: 

527 """Return a modified `Query` with additional dimension record columns 

528 and/or caches. 

529 

530 Parameters 

531 ---------- 

532 dimension_element : `str`, optional 

533 Name of a single dimension element to add record columns for, or 

534 `None` default to add them for all elements in `dimensions`. 

535 defer : `bool`, optional 

536 If `False`, run the new query immediately. If `True`, do not. If 

537 `None` (default), the ``defer`` option passed when making ``self`` 

538 is used (this option is "sticky"). 

539 

540 Returns 

541 ------- 

542 query : `Query` 

543 New query with the requested record columns either in the relation 

544 or (when possible) available via record caching. 

545 

546 Notes 

547 ----- 

548 Adding dimension record columns is fundamentally different from adding 

549 new dimension key columns or dataset columns, because it is purely an 

550 addition of columns, not rows - we can always join in a dimension 

551 element table (if it has not already been included) on keys already 

552 present in the current relation, confident that there is exactly one 

553 row in the dimension element table for each row in the current 

554 relation. 

555 """ 

556 if self._has_record_columns is True or self._has_record_columns == dimension_element: 

557 return self 

558 record_caches = dict(self._record_caches) 

559 columns_required: set[ColumnTag] = set() 

560 for element_name in self.dimensions.elements if dimension_element is None else [dimension_element]: 

561 element = self.dimensions.universe[element_name] 

562 if element_name in record_caches: 

563 continue 

564 if (cache := self._backend.get_dimension_record_cache(element_name)) is not None: 

565 record_caches[element_name] = cache 

566 else: 

567 columns_required.update(element.RecordClass.fields.columns.keys()) 

568 # Modify the relation we have to remove any projections that dropped 

569 # columns we now want, as long the relation's behavior is otherwise 

570 # unchanged. 

571 columns_required -= self._relation.columns 

572 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

573 columns_required.difference_update(columns_found) 

574 if columns_required: 

575 relation = self._backend.make_dimension_relation( 

576 self._dimensions, 

577 columns_required, 

578 self._context, 

579 initial_relation=relation, 

580 # Don't permit joins to use any columns beyond those in the 

581 # original relation, as that would change what this operation 

582 # does. 

583 initial_join_max_columns=frozenset(self._relation.columns), 

584 governor_constraints=self._governor_constraints, 

585 ) 

586 return self._chain( 

587 relation, 

588 defer=defer, 

589 has_record_columns=( 

590 True if dimension_element is None else self.dimensions.universe[dimension_element] 

591 ), 

592 record_caches=record_caches, 

593 ) 

594 

595 def find_datasets( 

596 self, 

597 dataset_type: DatasetType, 

598 collections: Any, 

599 *, 

600 find_first: bool = True, 

601 columns: Set[str] = frozenset(("dataset_id", "run")), 

602 defer: bool | None = None, 

603 ) -> Query: 

604 """Return a modified `Query` that includes a search for datasets of the 

605 given type. 

606 

607 Parameters 

608 ---------- 

609 dataset_type : `DatasetType` 

610 Dataset type to search for. May not be a component. 

611 collections : `~typing.Any` 

612 Collection search path or pattern. Must be a single collection 

613 name or ordered sequence if ``find_first=True``. See 

614 :ref:`daf_butler_collection_expressions` for more information. 

615 find_first : `bool`, optional 

616 If `True` (default) search collections in order until the first 

617 match for each data ID is found. If `False`, return all matches in 

618 all collections. 

619 columns : `~collections.abc.Set` [ `str` ] 

620 Dataset columns to include in the new query. Options include 

621 

622 - ``dataset_id``: the unique identifier of the dataset. The type 

623 is implementation-dependent. Never nullable. Included by 

624 default. 

625 - ``ingest_date``: the date and time the dataset was added to the 

626 data repository. 

627 - ``run``: the foreign key column to the `~CollectionType.RUN` 

628 collection holding the dataset (not necessarily the collection 

629 name). The type is dependent on the collection manager 

630 implementation. Included by default. 

631 - ``collection``: the foreign key column to the collection type in 

632 which the dataset was actually in this search. The type is 

633 dependent on the collection manager implementation. This may 

634 differ from ``run`` if the dataset is present in a matching 

635 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

636 collection, which means the same dataset may also appear multiple 

637 times in the query results. 

638 - ``timespan``: the validity range for datasets found in a 

639 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other 

640 collection types. 

641 

642 The default columns (``dataset_id`` and ``run``) are sufficient to 

643 enable `iter_dataset_refs`, which also takes care of translating 

644 the internal ``RUN`` collection key into its public name. 

645 

646 Setting this to an empty set while passing ``find_first=False`` 

647 will return a query that is constrained by dataset existence in 

648 some matching collection that does not actually return which 

649 datasets existed. 

650 defer : `bool`, optional 

651 If `False`, run the new query immediately. If `True`, do not. If 

652 `None` (default), the ``defer`` option passed when making ``self`` 

653 is used (this option is "sticky"). 

654 

655 Returns 

656 ------- 

657 query : `Query` 

658 New query with the requested dataset columns, constrained by the 

659 existence of datasets of this type in the given collection. 

660 

661 Raises 

662 ------ 

663 lsst.daf.relation.ColumnError 

664 Raised if a dataset search is already present in this query and 

665 this is a find-first search. 

666 """ 

667 if find_first and DatasetColumnTag.filter_from(self._relation.columns): 

668 raise ColumnError( 

669 "Cannot search for datasets with find_first=True " 

670 "on a query that already includes dataset columns." 

671 ) 

672 # 

673 # TODO: it'd be nice to do a QueryContext.restore_columns call here or 

674 # similar, to look for dataset-constraint joins already present in the 

675 # relation and expand them to include dataset-result columns as well, 

676 # instead of doing a possibly-redundant join here. But that would 

677 # require pushing relation usage down further into 

678 # DatasetStorageManager.make_relation, so that it doesn't need to be 

679 # given the columns, and then giving the relation system the ability to 

680 # simplify-away redundant joins when they only provide columns that 

681 # aren't ultimately used. The right time to look into that is probably 

682 # when investigating whether the base QueryBackend should be 

683 # responsible for producing an "abstract" relation tree of some sort, 

684 # with the subclasses only responsible for filling it in with payloads 

685 # (and possibly replacing some leaves with new sub-trees) during when 

686 # "processed" (or in some other "prepare" step). 

687 # 

688 # This is a low priority for three reasons: 

689 # - there's some chance the database's query optimizer will simplify 

690 # away these redundant joins; 

691 # - at present, the main use of this code path is in QG generation, 

692 # where we materialize the initial data ID query into a temp table 

693 # and hence can't go back and "recover" those dataset columns anyway; 

694 # 

695 collections = CollectionWildcard.from_expression(collections) 

696 if find_first: 

697 collections.require_ordered() 

698 rejections: list[str] = [] 

699 collection_records = self._backend.resolve_dataset_collections( 

700 dataset_type, 

701 collections, 

702 governor_constraints=self._governor_constraints, 

703 allow_calibration_collections=True, 

704 rejections=rejections, 

705 ) 

706 # If the dataset type has dimensions not in the current query, or we 

707 # need a temporal join for a calibration collection, either restore 

708 # those columns or join them in. 

709 full_dimensions = dataset_type.dimensions.as_group().union(self._dimensions) 

710 relation = self._relation 

711 record_caches = self._record_caches 

712 base_columns_required: set[ColumnTag] = { 

713 DimensionKeyColumnTag(name) for name in full_dimensions.names 

714 } 

715 spatial_joins: list[tuple[str, str]] = [] 

716 if not (dataset_type.dimensions <= self._dimensions): 

717 if self._has_record_columns is True: 

718 # This query is for expanded data IDs, so if we add new 

719 # dimensions to the query we need to be able to get records for 

720 # the new dimensions. 

721 record_caches = dict(self._record_caches) 

722 for element_name in full_dimensions.elements: 

723 element = full_dimensions.universe[element_name] 

724 if element in record_caches: 

725 continue 

726 if (cache := self._backend.get_dimension_record_cache(element_name)) is not None: 

727 record_caches[element_name] = cache 

728 else: 

729 base_columns_required.update(element.RecordClass.fields.columns.keys()) 

730 # See if we need spatial joins between the current query and the 

731 # dataset type's dimensions. The logic here is for multiple 

732 # spatial joins in general, but in practice it'll be exceedingly 

733 # rare for there to be more than one. We start by figuring out 

734 # which spatial "families" (observations vs. skymaps, skypix 

735 # systems) are present on only one side and not the other. 

736 lhs_spatial_families = self._dimensions.spatial - dataset_type.dimensions.spatial 

737 rhs_spatial_families = dataset_type.dimensions.spatial - self._dimensions.spatial 

738 # Now we iterate over the Cartesian product of those, so e.g. 

739 # if the query has {tract, patch, visit} and the dataset type 

740 # has {htm7} dimensions, the iterations of this loop 

741 # correspond to: (skymap, htm), (observations, htm). 

742 for lhs_spatial_family, rhs_spatial_family in itertools.product( 

743 lhs_spatial_families, rhs_spatial_families 

744 ): 

745 # For each pair we add a join between the most-precise element 

746 # present in each family (e.g. patch beats tract). 

747 spatial_joins.append( 

748 ( 

749 lhs_spatial_family.choose( 

750 full_dimensions.elements.names, self.dimensions.universe 

751 ).name, 

752 rhs_spatial_family.choose( 

753 full_dimensions.elements.names, self.dimensions.universe 

754 ).name, 

755 ) 

756 ) 

757 # Set up any temporal join between the query dimensions and CALIBRATION 

758 # collection's validity ranges. 

759 temporal_join_on: set[ColumnTag] = set() 

760 if any(r.type is CollectionType.CALIBRATION for r in collection_records): 

761 for family in self._dimensions.temporal: 

762 endpoint = family.choose(self._dimensions.elements.names, self.dimensions.universe) 

763 temporal_join_on.add(DimensionRecordColumnTag(endpoint.name, "timespan")) 

764 base_columns_required.update(temporal_join_on) 

765 # Note which of the many kinds of potentially-missing columns we have 

766 # and add the rest. 

767 base_columns_required.difference_update(relation.columns) 

768 if base_columns_required: 

769 relation = self._backend.make_dimension_relation( 

770 full_dimensions, 

771 base_columns_required, 

772 self._context, 

773 initial_relation=relation, 

774 # Don't permit joins to use any columns beyond those in the 

775 # original relation, as that would change what this 

776 # operation does. 

777 initial_join_max_columns=frozenset(self._relation.columns), 

778 governor_constraints=self._governor_constraints, 

779 spatial_joins=spatial_joins, 

780 ) 

781 # Finally we can join in the search for the dataset query. 

782 columns = set(columns) 

783 columns.add("dataset_id") 

784 if not collection_records: 

785 relation = relation.join( 

786 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context) 

787 ) 

788 elif find_first: 

789 relation = self._backend.make_dataset_search_relation( 

790 dataset_type, 

791 collection_records, 

792 columns, 

793 self._context, 

794 join_to=relation, 

795 temporal_join_on=temporal_join_on, 

796 ) 

797 else: 

798 relation = self._backend.make_dataset_query_relation( 

799 dataset_type, 

800 collection_records, 

801 columns, 

802 self._context, 

803 join_to=relation, 

804 temporal_join_on=temporal_join_on, 

805 ) 

806 return self._chain(relation, dimensions=full_dimensions, record_caches=record_caches, defer=defer) 

807 

808 def sliced( 

809 self, 

810 start: int = 0, 

811 stop: int | None = None, 

812 defer: bool | None = None, 

813 ) -> Query: 

814 """Return a modified `Query` with that takes a slice of this one's 

815 rows. 

816 

817 Parameters 

818 ---------- 

819 start : `int`, optional 

820 First index to include, inclusive. 

821 stop : `int` or `None`, optional 

822 One past the last index to include (i.e. exclusive). 

823 defer : `bool`, optional 

824 If `False`, run the new query immediately. If `True`, do not. If 

825 `None` (default), the ``defer`` option passed when making ``self`` 

826 is used (this option is "sticky"). 

827 

828 Returns 

829 ------- 

830 query : `Query` 

831 New query with the requested slice. 

832 

833 Notes 

834 ----- 

835 This operation must be implemented in the iteration engine if there are 

836 postprocessing operations, which may be much less efficient than 

837 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..`` 

838 in SQL). 

839 

840 Since query row order is usually arbitrary, it usually makes sense to 

841 call `sorted` before calling `sliced` to make the results 

842 deterministic. This is not checked because there are some contexts 

843 where getting an arbitrary subset of the results of a given size 

844 still makes sense. 

845 """ 

846 return self._chain(self._relation[start:stop], defer) 

847 

848 def sorted( 

849 self, 

850 order_by: Iterable[SortTerm], 

851 defer: bool | None = None, 

852 ) -> Query: 

853 """Return a modified `Query` that sorts this one's rows. 

854 

855 Parameters 

856 ---------- 

857 order_by : `~collections.abc.Iterable` [ `SortTerm` ] 

858 Expressions to sort by. 

859 defer : `bool`, optional 

860 If `False`, run the new query immediately. If `True`, do not. If 

861 `None` (default), the ``defer`` option passed when making ``self`` 

862 is used (this option is "sticky"). 

863 

864 Returns 

865 ------- 

866 query : `Query` 

867 New query with the requested sorting. 

868 

869 Notes 

870 ----- 

871 The ``order_by`` expression can include references to dimension record 

872 columns that were not present in the original relation; this is 

873 similar to calling `with_record_columns` for those columns first (but 

874 in this case column requests cannot be satisfied by record caches). 

875 All other columns referenced must be present in the query already. 

876 """ 

877 op = Sort(tuple(order_by)) 

878 columns_required = set(op.columns_required) 

879 columns_required.difference_update(self._relation.columns) 

880 if columns_required: 

881 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

882 columns_required.difference_update(columns_found) 

883 if columns_required: 

884 try: 

885 relation = self._backend.make_dimension_relation( 

886 self._dimensions, 

887 columns_required, 

888 self._context, 

889 initial_relation=relation, 

890 # Don't permit joins to use any columns beyond those in 

891 # the original relation, as that would change what this 

892 # operation does. 

893 initial_join_max_columns=frozenset(self._relation.columns), 

894 governor_constraints=self._governor_constraints, 

895 ) 

896 except ColumnError as err: 

897 raise ColumnError( 

898 "Cannot sort by columns that were not included in the original query or " 

899 "fully resolved by its dimensions." 

900 ) from err 

901 else: 

902 relation = self._relation 

903 relation = op.apply(relation, preferred_engine=self._context.preferred_engine) 

904 return self._chain(relation, defer) 

905 

906 def count(self, *, exact: bool = True, discard: bool = False) -> int: 

907 """Count the number of rows in this query. 

908 

909 Parameters 

910 ---------- 

911 exact : `bool`, optional 

912 If `True` (default), return the exact number of rows. If `False`, 

913 returning an upper bound is permitted if it can be done much more 

914 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but 

915 ignoring client-side filtering that would otherwise take place. 

916 discard : `bool`, optional 

917 If `True`, compute the exact count even if it would require running 

918 the full query and then throwing away the result rows after 

919 counting them. If `False`, this is an error, as the user would 

920 usually be better off executing the query first to fetch its rows 

921 into a new query (or passing ``exact=False``). Ignored if 

922 ``exact=False``. 

923 

924 Returns 

925 ------- 

926 n_rows : `int` 

927 Number of rows in the query, or an upper bound. This includes 

928 duplicates, if there are any. 

929 

930 Raises 

931 ------ 

932 RuntimeError 

933 Raised if an exact count was requested and could not be obtained 

934 without fetching and discarding rows. 

935 """ 

936 if self._relation.min_rows == self._relation.max_rows: 

937 return self._relation.max_rows 

938 return self._context.count(self._relation, exact=exact, discard=discard) 

939 

940 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

941 """Check whether this query has any result rows at all. 

942 

943 Parameters 

944 ---------- 

945 execute : `bool`, optional 

946 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

947 determined prior to execution that the query would return no rows. 

948 exact : `bool`, optional 

949 If `True`, run the full query and perform post-query filtering if 

950 needed, until at least one result row is found. If `False`, the 

951 returned result does not account for post-query filtering, and 

952 hence may be `True` even when all result rows would be filtered 

953 out. 

954 

955 Returns 

956 ------- 

957 any_rows : `bool` 

958 Whether the query has any rows, or if it may have any rows if 

959 ``exact=False``. 

960 

961 Raises 

962 ------ 

963 RuntimeError 

964 Raised if an exact check was requested and could not be obtained 

965 without executing the query. 

966 """ 

967 if self._relation.min_rows > 0: 

968 return True 

969 if self._relation.max_rows == 0: 

970 return False 

971 if execute: 

972 return self._context.any(self._relation, execute=execute, exact=exact) 

973 elif not exact: 

974 return True 

975 raise TypeError("Cannot obtain exact results without executing the query.") 

976 

977 def explain_no_results(self, execute: bool = True) -> list[str]: 

978 """Return human-readable messages that may help explain why the query 

979 yields no results. 

980 

981 Parameters 

982 ---------- 

983 execute : `bool`, optional 

984 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``) 

985 of aspects of the query to more precisely determine where rows were 

986 filtered out. 

987 

988 Returns 

989 ------- 

990 messages : `~collections.abc.Iterable` [ `str` ] 

991 String messages that describe reasons the query might not yield any 

992 results. 

993 """ 

994 # First try without actually executing any queries. 

995 diagnostics = Diagnostics.run(self._relation) 

996 if diagnostics.is_doomed: 

997 return diagnostics.messages 

998 if execute: 

999 # Try again, running LIMIT 1 queries as we walk back down the tree 

1000 # to look for relations with no rows: 

1001 diagnostics = Diagnostics.run(self._relation, executor=self._context.any) 

1002 if diagnostics.is_doomed: 

1003 return diagnostics.messages 

1004 return [] 

1005 

1006 def _copy( 

1007 self, 

1008 relation: Relation, 

1009 is_deferred: bool, 

1010 dimensions: DimensionGroup | None = None, 

1011 governor_constraints: Mapping[str, Set[str]] | None = None, 

1012 has_record_columns: bool | DimensionElement | None = None, 

1013 record_caches: Mapping[str, DimensionRecordSet] | None = None, 

1014 ) -> Query: 

1015 """Return a modified copy of this query with some attributes replaced. 

1016 

1017 See class docs for parameter documentation; the only difference here 

1018 is that the defaults are the values ``self`` was constructed with. 

1019 """ 

1020 return Query( 

1021 dimensions=self._dimensions if dimensions is None else dimensions, 

1022 backend=self._backend, 

1023 context=self._context, 

1024 relation=relation, 

1025 governor_constraints=( 

1026 governor_constraints if governor_constraints is not None else self._governor_constraints 

1027 ), 

1028 is_deferred=is_deferred, 

1029 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns, 

1030 record_caches=self._record_caches if record_caches is None else record_caches, 

1031 ) 

1032 

1033 def _chain( 

1034 self, 

1035 relation: Relation, 

1036 defer: bool | None, 

1037 dimensions: DimensionGroup | None = None, 

1038 governor_constraints: Mapping[str, Set[str]] | None = None, 

1039 has_record_columns: bool | DimensionElement | None = None, 

1040 record_caches: Mapping[str, DimensionRecordSet] | None = None, 

1041 ) -> Query: 

1042 """Return a modified query with a new relation while handling the 

1043 ubiquitous ``defer`` parameter's logic. 

1044 

1045 Parameters 

1046 ---------- 

1047 relation : `Relation` 

1048 Relation for the new query. 

1049 defer : `bool` 

1050 If `False`, run the new query immediately. If `True`, do not. If 

1051 `None` , the ``defer`` option passed when making ``self`` is used 

1052 (this option is "sticky"). 

1053 dimensions : `DimensionGroup`, optional 

1054 See class docs. 

1055 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

1056 `~collections.abc.Set` [ `str` ] ], optional 

1057 See class docs. 

1058 has_record_columns : `bool` or `DimensionElement`, optional 

1059 See class docs. 

1060 record_caches : `~collections.abc.Mapping` [ `str`, \ 

1061 `DimensionRecordSet`, optional 

1062 See class docs. 

1063 

1064 Returns 

1065 ------- 

1066 chained : `Query` 

1067 Modified query, or ``self`` if no modifications were actually 

1068 requested. 

1069 """ 

1070 if defer is None: 

1071 defer = self._is_deferred 

1072 if ( 

1073 relation is self._relation 

1074 and dimensions is None 

1075 and defer == self._is_deferred 

1076 and record_caches is None 

1077 and has_record_columns is None 

1078 and governor_constraints is None 

1079 ): 

1080 return self 

1081 result = self._copy( 

1082 relation, 

1083 is_deferred=True, 

1084 governor_constraints=governor_constraints, 

1085 dimensions=dimensions, 

1086 has_record_columns=has_record_columns, 

1087 record_caches=record_caches, 

1088 ) 

1089 if not defer: 

1090 result = result.run() 

1091 return result