Coverage for python/lsst/daf/butler/registry/queries/_query.py: 15%

241 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = () 

30 

31import itertools 

32from collections.abc import Iterable, Iterator, Mapping, Sequence, Set 

33from contextlib import contextmanager 

34from typing import Any, cast, final 

35 

36from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm 

37 

38from ...core import ( 

39 DataCoordinate, 

40 DatasetColumnTag, 

41 DatasetRef, 

42 DatasetType, 

43 Dimension, 

44 DimensionElement, 

45 DimensionGraph, 

46 DimensionKeyColumnTag, 

47 DimensionRecord, 

48 DimensionRecordColumnTag, 

49) 

50from .._collectionType import CollectionType 

51from ..wildcards import CollectionWildcard 

52from ._query_backend import QueryBackend 

53from ._query_context import QueryContext 

54from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader 

55 

56 

57@final 

58class Query: 

59 """A general-purpose representation of a registry query. 

60 

61 Parameters 

62 ---------- 

63 dimensions : `DimensionGraph` 

64 The dimensions that span the query and are used to join its relations 

65 together. 

66 backend : `QueryBackend` 

67 Backend object used to create the query and new ones derived from it. 

68 context : `QueryContext` 

69 Context manager that holds relation engines and database connections 

70 for the query. 

71 relation : `Relation` 

72 The relation tree representation of the query as a series of operations 

73 on tables. 

74 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

75 `~collections.abc.Set` [ `str` ] ] 

76 Constraints on governor dimensions encoded in this query's relation. 

77 This is a mapping from governor dimension name to sets of values that 

78 dimension may take. 

79 is_deferred : `bool` 

80 If `True`, modifier methods that return a related `Query` object should 

81 not immediately execute the new query. 

82 has_record_columns : `bool` or `DimensionElement` 

83 Whether this query's relation already includes columns for all or some 

84 dimension element records: `True` means all elements in ``dimensions`` 

85 either have records present in ``record_caches`` or all columns present 

86 in ``relation``, while a specific `DimensionElement` means that element 

87 does. 

88 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \ 

89 `~collections.abc.Mapping` 

90 [ `DataCoordinate`, `DimensionRecord` ] ], optional 

91 Cached dimension record values, organized first by dimension element 

92 and then by data ID. 

93 

94 Notes 

95 ----- 

96 Iterating over a `Query` yields mappings from `ColumnTag` to the associated 

97 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and 

98 `iter_dimension_records` methods can be used to instead iterate over 

99 various butler primitives derived from these rows. 

100 

101 Iterating over a `Query` may or may not execute database queries again each 

102 time, depending on the state of its relation tree - see `Query.run` for 

103 details. 

104 

105 Query is immutable; all methods that might appear to modify it in place 

106 actually return a new object (though many attributes will be shared). 

107 

108 Query is currently (still) an internal-to-Registry object, with only the 

109 "QueryResults" classes that are backed by it directly exposed to users. It 

110 has been designed with the intent that it will eventually play a larger 

111 role, either as the main query result object in a redesigned query 

112 interface, or a "power user" result option that accompanies simpler 

113 replacements for the current "QueryResults" objects. 

114 """ 

115 

116 def __init__( 

117 self, 

118 dimensions: DimensionGraph, 

119 backend: QueryBackend[QueryContext], 

120 context: QueryContext, 

121 relation: Relation, 

122 governor_constraints: Mapping[str, Set[str]], 

123 is_deferred: bool, 

124 has_record_columns: bool | DimensionElement, 

125 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

126 ): 

127 self._dimensions = dimensions 

128 self._backend = backend 

129 self._context = context 

130 self._relation = relation 

131 self._governor_constraints = governor_constraints 

132 self._is_deferred = is_deferred 

133 self._has_record_columns = has_record_columns 

134 self._record_caches = record_caches if record_caches is not None else {} 

135 

136 @property 

137 def dimensions(self) -> DimensionGraph: 

138 """The dimensions that span the query and are used to join its 

139 relations together (`DimensionGraph`). 

140 """ 

141 return self._dimensions 

142 

143 @property 

144 def relation(self) -> Relation: 

145 """The relation tree representation of the query as a series of 

146 operations on tables (`Relation`). 

147 """ 

148 return self._relation 

149 

150 @property 

151 def has_record_columns(self) -> bool | DimensionElement: 

152 """Whether this query's relation already includes columns for all or 

153 some dimension element records (`bool` or `DimensionElement`). 

154 """ 

155 return self._has_record_columns 

156 

157 @property 

158 def backend(self) -> QueryBackend[QueryContext]: 

159 """Backend object used to create the query and new ones derived from it 

160 (`QueryBackend`). 

161 """ 

162 return self._backend 

163 

164 @contextmanager 

165 def open_context(self) -> Iterator[None]: 

166 """Return a context manager that ensures a database connection is 

167 established and temporary tables and cursors have a defined lifetime. 

168 

169 Returns 

170 ------- 

171 context : `contextlib.AbstractContextManager` 

172 Context manager with no return value. 

173 """ 

174 if self._context.is_open: 

175 yield 

176 else: 

177 with self._context: 

178 yield 

179 

180 def __str__(self) -> str: 

181 return str(self._relation) 

182 

183 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]: 

184 return iter(self._context.fetch_iterable(self._relation)) 

185 

186 def iter_data_ids(self, dimensions: DimensionGraph | None = None) -> Iterator[DataCoordinate]: 

187 """Return an iterator that converts result rows to data IDs. 

188 

189 Parameters 

190 ---------- 

191 dimensions : `DimensionGraph`, optional 

192 Dimensions of the data IDs to return. If not provided, 

193 ``self.dimensions`` is used. 

194 

195 Returns 

196 ------- 

197 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ] 

198 Iterator that yields data IDs. 

199 """ 

200 if dimensions is None: 

201 dimensions = self._dimensions 

202 reader = DataCoordinateReader.make( 

203 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches 

204 ) 

205 if not (reader.columns_required <= self.relation.columns): 

206 raise ColumnError( 

207 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

208 f"for data IDs with dimensions {dimensions}." 

209 ) 

210 return (reader.read(row) for row in self) 

211 

212 def iter_dataset_refs( 

213 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,) 

214 ) -> Iterator[DatasetRef]: 

215 """Return an iterator that converts result rows to dataset references. 

216 

217 Parameters 

218 ---------- 

219 dataset_type : `DatasetType` 

220 The parent dataset type to yield references for. 

221 components : `~collections.abc.Sequence` [ `None` or `str` ] 

222 Which component dataset types to construct refs for from each row 

223 representing a parent; `None` for the parent itself. 

224 

225 Returns 

226 ------- 

227 refs : `~collections.abc.Iterator` [ `DatasetRef` ] 

228 Iterator that yields (resolved) dataset references. 

229 """ 

230 reader = DatasetRefReader( 

231 dataset_type, 

232 translate_collection=self._backend.get_collection_name, 

233 records=self._has_record_columns is True, 

234 record_caches=self._record_caches, 

235 ) 

236 if not (reader.columns_required <= self.relation.columns): 

237 raise ColumnError( 

238 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

239 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}." 

240 ) 

241 for row in self: 

242 parent_ref = reader.read(row) 

243 for component in components: 

244 if component is None: 

245 yield parent_ref 

246 else: 

247 yield parent_ref.makeComponentRef(component) 

248 

249 def iter_data_ids_and_dataset_refs( 

250 self, dataset_type: DatasetType, dimensions: DimensionGraph | None = None 

251 ) -> Iterator[tuple[DataCoordinate, DatasetRef]]: 

252 """Iterate over pairs of data IDs and dataset refs. 

253 

254 This permits the data ID dimensions to differ from the dataset 

255 dimensions. 

256 

257 Parameters 

258 ---------- 

259 dataset_type : `DatasetType` 

260 The parent dataset type to yield references for. 

261 dimensions : `DimensionGraph`, optional 

262 Dimensions of the data IDs to return. If not provided, 

263 ``self.dimensions`` is used. 

264 

265 Returns 

266 ------- 

267 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`, 

268 `DatasetRef` ] ] 

269 An iterator over (data ID, dataset reference) pairs. 

270 """ 

271 if dimensions is None: 

272 dimensions = self._dimensions 

273 data_id_reader = DataCoordinateReader.make( 

274 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches 

275 ) 

276 dataset_reader = DatasetRefReader( 

277 dataset_type, 

278 translate_collection=self._backend.get_collection_name, 

279 records=self._has_record_columns is True, 

280 record_caches=self._record_caches, 

281 ) 

282 if not (data_id_reader.columns_required <= self.relation.columns): 

283 raise ColumnError( 

284 f"Missing column(s) {set(data_id_reader.columns_required - self.relation.columns)} " 

285 f"for data IDs with dimensions {dimensions}." 

286 ) 

287 if not (dataset_reader.columns_required <= self.relation.columns): 

288 raise ColumnError( 

289 f"Missing column(s) {set(dataset_reader.columns_required - self.relation.columns)} " 

290 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}." 

291 ) 

292 for row in self: 

293 yield (data_id_reader.read(row), dataset_reader.read(row)) 

294 

295 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]: 

296 """Return an iterator that converts result rows to dimension records. 

297 

298 Parameters 

299 ---------- 

300 element : `DimensionElement`, optional 

301 Dimension element whose records will be returned. If not provided, 

302 `has_record_columns` must be a `DimensionElement` instance. 

303 

304 Returns 

305 ------- 

306 records : `~collections.abc.Iterator` [ `DimensionRecord` ] 

307 Iterator that yields dimension records. 

308 """ 

309 if element is None: 

310 match self._has_record_columns: 

311 case True | False: 

312 raise ValueError("No default dimension element in query; 'element' must be given.") 

313 case only_element_with_records: 

314 element = only_element_with_records 

315 if (cache := self._record_caches.get(element)) is not None: 

316 return (cache[data_id] for data_id in self.iter_data_ids(element.graph)) 

317 else: 

318 reader = DimensionRecordReader(element) 

319 if not (reader.columns_required <= self.relation.columns): 

320 raise ColumnError( 

321 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

322 f"for records of element {element.name}." 

323 ) 

324 return (reader.read(row) for row in self) 

325 

326 def run(self) -> Query: 

327 """Execute the query and hold its results in memory. 

328 

329 Returns 

330 ------- 

331 executed : `Query` 

332 New query that holds the query results. 

333 

334 Notes 

335 ----- 

336 Iterating over the results of a query that has been `run` will always 

337 iterate over an existing container, while iterating over a query that 

338 has not been run will result in executing at least some of the query 

339 each time. 

340 

341 Running a query also sets its `is_deferred` flag to `False`, which will 

342 cause new queries constructed by its methods to be run immediately, 

343 unless ``defer=True`` is passed to the factory method. After a query 

344 has been run, factory methods will also tend to prefer to apply new 

345 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python 

346 code acting on the existing container rather than going back to SQL, 

347 which can be less efficient overall that applying operations to a 

348 deferred query and executing them all only at the end. 

349 

350 Running a query is represented in terms of relations by adding a 

351 `~lsst.daf.relation.Materialization` marker relation in the iteration 

352 engine and then processing the relation tree; this attaches the 

353 container of rows to that new relation to short-circuit any future 

354 processing of the tree and lock changes to the tree upstream of it. 

355 This is very different from the SQL-engine 

356 `~lsst.daf.relation.Materialization` added to the tree by the 

357 `materialize` method from a user perspective, though it has a similar 

358 representation in the relation tree. 

359 """ 

360 relation = ( 

361 # Make a new relation that definitely ends in the iteration engine 

362 # (this does nothing if it already does). 

363 self.relation.transferred_to(self._context.iteration_engine) 

364 # Make the new relation save its rows to an in-memory Python 

365 # collection in relation.payload when processed. 

366 .materialized(name_prefix="run") 

367 ) 

368 # Actually process the relation, simplifying out trivial relations, 

369 # executing any SQL queries, and saving results to relation.payload. 

370 # We discard the simplified relation that's returned, because we want 

371 # the new query to have any extra diagnostic information contained in 

372 # the original. 

373 self._context.process(relation) 

374 return self._copy(relation, False) 

375 

376 def materialized(self, defer_postprocessing: bool = True) -> Query: 

377 """Materialize the results of this query in its context's preferred 

378 engine. 

379 

380 Usually this means inserting the results into a temporary table in a 

381 database. 

382 

383 Parameters 

384 ---------- 

385 defer_postprocessing : `bool`, optional 

386 If `True`, do not execute operations that occur in the context's 

387 `QueryContext.iteration_engine` up front; instead insert and 

388 execute a materialization upstream of them (e.g. via a a SQL 

389 ``INSERT INTO ... SELECT`` statement, with no fetching to the 

390 client) and execute the postprocessing operations when iterating 

391 over the query results. If `False`, and iteration-engine 

392 postprocessing operations exist, run the full query, execute them 

393 now, and upload the results. 

394 If the relation is already in the preferred engine, this option 

395 is ignored and the materialization will not involve fetching rows 

396 to the iteration engine at all. If the relation has already been 

397 materialized in the iteration engine (i.e. via `run`), then this 

398 option is again ignored and an upload of the existing rows will 

399 be performed. 

400 

401 Returns 

402 ------- 

403 materialized : `Query` 

404 Modified query with the same row-and-column content with a 

405 materialization in ``self.context.preferred_engine``. 

406 """ 

407 if defer_postprocessing or self.relation.engine == self._context.preferred_engine: 

408 relation, stripped = self._context.strip_postprocessing(self._relation) 

409 if relation.engine == self._context.preferred_engine: 

410 # We got all the way to the engine we want to materialize in. 

411 # Apply that operation to the tree, process it (which actually 

412 # creates a temporary table and populates it), and then reapply 

413 # the stripped operations. 

414 relation = relation.materialized() 

415 self._context.process(relation) 

416 for operation in stripped: 

417 relation = operation.apply( 

418 relation, transfer=True, preferred_engine=self._context.iteration_engine 

419 ) 

420 return self._copy(relation, True) 

421 # Either defer_postprocessing=False, or attempting to strip off unary 

422 # operations until we got to the preferred engine didn't work, because 

423 # this tree doesn't actually involve the preferred engine. So we just 

424 # transfer to the preferred engine first, and then materialize, 

425 # process, and return. 

426 relation = self._relation.transferred_to(self._context.preferred_engine).materialized() 

427 self._context.process(relation) 

428 return self._copy(relation, True) 

429 

430 def projected( 

431 self, 

432 dimensions: Iterable[Dimension | str] | None = None, 

433 unique: bool = True, 

434 columns: Iterable[ColumnTag] | None = None, 

435 defer: bool | None = None, 

436 drop_postprocessing: bool = False, 

437 keep_record_columns: bool = True, 

438 ) -> Query: 

439 """Return a modified `Query` with a subset of this one's columns. 

440 

441 Parameters 

442 ---------- 

443 dimensions : `~collections.abc.Iterable` [ `Dimension` or `str` ], 

444 optional 

445 Dimensions to include in the new query. Will be expanded to 

446 include all required and implied dependencies. Must be a subset of 

447 ``self.dimensions``. If not provided, ``self.dimensions`` is used. 

448 unique : `bool`, optional 

449 If `True` (default) deduplicate rows after dropping columns. 

450 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional 

451 Additional dataset or dimension record columns to include in the 

452 query. Dimension key columns added here are ignored unless they 

453 extend beyond the key columns implied by the ``dimensions`` 

454 argument (which is an error). 

455 defer : `bool`, optional 

456 If `False`, run the new query immediately. If `True`, do not. If 

457 `None` (default), the ``defer`` option passed when making ``self`` 

458 is used (this option is "sticky"). 

459 drop_postprocessing : `bool`, optional 

460 Drop any iteration-engine operations that depend on columns that 

461 are being removed (e.g. region-overlap tests when region columns 

462 are being dropped), making it more likely that projection and 

463 deduplication could be performed in the preferred engine, where 

464 they may be more efficient. 

465 keep_record_columns : `bool`, optional 

466 If `True` (default) and this query `has_record_columns`, implicitly 

467 add any of those to ``columns`` whose dimension element is in the 

468 given ``dimensions``. 

469 

470 Returns 

471 ------- 

472 query : `Query` 

473 New query with the requested columns only, optionally deduplicated. 

474 

475 Notes 

476 ----- 

477 Dataset columns are dropped from the new query unless passed via the 

478 ``columns`` argument. All other columns are by default preserved. 

479 

480 Raises 

481 ------ 

482 lsst.daf.relation.ColumnError 

483 Raised if the columns to include in the new query are not all 

484 present in the current query. 

485 """ 

486 if dimensions is None: 

487 dimensions = set(self._dimensions) 

488 else: 

489 dimensions = set(dimensions) 

490 if columns is not None: 

491 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns)) 

492 dimensions = self._dimensions.universe.extract(dimensions) 

493 if columns is None: 

494 columns = set() 

495 else: 

496 columns = set(columns) 

497 columns.update(DimensionKeyColumnTag.generate(dimensions.names)) 

498 if keep_record_columns: 

499 if self._has_record_columns is True: 

500 for element in dimensions.elements: 

501 if element not in self._record_caches: 

502 columns.update(element.RecordClass.fields.columns) 

503 elif self._has_record_columns in dimensions.elements: 

504 element = cast(DimensionElement, self._has_record_columns) 

505 columns.update(element.RecordClass.fields.columns) 

506 if drop_postprocessing: 

507 relation = self._context.drop_invalidated_postprocessing(self._relation, columns) 

508 # Dropping postprocessing Calculations could cause other columns 

509 # we had otherwise intended to keep to be dropped as well. 

510 columns &= relation.columns 

511 else: 

512 relation = self._relation 

513 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine) 

514 if unique: 

515 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine) 

516 return self._chain(relation, defer, dimensions=dimensions) 

517 

518 def with_record_columns( 

519 self, dimension_element: DimensionElement | None = None, defer: bool | None = None 

520 ) -> Query: 

521 """Return a modified `Query` with additional dimension record columns 

522 and/or caches. 

523 

524 Parameters 

525 ---------- 

526 dimension_element : `DimensionElement`, optional 

527 Single element to add record columns for, or `None` default to add 

528 them for all elements in `dimensions`. 

529 defer : `bool`, optional 

530 If `False`, run the new query immediately. If `True`, do not. If 

531 `None` (default), the ``defer`` option passed when making ``self`` 

532 is used (this option is "sticky"). 

533 

534 Returns 

535 ------- 

536 query : `Query` 

537 New query with the requested record columns either in the relation 

538 or (when possible) available via record caching. 

539 

540 Notes 

541 ----- 

542 Adding dimension record columns is fundamentally different from adding 

543 new dimension key columns or dataset columns, because it is purely an 

544 addition of columns, not rows - we can always join in a dimension 

545 element table (if it has not already been included) on keys already 

546 present in the current relation, confident that there is exactly one 

547 row in the dimension element table for each row in the current 

548 relation. 

549 """ 

550 if self._has_record_columns is True or self._has_record_columns == dimension_element: 

551 return self 

552 record_caches = dict(self._record_caches) 

553 columns_required: set[ColumnTag] = set() 

554 for element in self.dimensions.elements if dimension_element is None else [dimension_element]: 

555 if element in record_caches: 

556 continue 

557 if (cache := self._backend.get_dimension_record_cache(element.name, self._context)) is not None: 

558 record_caches[element] = cache 

559 else: 

560 columns_required.update(element.RecordClass.fields.columns.keys()) 

561 # Modify the relation we have to remove any projections that dropped 

562 # columns we now want, as long the relation's behavior is otherwise 

563 # unchanged. 

564 columns_required -= self._relation.columns 

565 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

566 columns_required.difference_update(columns_found) 

567 if columns_required: 

568 relation = self._backend.make_dimension_relation( 

569 self._dimensions, 

570 columns_required, 

571 self._context, 

572 initial_relation=relation, 

573 # Don't permit joins to use any columns beyond those in the 

574 # original relation, as that would change what this operation 

575 # does. 

576 initial_join_max_columns=frozenset(self._relation.columns), 

577 governor_constraints=self._governor_constraints, 

578 ) 

579 return self._chain( 

580 relation, 

581 defer=defer, 

582 has_record_columns=True if dimension_element is None else dimension_element, 

583 record_caches=record_caches, 

584 ) 

585 

586 def find_datasets( 

587 self, 

588 dataset_type: DatasetType, 

589 collections: Any, 

590 *, 

591 find_first: bool = True, 

592 columns: Set[str] = frozenset(("dataset_id", "run")), 

593 defer: bool | None = None, 

594 ) -> Query: 

595 """Return a modified `Query` that includes a search for datasets of the 

596 given type. 

597 

598 Parameters 

599 ---------- 

600 dataset_type : `DatasetType` 

601 Dataset type to search for. May not be a component. 

602 collections 

603 Collection search path or pattern. Must be a single collection 

604 name or ordered sequence if ``find_first=True``. See 

605 :ref:`daf_butler_collection_expressions` for more information. 

606 find_first : `bool`, optional 

607 If `True` (default) search collections in order until the first 

608 match for each data ID is found. If `False`, return all matches in 

609 all collections. 

610 columns : `~collections.abc.Set` [ `str` ] 

611 Dataset columns to include in the new query. Options include 

612 

613 - ``dataset_id``: the unique identifier of the dataset. The type 

614 is implementation-dependent. Never nullable. Included by 

615 default. 

616 

617 - ``ingest_date``: the date and time the dataset was added to the 

618 data repository. 

619 

620 - ``run``: the foreign key column to the `~CollectionType.RUN` 

621 collection holding the dataset (not necessarily the collection 

622 name). The type is dependent on the collection manager 

623 implementation. Included by default. 

624 

625 - ``collection``: the foreign key column to the collection type in 

626 which the dataset was actually in this search. The type is 

627 dependent on the collection manager implementation. This may 

628 differ from ``run`` if the dataset is present in a matching 

629 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

630 collection, which means the same dataset may also appear multiple 

631 times in the query results. 

632 

633 - ``timespan``: the validity range for datasets found in a 

634 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other 

635 collection types. 

636 

637 The default columns (``dataset_id`` and ``run``) are sufficient to 

638 enable `iter_dataset_refs`, which also takes care of translating 

639 the internal ``RUN`` collection key into its public name. 

640 

641 Setting this to an empty set while passing ``find_first=False`` 

642 will return a query that is constrained by dataset existence in 

643 some matching collection that does not actually return which 

644 datasets existed. 

645 defer : `bool`, optional 

646 If `False`, run the new query immediately. If `True`, do not. If 

647 `None` (default), the ``defer`` option passed when making ``self`` 

648 is used (this option is "sticky"). 

649 

650 Returns 

651 ------- 

652 query : `Query` 

653 New query with the requested dataset columns, constrained by the 

654 existence of datasets of this type in the given collection. 

655 

656 Raises 

657 ------ 

658 lsst.daf.relation.ColumnError 

659 Raised if a dataset search is already present in this query and 

660 this is a find-first search. 

661 """ 

662 if find_first and DatasetColumnTag.filter_from(self._relation.columns): 

663 raise ColumnError( 

664 "Cannot search for datasets with find_first=True " 

665 "on a query that already includes dataset columns." 

666 ) 

667 # 

668 # TODO: it'd be nice to do a QueryContext.restore_columns call here or 

669 # similar, to look for dataset-constraint joins already present in the 

670 # relation and expand them to include dataset-result columns as well, 

671 # instead of doing a possibly-redundant join here. But that would 

672 # require pushing relation usage down further into 

673 # DatasetStorageManager.make_relation, so that it doesn't need to be 

674 # given the columns, and then giving the relation system the ability to 

675 # simplify-away redundant joins when they only provide columns that 

676 # aren't ultimately used. The right time to look into that is probably 

677 # when investigating whether the base QueryBackend should be 

678 # responsible for producing an "abstract" relation tree of some sort, 

679 # with the subclasses only responsible for filling it in with payloads 

680 # (and possibly replacing some leaves with new sub-trees) during when 

681 # "processed" (or in some other "prepare" step). 

682 # 

683 # This is a low priority for three reasons: 

684 # - there's some chance the database's query optimizer will simplify 

685 # away these redundant joins; 

686 # - at present, the main use of this code path is in QG generation, 

687 # where we materialize the initial data ID query into a temp table 

688 # and hence can't go back and "recover" those dataset columns anyway; 

689 # 

690 collections = CollectionWildcard.from_expression(collections) 

691 if find_first: 

692 collections.require_ordered() 

693 rejections: list[str] = [] 

694 collection_records = self._backend.resolve_dataset_collections( 

695 dataset_type, 

696 collections, 

697 governor_constraints=self._governor_constraints, 

698 allow_calibration_collections=True, 

699 rejections=rejections, 

700 ) 

701 # If the dataset type has dimensions not in the current query, or we 

702 # need a temporal join for a calibration collection, either restore 

703 # those columns or join them in. 

704 full_dimensions = dataset_type.dimensions.union(self._dimensions) 

705 relation = self._relation 

706 record_caches = self._record_caches 

707 base_columns_required: set[ColumnTag] = { 

708 DimensionKeyColumnTag(name) for name in full_dimensions.names 

709 } 

710 spatial_joins: list[tuple[str, str]] = [] 

711 if not (dataset_type.dimensions <= self._dimensions): 

712 if self._has_record_columns is True: 

713 # This query is for expanded data IDs, so if we add new 

714 # dimensions to the query we need to be able to get records for 

715 # the new dimensions. 

716 record_caches = dict(self._record_caches) 

717 for element in full_dimensions.elements: 

718 if element in record_caches: 

719 continue 

720 if ( 

721 cache := self._backend.get_dimension_record_cache(element.name, self._context) 

722 ) is not None: 

723 record_caches[element] = cache 

724 else: 

725 base_columns_required.update(element.RecordClass.fields.columns.keys()) 

726 # See if we need spatial joins between the current query and the 

727 # dataset type's dimensions. The logic here is for multiple 

728 # spatial joins in general, but in practice it'll be exceedingly 

729 # rare for there to be more than one. We start by figuring out 

730 # which spatial "families" (observations vs. skymaps, skypix 

731 # systems) are present on only one side and not the other. 

732 lhs_spatial_families = self._dimensions.spatial - dataset_type.dimensions.spatial 

733 rhs_spatial_families = dataset_type.dimensions.spatial - self._dimensions.spatial 

734 # Now we iterate over the Cartesian product of those, so e.g. 

735 # if the query has {tract, patch, visit} and the dataset type 

736 # has {htm7} dimensions, the iterations of this loop 

737 # correspond to: (skymap, htm), (observations, htm). 

738 for lhs_spatial_family, rhs_spatial_family in itertools.product( 

739 lhs_spatial_families, rhs_spatial_families 

740 ): 

741 # For each pair we add a join between the most-precise element 

742 # present in each family (e.g. patch beats tract). 

743 spatial_joins.append( 

744 ( 

745 lhs_spatial_family.choose(full_dimensions.elements).name, 

746 rhs_spatial_family.choose(full_dimensions.elements).name, 

747 ) 

748 ) 

749 # Set up any temporal join between the query dimensions and CALIBRATION 

750 # collection's validity ranges. 

751 temporal_join_on: set[ColumnTag] = set() 

752 if any(r.type is CollectionType.CALIBRATION for r in collection_records): 

753 for family in self._dimensions.temporal: 

754 endpoint = family.choose(self._dimensions.elements) 

755 temporal_join_on.add(DimensionRecordColumnTag(endpoint.name, "timespan")) 

756 base_columns_required.update(temporal_join_on) 

757 # Note which of the many kinds of potentially-missing columns we have 

758 # and add the rest. 

759 base_columns_required.difference_update(relation.columns) 

760 if base_columns_required: 

761 relation = self._backend.make_dimension_relation( 

762 full_dimensions, 

763 base_columns_required, 

764 self._context, 

765 initial_relation=relation, 

766 # Don't permit joins to use any columns beyond those in the 

767 # original relation, as that would change what this 

768 # operation does. 

769 initial_join_max_columns=frozenset(self._relation.columns), 

770 governor_constraints=self._governor_constraints, 

771 spatial_joins=spatial_joins, 

772 ) 

773 # Finally we can join in the search for the dataset query. 

774 columns = set(columns) 

775 columns.add("dataset_id") 

776 if not collection_records: 

777 relation = relation.join( 

778 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context) 

779 ) 

780 elif find_first: 

781 relation = self._backend.make_dataset_search_relation( 

782 dataset_type, 

783 collection_records, 

784 columns, 

785 self._context, 

786 join_to=relation, 

787 temporal_join_on=temporal_join_on, 

788 ) 

789 else: 

790 relation = self._backend.make_dataset_query_relation( 

791 dataset_type, 

792 collection_records, 

793 columns, 

794 self._context, 

795 join_to=relation, 

796 temporal_join_on=temporal_join_on, 

797 ) 

798 return self._chain(relation, dimensions=full_dimensions, record_caches=record_caches, defer=defer) 

799 

800 def sliced( 

801 self, 

802 start: int = 0, 

803 stop: int | None = None, 

804 defer: bool | None = None, 

805 ) -> Query: 

806 """Return a modified `Query` with that takes a slice of this one's 

807 rows. 

808 

809 Parameters 

810 ---------- 

811 start : `int`, optional 

812 First index to include, inclusive. 

813 stop : `int` or `None`, optional 

814 One past the last index to include (i.e. exclusive). 

815 defer : `bool`, optional 

816 If `False`, run the new query immediately. If `True`, do not. If 

817 `None` (default), the ``defer`` option passed when making ``self`` 

818 is used (this option is "sticky"). 

819 

820 Returns 

821 ------- 

822 query : `Query` 

823 New query with the requested slice. 

824 

825 Notes 

826 ----- 

827 This operation must be implemented in the iteration engine if there are 

828 postprocessing operations, which may be much less efficient than 

829 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..`` 

830 in SQL). 

831 

832 Since query row order is usually arbitrary, it usually makes sense to 

833 call `sorted` before calling `sliced` to make the results 

834 deterministic. This is not checked because there are some contexts 

835 where getting an arbitrary subset of the results of a given size 

836 still makes sense. 

837 """ 

838 return self._chain(self._relation[start:stop], defer) 

839 

840 def sorted( 

841 self, 

842 order_by: Iterable[SortTerm], 

843 defer: bool | None = None, 

844 ) -> Query: 

845 """Return a modified `Query` that sorts this one's rows. 

846 

847 Parameters 

848 ---------- 

849 order_by : `~collections.abc.Iterable` [ `SortTerm` ] 

850 Expressions to sort by. 

851 defer : `bool`, optional 

852 If `False`, run the new query immediately. If `True`, do not. If 

853 `None` (default), the ``defer`` option passed when making ``self`` 

854 is used (this option is "sticky"). 

855 

856 Returns 

857 ------- 

858 query : `Query` 

859 New query with the requested sorting. 

860 

861 Notes 

862 ----- 

863 The ``order_by`` expression can include references to dimension record 

864 columns that were not present in the original relation; this is 

865 similar to calling `with_record_columns` for those columns first (but 

866 in this case column requests cannot be satisfied by record caches). 

867 All other columns referenced must be present in the query already. 

868 """ 

869 op = Sort(tuple(order_by)) 

870 columns_required = set(op.columns_required) 

871 columns_required.difference_update(self._relation.columns) 

872 if columns_required: 

873 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

874 columns_required.difference_update(columns_found) 

875 if columns_required: 

876 try: 

877 relation = self._backend.make_dimension_relation( 

878 self._dimensions, 

879 columns_required, 

880 self._context, 

881 initial_relation=relation, 

882 # Don't permit joins to use any columns beyond those in 

883 # the original relation, as that would change what this 

884 # operation does. 

885 initial_join_max_columns=frozenset(self._relation.columns), 

886 governor_constraints=self._governor_constraints, 

887 ) 

888 except ColumnError as err: 

889 raise ColumnError( 

890 "Cannot sort by columns that were not included in the original query or " 

891 "fully resolved by its dimensions." 

892 ) from err 

893 else: 

894 relation = self._relation 

895 relation = op.apply(relation, preferred_engine=self._context.preferred_engine) 

896 return self._chain(relation, defer) 

897 

898 def count(self, *, exact: bool = True, discard: bool = False) -> int: 

899 """Count the number of rows in this query. 

900 

901 Parameters 

902 ---------- 

903 exact : `bool`, optional 

904 If `True` (default), return the exact number of rows. If `False`, 

905 returning an upper bound is permitted if it can be done much more 

906 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but 

907 ignoring client-side filtering that would otherwise take place. 

908 discard : `bool`, optional 

909 If `True`, compute the exact count even if it would require running 

910 the full query and then throwing away the result rows after 

911 counting them. If `False`, this is an error, as the user would 

912 usually be better off executing the query first to fetch its rows 

913 into a new query (or passing ``exact=False``). Ignored if 

914 ``exact=False``. 

915 

916 Returns 

917 ------- 

918 n_rows : `int` 

919 Number of rows in the query, or an upper bound. This includes 

920 duplicates, if there are any. 

921 

922 Raises 

923 ------ 

924 RuntimeError 

925 Raised if an exact count was requested and could not be obtained 

926 without fetching and discarding rows. 

927 """ 

928 if self._relation.min_rows == self._relation.max_rows: 

929 return self._relation.max_rows 

930 return self._context.count(self._relation, exact=exact, discard=discard) 

931 

932 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

933 """Check whether this query has any result rows at all. 

934 

935 Parameters 

936 ---------- 

937 execute : `bool`, optional 

938 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

939 determined prior to execution that the query would return no rows. 

940 exact : `bool`, optional 

941 If `True`, run the full query and perform post-query filtering if 

942 needed, until at least one result row is found. If `False`, the 

943 returned result does not account for post-query filtering, and 

944 hence may be `True` even when all result rows would be filtered 

945 out. 

946 

947 Returns 

948 ------- 

949 any_rows : `bool` 

950 Whether the query has any rows, or if it may have any rows if 

951 ``exact=False``. 

952 

953 Raises 

954 ------ 

955 RuntimeError 

956 Raised if an exact check was requested and could not be obtained 

957 without executing the query. 

958 """ 

959 if self._relation.min_rows > 0: 

960 return True 

961 if self._relation.max_rows == 0: 

962 return False 

963 if execute: 

964 return self._context.any(self._relation, execute=execute, exact=exact) 

965 elif not exact: 

966 return True 

967 raise TypeError("Cannot obtain exact results without executing the query.") 

968 

969 def explain_no_results(self, execute: bool = True) -> list[str]: 

970 """Return human-readable messages that may help explain why the query 

971 yields no results. 

972 

973 Parameters 

974 ---------- 

975 execute : `bool`, optional 

976 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``) 

977 of aspects of the query to more precisely determine where rows were 

978 filtered out. 

979 

980 Returns 

981 ------- 

982 messages : `~collections.abc.Iterable` [ `str` ] 

983 String messages that describe reasons the query might not yield any 

984 results. 

985 """ 

986 # First try without actually executing any queries. 

987 diagnostics = Diagnostics.run(self._relation) 

988 if diagnostics.is_doomed: 

989 return diagnostics.messages 

990 if execute: 

991 # Try again, running LIMIT 1 queries as we walk back down the tree 

992 # to look for relations with no rows: 

993 diagnostics = Diagnostics.run(self._relation, executor=self._context.any) 

994 if diagnostics.is_doomed: 

995 return diagnostics.messages 

996 return [] 

997 

998 def _copy( 

999 self, 

1000 relation: Relation, 

1001 is_deferred: bool, 

1002 dimensions: DimensionGraph | None = None, 

1003 governor_constraints: Mapping[str, Set[str]] | None = None, 

1004 has_record_columns: bool | DimensionElement | None = None, 

1005 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

1006 ) -> Query: 

1007 """Return a modified copy of this query with some attributes replaced. 

1008 

1009 See class docs for parameter documentation; the only difference here 

1010 is that the defaults are the values ``self`` was constructed with. 

1011 """ 

1012 return Query( 

1013 dimensions=self._dimensions if dimensions is None else dimensions, 

1014 backend=self._backend, 

1015 context=self._context, 

1016 relation=relation, 

1017 governor_constraints=( 

1018 governor_constraints if governor_constraints is not None else self._governor_constraints 

1019 ), 

1020 is_deferred=is_deferred, 

1021 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns, 

1022 record_caches=self._record_caches if record_caches is None else record_caches, 

1023 ) 

1024 

1025 def _chain( 

1026 self, 

1027 relation: Relation, 

1028 defer: bool | None, 

1029 dimensions: DimensionGraph | None = None, 

1030 governor_constraints: Mapping[str, Set[str]] | None = None, 

1031 has_record_columns: bool | DimensionElement | None = None, 

1032 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

1033 ) -> Query: 

1034 """Return a modified query with a new relation while handling the 

1035 ubiquitous ``defer`` parameter's logic. 

1036 

1037 Parameters 

1038 ---------- 

1039 relation : `Relation` 

1040 Relation for the new query. 

1041 defer : `bool` 

1042 If `False`, run the new query immediately. If `True`, do not. If 

1043 `None` , the ``defer`` option passed when making ``self`` is used 

1044 (this option is "sticky"). 

1045 dimensions : `DimensionGraph`, optional 

1046 See class docs. 

1047 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

1048 `~collections.abc.Set` [ `str` ] ], optional 

1049 See class docs. 

1050 has_record_columns : `bool` or `DimensionElement`, optional 

1051 See class docs. 

1052 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \ 

1053 `~collections.abc.Mapping` \ 

1054 [ `DataCoordinate`, `DimensionRecord` ] ], optional 

1055 See class docs. 

1056 

1057 Returns 

1058 ------- 

1059 chained : `Query` 

1060 Modified query, or ``self`` if no modifications were actually 

1061 requested. 

1062 """ 

1063 if defer is None: 

1064 defer = self._is_deferred 

1065 if ( 

1066 relation is self._relation 

1067 and dimensions is None 

1068 and defer == self._is_deferred 

1069 and record_caches is None 

1070 and has_record_columns is None 

1071 and governor_constraints is None 

1072 ): 

1073 return self 

1074 result = self._copy( 

1075 relation, 

1076 is_deferred=True, 

1077 governor_constraints=governor_constraints, 

1078 dimensions=dimensions, 

1079 has_record_columns=has_record_columns, 

1080 record_caches=record_caches, 

1081 ) 

1082 if not defer: 

1083 result = result.run() 

1084 return result