Coverage for python/lsst/daf/butler/registry/queries/_query.py: 15%

250 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-01 11:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = () 

30 

31import itertools 

32from collections.abc import Iterable, Iterator, Mapping, Sequence, Set 

33from contextlib import contextmanager 

34from typing import Any, cast, final 

35 

36from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm 

37 

38from ..._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag 

39from ..._dataset_ref import DatasetRef 

40from ..._dataset_type import DatasetType 

41from ...dimensions import DataCoordinate, DimensionElement, DimensionGroup, DimensionRecord 

42from .._collection_type import CollectionType 

43from ..wildcards import CollectionWildcard 

44from ._query_backend import QueryBackend 

45from ._query_context import QueryContext 

46from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader 

47 

48 

49@final 

50class Query: 

51 """A general-purpose representation of a registry query. 

52 

53 Parameters 

54 ---------- 

55 dimensions : `DimensionGroup` 

56 The dimensions that span the query and are used to join its relations 

57 together. 

58 backend : `QueryBackend` 

59 Backend object used to create the query and new ones derived from it. 

60 context : `QueryContext` 

61 Context manager that holds relation engines and database connections 

62 for the query. 

63 relation : `Relation` 

64 The relation tree representation of the query as a series of operations 

65 on tables. 

66 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

67 `~collections.abc.Set` [ `str` ] ] 

68 Constraints on governor dimensions encoded in this query's relation. 

69 This is a mapping from governor dimension name to sets of values that 

70 dimension may take. 

71 is_deferred : `bool` 

72 If `True`, modifier methods that return a related `Query` object should 

73 not immediately execute the new query. 

74 has_record_columns : `bool` or `DimensionElement` 

75 Whether this query's relation already includes columns for all or some 

76 dimension element records: `True` means all elements in ``dimensions`` 

77 either have records present in ``record_caches`` or all columns present 

78 in ``relation``, while a specific `DimensionElement` means that element 

79 does. 

80 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \ 

81 `~collections.abc.Mapping` 

82 [ `DataCoordinate`, `DimensionRecord` ] ], optional 

83 Cached dimension record values, organized first by dimension element 

84 and then by data ID. 

85 

86 Notes 

87 ----- 

88 Iterating over a `Query` yields mappings from `ColumnTag` to the associated 

89 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and 

90 `iter_dimension_records` methods can be used to instead iterate over 

91 various butler primitives derived from these rows. 

92 

93 Iterating over a `Query` may or may not execute database queries again each 

94 time, depending on the state of its relation tree - see `Query.run` for 

95 details. 

96 

97 Query is immutable; all methods that might appear to modify it in place 

98 actually return a new object (though many attributes will be shared). 

99 

100 Query is currently (still) an internal-to-Registry object, with only the 

101 "QueryResults" classes that are backed by it directly exposed to users. It 

102 has been designed with the intent that it will eventually play a larger 

103 role, either as the main query result object in a redesigned query 

104 interface, or a "power user" result option that accompanies simpler 

105 replacements for the current "QueryResults" objects. 

106 """ 

107 

108 def __init__( 

109 self, 

110 dimensions: DimensionGroup, 

111 backend: QueryBackend[QueryContext], 

112 context: QueryContext, 

113 relation: Relation, 

114 governor_constraints: Mapping[str, Set[str]], 

115 is_deferred: bool, 

116 has_record_columns: bool | DimensionElement, 

117 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

118 ): 

119 self._dimensions = dimensions 

120 self._backend = backend 

121 self._context = context 

122 self._relation = relation 

123 self._governor_constraints = governor_constraints 

124 self._is_deferred = is_deferred 

125 self._has_record_columns = has_record_columns 

126 self._record_caches = record_caches if record_caches is not None else {} 

127 

128 @property 

129 def dimensions(self) -> DimensionGroup: 

130 """The dimensions that span the query and are used to join its 

131 relations together (`DimensionGroup`). 

132 """ 

133 return self._dimensions 

134 

135 @property 

136 def relation(self) -> Relation: 

137 """The relation tree representation of the query as a series of 

138 operations on tables (`Relation`). 

139 """ 

140 return self._relation 

141 

142 @property 

143 def has_record_columns(self) -> bool | DimensionElement: 

144 """Whether this query's relation already includes columns for all or 

145 some dimension element records (`bool` or `DimensionElement`). 

146 """ 

147 return self._has_record_columns 

148 

149 @property 

150 def backend(self) -> QueryBackend[QueryContext]: 

151 """Backend object used to create the query and new ones derived from it 

152 (`QueryBackend`). 

153 """ 

154 return self._backend 

155 

156 @contextmanager 

157 def open_context(self) -> Iterator[None]: 

158 """Return a context manager that ensures a database connection is 

159 established and temporary tables and cursors have a defined lifetime. 

160 

161 Returns 

162 ------- 

163 context : `contextlib.AbstractContextManager` 

164 Context manager with no return value. 

165 """ 

166 if self._context.is_open: 

167 yield 

168 else: 

169 with self._context: 

170 yield 

171 

172 def __str__(self) -> str: 

173 return str(self._relation) 

174 

175 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]: 

176 return iter(self._context.fetch_iterable(self._relation)) 

177 

178 def iter_data_ids(self, dimensions: DimensionGroup | None = None) -> Iterator[DataCoordinate]: 

179 """Return an iterator that converts result rows to data IDs. 

180 

181 Parameters 

182 ---------- 

183 dimensions : `DimensionGroup`, optional 

184 Dimensions of the data IDs to return. If not provided, 

185 ``self.dimensions`` is used. 

186 

187 Returns 

188 ------- 

189 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ] 

190 Iterator that yields data IDs. 

191 """ 

192 if dimensions is None: 

193 dimensions = self._dimensions 

194 reader = DataCoordinateReader.make( 

195 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches 

196 ) 

197 if not (reader.columns_required <= self.relation.columns): 

198 raise ColumnError( 

199 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

200 f"for data IDs with dimensions {dimensions}." 

201 ) 

202 return (reader.read(row) for row in self) 

203 

204 def iter_dataset_refs( 

205 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,) 

206 ) -> Iterator[DatasetRef]: 

207 """Return an iterator that converts result rows to dataset references. 

208 

209 Parameters 

210 ---------- 

211 dataset_type : `DatasetType` 

212 The parent dataset type to yield references for. 

213 components : `~collections.abc.Sequence` [ `None` or `str` ] 

214 Which component dataset types to construct refs for from each row 

215 representing a parent; `None` for the parent itself. 

216 

217 Returns 

218 ------- 

219 refs : `~collections.abc.Iterator` [ `DatasetRef` ] 

220 Iterator that yields (resolved) dataset references. 

221 """ 

222 reader = DatasetRefReader( 

223 dataset_type, 

224 translate_collection=self._backend.get_collection_name, 

225 records=self._has_record_columns is True, 

226 record_caches=self._record_caches, 

227 ) 

228 if not (reader.columns_required <= self.relation.columns): 

229 raise ColumnError( 

230 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

231 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}." 

232 ) 

233 for row in self: 

234 parent_ref = reader.read(row) 

235 for component in components: 

236 if component is None: 

237 yield parent_ref 

238 else: 

239 yield parent_ref.makeComponentRef(component) 

240 

241 def iter_data_ids_and_dataset_refs( 

242 self, dataset_type: DatasetType, dimensions: DimensionGroup | None = None 

243 ) -> Iterator[tuple[DataCoordinate, DatasetRef]]: 

244 """Iterate over pairs of data IDs and dataset refs. 

245 

246 This permits the data ID dimensions to differ from the dataset 

247 dimensions. 

248 

249 Parameters 

250 ---------- 

251 dataset_type : `DatasetType` 

252 The parent dataset type to yield references for. 

253 dimensions : `DimensionGroup`, optional 

254 Dimensions of the data IDs to return. If not provided, 

255 ``self.dimensions`` is used. 

256 

257 Returns 

258 ------- 

259 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`, 

260 `DatasetRef` ] ] 

261 An iterator over (data ID, dataset reference) pairs. 

262 """ 

263 if dimensions is None: 

264 dimensions = self._dimensions 

265 data_id_reader = DataCoordinateReader.make( 

266 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches 

267 ) 

268 dataset_reader = DatasetRefReader( 

269 dataset_type, 

270 translate_collection=self._backend.get_collection_name, 

271 records=self._has_record_columns is True, 

272 record_caches=self._record_caches, 

273 ) 

274 if not (data_id_reader.columns_required <= self.relation.columns): 

275 raise ColumnError( 

276 f"Missing column(s) {set(data_id_reader.columns_required - self.relation.columns)} " 

277 f"for data IDs with dimensions {dimensions}." 

278 ) 

279 if not (dataset_reader.columns_required <= self.relation.columns): 

280 raise ColumnError( 

281 f"Missing column(s) {set(dataset_reader.columns_required - self.relation.columns)} " 

282 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}." 

283 ) 

284 for row in self: 

285 yield (data_id_reader.read(row), dataset_reader.read(row)) 

286 

287 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]: 

288 """Return an iterator that converts result rows to dimension records. 

289 

290 Parameters 

291 ---------- 

292 element : `DimensionElement`, optional 

293 Dimension element whose records will be returned. If not provided, 

294 `has_record_columns` must be a `DimensionElement` instance. 

295 

296 Returns 

297 ------- 

298 records : `~collections.abc.Iterator` [ `DimensionRecord` ] 

299 Iterator that yields dimension records. 

300 """ 

301 if element is None: 

302 match self._has_record_columns: 

303 case True | False: 

304 raise ValueError("No default dimension element in query; 'element' must be given.") 

305 case only_element_with_records: 

306 element = only_element_with_records 

307 if (cache := self._record_caches.get(element)) is not None: 

308 return (cache[data_id] for data_id in self.iter_data_ids(element.minimal_group)) 

309 else: 

310 reader = DimensionRecordReader(element) 

311 if not (reader.columns_required <= self.relation.columns): 

312 raise ColumnError( 

313 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

314 f"for records of element {element.name}." 

315 ) 

316 return (reader.read(row) for row in self) 

317 

318 def run(self) -> Query: 

319 """Execute the query and hold its results in memory. 

320 

321 Returns 

322 ------- 

323 executed : `Query` 

324 New query that holds the query results. 

325 

326 Notes 

327 ----- 

328 Iterating over the results of a query that has been `run` will always 

329 iterate over an existing container, while iterating over a query that 

330 has not been run will result in executing at least some of the query 

331 each time. 

332 

333 Running a query also sets its `is_deferred` flag to `False`, which will 

334 cause new queries constructed by its methods to be run immediately, 

335 unless ``defer=True`` is passed to the factory method. After a query 

336 has been run, factory methods will also tend to prefer to apply new 

337 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python 

338 code acting on the existing container rather than going back to SQL, 

339 which can be less efficient overall that applying operations to a 

340 deferred query and executing them all only at the end. 

341 

342 Running a query is represented in terms of relations by adding a 

343 `~lsst.daf.relation.Materialization` marker relation in the iteration 

344 engine and then processing the relation tree; this attaches the 

345 container of rows to that new relation to short-circuit any future 

346 processing of the tree and lock changes to the tree upstream of it. 

347 This is very different from the SQL-engine 

348 `~lsst.daf.relation.Materialization` added to the tree by the 

349 `materialize` method from a user perspective, though it has a similar 

350 representation in the relation tree. 

351 """ 

352 relation = ( 

353 # Make a new relation that definitely ends in the iteration engine 

354 # (this does nothing if it already does). 

355 self.relation.transferred_to(self._context.iteration_engine) 

356 # Make the new relation save its rows to an in-memory Python 

357 # collection in relation.payload when processed. 

358 .materialized(name_prefix="run") 

359 ) 

360 # Actually process the relation, simplifying out trivial relations, 

361 # executing any SQL queries, and saving results to relation.payload. 

362 # We discard the simplified relation that's returned, because we want 

363 # the new query to have any extra diagnostic information contained in 

364 # the original. 

365 self._context.process(relation) 

366 return self._copy(relation, False) 

367 

368 def materialized(self, defer_postprocessing: bool = True) -> Query: 

369 """Materialize the results of this query in its context's preferred 

370 engine. 

371 

372 Usually this means inserting the results into a temporary table in a 

373 database. 

374 

375 Parameters 

376 ---------- 

377 defer_postprocessing : `bool`, optional 

378 If `True`, do not execute operations that occur in the context's 

379 `QueryContext.iteration_engine` up front; instead insert and 

380 execute a materialization upstream of them (e.g. via a a SQL 

381 ``INSERT INTO ... SELECT`` statement, with no fetching to the 

382 client) and execute the postprocessing operations when iterating 

383 over the query results. If `False`, and iteration-engine 

384 postprocessing operations exist, run the full query, execute them 

385 now, and upload the results. 

386 If the relation is already in the preferred engine, this option 

387 is ignored and the materialization will not involve fetching rows 

388 to the iteration engine at all. If the relation has already been 

389 materialized in the iteration engine (i.e. via `run`), then this 

390 option is again ignored and an upload of the existing rows will 

391 be performed. 

392 

393 Returns 

394 ------- 

395 materialized : `Query` 

396 Modified query with the same row-and-column content with a 

397 materialization in ``self.context.preferred_engine``. 

398 """ 

399 if defer_postprocessing or self.relation.engine == self._context.preferred_engine: 

400 relation, stripped = self._context.strip_postprocessing(self._relation) 

401 if relation.engine == self._context.preferred_engine: 

402 # We got all the way to the engine we want to materialize in. 

403 # Apply that operation to the tree, process it (which actually 

404 # creates a temporary table and populates it), and then reapply 

405 # the stripped operations. 

406 relation = relation.materialized() 

407 self._context.process(relation) 

408 for operation in stripped: 

409 relation = operation.apply( 

410 relation, transfer=True, preferred_engine=self._context.iteration_engine 

411 ) 

412 return self._copy(relation, True) 

413 # Either defer_postprocessing=False, or attempting to strip off unary 

414 # operations until we got to the preferred engine didn't work, because 

415 # this tree doesn't actually involve the preferred engine. So we just 

416 # transfer to the preferred engine first, and then materialize, 

417 # process, and return. 

418 relation = self._relation.transferred_to(self._context.preferred_engine).materialized() 

419 self._context.process(relation) 

420 return self._copy(relation, True) 

421 

422 def projected( 

423 self, 

424 dimensions: DimensionGroup | Iterable[str] | None = None, 

425 unique: bool = True, 

426 columns: Iterable[ColumnTag] | None = None, 

427 defer: bool | None = None, 

428 drop_postprocessing: bool = False, 

429 keep_record_columns: bool = True, 

430 ) -> Query: 

431 """Return a modified `Query` with a subset of this one's columns. 

432 

433 Parameters 

434 ---------- 

435 dimensions : `~collections.abc.Iterable` [ `str` ], 

436 optional 

437 Dimensions to include in the new query. Will be expanded to 

438 include all required and implied dependencies. Must be a subset of 

439 ``self.dimensions``. If not provided, ``self.dimensions`` is used. 

440 unique : `bool`, optional 

441 If `True` (default) deduplicate rows after dropping columns. 

442 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional 

443 Additional dataset or dimension record columns to include in the 

444 query. Dimension key columns added here are ignored unless they 

445 extend beyond the key columns implied by the ``dimensions`` 

446 argument (which is an error). 

447 defer : `bool`, optional 

448 If `False`, run the new query immediately. If `True`, do not. If 

449 `None` (default), the ``defer`` option passed when making ``self`` 

450 is used (this option is "sticky"). 

451 drop_postprocessing : `bool`, optional 

452 Drop any iteration-engine operations that depend on columns that 

453 are being removed (e.g. region-overlap tests when region columns 

454 are being dropped), making it more likely that projection and 

455 deduplication could be performed in the preferred engine, where 

456 they may be more efficient. 

457 keep_record_columns : `bool`, optional 

458 If `True` (default) and this query `has_record_columns`, implicitly 

459 add any of those to ``columns`` whose dimension element is in the 

460 given ``dimensions``. 

461 

462 Returns 

463 ------- 

464 query : `Query` 

465 New query with the requested columns only, optionally deduplicated. 

466 

467 Notes 

468 ----- 

469 Dataset columns are dropped from the new query unless passed via the 

470 ``columns`` argument. All other columns are by default preserved. 

471 

472 Raises 

473 ------ 

474 lsst.daf.relation.ColumnError 

475 Raised if the columns to include in the new query are not all 

476 present in the current query. 

477 """ 

478 match dimensions: 

479 case None: 

480 dimensions = set(self._dimensions.names) 

481 case DimensionGroup(): 

482 dimensions = set(dimensions.names) 

483 case iterable: 

484 dimensions = set(iterable) 

485 if columns is not None: 

486 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns)) 

487 dimensions = self._dimensions.universe.conform(dimensions) 

488 if columns is None: 

489 columns = set() 

490 else: 

491 columns = set(columns) 

492 columns.update(DimensionKeyColumnTag.generate(dimensions.names)) 

493 if keep_record_columns: 

494 if self._has_record_columns is True: 

495 for element_name in dimensions.elements: 

496 if element_name not in self._record_caches: 

497 columns.update(self.dimensions.universe[element_name].RecordClass.fields.columns) 

498 elif self._has_record_columns in dimensions.elements: 

499 element = cast(DimensionElement, self._has_record_columns) 

500 columns.update(element.RecordClass.fields.columns) 

501 if drop_postprocessing: 

502 relation = self._context.drop_invalidated_postprocessing(self._relation, columns) 

503 # Dropping postprocessing Calculations could cause other columns 

504 # we had otherwise intended to keep to be dropped as well. 

505 columns &= relation.columns 

506 else: 

507 relation = self._relation 

508 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine) 

509 if unique: 

510 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine) 

511 return self._chain(relation, defer, dimensions=dimensions) 

512 

513 def with_record_columns(self, dimension_element: str | None = None, defer: bool | None = None) -> Query: 

514 """Return a modified `Query` with additional dimension record columns 

515 and/or caches. 

516 

517 Parameters 

518 ---------- 

519 dimension_element : `str`, optional 

520 Name of a single dimension element to add record columns for, or 

521 `None` default to add them for all elements in `dimensions`. 

522 defer : `bool`, optional 

523 If `False`, run the new query immediately. If `True`, do not. If 

524 `None` (default), the ``defer`` option passed when making ``self`` 

525 is used (this option is "sticky"). 

526 

527 Returns 

528 ------- 

529 query : `Query` 

530 New query with the requested record columns either in the relation 

531 or (when possible) available via record caching. 

532 

533 Notes 

534 ----- 

535 Adding dimension record columns is fundamentally different from adding 

536 new dimension key columns or dataset columns, because it is purely an 

537 addition of columns, not rows - we can always join in a dimension 

538 element table (if it has not already been included) on keys already 

539 present in the current relation, confident that there is exactly one 

540 row in the dimension element table for each row in the current 

541 relation. 

542 """ 

543 if self._has_record_columns is True or self._has_record_columns == dimension_element: 

544 return self 

545 record_caches = dict(self._record_caches) 

546 columns_required: set[ColumnTag] = set() 

547 for element_name in self.dimensions.elements if dimension_element is None else [dimension_element]: 

548 element = self.dimensions.universe[element_name] 

549 if element_name in record_caches: 

550 continue 

551 if (cache := self._backend.get_dimension_record_cache(element_name, self._context)) is not None: 

552 record_caches[element] = cache 

553 else: 

554 columns_required.update(element.RecordClass.fields.columns.keys()) 

555 # Modify the relation we have to remove any projections that dropped 

556 # columns we now want, as long the relation's behavior is otherwise 

557 # unchanged. 

558 columns_required -= self._relation.columns 

559 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

560 columns_required.difference_update(columns_found) 

561 if columns_required: 

562 relation = self._backend.make_dimension_relation( 

563 self._dimensions, 

564 columns_required, 

565 self._context, 

566 initial_relation=relation, 

567 # Don't permit joins to use any columns beyond those in the 

568 # original relation, as that would change what this operation 

569 # does. 

570 initial_join_max_columns=frozenset(self._relation.columns), 

571 governor_constraints=self._governor_constraints, 

572 ) 

573 return self._chain( 

574 relation, 

575 defer=defer, 

576 has_record_columns=( 

577 True if dimension_element is None else self.dimensions.universe[dimension_element] 

578 ), 

579 record_caches=record_caches, 

580 ) 

581 

582 def find_datasets( 

583 self, 

584 dataset_type: DatasetType, 

585 collections: Any, 

586 *, 

587 find_first: bool = True, 

588 columns: Set[str] = frozenset(("dataset_id", "run")), 

589 defer: bool | None = None, 

590 ) -> Query: 

591 """Return a modified `Query` that includes a search for datasets of the 

592 given type. 

593 

594 Parameters 

595 ---------- 

596 dataset_type : `DatasetType` 

597 Dataset type to search for. May not be a component. 

598 collections 

599 Collection search path or pattern. Must be a single collection 

600 name or ordered sequence if ``find_first=True``. See 

601 :ref:`daf_butler_collection_expressions` for more information. 

602 find_first : `bool`, optional 

603 If `True` (default) search collections in order until the first 

604 match for each data ID is found. If `False`, return all matches in 

605 all collections. 

606 columns : `~collections.abc.Set` [ `str` ] 

607 Dataset columns to include in the new query. Options include 

608 

609 - ``dataset_id``: the unique identifier of the dataset. The type 

610 is implementation-dependent. Never nullable. Included by 

611 default. 

612 

613 - ``ingest_date``: the date and time the dataset was added to the 

614 data repository. 

615 

616 - ``run``: the foreign key column to the `~CollectionType.RUN` 

617 collection holding the dataset (not necessarily the collection 

618 name). The type is dependent on the collection manager 

619 implementation. Included by default. 

620 

621 - ``collection``: the foreign key column to the collection type in 

622 which the dataset was actually in this search. The type is 

623 dependent on the collection manager implementation. This may 

624 differ from ``run`` if the dataset is present in a matching 

625 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

626 collection, which means the same dataset may also appear multiple 

627 times in the query results. 

628 

629 - ``timespan``: the validity range for datasets found in a 

630 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other 

631 collection types. 

632 

633 The default columns (``dataset_id`` and ``run``) are sufficient to 

634 enable `iter_dataset_refs`, which also takes care of translating 

635 the internal ``RUN`` collection key into its public name. 

636 

637 Setting this to an empty set while passing ``find_first=False`` 

638 will return a query that is constrained by dataset existence in 

639 some matching collection that does not actually return which 

640 datasets existed. 

641 defer : `bool`, optional 

642 If `False`, run the new query immediately. If `True`, do not. If 

643 `None` (default), the ``defer`` option passed when making ``self`` 

644 is used (this option is "sticky"). 

645 

646 Returns 

647 ------- 

648 query : `Query` 

649 New query with the requested dataset columns, constrained by the 

650 existence of datasets of this type in the given collection. 

651 

652 Raises 

653 ------ 

654 lsst.daf.relation.ColumnError 

655 Raised if a dataset search is already present in this query and 

656 this is a find-first search. 

657 """ 

658 if find_first and DatasetColumnTag.filter_from(self._relation.columns): 

659 raise ColumnError( 

660 "Cannot search for datasets with find_first=True " 

661 "on a query that already includes dataset columns." 

662 ) 

663 # 

664 # TODO: it'd be nice to do a QueryContext.restore_columns call here or 

665 # similar, to look for dataset-constraint joins already present in the 

666 # relation and expand them to include dataset-result columns as well, 

667 # instead of doing a possibly-redundant join here. But that would 

668 # require pushing relation usage down further into 

669 # DatasetStorageManager.make_relation, so that it doesn't need to be 

670 # given the columns, and then giving the relation system the ability to 

671 # simplify-away redundant joins when they only provide columns that 

672 # aren't ultimately used. The right time to look into that is probably 

673 # when investigating whether the base QueryBackend should be 

674 # responsible for producing an "abstract" relation tree of some sort, 

675 # with the subclasses only responsible for filling it in with payloads 

676 # (and possibly replacing some leaves with new sub-trees) during when 

677 # "processed" (or in some other "prepare" step). 

678 # 

679 # This is a low priority for three reasons: 

680 # - there's some chance the database's query optimizer will simplify 

681 # away these redundant joins; 

682 # - at present, the main use of this code path is in QG generation, 

683 # where we materialize the initial data ID query into a temp table 

684 # and hence can't go back and "recover" those dataset columns anyway; 

685 # 

686 collections = CollectionWildcard.from_expression(collections) 

687 if find_first: 

688 collections.require_ordered() 

689 rejections: list[str] = [] 

690 collection_records = self._backend.resolve_dataset_collections( 

691 dataset_type, 

692 collections, 

693 governor_constraints=self._governor_constraints, 

694 allow_calibration_collections=True, 

695 rejections=rejections, 

696 ) 

697 # If the dataset type has dimensions not in the current query, or we 

698 # need a temporal join for a calibration collection, either restore 

699 # those columns or join them in. 

700 full_dimensions = dataset_type.dimensions.as_group().union(self._dimensions) 

701 relation = self._relation 

702 record_caches = self._record_caches 

703 base_columns_required: set[ColumnTag] = { 

704 DimensionKeyColumnTag(name) for name in full_dimensions.names 

705 } 

706 spatial_joins: list[tuple[str, str]] = [] 

707 if not (dataset_type.dimensions <= self._dimensions): 

708 if self._has_record_columns is True: 

709 # This query is for expanded data IDs, so if we add new 

710 # dimensions to the query we need to be able to get records for 

711 # the new dimensions. 

712 record_caches = dict(self._record_caches) 

713 for element_name in full_dimensions.elements: 

714 element = full_dimensions.universe[element_name] 

715 if element in record_caches: 

716 continue 

717 if ( 

718 cache := self._backend.get_dimension_record_cache(element_name, self._context) 

719 ) is not None: 

720 record_caches[element] = cache 

721 else: 

722 base_columns_required.update(element.RecordClass.fields.columns.keys()) 

723 # See if we need spatial joins between the current query and the 

724 # dataset type's dimensions. The logic here is for multiple 

725 # spatial joins in general, but in practice it'll be exceedingly 

726 # rare for there to be more than one. We start by figuring out 

727 # which spatial "families" (observations vs. skymaps, skypix 

728 # systems) are present on only one side and not the other. 

729 lhs_spatial_families = self._dimensions.spatial - dataset_type.dimensions.spatial 

730 rhs_spatial_families = dataset_type.dimensions.spatial - self._dimensions.spatial 

731 # Now we iterate over the Cartesian product of those, so e.g. 

732 # if the query has {tract, patch, visit} and the dataset type 

733 # has {htm7} dimensions, the iterations of this loop 

734 # correspond to: (skymap, htm), (observations, htm). 

735 for lhs_spatial_family, rhs_spatial_family in itertools.product( 

736 lhs_spatial_families, rhs_spatial_families 

737 ): 

738 # For each pair we add a join between the most-precise element 

739 # present in each family (e.g. patch beats tract). 

740 spatial_joins.append( 

741 ( 

742 lhs_spatial_family.choose( 

743 full_dimensions.elements.names, self.dimensions.universe 

744 ).name, 

745 rhs_spatial_family.choose( 

746 full_dimensions.elements.names, self.dimensions.universe 

747 ).name, 

748 ) 

749 ) 

750 # Set up any temporal join between the query dimensions and CALIBRATION 

751 # collection's validity ranges. 

752 temporal_join_on: set[ColumnTag] = set() 

753 if any(r.type is CollectionType.CALIBRATION for r in collection_records): 

754 for family in self._dimensions.temporal: 

755 endpoint = family.choose(self._dimensions.elements.names, self.dimensions.universe) 

756 temporal_join_on.add(DimensionRecordColumnTag(endpoint.name, "timespan")) 

757 base_columns_required.update(temporal_join_on) 

758 # Note which of the many kinds of potentially-missing columns we have 

759 # and add the rest. 

760 base_columns_required.difference_update(relation.columns) 

761 if base_columns_required: 

762 relation = self._backend.make_dimension_relation( 

763 full_dimensions, 

764 base_columns_required, 

765 self._context, 

766 initial_relation=relation, 

767 # Don't permit joins to use any columns beyond those in the 

768 # original relation, as that would change what this 

769 # operation does. 

770 initial_join_max_columns=frozenset(self._relation.columns), 

771 governor_constraints=self._governor_constraints, 

772 spatial_joins=spatial_joins, 

773 ) 

774 # Finally we can join in the search for the dataset query. 

775 columns = set(columns) 

776 columns.add("dataset_id") 

777 if not collection_records: 

778 relation = relation.join( 

779 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context) 

780 ) 

781 elif find_first: 

782 relation = self._backend.make_dataset_search_relation( 

783 dataset_type, 

784 collection_records, 

785 columns, 

786 self._context, 

787 join_to=relation, 

788 temporal_join_on=temporal_join_on, 

789 ) 

790 else: 

791 relation = self._backend.make_dataset_query_relation( 

792 dataset_type, 

793 collection_records, 

794 columns, 

795 self._context, 

796 join_to=relation, 

797 temporal_join_on=temporal_join_on, 

798 ) 

799 return self._chain(relation, dimensions=full_dimensions, record_caches=record_caches, defer=defer) 

800 

801 def sliced( 

802 self, 

803 start: int = 0, 

804 stop: int | None = None, 

805 defer: bool | None = None, 

806 ) -> Query: 

807 """Return a modified `Query` with that takes a slice of this one's 

808 rows. 

809 

810 Parameters 

811 ---------- 

812 start : `int`, optional 

813 First index to include, inclusive. 

814 stop : `int` or `None`, optional 

815 One past the last index to include (i.e. exclusive). 

816 defer : `bool`, optional 

817 If `False`, run the new query immediately. If `True`, do not. If 

818 `None` (default), the ``defer`` option passed when making ``self`` 

819 is used (this option is "sticky"). 

820 

821 Returns 

822 ------- 

823 query : `Query` 

824 New query with the requested slice. 

825 

826 Notes 

827 ----- 

828 This operation must be implemented in the iteration engine if there are 

829 postprocessing operations, which may be much less efficient than 

830 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..`` 

831 in SQL). 

832 

833 Since query row order is usually arbitrary, it usually makes sense to 

834 call `sorted` before calling `sliced` to make the results 

835 deterministic. This is not checked because there are some contexts 

836 where getting an arbitrary subset of the results of a given size 

837 still makes sense. 

838 """ 

839 return self._chain(self._relation[start:stop], defer) 

840 

841 def sorted( 

842 self, 

843 order_by: Iterable[SortTerm], 

844 defer: bool | None = None, 

845 ) -> Query: 

846 """Return a modified `Query` that sorts this one's rows. 

847 

848 Parameters 

849 ---------- 

850 order_by : `~collections.abc.Iterable` [ `SortTerm` ] 

851 Expressions to sort by. 

852 defer : `bool`, optional 

853 If `False`, run the new query immediately. If `True`, do not. If 

854 `None` (default), the ``defer`` option passed when making ``self`` 

855 is used (this option is "sticky"). 

856 

857 Returns 

858 ------- 

859 query : `Query` 

860 New query with the requested sorting. 

861 

862 Notes 

863 ----- 

864 The ``order_by`` expression can include references to dimension record 

865 columns that were not present in the original relation; this is 

866 similar to calling `with_record_columns` for those columns first (but 

867 in this case column requests cannot be satisfied by record caches). 

868 All other columns referenced must be present in the query already. 

869 """ 

870 op = Sort(tuple(order_by)) 

871 columns_required = set(op.columns_required) 

872 columns_required.difference_update(self._relation.columns) 

873 if columns_required: 

874 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

875 columns_required.difference_update(columns_found) 

876 if columns_required: 

877 try: 

878 relation = self._backend.make_dimension_relation( 

879 self._dimensions, 

880 columns_required, 

881 self._context, 

882 initial_relation=relation, 

883 # Don't permit joins to use any columns beyond those in 

884 # the original relation, as that would change what this 

885 # operation does. 

886 initial_join_max_columns=frozenset(self._relation.columns), 

887 governor_constraints=self._governor_constraints, 

888 ) 

889 except ColumnError as err: 

890 raise ColumnError( 

891 "Cannot sort by columns that were not included in the original query or " 

892 "fully resolved by its dimensions." 

893 ) from err 

894 else: 

895 relation = self._relation 

896 relation = op.apply(relation, preferred_engine=self._context.preferred_engine) 

897 return self._chain(relation, defer) 

898 

899 def count(self, *, exact: bool = True, discard: bool = False) -> int: 

900 """Count the number of rows in this query. 

901 

902 Parameters 

903 ---------- 

904 exact : `bool`, optional 

905 If `True` (default), return the exact number of rows. If `False`, 

906 returning an upper bound is permitted if it can be done much more 

907 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but 

908 ignoring client-side filtering that would otherwise take place. 

909 discard : `bool`, optional 

910 If `True`, compute the exact count even if it would require running 

911 the full query and then throwing away the result rows after 

912 counting them. If `False`, this is an error, as the user would 

913 usually be better off executing the query first to fetch its rows 

914 into a new query (or passing ``exact=False``). Ignored if 

915 ``exact=False``. 

916 

917 Returns 

918 ------- 

919 n_rows : `int` 

920 Number of rows in the query, or an upper bound. This includes 

921 duplicates, if there are any. 

922 

923 Raises 

924 ------ 

925 RuntimeError 

926 Raised if an exact count was requested and could not be obtained 

927 without fetching and discarding rows. 

928 """ 

929 if self._relation.min_rows == self._relation.max_rows: 

930 return self._relation.max_rows 

931 return self._context.count(self._relation, exact=exact, discard=discard) 

932 

933 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

934 """Check whether this query has any result rows at all. 

935 

936 Parameters 

937 ---------- 

938 execute : `bool`, optional 

939 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

940 determined prior to execution that the query would return no rows. 

941 exact : `bool`, optional 

942 If `True`, run the full query and perform post-query filtering if 

943 needed, until at least one result row is found. If `False`, the 

944 returned result does not account for post-query filtering, and 

945 hence may be `True` even when all result rows would be filtered 

946 out. 

947 

948 Returns 

949 ------- 

950 any_rows : `bool` 

951 Whether the query has any rows, or if it may have any rows if 

952 ``exact=False``. 

953 

954 Raises 

955 ------ 

956 RuntimeError 

957 Raised if an exact check was requested and could not be obtained 

958 without executing the query. 

959 """ 

960 if self._relation.min_rows > 0: 

961 return True 

962 if self._relation.max_rows == 0: 

963 return False 

964 if execute: 

965 return self._context.any(self._relation, execute=execute, exact=exact) 

966 elif not exact: 

967 return True 

968 raise TypeError("Cannot obtain exact results without executing the query.") 

969 

970 def explain_no_results(self, execute: bool = True) -> list[str]: 

971 """Return human-readable messages that may help explain why the query 

972 yields no results. 

973 

974 Parameters 

975 ---------- 

976 execute : `bool`, optional 

977 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``) 

978 of aspects of the query to more precisely determine where rows were 

979 filtered out. 

980 

981 Returns 

982 ------- 

983 messages : `~collections.abc.Iterable` [ `str` ] 

984 String messages that describe reasons the query might not yield any 

985 results. 

986 """ 

987 # First try without actually executing any queries. 

988 diagnostics = Diagnostics.run(self._relation) 

989 if diagnostics.is_doomed: 

990 return diagnostics.messages 

991 if execute: 

992 # Try again, running LIMIT 1 queries as we walk back down the tree 

993 # to look for relations with no rows: 

994 diagnostics = Diagnostics.run(self._relation, executor=self._context.any) 

995 if diagnostics.is_doomed: 

996 return diagnostics.messages 

997 return [] 

998 

999 def _copy( 

1000 self, 

1001 relation: Relation, 

1002 is_deferred: bool, 

1003 dimensions: DimensionGroup | None = None, 

1004 governor_constraints: Mapping[str, Set[str]] | None = None, 

1005 has_record_columns: bool | DimensionElement | None = None, 

1006 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

1007 ) -> Query: 

1008 """Return a modified copy of this query with some attributes replaced. 

1009 

1010 See class docs for parameter documentation; the only difference here 

1011 is that the defaults are the values ``self`` was constructed with. 

1012 """ 

1013 return Query( 

1014 dimensions=self._dimensions if dimensions is None else dimensions, 

1015 backend=self._backend, 

1016 context=self._context, 

1017 relation=relation, 

1018 governor_constraints=( 

1019 governor_constraints if governor_constraints is not None else self._governor_constraints 

1020 ), 

1021 is_deferred=is_deferred, 

1022 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns, 

1023 record_caches=self._record_caches if record_caches is None else record_caches, 

1024 ) 

1025 

1026 def _chain( 

1027 self, 

1028 relation: Relation, 

1029 defer: bool | None, 

1030 dimensions: DimensionGroup | None = None, 

1031 governor_constraints: Mapping[str, Set[str]] | None = None, 

1032 has_record_columns: bool | DimensionElement | None = None, 

1033 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

1034 ) -> Query: 

1035 """Return a modified query with a new relation while handling the 

1036 ubiquitous ``defer`` parameter's logic. 

1037 

1038 Parameters 

1039 ---------- 

1040 relation : `Relation` 

1041 Relation for the new query. 

1042 defer : `bool` 

1043 If `False`, run the new query immediately. If `True`, do not. If 

1044 `None` , the ``defer`` option passed when making ``self`` is used 

1045 (this option is "sticky"). 

1046 dimensions : `DimensionGroup`, optional 

1047 See class docs. 

1048 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

1049 `~collections.abc.Set` [ `str` ] ], optional 

1050 See class docs. 

1051 has_record_columns : `bool` or `DimensionElement`, optional 

1052 See class docs. 

1053 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \ 

1054 `~collections.abc.Mapping` \ 

1055 [ `DataCoordinate`, `DimensionRecord` ] ], optional 

1056 See class docs. 

1057 

1058 Returns 

1059 ------- 

1060 chained : `Query` 

1061 Modified query, or ``self`` if no modifications were actually 

1062 requested. 

1063 """ 

1064 if defer is None: 

1065 defer = self._is_deferred 

1066 if ( 

1067 relation is self._relation 

1068 and dimensions is None 

1069 and defer == self._is_deferred 

1070 and record_caches is None 

1071 and has_record_columns is None 

1072 and governor_constraints is None 

1073 ): 

1074 return self 

1075 result = self._copy( 

1076 relation, 

1077 is_deferred=True, 

1078 governor_constraints=governor_constraints, 

1079 dimensions=dimensions, 

1080 has_record_columns=has_record_columns, 

1081 record_caches=record_caches, 

1082 ) 

1083 if not defer: 

1084 result = result.run() 

1085 return result