Coverage for python/lsst/daf/butler/registry/queries/_query.py: 15%

241 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-09-02 09:34 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = () 

24 

25import itertools 

26from collections.abc import Iterable, Iterator, Mapping, Sequence, Set 

27from contextlib import contextmanager 

28from typing import Any, cast, final 

29 

30from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm 

31 

32from ...core import ( 

33 DataCoordinate, 

34 DatasetColumnTag, 

35 DatasetRef, 

36 DatasetType, 

37 Dimension, 

38 DimensionElement, 

39 DimensionGraph, 

40 DimensionKeyColumnTag, 

41 DimensionRecord, 

42 DimensionRecordColumnTag, 

43) 

44from .._collectionType import CollectionType 

45from ..wildcards import CollectionWildcard 

46from ._query_backend import QueryBackend 

47from ._query_context import QueryContext 

48from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader 

49 

50 

51@final 

52class Query: 

53 """A general-purpose representation of a registry query. 

54 

55 Parameters 

56 ---------- 

57 dimensions : `DimensionGraph` 

58 The dimensions that span the query and are used to join its relations 

59 together. 

60 backend : `QueryBackend` 

61 Backend object used to create the query and new ones derived from it. 

62 context : `QueryContext` 

63 Context manager that holds relation engines and database connections 

64 for the query. 

65 relation : `Relation` 

66 The relation tree representation of the query as a series of operations 

67 on tables. 

68 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

69 `~collections.abc.Set` [ `str` ] ] 

70 Constraints on governor dimensions encoded in this query's relation. 

71 This is a mapping from governor dimension name to sets of values that 

72 dimension may take. 

73 is_deferred : `bool` 

74 If `True`, modifier methods that return a related `Query` object should 

75 not immediately execute the new query. 

76 has_record_columns : `bool` or `DimensionElement` 

77 Whether this query's relation already includes columns for all or some 

78 dimension element records: `True` means all elements in ``dimensions`` 

79 either have records present in ``record_caches`` or all columns present 

80 in ``relation``, while a specific `DimensionElement` means that element 

81 does. 

82 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \ 

83 `~collections.abc.Mapping` 

84 [ `DataCoordinate`, `DimensionRecord` ] ], optional 

85 Cached dimension record values, organized first by dimension element 

86 and then by data ID. 

87 

88 Notes 

89 ----- 

90 Iterating over a `Query` yields mappings from `ColumnTag` to the associated 

91 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and 

92 `iter_dimension_records` methods can be used to instead iterate over 

93 various butler primitives derived from these rows. 

94 

95 Iterating over a `Query` may or may not execute database queries again each 

96 time, depending on the state of its relation tree - see `Query.run` for 

97 details. 

98 

99 Query is immutable; all methods that might appear to modify it in place 

100 actually return a new object (though many attributes will be shared). 

101 

102 Query is currently (still) an internal-to-Registry object, with only the 

103 "QueryResults" classes that are backed by it directly exposed to users. It 

104 has been designed with the intent that it will eventually play a larger 

105 role, either as the main query result object in a redesigned query 

106 interface, or a "power user" result option that accompanies simpler 

107 replacements for the current "QueryResults" objects. 

108 """ 

109 

110 def __init__( 

111 self, 

112 dimensions: DimensionGraph, 

113 backend: QueryBackend[QueryContext], 

114 context: QueryContext, 

115 relation: Relation, 

116 governor_constraints: Mapping[str, Set[str]], 

117 is_deferred: bool, 

118 has_record_columns: bool | DimensionElement, 

119 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

120 ): 

121 self._dimensions = dimensions 

122 self._backend = backend 

123 self._context = context 

124 self._relation = relation 

125 self._governor_constraints = governor_constraints 

126 self._is_deferred = is_deferred 

127 self._has_record_columns = has_record_columns 

128 self._record_caches = record_caches if record_caches is not None else {} 

129 

130 @property 

131 def dimensions(self) -> DimensionGraph: 

132 """The dimensions that span the query and are used to join its 

133 relations together (`DimensionGraph`). 

134 """ 

135 return self._dimensions 

136 

137 @property 

138 def relation(self) -> Relation: 

139 """The relation tree representation of the query as a series of 

140 operations on tables (`Relation`). 

141 """ 

142 return self._relation 

143 

144 @property 

145 def has_record_columns(self) -> bool | DimensionElement: 

146 """Whether this query's relation already includes columns for all or 

147 some dimension element records (`bool` or `DimensionElement`). 

148 """ 

149 return self._has_record_columns 

150 

151 @property 

152 def backend(self) -> QueryBackend[QueryContext]: 

153 """Backend object used to create the query and new ones derived from it 

154 (`QueryBackend`). 

155 """ 

156 return self._backend 

157 

158 @contextmanager 

159 def open_context(self) -> Iterator[None]: 

160 """Return a context manager that ensures a database connection is 

161 established and temporary tables and cursors have a defined lifetime. 

162 

163 Returns 

164 ------- 

165 context : `contextlib.AbstractContextManager` 

166 Context manager with no return value. 

167 """ 

168 if self._context.is_open: 

169 yield 

170 else: 

171 with self._context: 

172 yield 

173 

174 def __str__(self) -> str: 

175 return str(self._relation) 

176 

177 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]: 

178 return iter(self._context.fetch_iterable(self._relation)) 

179 

180 def iter_data_ids(self, dimensions: DimensionGraph | None = None) -> Iterator[DataCoordinate]: 

181 """Return an iterator that converts result rows to data IDs. 

182 

183 Parameters 

184 ---------- 

185 dimensions : `DimensionGraph`, optional 

186 Dimensions of the data IDs to return. If not provided, 

187 ``self.dimensions`` is used. 

188 

189 Returns 

190 ------- 

191 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ] 

192 Iterator that yields data IDs. 

193 """ 

194 if dimensions is None: 

195 dimensions = self._dimensions 

196 reader = DataCoordinateReader.make( 

197 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches 

198 ) 

199 if not (reader.columns_required <= self.relation.columns): 

200 raise ColumnError( 

201 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

202 f"for data IDs with dimensions {dimensions}." 

203 ) 

204 return (reader.read(row) for row in self) 

205 

206 def iter_dataset_refs( 

207 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,) 

208 ) -> Iterator[DatasetRef]: 

209 """Return an iterator that converts result rows to dataset references. 

210 

211 Parameters 

212 ---------- 

213 dataset_type : `DatasetType` 

214 The parent dataset type to yield references for. 

215 components : `~collections.abc.Sequence` [ `None` or `str` ] 

216 Which component dataset types to construct refs for from each row 

217 representing a parent; `None` for the parent itself. 

218 

219 Returns 

220 ------- 

221 refs : `~collections.abc.Iterator` [ `DatasetRef` ] 

222 Iterator that yields (resolved) dataset references. 

223 """ 

224 reader = DatasetRefReader( 

225 dataset_type, 

226 translate_collection=self._backend.get_collection_name, 

227 records=self._has_record_columns is True, 

228 record_caches=self._record_caches, 

229 ) 

230 if not (reader.columns_required <= self.relation.columns): 

231 raise ColumnError( 

232 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

233 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}." 

234 ) 

235 for row in self: 

236 parent_ref = reader.read(row) 

237 for component in components: 

238 if component is None: 

239 yield parent_ref 

240 else: 

241 yield parent_ref.makeComponentRef(component) 

242 

243 def iter_data_ids_and_dataset_refs( 

244 self, dataset_type: DatasetType, dimensions: DimensionGraph | None = None 

245 ) -> Iterator[tuple[DataCoordinate, DatasetRef]]: 

246 """Iterate over pairs of data IDs and dataset refs. 

247 

248 This permits the data ID dimensions to differ from the dataset 

249 dimensions. 

250 

251 Parameters 

252 ---------- 

253 dataset_type : `DatasetType` 

254 The parent dataset type to yield references for. 

255 dimensions : `DimensionGraph`, optional 

256 Dimensions of the data IDs to return. If not provided, 

257 ``self.dimensions`` is used. 

258 

259 Returns 

260 ------- 

261 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`, 

262 `DatasetRef` ] ] 

263 An iterator over (data ID, dataset reference) pairs. 

264 """ 

265 if dimensions is None: 

266 dimensions = self._dimensions 

267 data_id_reader = DataCoordinateReader.make( 

268 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches 

269 ) 

270 dataset_reader = DatasetRefReader( 

271 dataset_type, 

272 translate_collection=self._backend.get_collection_name, 

273 records=self._has_record_columns is True, 

274 record_caches=self._record_caches, 

275 ) 

276 if not (data_id_reader.columns_required <= self.relation.columns): 

277 raise ColumnError( 

278 f"Missing column(s) {set(data_id_reader.columns_required - self.relation.columns)} " 

279 f"for data IDs with dimensions {dimensions}." 

280 ) 

281 if not (dataset_reader.columns_required <= self.relation.columns): 

282 raise ColumnError( 

283 f"Missing column(s) {set(dataset_reader.columns_required - self.relation.columns)} " 

284 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}." 

285 ) 

286 for row in self: 

287 yield (data_id_reader.read(row), dataset_reader.read(row)) 

288 

289 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]: 

290 """Return an iterator that converts result rows to dimension records. 

291 

292 Parameters 

293 ---------- 

294 element : `DimensionElement`, optional 

295 Dimension element whose records will be returned. If not provided, 

296 `has_record_columns` must be a `DimensionElement` instance. 

297 

298 Returns 

299 ------- 

300 records : `~collections.abc.Iterator` [ `DimensionRecord` ] 

301 Iterator that yields dimension records. 

302 """ 

303 if element is None: 

304 match self._has_record_columns: 

305 case True | False: 

306 raise ValueError("No default dimension element in query; 'element' must be given.") 

307 case only_element_with_records: 

308 element = only_element_with_records 

309 if (cache := self._record_caches.get(element)) is not None: 

310 return (cache[data_id] for data_id in self.iter_data_ids(element.graph)) 

311 else: 

312 reader = DimensionRecordReader(element) 

313 if not (reader.columns_required <= self.relation.columns): 

314 raise ColumnError( 

315 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} " 

316 f"for records of element {element.name}." 

317 ) 

318 return (reader.read(row) for row in self) 

319 

320 def run(self) -> Query: 

321 """Execute the query and hold its results in memory. 

322 

323 Returns 

324 ------- 

325 executed : `Query` 

326 New query that holds the query results. 

327 

328 Notes 

329 ----- 

330 Iterating over the results of a query that has been `run` will always 

331 iterate over an existing container, while iterating over a query that 

332 has not been run will result in executing at least some of the query 

333 each time. 

334 

335 Running a query also sets its `is_deferred` flag to `False`, which will 

336 cause new queries constructed by its methods to be run immediately, 

337 unless ``defer=True`` is passed to the factory method. After a query 

338 has been run, factory methods will also tend to prefer to apply new 

339 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python 

340 code acting on the existing container rather than going back to SQL, 

341 which can be less efficient overall that applying operations to a 

342 deferred query and executing them all only at the end. 

343 

344 Running a query is represented in terms of relations by adding a 

345 `~lsst.daf.relation.Materialization` marker relation in the iteration 

346 engine and then processing the relation tree; this attaches the 

347 container of rows to that new relation to short-circuit any future 

348 processing of the tree and lock changes to the tree upstream of it. 

349 This is very different from the SQL-engine 

350 `~lsst.daf.relation.Materialization` added to the tree by the 

351 `materialize` method from a user perspective, though it has a similar 

352 representation in the relation tree. 

353 """ 

354 relation = ( 

355 # Make a new relation that definitely ends in the iteration engine 

356 # (this does nothing if it already does). 

357 self.relation.transferred_to(self._context.iteration_engine) 

358 # Make the new relation save its rows to an in-memory Python 

359 # collection in relation.payload when processed. 

360 .materialized(name_prefix="run") 

361 ) 

362 # Actually process the relation, simplifying out trivial relations, 

363 # executing any SQL queries, and saving results to relation.payload. 

364 # We discard the simplified relation that's returned, because we want 

365 # the new query to have any extra diagnostic information contained in 

366 # the original. 

367 self._context.process(relation) 

368 return self._copy(relation, False) 

369 

370 def materialized(self, defer_postprocessing: bool = True) -> Query: 

371 """Materialize the results of this query in its context's preferred 

372 engine. 

373 

374 Usually this means inserting the results into a temporary table in a 

375 database. 

376 

377 Parameters 

378 ---------- 

379 defer_postprocessing : `bool`, optional 

380 If `True`, do not execute operations that occur in the context's 

381 `QueryContext.iteration_engine` up front; instead insert and 

382 execute a materialization upstream of them (e.g. via a a SQL 

383 ``INSERT INTO ... SELECT`` statement, with no fetching to the 

384 client) and execute the postprocessing operations when iterating 

385 over the query results. If `False`, and iteration-engine 

386 postprocessing operations exist, run the full query, execute them 

387 now, and upload the results. 

388 If the relation is already in the preferred engine, this option 

389 is ignored and the materialization will not involve fetching rows 

390 to the iteration engine at all. If the relation has already been 

391 materialized in the iteration engine (i.e. via `run`), then this 

392 option is again ignored and an upload of the existing rows will 

393 be performed. 

394 

395 Returns 

396 ------- 

397 materialized : `Query` 

398 Modified query with the same row-and-column content with a 

399 materialization in ``self.context.preferred_engine``. 

400 """ 

401 if defer_postprocessing or self.relation.engine == self._context.preferred_engine: 

402 relation, stripped = self._context.strip_postprocessing(self._relation) 

403 if relation.engine == self._context.preferred_engine: 

404 # We got all the way to the engine we want to materialize in. 

405 # Apply that operation to the tree, process it (which actually 

406 # creates a temporary table and populates it), and then reapply 

407 # the stripped operations. 

408 relation = relation.materialized() 

409 self._context.process(relation) 

410 for operation in stripped: 

411 relation = operation.apply( 

412 relation, transfer=True, preferred_engine=self._context.iteration_engine 

413 ) 

414 return self._copy(relation, True) 

415 # Either defer_postprocessing=False, or attempting to strip off unary 

416 # operations until we got to the preferred engine didn't work, because 

417 # this tree doesn't actually involve the preferred engine. So we just 

418 # transfer to the preferred engine first, and then materialize, 

419 # process, and return. 

420 relation = self._relation.transferred_to(self._context.preferred_engine).materialized() 

421 self._context.process(relation) 

422 return self._copy(relation, True) 

423 

424 def projected( 

425 self, 

426 dimensions: Iterable[Dimension | str] | None = None, 

427 unique: bool = True, 

428 columns: Iterable[ColumnTag] | None = None, 

429 defer: bool | None = None, 

430 drop_postprocessing: bool = False, 

431 keep_record_columns: bool = True, 

432 ) -> Query: 

433 """Return a modified `Query` with a subset of this one's columns. 

434 

435 Parameters 

436 ---------- 

437 dimensions : `~collections.abc.Iterable` [ `Dimension` or `str` ], 

438 optional 

439 Dimensions to include in the new query. Will be expanded to 

440 include all required and implied dependencies. Must be a subset of 

441 ``self.dimensions``. If not provided, ``self.dimensions`` is used. 

442 unique : `bool`, optional 

443 If `True` (default) deduplicate rows after dropping columns. 

444 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional 

445 Additional dataset or dimension record columns to include in the 

446 query. Dimension key columns added here are ignored unless they 

447 extend beyond the key columns implied by the ``dimensions`` 

448 argument (which is an error). 

449 defer : `bool`, optional 

450 If `False`, run the new query immediately. If `True`, do not. If 

451 `None` (default), the ``defer`` option passed when making ``self`` 

452 is used (this option is "sticky"). 

453 drop_postprocessing : `bool`, optional 

454 Drop any iteration-engine operations that depend on columns that 

455 are being removed (e.g. region-overlap tests when region columns 

456 are being dropped), making it more likely that projection and 

457 deduplication could be performed in the preferred engine, where 

458 they may be more efficient. 

459 keep_record_columns : `bool`, optional 

460 If `True` (default) and this query `has_record_columns`, implicitly 

461 add any of those to ``columns`` whose dimension element is in the 

462 given ``dimensions``. 

463 

464 Returns 

465 ------- 

466 query : `Query` 

467 New query with the requested columns only, optionally deduplicated. 

468 

469 Notes 

470 ----- 

471 Dataset columns are dropped from the new query unless passed via the 

472 ``columns`` argument. All other columns are by default preserved. 

473 

474 Raises 

475 ------ 

476 lsst.daf.relation.ColumnError 

477 Raised if the columns to include in the new query are not all 

478 present in the current query. 

479 """ 

480 if dimensions is None: 

481 dimensions = set(self._dimensions) 

482 else: 

483 dimensions = set(dimensions) 

484 if columns is not None: 

485 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns)) 

486 dimensions = self._dimensions.universe.extract(dimensions) 

487 if columns is None: 

488 columns = set() 

489 else: 

490 columns = set(columns) 

491 columns.update(DimensionKeyColumnTag.generate(dimensions.names)) 

492 if keep_record_columns: 

493 if self._has_record_columns is True: 

494 for element in dimensions.elements: 

495 if element not in self._record_caches: 

496 columns.update(element.RecordClass.fields.columns) 

497 elif self._has_record_columns in dimensions.elements: 

498 element = cast(DimensionElement, self._has_record_columns) 

499 columns.update(element.RecordClass.fields.columns) 

500 if drop_postprocessing: 

501 relation = self._context.drop_invalidated_postprocessing(self._relation, columns) 

502 # Dropping postprocessing Calculations could cause other columns 

503 # we had otherwise intended to keep to be dropped as well. 

504 columns &= relation.columns 

505 else: 

506 relation = self._relation 

507 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine) 

508 if unique: 

509 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine) 

510 return self._chain(relation, defer, dimensions=dimensions) 

511 

512 def with_record_columns( 

513 self, dimension_element: DimensionElement | None = None, defer: bool | None = None 

514 ) -> Query: 

515 """Return a modified `Query` with additional dimension record columns 

516 and/or caches. 

517 

518 Parameters 

519 ---------- 

520 dimension_element : `DimensionElement`, optional 

521 Single element to add record columns for, or `None` default to add 

522 them for all elements in `dimensions`. 

523 defer : `bool`, optional 

524 If `False`, run the new query immediately. If `True`, do not. If 

525 `None` (default), the ``defer`` option passed when making ``self`` 

526 is used (this option is "sticky"). 

527 

528 Returns 

529 ------- 

530 query : `Query` 

531 New query with the requested record columns either in the relation 

532 or (when possible) available via record caching. 

533 

534 Notes 

535 ----- 

536 Adding dimension record columns is fundamentally different from adding 

537 new dimension key columns or dataset columns, because it is purely an 

538 addition of columns, not rows - we can always join in a dimension 

539 element table (if it has not already been included) on keys already 

540 present in the current relation, confident that there is exactly one 

541 row in the dimension element table for each row in the current 

542 relation. 

543 """ 

544 if self._has_record_columns is True or self._has_record_columns == dimension_element: 

545 return self 

546 record_caches = dict(self._record_caches) 

547 columns_required: set[ColumnTag] = set() 

548 for element in self.dimensions.elements if dimension_element is None else [dimension_element]: 

549 if element in record_caches: 

550 continue 

551 if (cache := self._backend.get_dimension_record_cache(element.name, self._context)) is not None: 

552 record_caches[element] = cache 

553 else: 

554 columns_required.update(element.RecordClass.fields.columns.keys()) 

555 # Modify the relation we have to remove any projections that dropped 

556 # columns we now want, as long the relation's behavior is otherwise 

557 # unchanged. 

558 columns_required -= self._relation.columns 

559 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

560 columns_required.difference_update(columns_found) 

561 if columns_required: 

562 relation = self._backend.make_dimension_relation( 

563 self._dimensions, 

564 columns_required, 

565 self._context, 

566 initial_relation=relation, 

567 # Don't permit joins to use any columns beyond those in the 

568 # original relation, as that would change what this operation 

569 # does. 

570 initial_join_max_columns=frozenset(self._relation.columns), 

571 governor_constraints=self._governor_constraints, 

572 ) 

573 return self._chain( 

574 relation, 

575 defer=defer, 

576 has_record_columns=True if dimension_element is None else dimension_element, 

577 record_caches=record_caches, 

578 ) 

579 

580 def find_datasets( 

581 self, 

582 dataset_type: DatasetType, 

583 collections: Any, 

584 *, 

585 find_first: bool = True, 

586 columns: Set[str] = frozenset(("dataset_id", "run")), 

587 defer: bool | None = None, 

588 ) -> Query: 

589 """Return a modified `Query` that includes a search for datasets of the 

590 given type. 

591 

592 Parameters 

593 ---------- 

594 dataset_type : `DatasetType` 

595 Dataset type to search for. May not be a component. 

596 collections 

597 Collection search path or pattern. Must be a single collection 

598 name or ordered sequence if ``find_first=True``. See 

599 :ref:`daf_butler_collection_expressions` for more information. 

600 find_first : `bool`, optional 

601 If `True` (default) search collections in order until the first 

602 match for each data ID is found. If `False`, return all matches in 

603 all collections. 

604 columns : `~collections.abc.Set` [ `str` ] 

605 Dataset columns to include in the new query. Options include 

606 

607 - ``dataset_id``: the unique identifier of the dataset. The type 

608 is implementation-dependent. Never nullable. Included by 

609 default. 

610 

611 - ``ingest_date``: the date and time the dataset was added to the 

612 data repository. 

613 

614 - ``run``: the foreign key column to the `~CollectionType.RUN` 

615 collection holding the dataset (not necessarily the collection 

616 name). The type is dependent on the collection manager 

617 implementation. Included by default. 

618 

619 - ``collection``: the foreign key column to the collection type in 

620 which the dataset was actually in this search. The type is 

621 dependent on the collection manager implementation. This may 

622 differ from ``run`` if the dataset is present in a matching 

623 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION` 

624 collection, which means the same dataset may also appear multiple 

625 times in the query results. 

626 

627 - ``timespan``: the validity range for datasets found in a 

628 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other 

629 collection types. 

630 

631 The default columns (``dataset_id`` and ``run``) are sufficient to 

632 enable `iter_dataset_refs`, which also takes care of translating 

633 the internal ``RUN`` collection key into its public name. 

634 

635 Setting this to an empty set while passing ``find_first=False`` 

636 will return a query that is constrained by dataset existence in 

637 some matching collection that does not actually return which 

638 datasets existed. 

639 defer : `bool`, optional 

640 If `False`, run the new query immediately. If `True`, do not. If 

641 `None` (default), the ``defer`` option passed when making ``self`` 

642 is used (this option is "sticky"). 

643 

644 Returns 

645 ------- 

646 query : `Query` 

647 New query with the requested dataset columns, constrained by the 

648 existence of datasets of this type in the given collection. 

649 

650 Raises 

651 ------ 

652 lsst.daf.relation.ColumnError 

653 Raised if a dataset search is already present in this query and 

654 this is a find-first search. 

655 """ 

656 if find_first and DatasetColumnTag.filter_from(self._relation.columns): 

657 raise ColumnError( 

658 "Cannot search for datasets with find_first=True " 

659 "on a query that already includes dataset columns." 

660 ) 

661 # 

662 # TODO: it'd be nice to do a QueryContext.restore_columns call here or 

663 # similar, to look for dataset-constraint joins already present in the 

664 # relation and expand them to include dataset-result columns as well, 

665 # instead of doing a possibly-redundant join here. But that would 

666 # require pushing relation usage down further into 

667 # DatasetStorageManager.make_relation, so that it doesn't need to be 

668 # given the columns, and then giving the relation system the ability to 

669 # simplify-away redundant joins when they only provide columns that 

670 # aren't ultimately used. The right time to look into that is probably 

671 # when investigating whether the base QueryBackend should be 

672 # responsible for producing an "abstract" relation tree of some sort, 

673 # with the subclasses only responsible for filling it in with payloads 

674 # (and possibly replacing some leaves with new sub-trees) during when 

675 # "processed" (or in some other "prepare" step). 

676 # 

677 # This is a low priority for three reasons: 

678 # - there's some chance the database's query optimizer will simplify 

679 # away these redundant joins; 

680 # - at present, the main use of this code path is in QG generation, 

681 # where we materialize the initial data ID query into a temp table 

682 # and hence can't go back and "recover" those dataset columns anyway; 

683 # 

684 collections = CollectionWildcard.from_expression(collections) 

685 if find_first: 

686 collections.require_ordered() 

687 rejections: list[str] = [] 

688 collection_records = self._backend.resolve_dataset_collections( 

689 dataset_type, 

690 collections, 

691 governor_constraints=self._governor_constraints, 

692 allow_calibration_collections=True, 

693 rejections=rejections, 

694 ) 

695 # If the dataset type has dimensions not in the current query, or we 

696 # need a temporal join for a calibration collection, either restore 

697 # those columns or join them in. 

698 full_dimensions = dataset_type.dimensions.union(self._dimensions) 

699 relation = self._relation 

700 record_caches = self._record_caches 

701 base_columns_required: set[ColumnTag] = { 

702 DimensionKeyColumnTag(name) for name in full_dimensions.names 

703 } 

704 spatial_joins: list[tuple[str, str]] = [] 

705 if not (dataset_type.dimensions <= self._dimensions): 

706 if self._has_record_columns is True: 

707 # This query is for expanded data IDs, so if we add new 

708 # dimensions to the query we need to be able to get records for 

709 # the new dimensions. 

710 record_caches = dict(self._record_caches) 

711 for element in full_dimensions.elements: 

712 if element in record_caches: 

713 continue 

714 if ( 

715 cache := self._backend.get_dimension_record_cache(element.name, self._context) 

716 ) is not None: 

717 record_caches[element] = cache 

718 else: 

719 base_columns_required.update(element.RecordClass.fields.columns.keys()) 

720 # See if we need spatial joins between the current query and the 

721 # dataset type's dimensions. The logic here is for multiple 

722 # spatial joins in general, but in practice it'll be exceedingly 

723 # rare for there to be more than one. We start by figuring out 

724 # which spatial "families" (observations vs. skymaps, skypix 

725 # systems) are present on only one side and not the other. 

726 lhs_spatial_families = self._dimensions.spatial - dataset_type.dimensions.spatial 

727 rhs_spatial_families = dataset_type.dimensions.spatial - self._dimensions.spatial 

728 # Now we iterate over the Cartesian product of those, so e.g. 

729 # if the query has {tract, patch, visit} and the dataset type 

730 # has {htm7} dimensions, the iterations of this loop 

731 # correspond to: (skymap, htm), (observations, htm). 

732 for lhs_spatial_family, rhs_spatial_family in itertools.product( 

733 lhs_spatial_families, rhs_spatial_families 

734 ): 

735 # For each pair we add a join between the most-precise element 

736 # present in each family (e.g. patch beats tract). 

737 spatial_joins.append( 

738 ( 

739 lhs_spatial_family.choose(full_dimensions.elements).name, 

740 rhs_spatial_family.choose(full_dimensions.elements).name, 

741 ) 

742 ) 

743 # Set up any temporal join between the query dimensions and CALIBRATION 

744 # collection's validity ranges. 

745 temporal_join_on: set[ColumnTag] = set() 

746 if any(r.type is CollectionType.CALIBRATION for r in collection_records): 

747 for family in self._dimensions.temporal: 

748 endpoint = family.choose(self._dimensions.elements) 

749 temporal_join_on.add(DimensionRecordColumnTag(endpoint.name, "timespan")) 

750 base_columns_required.update(temporal_join_on) 

751 # Note which of the many kinds of potentially-missing columns we have 

752 # and add the rest. 

753 base_columns_required.difference_update(relation.columns) 

754 if base_columns_required: 

755 relation = self._backend.make_dimension_relation( 

756 full_dimensions, 

757 base_columns_required, 

758 self._context, 

759 initial_relation=relation, 

760 # Don't permit joins to use any columns beyond those in the 

761 # original relation, as that would change what this 

762 # operation does. 

763 initial_join_max_columns=frozenset(self._relation.columns), 

764 governor_constraints=self._governor_constraints, 

765 spatial_joins=spatial_joins, 

766 ) 

767 # Finally we can join in the search for the dataset query. 

768 columns = set(columns) 

769 columns.add("dataset_id") 

770 if not collection_records: 

771 relation = relation.join( 

772 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context) 

773 ) 

774 elif find_first: 

775 relation = self._backend.make_dataset_search_relation( 

776 dataset_type, 

777 collection_records, 

778 columns, 

779 self._context, 

780 join_to=relation, 

781 temporal_join_on=temporal_join_on, 

782 ) 

783 else: 

784 relation = self._backend.make_dataset_query_relation( 

785 dataset_type, 

786 collection_records, 

787 columns, 

788 self._context, 

789 join_to=relation, 

790 temporal_join_on=temporal_join_on, 

791 ) 

792 return self._chain(relation, dimensions=full_dimensions, record_caches=record_caches, defer=defer) 

793 

794 def sliced( 

795 self, 

796 start: int = 0, 

797 stop: int | None = None, 

798 defer: bool | None = None, 

799 ) -> Query: 

800 """Return a modified `Query` with that takes a slice of this one's 

801 rows. 

802 

803 Parameters 

804 ---------- 

805 start : `int`, optional 

806 First index to include, inclusive. 

807 stop : `int` or `None`, optional 

808 One past the last index to include (i.e. exclusive). 

809 defer : `bool`, optional 

810 If `False`, run the new query immediately. If `True`, do not. If 

811 `None` (default), the ``defer`` option passed when making ``self`` 

812 is used (this option is "sticky"). 

813 

814 Returns 

815 ------- 

816 query : `Query` 

817 New query with the requested slice. 

818 

819 Notes 

820 ----- 

821 This operation must be implemented in the iteration engine if there are 

822 postprocessing operations, which may be much less efficient than 

823 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..`` 

824 in SQL). 

825 

826 Since query row order is usually arbitrary, it usually makes sense to 

827 call `sorted` before calling `sliced` to make the results 

828 deterministic. This is not checked because there are some contexts 

829 where getting an arbitrary subset of the results of a given size 

830 still makes sense. 

831 """ 

832 return self._chain(self._relation[start:stop], defer) 

833 

834 def sorted( 

835 self, 

836 order_by: Iterable[SortTerm], 

837 defer: bool | None = None, 

838 ) -> Query: 

839 """Return a modified `Query` that sorts this one's rows. 

840 

841 Parameters 

842 ---------- 

843 order_by : `~collections.abc.Iterable` [ `SortTerm` ] 

844 Expressions to sort by. 

845 defer : `bool`, optional 

846 If `False`, run the new query immediately. If `True`, do not. If 

847 `None` (default), the ``defer`` option passed when making ``self`` 

848 is used (this option is "sticky"). 

849 

850 Returns 

851 ------- 

852 query : `Query` 

853 New query with the requested sorting. 

854 

855 Notes 

856 ----- 

857 The ``order_by`` expression can include references to dimension record 

858 columns that were not present in the original relation; this is 

859 similar to calling `with_record_columns` for those columns first (but 

860 in this case column requests cannot be satisfied by record caches). 

861 All other columns referenced must be present in the query already. 

862 """ 

863 op = Sort(tuple(order_by)) 

864 columns_required = set(op.columns_required) 

865 columns_required.difference_update(self._relation.columns) 

866 if columns_required: 

867 relation, columns_found = self._context.restore_columns(self._relation, columns_required) 

868 columns_required.difference_update(columns_found) 

869 if columns_required: 

870 try: 

871 relation = self._backend.make_dimension_relation( 

872 self._dimensions, 

873 columns_required, 

874 self._context, 

875 initial_relation=relation, 

876 # Don't permit joins to use any columns beyond those in 

877 # the original relation, as that would change what this 

878 # operation does. 

879 initial_join_max_columns=frozenset(self._relation.columns), 

880 governor_constraints=self._governor_constraints, 

881 ) 

882 except ColumnError as err: 

883 raise ColumnError( 

884 "Cannot sort by columns that were not included in the original query or " 

885 "fully resolved by its dimensions." 

886 ) from err 

887 else: 

888 relation = self._relation 

889 relation = op.apply(relation, preferred_engine=self._context.preferred_engine) 

890 return self._chain(relation, defer) 

891 

892 def count(self, *, exact: bool = True, discard: bool = False) -> int: 

893 """Count the number of rows in this query. 

894 

895 Parameters 

896 ---------- 

897 exact : `bool`, optional 

898 If `True` (default), return the exact number of rows. If `False`, 

899 returning an upper bound is permitted if it can be done much more 

900 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but 

901 ignoring client-side filtering that would otherwise take place. 

902 discard : `bool`, optional 

903 If `True`, compute the exact count even if it would require running 

904 the full query and then throwing away the result rows after 

905 counting them. If `False`, this is an error, as the user would 

906 usually be better off executing the query first to fetch its rows 

907 into a new query (or passing ``exact=False``). Ignored if 

908 ``exact=False``. 

909 

910 Returns 

911 ------- 

912 n_rows : `int` 

913 Number of rows in the query, or an upper bound. This includes 

914 duplicates, if there are any. 

915 

916 Raises 

917 ------ 

918 RuntimeError 

919 Raised if an exact count was requested and could not be obtained 

920 without fetching and discarding rows. 

921 """ 

922 if self._relation.min_rows == self._relation.max_rows: 

923 return self._relation.max_rows 

924 return self._context.count(self._relation, exact=exact, discard=discard) 

925 

926 def any(self, *, execute: bool = True, exact: bool = True) -> bool: 

927 """Check whether this query has any result rows at all. 

928 

929 Parameters 

930 ---------- 

931 execute : `bool`, optional 

932 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

933 determined prior to execution that the query would return no rows. 

934 exact : `bool`, optional 

935 If `True`, run the full query and perform post-query filtering if 

936 needed, until at least one result row is found. If `False`, the 

937 returned result does not account for post-query filtering, and 

938 hence may be `True` even when all result rows would be filtered 

939 out. 

940 

941 Returns 

942 ------- 

943 any_rows : `bool` 

944 Whether the query has any rows, or if it may have any rows if 

945 ``exact=False``. 

946 

947 Raises 

948 ------ 

949 RuntimeError 

950 Raised if an exact check was requested and could not be obtained 

951 without executing the query. 

952 """ 

953 if self._relation.min_rows > 0: 

954 return True 

955 if self._relation.max_rows == 0: 

956 return False 

957 if execute: 

958 return self._context.any(self._relation, execute=execute, exact=exact) 

959 elif not exact: 

960 return True 

961 raise TypeError("Cannot obtain exact results without executing the query.") 

962 

963 def explain_no_results(self, execute: bool = True) -> list[str]: 

964 """Return human-readable messages that may help explain why the query 

965 yields no results. 

966 

967 Parameters 

968 ---------- 

969 execute : `bool`, optional 

970 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``) 

971 of aspects of the query to more precisely determine where rows were 

972 filtered out. 

973 

974 Returns 

975 ------- 

976 messages : `~collections.abc.Iterable` [ `str` ] 

977 String messages that describe reasons the query might not yield any 

978 results. 

979 """ 

980 # First try without actually executing any queries. 

981 diagnostics = Diagnostics.run(self._relation) 

982 if diagnostics.is_doomed: 

983 return diagnostics.messages 

984 if execute: 

985 # Try again, running LIMIT 1 queries as we walk back down the tree 

986 # to look for relations with no rows: 

987 diagnostics = Diagnostics.run(self._relation, executor=self._context.any) 

988 if diagnostics.is_doomed: 

989 return diagnostics.messages 

990 return [] 

991 

992 def _copy( 

993 self, 

994 relation: Relation, 

995 is_deferred: bool, 

996 dimensions: DimensionGraph | None = None, 

997 governor_constraints: Mapping[str, Set[str]] | None = None, 

998 has_record_columns: bool | DimensionElement | None = None, 

999 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

1000 ) -> Query: 

1001 """Return a modified copy of this query with some attributes replaced. 

1002 

1003 See class docs for parameter documentation; the only difference here 

1004 is that the defaults are the values ``self`` was constructed with. 

1005 """ 

1006 return Query( 

1007 dimensions=self._dimensions if dimensions is None else dimensions, 

1008 backend=self._backend, 

1009 context=self._context, 

1010 relation=relation, 

1011 governor_constraints=( 

1012 governor_constraints if governor_constraints is not None else self._governor_constraints 

1013 ), 

1014 is_deferred=is_deferred, 

1015 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns, 

1016 record_caches=self._record_caches if record_caches is None else record_caches, 

1017 ) 

1018 

1019 def _chain( 

1020 self, 

1021 relation: Relation, 

1022 defer: bool | None, 

1023 dimensions: DimensionGraph | None = None, 

1024 governor_constraints: Mapping[str, Set[str]] | None = None, 

1025 has_record_columns: bool | DimensionElement | None = None, 

1026 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None, 

1027 ) -> Query: 

1028 """Return a modified query with a new relation while handling the 

1029 ubiquitous ``defer`` parameter's logic. 

1030 

1031 Parameters 

1032 ---------- 

1033 relation : `Relation` 

1034 Relation for the new query. 

1035 defer : `bool` 

1036 If `False`, run the new query immediately. If `True`, do not. If 

1037 `None` , the ``defer`` option passed when making ``self`` is used 

1038 (this option is "sticky"). 

1039 dimensions : `DimensionGraph`, optional 

1040 See class docs. 

1041 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

1042 `~collections.abc.Set` [ `str` ] ], optional 

1043 See class docs. 

1044 has_record_columns : `bool` or `DimensionElement`, optional 

1045 See class docs. 

1046 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \ 

1047 `~collections.abc.Mapping` \ 

1048 [ `DataCoordinate`, `DimensionRecord` ] ], optional 

1049 See class docs. 

1050 

1051 Returns 

1052 ------- 

1053 chained : `Query` 

1054 Modified query, or ``self`` if no modifications were actually 

1055 requested. 

1056 """ 

1057 if defer is None: 

1058 defer = self._is_deferred 

1059 if ( 

1060 relation is self._relation 

1061 and dimensions is None 

1062 and defer == self._is_deferred 

1063 and record_caches is None 

1064 and has_record_columns is None 

1065 and governor_constraints is None 

1066 ): 

1067 return self 

1068 result = self._copy( 

1069 relation, 

1070 is_deferred=True, 

1071 governor_constraints=governor_constraints, 

1072 dimensions=dimensions, 

1073 has_record_columns=has_record_columns, 

1074 record_caches=record_caches, 

1075 ) 

1076 if not defer: 

1077 result = result.run() 

1078 return result