Coverage for python/lsst/daf/butler/registry/queries/_query.py: 24%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

374 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("Query",) 

24 

25import dataclasses 

26import enum 

27import itertools 

28from abc import ABC, abstractmethod 

29from contextlib import contextmanager 

30from typing import TYPE_CHECKING, ContextManager, Dict, Iterable, Iterator, Mapping, Optional, Tuple 

31 

32import sqlalchemy 

33from lsst.sphgeom import Region 

34 

35from ...core import ( 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 Dimension, 

40 DimensionElement, 

41 DimensionGraph, 

42 DimensionRecord, 

43 DimensionUniverse, 

44 SimpleQuery, 

45 SpatialRegionDatabaseRepresentation, 

46 addDimensionForeignKey, 

47 ddl, 

48) 

49from ..interfaces import Database 

50from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers 

51 

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 from ._builder import QueryBuilder 

54 

55 

56@dataclasses.dataclass(frozen=True) 

57class OrderByColumn: 

58 """Information about single column in ORDER BY clause.""" 

59 

60 column: sqlalchemy.sql.ColumnElement 

61 """Name of the column or `None` for primary key (`str` or `None`)""" 

62 

63 ordering: bool 

64 """True for ascending order, False for descending (`bool`).""" 

65 

66 add_to_select: bool 

67 """True if columns is a non-key column and needs to be added to select 

68 columns explicitly (`bool`).""" 

69 

70 field_spec: Optional[ddl.FieldSpec] 

71 """Field specification for a column in materialized table (`ddl.FieldSpec`) 

72 """ 

73 

74 dimension: Optional[Dimension] 

75 """Not-None if column corresponds to a dimension (`Dimension` or `None`)""" 

76 

77 @property 

78 def column_order(self) -> sqlalchemy.sql.ColumnElement: 

79 """Column element for use in ORDER BY clause 

80 (`sqlalchemy.sql.ColumnElement`) 

81 """ 

82 return self.column.asc() if self.ordering else self.column.desc() 

83 

84 def materialized(self, table: sqlalchemy.schema.Table) -> OrderByColumn: 

85 """Re-purpose ordering column definition for a materialized table. 

86 

87 Parameters 

88 ---------- 

89 table : `sqlalchemy.schema.Table` 

90 Materialized table, it should have all columns in SELECT clause 

91 already. 

92 

93 Returns 

94 ------- 

95 column : `OrderByColumn` 

96 Column definition to use with ORDER BY in materialized table. 

97 """ 

98 return OrderByColumn( 

99 column=table.columns[self.dimension.name if self.dimension else self.column.name], 

100 ordering=self.ordering, 

101 add_to_select=False, 

102 field_spec=None, 

103 dimension=self.dimension, 

104 ) 

105 

106 

107class Query(ABC): 

108 """An abstract base class for queries that return some combination of 

109 `DatasetRef` and `DataCoordinate` objects. 

110 

111 Parameters 

112 ---------- 

113 graph : `DimensionGraph` 

114 Object describing the dimensions included in the query. 

115 whereRegion : `lsst.sphgeom.Region`, optional 

116 Region that all region columns in all returned rows must overlap. 

117 managers : `RegistryManagers` 

118 A struct containing the registry manager instances used by the query 

119 system. 

120 doomed_by : `Iterable` [ `str` ], optional 

121 A list of messages (appropriate for e.g. logging or exceptions) that 

122 explain why the query is known to return no results even before it is 

123 executed. Queries with a non-empty list will never be executed. 

124 

125 Notes 

126 ----- 

127 The `Query` hierarchy abstracts over the database/SQL representation of a 

128 particular set of data IDs or datasets. It is expected to be used as a 

129 backend for other objects that provide more natural interfaces for one or 

130 both of these, not as part of a public interface to query results. 

131 """ 

132 

133 def __init__( 

134 self, 

135 *, 

136 graph: DimensionGraph, 

137 whereRegion: Optional[Region], 

138 managers: RegistryManagers, 

139 doomed_by: Iterable[str] = (), 

140 ): 

141 self.graph = graph 

142 self.whereRegion = whereRegion 

143 self.managers = managers 

144 self._doomed_by = tuple(doomed_by) 

145 self._filtered_by_join: Optional[int] = None 

146 self._filtered_by_where: Optional[int] = None 

147 

148 @abstractmethod 

149 def isUnique(self) -> bool: 

150 """Return `True` if this query's rows are guaranteed to be unique, and 

151 `False` otherwise. 

152 

153 If this query has dataset results (`datasetType` is not `None`), 

154 uniqueness applies to the `DatasetRef` instances returned by 

155 `extractDatasetRef` from the result of `rows`. If it does not have 

156 dataset results, uniqueness applies to the `DataCoordinate` instances 

157 returned by `extractDataId`. 

158 """ 

159 raise NotImplementedError() 

160 

161 @abstractmethod 

162 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

163 """Return the query column that contains the primary key value for 

164 the dimension with the given name. 

165 

166 Parameters 

167 ---------- 

168 name : `str` 

169 Name of the dimension. 

170 

171 Returns 

172 ------- 

173 column : `sqlalchemy.sql.ColumnElement`. 

174 SQLAlchemy object representing a column in the query. 

175 

176 Notes 

177 ----- 

178 This method is intended primarily as a hook for subclasses to implement 

179 and the ABC to call in order to provide higher-level functionality; 

180 code that uses `Query` objects (but does not implement one) should 

181 usually not have to call this method. 

182 """ 

183 raise NotImplementedError() 

184 

185 @property 

186 @abstractmethod 

187 def spatial(self) -> Iterator[DimensionElement]: 

188 """An iterator over the dimension element columns used in post-query 

189 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]). 

190 

191 Notes 

192 ----- 

193 This property is intended primarily as a hook for subclasses to 

194 implement and the ABC to call in order to provide higher-level 

195 functionality; code that uses `Query` objects (but does not implement 

196 one) should usually not have to access this property. 

197 """ 

198 raise NotImplementedError() 

199 

200 @abstractmethod 

201 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

202 """Return a region column for one of the dimension elements iterated 

203 over by `spatial`. 

204 

205 Parameters 

206 ---------- 

207 name : `str` 

208 Name of the element. 

209 

210 Returns 

211 ------- 

212 column : `sqlalchemy.sql.ColumnElement` 

213 SQLAlchemy representing a result column in the query. 

214 

215 Notes 

216 ----- 

217 This method is intended primarily as a hook for subclasses to implement 

218 and the ABC to call in order to provide higher-level functionality; 

219 code that uses `Query` objects (but does not implement one) should 

220 usually not have to call this method. 

221 """ 

222 raise NotImplementedError() 

223 

224 @property 

225 def datasetType(self) -> Optional[DatasetType]: 

226 """The `DatasetType` of datasets returned by this query, or `None` 

227 if there are no dataset results (`DatasetType` or `None`). 

228 """ 

229 cols = self.getDatasetColumns() 

230 if cols is None: 

231 return None 

232 return cols.datasetType 

233 

234 def count(self, db: Database, *, region: Optional[Region] = None, exact: bool = True) -> int: 

235 """Count the number of rows this query would return. 

236 

237 Parameters 

238 ---------- 

239 db : `Database` 

240 Object managing the database connection. 

241 region : `sphgeom.Region`, optional 

242 A region that any result-row regions must overlap in order to be 

243 yielded. If not provided, this will be ``self.whereRegion``, if 

244 that exists. 

245 exact : `bool`, optional 

246 If `True`, run the full query and perform post-query filtering if 

247 needed to account for that filtering in the count. If `False`, the 

248 result may be an upper bound. 

249 

250 Returns 

251 ------- 

252 count : `int` 

253 The number of rows the query would return, or an upper bound if 

254 ``exact=False``. 

255 

256 Notes 

257 ----- 

258 This counts the number of rows returned, not the number of unique rows 

259 returned, so even with ``exact=True`` it may provide only an upper 

260 bound on the number of *deduplicated* result rows. 

261 """ 

262 if self._doomed_by: 

263 return 0 

264 sql = self.sql 

265 if sql is None: 

266 return 1 

267 if exact and self.spatial: 

268 filtered_count = 0 

269 for _ in self.rows(db, region=region): 

270 filtered_count += 1 

271 return filtered_count 

272 else: 

273 return db.query(sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)).scalar() 

274 

275 def any( 

276 self, 

277 db: Database, 

278 *, 

279 region: Optional[Region] = None, 

280 execute: bool = True, 

281 exact: bool = True, 

282 ) -> bool: 

283 """Test whether this query returns any results. 

284 

285 Parameters 

286 ---------- 

287 db : `Database` 

288 Object managing the database connection. 

289 region : `sphgeom.Region`, optional 

290 A region that any result-row regions must overlap in order to be 

291 yielded. If not provided, this will be ``self.whereRegion``, if 

292 that exists. 

293 execute : `bool`, optional 

294 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

295 determined prior to execution that the query would return no rows. 

296 exact : `bool`, optional 

297 If `True`, run the full query and perform post-query filtering if 

298 needed, until at least one result row is found. If `False`, the 

299 returned result does not account for post-query filtering, and 

300 hence may be `True` even when all result rows would be filtered 

301 out. 

302 

303 Returns 

304 ------- 

305 any : `bool` 

306 `True` if the query would (or might, depending on arguments) yield 

307 result rows. `False` if it definitely would not. 

308 """ 

309 if self._doomed_by: 

310 return False 

311 sql = self.sql 

312 if sql is None: 

313 return True 

314 if exact and not execute: 

315 raise TypeError("Cannot obtain exact results without executing the query.") 

316 if exact and self.spatial: 

317 for _ in self.rows(db, region=region): 

318 return True 

319 return False 

320 elif execute: 

321 return db.query(sql.limit(1)).one_or_none() is not None 

322 else: 

323 return True 

324 

325 def explain_no_results( 

326 self, 

327 db: Database, 

328 *, 

329 region: Optional[Region] = None, 

330 followup: bool = True, 

331 ) -> Iterator[str]: 

332 """Return human-readable messages that may help explain why the query 

333 yields no results. 

334 

335 Parameters 

336 ---------- 

337 db : `Database` 

338 Object managing the database connection. 

339 region : `sphgeom.Region`, optional 

340 A region that any result-row regions must overlap in order to be 

341 yielded. If not provided, this will be ``self.whereRegion``, if 

342 that exists. 

343 followup : `bool`, optional 

344 If `True` (default) perform inexpensive follow-up queries if no 

345 diagnostics are available from query generation alone. 

346 

347 Returns 

348 ------- 

349 messages : `Iterator` [ `str` ] 

350 String messages that describe reasons the query might not yield any 

351 results. 

352 

353 Notes 

354 ----- 

355 Messages related to post-query filtering are only available if `rows`, 

356 `any`, or `count` was already called with the same region (with 

357 ``exact=True`` for the latter two). 

358 """ 

359 from ._builder import QueryBuilder 

360 

361 if self._doomed_by: 

362 yield from self._doomed_by 

363 return 

364 if self._filtered_by_where: 

365 yield ( 

366 f"{self._filtered_by_where} result rows were filtered out because " 

367 "one or more region did not overlap the WHERE-clause region." 

368 ) 

369 if self._filtered_by_join: 

370 yield ( 

371 f"{self._filtered_by_join} result rows were filtered out because " 

372 "one or more regions did not overlap." 

373 ) 

374 if (not followup) or self._filtered_by_join or self._filtered_by_where: 

375 return 

376 # Query didn't return results even before client-side filtering, and 

377 # caller says we can do follow-up queries to determine why. 

378 # Start by seeing if there are _any_ dimension records for each element 

379 # involved. 

380 for element in self.graph.elements: 

381 summary = QuerySummary(element.graph) 

382 builder = QueryBuilder(summary, self.managers) 

383 followup_query = builder.finish() 

384 if not followup_query.any(db, exact=False): 

385 yield f"No dimension records for element '{element.name}' found." 

386 yield from followup_query.explain_no_results(db, region=region, followup=False) 

387 return 

388 

389 @abstractmethod 

390 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

391 """Return the columns for the datasets returned by this query. 

392 

393 Returns 

394 ------- 

395 columns : `DatasetQueryColumns` or `None` 

396 Struct containing SQLAlchemy representations of the result columns 

397 for a dataset. 

398 

399 Notes 

400 ----- 

401 This method is intended primarily as a hook for subclasses to implement 

402 and the ABC to call in order to provide higher-level functionality; 

403 code that uses `Query` objects (but does not implement one) should 

404 usually not have to call this method. 

405 """ 

406 raise NotImplementedError() 

407 

408 @property 

409 @abstractmethod 

410 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

411 """A SQLAlchemy object representing the full query 

412 (`sqlalchemy.sql.FromClause` or `None`). 

413 

414 This is `None` in the special case where the query has no columns, and 

415 only one logical row. 

416 """ 

417 raise NotImplementedError() 

418 

419 def rows( 

420 self, db: Database, *, region: Optional[Region] = None 

421 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

422 """Execute the query and yield result rows, applying `predicate`. 

423 

424 Parameters 

425 ---------- 

426 db : `Database` 

427 Object managing the database connection. 

428 region : `sphgeom.Region`, optional 

429 A region that any result-row regions must overlap in order to be 

430 yielded. If not provided, this will be ``self.whereRegion``, if 

431 that exists. 

432 

433 Yields 

434 ------ 

435 row : `sqlalchemy.engine.RowProxy` or `None` 

436 Result row from the query. `None` may yielded exactly once instead 

437 of any real rows to indicate an empty query (see `EmptyQuery`). 

438 """ 

439 if self._doomed_by: 

440 return 

441 whereRegion = region if region is not None else self.whereRegion 

442 self._filtered_by_where = 0 

443 self._filtered_by_join = 0 

444 for row in db.query(self.sql): 

445 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial] 

446 if whereRegion and any(r.isDisjointFrom(whereRegion) for r in rowRegions): 

447 self._filtered_by_where += 1 

448 continue 

449 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)): 

450 self._filtered_by_join += 1 

451 continue 

452 yield row 

453 

454 def extractDimensionsTuple( 

455 self, row: Optional[sqlalchemy.engine.RowProxy], dimensions: Iterable[Dimension] 

456 ) -> tuple: 

457 """Extract a tuple of data ID values from a result row. 

458 

459 Parameters 

460 ---------- 

461 row : `sqlalchemy.engine.RowProxy` or `None` 

462 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

463 the row from an `EmptyQuery`. 

464 dimensions : `Iterable` [ `Dimension` ] 

465 The dimensions to include in the returned tuple, in order. 

466 

467 Returns 

468 ------- 

469 values : `tuple` 

470 A tuple of dimension primary key values. 

471 """ 

472 if row is None: 

473 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions." 

474 return () 

475 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions) 

476 

477 def extractDataId( 

478 self, 

479 row: Optional[sqlalchemy.engine.RowProxy], 

480 *, 

481 graph: Optional[DimensionGraph] = None, 

482 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

483 ) -> DataCoordinate: 

484 """Extract a data ID from a result row. 

485 

486 Parameters 

487 ---------- 

488 row : `sqlalchemy.engine.RowProxy` or `None` 

489 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

490 the row from an `EmptyQuery`. 

491 graph : `DimensionGraph`, optional 

492 The dimensions the returned data ID should identify. If not 

493 provided, this will be all dimensions in `QuerySummary.requested`. 

494 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

495 Nested mapping containing records to attach to the returned 

496 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will 

497 return `True`. If provided, outer keys must include all dimension 

498 element names in ``graph``, and inner keys should be tuples of 

499 dimension primary key values in the same order as 

500 ``element.graph.required``. If not provided, 

501 `DataCoordinate.hasRecords` will return `False` on the returned 

502 object. 

503 

504 Returns 

505 ------- 

506 dataId : `DataCoordinate` 

507 A data ID that identifies all required and implied dimensions. If 

508 ``records is not None``, this is have 

509 `~DataCoordinate.hasRecords()` return `True`. 

510 """ 

511 if graph is None: 

512 graph = self.graph 

513 if not graph: 

514 return DataCoordinate.makeEmpty(self.graph.universe) 

515 dataId = DataCoordinate.fromFullValues( 

516 graph, self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied)) 

517 ) 

518 if records is not None: 

519 recordsForRow = {} 

520 for element in graph.elements: 

521 key = tuple(dataId.subset(element.graph).values()) 

522 recordsForRow[element.name] = records[element.name].get(key) 

523 return dataId.expanded(recordsForRow) 

524 else: 

525 return dataId 

526 

527 def extractDatasetRef( 

528 self, 

529 row: sqlalchemy.engine.RowProxy, 

530 dataId: Optional[DataCoordinate] = None, 

531 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

532 ) -> DatasetRef: 

533 """Extract a `DatasetRef` from a result row. 

534 

535 Parameters 

536 ---------- 

537 row : `sqlalchemy.engine.RowProxy` 

538 A result row from a SQLAlchemy SELECT query. 

539 dataId : `DataCoordinate` 

540 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class) 

541 `DataCoordinate` is constructed from ``row`` if `None`. 

542 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

543 Records to use to return an `ExpandedDataCoordinate`. If provided, 

544 outer keys must include all dimension element names in ``graph``, 

545 and inner keys should be tuples of dimension primary key values 

546 in the same order as ``element.graph.required``. 

547 

548 Returns 

549 ------- 

550 ref : `DatasetRef` 

551 Reference to the dataset; guaranteed to have `DatasetRef.id` not 

552 `None`. 

553 """ 

554 datasetColumns = self.getDatasetColumns() 

555 assert datasetColumns is not None 

556 if dataId is None: 

557 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records) 

558 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]] 

559 return DatasetRef( 

560 datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], run=runRecord.name 

561 ) 

562 

563 def _makeSubsetQueryColumns( 

564 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

565 ) -> Tuple[DimensionGraph, Optional[QueryColumns]]: 

566 """Helper method for subclass implementations of `subset`. 

567 

568 Parameters 

569 ---------- 

570 graph : `DimensionGraph`, optional 

571 Dimensions to include in the new `Query` being constructed. 

572 ``subset`` implementations should generally just forward their 

573 own ``graph`` argument here. 

574 datasets : `bool`, optional 

575 Whether the new `Query` should include dataset results. Defaults 

576 to `True`, but is ignored if ``self`` does not include dataset 

577 results. 

578 unique : `bool`, optional 

579 Whether the new `Query` should guarantee unique results (this may 

580 come with a performance penalty). 

581 

582 Returns 

583 ------- 

584 graph : `DimensionGraph` 

585 The dimensions of the new `Query`. This is exactly the same as 

586 the argument of the same name, with ``self.graph`` used if that 

587 argument is `None`. 

588 columns : `QueryColumns` or `None` 

589 A struct containing the SQLAlchemy column objects to use in the 

590 new query, constructed by delegating to other (mostly abstract) 

591 methods on ``self``. If `None`, `subset` may return ``self``. 

592 """ 

593 if graph is None: 

594 graph = self.graph 

595 if ( 

596 graph == self.graph 

597 and (self.getDatasetColumns() is None or datasets) 

598 and (self.isUnique() or not unique) 

599 ): 

600 return graph, None 

601 columns = QueryColumns() 

602 for dimension in graph.dimensions: 

603 col = self.getDimensionColumn(dimension.name) 

604 columns.keys[dimension] = [col] 

605 if not unique: 

606 for element in self.spatial: 

607 col = self.getRegionColumn(element.name) 

608 columns.regions[element] = col 

609 if datasets and self.getDatasetColumns() is not None: 

610 columns.datasets = self.getDatasetColumns() 

611 return graph, columns 

612 

613 @abstractmethod 

614 def materialize(self, db: Database) -> ContextManager[Query]: 

615 """Execute this query and insert its results into a temporary table. 

616 

617 Parameters 

618 ---------- 

619 db : `Database` 

620 Database engine to execute the query against. 

621 

622 Returns 

623 ------- 

624 context : `typing.ContextManager` [ `MaterializedQuery` ] 

625 A context manager that ensures the temporary table is created and 

626 populated in ``__enter__`` (returning a `MaterializedQuery` object 

627 backed by that table), and dropped in ``__exit__``. If ``self`` 

628 is already a `MaterializedQuery`, ``__enter__`` may just return 

629 ``self`` and ``__exit__`` may do nothing (reflecting the fact that 

630 an outer context manager should already take care of everything 

631 else). 

632 """ 

633 raise NotImplementedError() 

634 

635 @abstractmethod 

636 def subset( 

637 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

638 ) -> Query: 

639 """Return a new `Query` whose columns and/or rows are (mostly) subset 

640 of this one's. 

641 

642 Parameters 

643 ---------- 

644 graph : `DimensionGraph`, optional 

645 Dimensions to include in the new `Query` being constructed. 

646 If `None` (default), ``self.graph`` is used. 

647 datasets : `bool`, optional 

648 Whether the new `Query` should include dataset results. Defaults 

649 to `True`, but is ignored if ``self`` does not include dataset 

650 results. 

651 unique : `bool`, optional 

652 Whether the new `Query` should guarantee unique results (this may 

653 come with a performance penalty). 

654 

655 Returns 

656 ------- 

657 query : `Query` 

658 A query object corresponding to the given inputs. May be ``self`` 

659 if no changes were requested. 

660 

661 Notes 

662 ----- 

663 The way spatial overlaps are handled at present makes it impossible to 

664 fully guarantee in general that the new query's rows are a subset of 

665 this one's while also returning unique rows. That's because the 

666 database is only capable of performing approximate, conservative 

667 overlaps via the common skypix system; we defer actual region overlap 

668 operations to per-result-row Python logic. But including the region 

669 columns necessary to do that postprocessing in the query makes it 

670 impossible to do a SELECT DISTINCT on the user-visible dimensions of 

671 the query. For example, consider starting with a query with dimensions 

672 (instrument, skymap, visit, tract). That involves a spatial join 

673 between visit and tract, and we include the region columns from both 

674 tables in the results in order to only actually yield result rows 

675 (see `predicate` and `rows`) where the regions in those two columns 

676 overlap. If the user then wants to subset to just (skymap, tract) with 

677 unique results, we have two unpalatable options: 

678 

679 - we can do a SELECT DISTINCT with just the skymap and tract columns 

680 in the SELECT clause, dropping all detailed overlap information and 

681 including some tracts that did not actually overlap any of the 

682 visits in the original query (but were regarded as _possibly_ 

683 overlapping via the coarser, common-skypix relationships); 

684 

685 - we can include the tract and visit region columns in the query, and 

686 continue to filter out the non-overlapping pairs, but completely 

687 disregard the user's request for unique tracts. 

688 

689 This interface specifies that implementations must do the former, as 

690 that's what makes things efficient in our most important use case 

691 (``QuantumGraph`` generation in ``pipe_base``). We may be able to 

692 improve this situation in the future by putting exact overlap 

693 information in the database, either by using built-in (but 

694 engine-specific) spatial database functionality or (more likely) 

695 switching to a scheme in which pairwise dimension spatial relationships 

696 are explicitly precomputed (for e.g. combinations of instruments and 

697 skymaps). 

698 """ 

699 raise NotImplementedError() 

700 

701 @abstractmethod 

702 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

703 """Return a `QueryBuilder` that can be used to construct a new `Query` 

704 that is joined to (and hence constrained by) this one. 

705 

706 Parameters 

707 ---------- 

708 summary : `QuerySummary`, optional 

709 A `QuerySummary` instance that specifies the dimensions and any 

710 additional constraints to include in the new query being 

711 constructed, or `None` to use the dimensions of ``self`` with no 

712 additional constraints. 

713 """ 

714 raise NotImplementedError() 

715 

716 graph: DimensionGraph 

717 """The dimensions identified by this query and included in any data IDs 

718 created from its result rows (`DimensionGraph`). 

719 """ 

720 

721 whereRegion: Optional[Region] 

722 """A spatial region that all regions in all rows returned by this query 

723 must overlap (`lsst.sphgeom.Region` or `None`). 

724 """ 

725 

726 managers: RegistryManagers 

727 """A struct containing `Registry` helper object (`RegistryManagers`). 

728 """ 

729 

730 

731class DirectQueryUniqueness(enum.Enum): 

732 """An enum representing the ways in which a query can have unique rows (or 

733 not). 

734 """ 

735 

736 NOT_UNIQUE = enum.auto() 

737 """The query is not expected to have unique rows. 

738 """ 

739 

740 NATURALLY_UNIQUE = enum.auto() 

741 """The construction of the query guarantees that it will have unique 

742 result rows, even without SELECT DISTINCT or a GROUP BY clause. 

743 """ 

744 

745 NEEDS_DISTINCT = enum.auto() 

746 """The query is expected to yield unique result rows, and needs to use 

747 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this. 

748 """ 

749 

750 

751class DirectQuery(Query): 

752 """A `Query` implementation that represents a direct SELECT query that 

753 usually joins many tables. 

754 

755 `DirectQuery` objects should generally only be constructed by 

756 `QueryBuilder` or the methods of other `Query` objects. 

757 

758 Parameters 

759 ---------- 

760 simpleQuery : `SimpleQuery` 

761 Struct representing the actual SELECT, FROM, and WHERE clauses. 

762 columns : `QueryColumns` 

763 Columns that are referenced in the query in any clause. 

764 uniqueness : `DirectQueryUniqueness` 

765 Enum value indicating whether the query should yield unique result 

766 rows, and if so whether that needs to be explicitly requested of the 

767 database. 

768 graph : `DimensionGraph` 

769 Object describing the dimensions included in the query. 

770 whereRegion : `lsst.sphgeom.Region`, optional 

771 Region that all region columns in all returned rows must overlap. 

772 managers : `RegistryManagers` 

773 Struct containing the `Registry` manager helper objects, to be 

774 forwarded to the `Query` constructor. 

775 doomed_by : `Iterable` [ `str` ], optional 

776 A list of messages (appropriate for e.g. logging or exceptions) that 

777 explain why the query is known to return no results even before it is 

778 executed. Queries with a non-empty list will never be executed. 

779 """ 

780 

781 def __init__( 

782 self, 

783 *, 

784 simpleQuery: SimpleQuery, 

785 columns: QueryColumns, 

786 uniqueness: DirectQueryUniqueness, 

787 graph: DimensionGraph, 

788 whereRegion: Optional[Region], 

789 managers: RegistryManagers, 

790 order_by_columns: Iterable[OrderByColumn] = (), 

791 limit: Optional[Tuple[int, Optional[int]]] = None, 

792 doomed_by: Iterable[str] = (), 

793 ): 

794 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

795 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql" 

796 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns." 

797 self._simpleQuery = simpleQuery 

798 self._columns = columns 

799 self._uniqueness = uniqueness 

800 self._order_by_columns = order_by_columns 

801 self._limit = limit 

802 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None 

803 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

804 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

805 

806 def isUnique(self) -> bool: 

807 # Docstring inherited from Query. 

808 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE 

809 

810 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

811 # Docstring inherited from Query. 

812 column = self._dimensionColumns.get(name) 

813 if column is None: 

814 column = self._columns.getKeyColumn(name).label(name) 

815 self._dimensionColumns[name] = column 

816 return column 

817 

818 @property 

819 def spatial(self) -> Iterator[DimensionElement]: 

820 # Docstring inherited from Query. 

821 return iter(self._columns.regions) 

822 

823 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

824 # Docstring inherited from Query. 

825 column = self._regionColumns.get(name) 

826 if column is None: 

827 column = self._columns.regions[name].column.label(f"{name}_region") 

828 self._regionColumns[name] = column 

829 return column 

830 

831 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

832 # Docstring inherited from Query. 

833 if self._datasetQueryColumns is None: 

834 base = self._columns.datasets 

835 if base is None: 

836 return None 

837 ingestDate = base.ingestDate 

838 if ingestDate is not None: 

839 ingestDate = ingestDate.label("ingest_date") 

840 self._datasetQueryColumns = DatasetQueryColumns( 

841 datasetType=base.datasetType, 

842 id=base.id.label("dataset_id"), 

843 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()), 

844 ingestDate=ingestDate, 

845 ) 

846 return self._datasetQueryColumns 

847 

848 @property 

849 def sql(self) -> sqlalchemy.sql.FromClause: 

850 # Docstring inherited from Query. 

851 simpleQuery = self._simpleQuery.copy() 

852 for dimension in self.graph: 

853 simpleQuery.columns.append(self.getDimensionColumn(dimension.name)) 

854 for element in self.spatial: 

855 simpleQuery.columns.append(self.getRegionColumn(element.name)) 

856 datasetColumns = self.getDatasetColumns() 

857 if datasetColumns is not None: 

858 simpleQuery.columns.extend(datasetColumns) 

859 

860 if self._order_by_columns: 

861 # add ORDER BY columns 

862 select_columns = [column.column for column in self._order_by_columns if column.add_to_select] 

863 simpleQuery.columns.extend(select_columns) 

864 sql = simpleQuery.combine() 

865 order_by_columns = [column.column_order for column in self._order_by_columns] 

866 sql = sql.order_by(*order_by_columns) 

867 else: 

868 sql = simpleQuery.combine() 

869 

870 if self._limit: 

871 sql = sql.limit(self._limit[0]) 

872 if self._limit[1] is not None: 

873 sql = sql.offset(self._limit[1]) 

874 

875 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT: 

876 return sql.distinct() 

877 else: 

878 return sql 

879 

880 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec: 

881 """Helper method for subclass implementations of `materialize`. 

882 

883 Parameters 

884 ---------- 

885 constraints : `bool`, optional 

886 If `True` (`False` is default), define a specification that 

887 includes actual foreign key constraints for logical foreign keys. 

888 Some database engines do not permit temporary tables to reference 

889 normal tables, so this should be `False` when generating a spec 

890 for a temporary table unless the database engine is known to 

891 support them. 

892 

893 Returns 

894 ------- 

895 spec : `ddl.TableSpec` 

896 Specification for a table that could hold this query's result rows. 

897 """ 

898 unique = self.isUnique() 

899 spec = ddl.TableSpec(fields=()) 

900 for dimension in self.graph: 

901 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints) 

902 for element in self.spatial: 

903 spec.fields.update( 

904 SpatialRegionDatabaseRepresentation.makeFieldSpecs( 

905 nullable=True, 

906 name=f"{element.name}_region", 

907 ) 

908 ) 

909 datasetColumns = self.getDatasetColumns() 

910 if datasetColumns is not None: 

911 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints) 

912 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints) 

913 

914 # may need few extra columns from ORDER BY 

915 spec.fields.update( 

916 column.field_spec for column in self._order_by_columns if column.field_spec is not None 

917 ) 

918 

919 return spec 

920 

921 @contextmanager 

922 def materialize(self, db: Database) -> Iterator[Query]: 

923 # Docstring inherited from Query. 

924 spec = self._makeTableSpec() 

925 with db.session() as session: 

926 table = session.makeTemporaryTable(spec) 

927 if not self._doomed_by: 

928 db.insert(table, select=self.sql, names=spec.fields.names) 

929 order_by_columns = [column.materialized(table) for column in self._order_by_columns] 

930 yield MaterializedQuery( 

931 table=table, 

932 spatial=self.spatial, 

933 datasetType=self.datasetType, 

934 isUnique=self.isUnique(), 

935 graph=self.graph, 

936 whereRegion=self.whereRegion, 

937 managers=self.managers, 

938 doomed_by=self._doomed_by, 

939 order_by_columns=order_by_columns, 

940 ) 

941 session.dropTemporaryTable(table) 

942 

943 def subset( 

944 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

945 ) -> Query: 

946 # Docstring inherited from Query. 

947 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

948 if columns is None: 

949 return self 

950 if columns.isEmpty(): 

951 return EmptyQuery(self.graph.universe, self.managers) 

952 return DirectQuery( 

953 simpleQuery=self._simpleQuery.copy(), 

954 columns=columns, 

955 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

956 graph=graph, 

957 whereRegion=self.whereRegion if not unique else None, 

958 managers=self.managers, 

959 doomed_by=self._doomed_by, 

960 ) 

961 

962 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

963 # Docstring inherited from Query. 

964 from ._builder import QueryBuilder 

965 

966 if summary is None: 

967 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

968 if not summary.requested.issubset(self.graph): 

969 raise NotImplementedError( 

970 f"Query.makeBuilder does not yet support augmenting dimensions " 

971 f"({summary.requested.dimensions}) beyond those originally included in the query " 

972 f"({self.graph.dimensions})." 

973 ) 

974 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

975 builder.joinTable( 

976 self.sql.alias(), dimensions=self.graph.dimensions, datasets=self.getDatasetColumns() 

977 ) 

978 return builder 

979 

980 

981class MaterializedQuery(Query): 

982 """A `Query` implementation that represents query results saved in a 

983 temporary table. 

984 

985 `MaterializedQuery` instances should not be constructed directly; use 

986 `Query.materialize()` instead. 

987 

988 Parameters 

989 ---------- 

990 table : `sqlalchemy.schema.Table` 

991 SQLAlchemy object representing the temporary table. 

992 spatial : `Iterable` [ `DimensionElement` ] 

993 Spatial dimension elements whose regions must overlap for each valid 

994 result row (which may reject some rows that are in the table). 

995 datasetType : `DatasetType` 

996 The `DatasetType` of datasets returned by this query, or `None` 

997 if there are no dataset results 

998 isUnique : `bool` 

999 If `True`, the table's rows are unique, and there is no need to 

1000 add ``SELECT DISTINCT`` to guarantee this in results. 

1001 graph : `DimensionGraph` 

1002 Dimensions included in the columns of this table. 

1003 whereRegion : `Region` or `None` 

1004 A spatial region all result-row regions must overlap to be valid (which 

1005 may reject some rows that are in the table). 

1006 managers : `RegistryManagers` 

1007 A struct containing `Registry` manager helper objects, forwarded to 

1008 the `Query` constructor. 

1009 doomed_by : `Iterable` [ `str` ], optional 

1010 A list of messages (appropriate for e.g. logging or exceptions) that 

1011 explain why the query is known to return no results even before it is 

1012 executed. Queries with a non-empty list will never be executed. 

1013 order_by : `Tuple` [ `str` ], optional 

1014 Optional list of column names to use in ORDER BY clause, names can be 

1015 prefixed with minus sign for descending ordering. 

1016 """ 

1017 

1018 def __init__( 

1019 self, 

1020 *, 

1021 table: sqlalchemy.schema.Table, 

1022 spatial: Iterable[DimensionElement], 

1023 datasetType: Optional[DatasetType], 

1024 isUnique: bool, 

1025 graph: DimensionGraph, 

1026 whereRegion: Optional[Region], 

1027 managers: RegistryManagers, 

1028 doomed_by: Iterable[str] = (), 

1029 order_by_columns: Iterable[OrderByColumn] = (), 

1030 ): 

1031 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

1032 self._table = table 

1033 self._spatial = tuple(spatial) 

1034 self._datasetType = datasetType 

1035 self._isUnique = isUnique 

1036 self._order_by_columns = order_by_columns 

1037 

1038 def isUnique(self) -> bool: 

1039 # Docstring inherited from Query. 

1040 return self._isUnique 

1041 

1042 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1043 # Docstring inherited from Query. 

1044 return self._table.columns[name] 

1045 

1046 @property 

1047 def spatial(self) -> Iterator[DimensionElement]: 

1048 # Docstring inherited from Query. 

1049 return iter(self._spatial) 

1050 

1051 def order_by(self, *args: str) -> Query: 

1052 # Docstring inherited from Query. 

1053 raise NotImplementedError("MaterializedQuery.order_by should not be called directly") 

1054 

1055 def limit(self, limit: int, offset: Optional[int] = None) -> Query: 

1056 # Docstring inherited from Query. 

1057 

1058 # Calling limit on materialized data is likely an error, limit should 

1059 # be set before materializing. 

1060 raise NotImplementedError("MaterializedQuery.limit should not be called directly") 

1061 

1062 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1063 # Docstring inherited from Query. 

1064 return self._table.columns[f"{name}_region"] 

1065 

1066 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1067 # Docstring inherited from Query. 

1068 if self._datasetType is not None: 

1069 return DatasetQueryColumns( 

1070 datasetType=self._datasetType, 

1071 id=self._table.columns["dataset_id"], 

1072 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()], 

1073 ingestDate=None, 

1074 ) 

1075 else: 

1076 return None 

1077 

1078 @property 

1079 def sql(self) -> sqlalchemy.sql.FromClause: 

1080 # Docstring inherited from Query. 

1081 select = self._table.select() 

1082 if self._order_by_columns: 

1083 order_by_columns = [column.column_order for column in self._order_by_columns] 

1084 select = select.order_by(*order_by_columns) 

1085 return select 

1086 

1087 @contextmanager 

1088 def materialize(self, db: Database) -> Iterator[Query]: 

1089 # Docstring inherited from Query. 

1090 yield self 

1091 

1092 def subset( 

1093 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

1094 ) -> Query: 

1095 # Docstring inherited from Query. 

1096 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

1097 if columns is None: 

1098 return self 

1099 if columns.isEmpty(): 

1100 return EmptyQuery(self.graph.universe, managers=self.managers) 

1101 simpleQuery = SimpleQuery() 

1102 simpleQuery.join(self._table) 

1103 return DirectQuery( 

1104 simpleQuery=simpleQuery, 

1105 columns=columns, 

1106 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

1107 graph=graph, 

1108 whereRegion=self.whereRegion if not unique else None, 

1109 managers=self.managers, 

1110 doomed_by=self._doomed_by, 

1111 ) 

1112 

1113 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1114 # Docstring inherited from Query. 

1115 from ._builder import QueryBuilder 

1116 

1117 if summary is None: 

1118 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

1119 if not summary.requested.issubset(self.graph): 

1120 raise NotImplementedError( 

1121 f"Query.makeBuilder does not yet support augmenting dimensions " 

1122 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1123 f"({self.graph.dimensions})." 

1124 ) 

1125 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

1126 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()) 

1127 return builder 

1128 

1129 

1130class EmptyQuery(Query): 

1131 """A `Query` implementation that handes the special case where the query 

1132 would have no columns. 

1133 

1134 Parameters 

1135 ---------- 

1136 universe : `DimensionUniverse` 

1137 Set of all dimensions from which the null set is extracted. 

1138 managers : `RegistryManagers` 

1139 A struct containing the registry manager instances used by the query 

1140 system. 

1141 doomed_by : `Iterable` [ `str` ], optional 

1142 A list of messages (appropriate for e.g. logging or exceptions) that 

1143 explain why the query is known to return no results even before it is 

1144 executed. Queries with a non-empty list will never be executed. 

1145 """ 

1146 

1147 def __init__( 

1148 self, 

1149 universe: DimensionUniverse, 

1150 managers: RegistryManagers, 

1151 doomed_by: Iterable[str] = (), 

1152 ): 

1153 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by) 

1154 

1155 def isUnique(self) -> bool: 

1156 # Docstring inherited from Query. 

1157 return True 

1158 

1159 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1160 # Docstring inherited from Query. 

1161 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).") 

1162 

1163 @property 

1164 def spatial(self) -> Iterator[DimensionElement]: 

1165 # Docstring inherited from Query. 

1166 return iter(()) 

1167 

1168 def order_by(self, *args: str) -> Query: 

1169 # Docstring inherited from Query. 

1170 return self 

1171 

1172 def limit(self, limit: int, offset: Optional[int] = None) -> Query: 

1173 # Docstring inherited from Query. 

1174 return self 

1175 

1176 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1177 # Docstring inherited from Query. 

1178 raise KeyError(f"No region for {name} in query (no regions at all, actually).") 

1179 

1180 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1181 # Docstring inherited from Query. 

1182 return None 

1183 

1184 def rows( 

1185 self, db: Database, *, region: Optional[Region] = None 

1186 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

1187 if not self._doomed_by: 

1188 yield None 

1189 

1190 @property 

1191 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

1192 # Docstring inherited from Query. 

1193 return None 

1194 

1195 @contextmanager 

1196 def materialize(self, db: Database) -> Iterator[Query]: 

1197 # Docstring inherited from Query. 

1198 yield self 

1199 

1200 def subset( 

1201 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

1202 ) -> Query: 

1203 # Docstring inherited from Query. 

1204 assert graph is None or graph.issubset(self.graph) 

1205 return self 

1206 

1207 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1208 # Docstring inherited from Query. 

1209 from ._builder import QueryBuilder 

1210 

1211 if summary is None: 

1212 summary = QuerySummary(self.graph) 

1213 if not summary.requested.issubset(self.graph): 

1214 raise NotImplementedError( 

1215 f"Query.makeBuilder does not yet support augmenting dimensions " 

1216 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1217 f"({self.graph.dimensions})." 

1218 ) 

1219 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)