Coverage for python/lsst/daf/butler/registry/queries/_query.py: 22%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

374 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("Query",) 

24 

25from abc import ABC, abstractmethod 

26from contextlib import contextmanager 

27import dataclasses 

28import enum 

29import itertools 

30from typing import ( 

31 ContextManager, 

32 Dict, 

33 Iterable, 

34 Iterator, 

35 Mapping, 

36 Optional, 

37 Tuple, 

38 TYPE_CHECKING, 

39) 

40 

41import sqlalchemy 

42 

43from lsst.sphgeom import Region 

44 

45from ...core import ( 

46 addDimensionForeignKey, 

47 DataCoordinate, 

48 DatasetRef, 

49 DatasetType, 

50 ddl, 

51 Dimension, 

52 DimensionElement, 

53 DimensionGraph, 

54 DimensionRecord, 

55 DimensionUniverse, 

56 SpatialRegionDatabaseRepresentation, 

57 SimpleQuery, 

58) 

59from ..interfaces import Database 

60from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers 

61 

62if TYPE_CHECKING: 62 ↛ 63line 62 didn't jump to line 63, because the condition on line 62 was never true

63 from ._builder import QueryBuilder 

64 

65 

66@dataclasses.dataclass(frozen=True) 

67class OrderByColumn: 

68 """Information about single column in ORDER BY clause. 

69 """ 

70 column: sqlalchemy.sql.ColumnElement 

71 """Name of the column or `None` for primary key (`str` or `None`)""" 

72 

73 ordering: bool 

74 """True for ascending order, False for descending (`bool`).""" 

75 

76 add_to_select: bool 

77 """True if columns is a non-key column and needs to be added to select 

78 columns explicitly (`bool`).""" 

79 

80 field_spec: Optional[ddl.FieldSpec] 

81 """Field specification for a column in materialized table (`ddl.FieldSpec`) 

82 """ 

83 

84 dimension: Optional[Dimension] 

85 """Not-None if column corresponds to a dimension (`Dimension` or `None`)""" 

86 

87 @property 

88 def column_order(self) -> sqlalchemy.sql.ColumnElement: 

89 """Column element for use in ORDER BY clause 

90 (`sqlalchemy.sql.ColumnElement`) 

91 """ 

92 return self.column.asc() if self.ordering else self.column.desc() 

93 

94 def materialized(self, table: sqlalchemy.schema.Table) -> OrderByColumn: 

95 """Re-purpose ordering column definition for a materialized table. 

96 

97 Parameters 

98 ---------- 

99 table : `sqlalchemy.schema.Table` 

100 Materialized table, it should have all columns in SELECT clause 

101 already. 

102 

103 Returns 

104 ------- 

105 column : `OrderByColumn` 

106 Column definition to use with ORDER BY in materialized table. 

107 """ 

108 return OrderByColumn( 

109 column=table.columns[self.dimension.name if self.dimension else self.column.name], 

110 ordering=self.ordering, 

111 add_to_select=False, 

112 field_spec=None, 

113 dimension=self.dimension 

114 ) 

115 

116 

117class Query(ABC): 

118 """An abstract base class for queries that return some combination of 

119 `DatasetRef` and `DataCoordinate` objects. 

120 

121 Parameters 

122 ---------- 

123 graph : `DimensionGraph` 

124 Object describing the dimensions included in the query. 

125 whereRegion : `lsst.sphgeom.Region`, optional 

126 Region that all region columns in all returned rows must overlap. 

127 managers : `RegistryManagers` 

128 A struct containing the registry manager instances used by the query 

129 system. 

130 doomed_by : `Iterable` [ `str` ], optional 

131 A list of messages (appropriate for e.g. logging or exceptions) that 

132 explain why the query is known to return no results even before it is 

133 executed. Queries with a non-empty list will never be executed. 

134 

135 Notes 

136 ----- 

137 The `Query` hierarchy abstracts over the database/SQL representation of a 

138 particular set of data IDs or datasets. It is expected to be used as a 

139 backend for other objects that provide more natural interfaces for one or 

140 both of these, not as part of a public interface to query results. 

141 """ 

142 def __init__(self, *, 

143 graph: DimensionGraph, 

144 whereRegion: Optional[Region], 

145 managers: RegistryManagers, 

146 doomed_by: Iterable[str] = (), 

147 ): 

148 self.graph = graph 

149 self.whereRegion = whereRegion 

150 self.managers = managers 

151 self._doomed_by = tuple(doomed_by) 

152 self._filtered_by_join: Optional[int] = None 

153 self._filtered_by_where: Optional[int] = None 

154 

155 @abstractmethod 

156 def isUnique(self) -> bool: 

157 """Return `True` if this query's rows are guaranteed to be unique, and 

158 `False` otherwise. 

159 

160 If this query has dataset results (`datasetType` is not `None`), 

161 uniqueness applies to the `DatasetRef` instances returned by 

162 `extractDatasetRef` from the result of `rows`. If it does not have 

163 dataset results, uniqueness applies to the `DataCoordinate` instances 

164 returned by `extractDataId`. 

165 """ 

166 raise NotImplementedError() 

167 

168 @abstractmethod 

169 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

170 """Return the query column that contains the primary key value for 

171 the dimension with the given name. 

172 

173 Parameters 

174 ---------- 

175 name : `str` 

176 Name of the dimension. 

177 

178 Returns 

179 ------- 

180 column : `sqlalchemy.sql.ColumnElement`. 

181 SQLAlchemy object representing a column in the query. 

182 

183 Notes 

184 ----- 

185 This method is intended primarily as a hook for subclasses to implement 

186 and the ABC to call in order to provide higher-level functionality; 

187 code that uses `Query` objects (but does not implement one) should 

188 usually not have to call this method. 

189 """ 

190 raise NotImplementedError() 

191 

192 @property 

193 @abstractmethod 

194 def spatial(self) -> Iterator[DimensionElement]: 

195 """An iterator over the dimension element columns used in post-query 

196 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]). 

197 

198 Notes 

199 ----- 

200 This property is intended primarily as a hook for subclasses to 

201 implement and the ABC to call in order to provide higher-level 

202 functionality; code that uses `Query` objects (but does not implement 

203 one) should usually not have to access this property. 

204 """ 

205 raise NotImplementedError() 

206 

207 @abstractmethod 

208 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

209 """Return a region column for one of the dimension elements iterated 

210 over by `spatial`. 

211 

212 Parameters 

213 ---------- 

214 name : `str` 

215 Name of the element. 

216 

217 Returns 

218 ------- 

219 column : `sqlalchemy.sql.ColumnElement` 

220 SQLAlchemy representing a result column in the query. 

221 

222 Notes 

223 ----- 

224 This method is intended primarily as a hook for subclasses to implement 

225 and the ABC to call in order to provide higher-level functionality; 

226 code that uses `Query` objects (but does not implement one) should 

227 usually not have to call this method. 

228 """ 

229 raise NotImplementedError() 

230 

231 @property 

232 def datasetType(self) -> Optional[DatasetType]: 

233 """The `DatasetType` of datasets returned by this query, or `None` 

234 if there are no dataset results (`DatasetType` or `None`). 

235 """ 

236 cols = self.getDatasetColumns() 

237 if cols is None: 

238 return None 

239 return cols.datasetType 

240 

241 def count(self, db: Database, *, region: Optional[Region] = None, exact: bool = True) -> int: 

242 """Count the number of rows this query would return. 

243 

244 Parameters 

245 ---------- 

246 db : `Database` 

247 Object managing the database connection. 

248 region : `sphgeom.Region`, optional 

249 A region that any result-row regions must overlap in order to be 

250 yielded. If not provided, this will be ``self.whereRegion``, if 

251 that exists. 

252 exact : `bool`, optional 

253 If `True`, run the full query and perform post-query filtering if 

254 needed to account for that filtering in the count. If `False`, the 

255 result may be an upper bound. 

256 

257 Returns 

258 ------- 

259 count : `int` 

260 The number of rows the query would return, or an upper bound if 

261 ``exact=False``. 

262 

263 Notes 

264 ----- 

265 This counts the number of rows returned, not the number of unique rows 

266 returned, so even with ``exact=True`` it may provide only an upper 

267 bound on the number of *deduplicated* result rows. 

268 """ 

269 if self._doomed_by: 

270 return 0 

271 sql = self.sql 

272 if sql is None: 

273 return 1 

274 if exact and self.spatial: 

275 filtered_count = 0 

276 for _ in self.rows(db, region=region): 

277 filtered_count += 1 

278 return filtered_count 

279 else: 

280 return db.query( 

281 sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None) 

282 ).scalar() 

283 

284 def any( 

285 self, 

286 db: Database, *, 

287 region: Optional[Region] = None, 

288 execute: bool = True, 

289 exact: bool = True, 

290 ) -> bool: 

291 """Test whether this query returns any results. 

292 

293 Parameters 

294 ---------- 

295 db : `Database` 

296 Object managing the database connection. 

297 region : `sphgeom.Region`, optional 

298 A region that any result-row regions must overlap in order to be 

299 yielded. If not provided, this will be ``self.whereRegion``, if 

300 that exists. 

301 execute : `bool`, optional 

302 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

303 determined prior to execution that the query would return no rows. 

304 exact : `bool`, optional 

305 If `True`, run the full query and perform post-query filtering if 

306 needed, until at least one result row is found. If `False`, the 

307 returned result does not account for post-query filtering, and 

308 hence may be `True` even when all result rows would be filtered 

309 out. 

310 

311 Returns 

312 ------- 

313 any : `bool` 

314 `True` if the query would (or might, depending on arguments) yield 

315 result rows. `False` if it definitely would not. 

316 """ 

317 if self._doomed_by: 

318 return False 

319 sql = self.sql 

320 if sql is None: 

321 return True 

322 if exact and not execute: 

323 raise TypeError("Cannot obtain exact results without executing the query.") 

324 if exact and self.spatial: 

325 for _ in self.rows(db, region=region): 

326 return True 

327 return False 

328 elif execute: 

329 return db.query(sql.limit(1)).one_or_none() is not None 

330 else: 

331 return True 

332 

333 def explain_no_results( 

334 self, 

335 db: Database, *, 

336 region: Optional[Region] = None, 

337 followup: bool = True, 

338 ) -> Iterator[str]: 

339 """Return human-readable messages that may help explain why the query 

340 yields no results. 

341 

342 Parameters 

343 ---------- 

344 db : `Database` 

345 Object managing the database connection. 

346 region : `sphgeom.Region`, optional 

347 A region that any result-row regions must overlap in order to be 

348 yielded. If not provided, this will be ``self.whereRegion``, if 

349 that exists. 

350 followup : `bool`, optional 

351 If `True` (default) perform inexpensive follow-up queries if no 

352 diagnostics are available from query generation alone. 

353 

354 Returns 

355 ------- 

356 messages : `Iterator` [ `str` ] 

357 String messages that describe reasons the query might not yield any 

358 results. 

359 

360 Notes 

361 ----- 

362 Messages related to post-query filtering are only available if `rows`, 

363 `any`, or `count` was already called with the same region (with 

364 ``exact=True`` for the latter two). 

365 """ 

366 from ._builder import QueryBuilder 

367 if self._doomed_by: 

368 yield from self._doomed_by 

369 return 

370 if self._filtered_by_where: 

371 yield ( 

372 f"{self._filtered_by_where} result rows were filtered out because " 

373 "one or more region did not overlap the WHERE-clause region." 

374 ) 

375 if self._filtered_by_join: 

376 yield ( 

377 f"{self._filtered_by_join} result rows were filtered out because " 

378 "one or more regions did not overlap." 

379 ) 

380 if (not followup) or self._filtered_by_join or self._filtered_by_where: 

381 return 

382 # Query didn't return results even before client-side filtering, and 

383 # caller says we can do follow-up queries to determine why. 

384 # Start by seeing if there are _any_ dimension records for each element 

385 # involved. 

386 for element in self.graph.elements: 

387 summary = QuerySummary(element.graph) 

388 builder = QueryBuilder(summary, self.managers) 

389 followup_query = builder.finish() 

390 if not followup_query.any(db, exact=False): 

391 yield f"No dimension records for element '{element.name}' found." 

392 yield from followup_query.explain_no_results(db, region=region, followup=False) 

393 return 

394 

395 @abstractmethod 

396 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

397 """Return the columns for the datasets returned by this query. 

398 

399 Returns 

400 ------- 

401 columns : `DatasetQueryColumns` or `None` 

402 Struct containing SQLAlchemy representations of the result columns 

403 for a dataset. 

404 

405 Notes 

406 ----- 

407 This method is intended primarily as a hook for subclasses to implement 

408 and the ABC to call in order to provide higher-level functionality; 

409 code that uses `Query` objects (but does not implement one) should 

410 usually not have to call this method. 

411 """ 

412 raise NotImplementedError() 

413 

414 @property 

415 @abstractmethod 

416 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

417 """A SQLAlchemy object representing the full query 

418 (`sqlalchemy.sql.FromClause` or `None`). 

419 

420 This is `None` in the special case where the query has no columns, and 

421 only one logical row. 

422 """ 

423 raise NotImplementedError() 

424 

425 def rows(self, db: Database, *, region: Optional[Region] = None 

426 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

427 """Execute the query and yield result rows, applying `predicate`. 

428 

429 Parameters 

430 ---------- 

431 db : `Database` 

432 Object managing the database connection. 

433 region : `sphgeom.Region`, optional 

434 A region that any result-row regions must overlap in order to be 

435 yielded. If not provided, this will be ``self.whereRegion``, if 

436 that exists. 

437 

438 Yields 

439 ------ 

440 row : `sqlalchemy.engine.RowProxy` or `None` 

441 Result row from the query. `None` may yielded exactly once instead 

442 of any real rows to indicate an empty query (see `EmptyQuery`). 

443 """ 

444 if self._doomed_by: 

445 return 

446 whereRegion = region if region is not None else self.whereRegion 

447 self._filtered_by_where = 0 

448 self._filtered_by_join = 0 

449 for row in db.query(self.sql): 

450 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial] 

451 if whereRegion and any(r.isDisjointFrom(whereRegion) for r in rowRegions): 

452 self._filtered_by_where += 1 

453 continue 

454 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)): 

455 self._filtered_by_join += 1 

456 continue 

457 yield row 

458 

459 def extractDimensionsTuple(self, row: Optional[sqlalchemy.engine.RowProxy], 

460 dimensions: Iterable[Dimension]) -> tuple: 

461 """Extract a tuple of data ID values from a result row. 

462 

463 Parameters 

464 ---------- 

465 row : `sqlalchemy.engine.RowProxy` or `None` 

466 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

467 the row from an `EmptyQuery`. 

468 dimensions : `Iterable` [ `Dimension` ] 

469 The dimensions to include in the returned tuple, in order. 

470 

471 Returns 

472 ------- 

473 values : `tuple` 

474 A tuple of dimension primary key values. 

475 """ 

476 if row is None: 

477 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions." 

478 return () 

479 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions) 

480 

481 def extractDataId(self, row: Optional[sqlalchemy.engine.RowProxy], *, 

482 graph: Optional[DimensionGraph] = None, 

483 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

484 ) -> DataCoordinate: 

485 """Extract a data ID from a result row. 

486 

487 Parameters 

488 ---------- 

489 row : `sqlalchemy.engine.RowProxy` or `None` 

490 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

491 the row from an `EmptyQuery`. 

492 graph : `DimensionGraph`, optional 

493 The dimensions the returned data ID should identify. If not 

494 provided, this will be all dimensions in `QuerySummary.requested`. 

495 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

496 Nested mapping containing records to attach to the returned 

497 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will 

498 return `True`. If provided, outer keys must include all dimension 

499 element names in ``graph``, and inner keys should be tuples of 

500 dimension primary key values in the same order as 

501 ``element.graph.required``. If not provided, 

502 `DataCoordinate.hasRecords` will return `False` on the returned 

503 object. 

504 

505 Returns 

506 ------- 

507 dataId : `DataCoordinate` 

508 A data ID that identifies all required and implied dimensions. If 

509 ``records is not None``, this is have 

510 `~DataCoordinate.hasRecords()` return `True`. 

511 """ 

512 if graph is None: 

513 graph = self.graph 

514 if not graph: 

515 return DataCoordinate.makeEmpty(self.graph.universe) 

516 dataId = DataCoordinate.fromFullValues( 

517 graph, 

518 self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied)) 

519 ) 

520 if records is not None: 

521 recordsForRow = {} 

522 for element in graph.elements: 

523 key = tuple(dataId.subset(element.graph).values()) 

524 recordsForRow[element.name] = records[element.name].get(key) 

525 return dataId.expanded(recordsForRow) 

526 else: 

527 return dataId 

528 

529 def extractDatasetRef(self, row: sqlalchemy.engine.RowProxy, 

530 dataId: Optional[DataCoordinate] = None, 

531 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

532 ) -> DatasetRef: 

533 """Extract a `DatasetRef` from a result row. 

534 

535 Parameters 

536 ---------- 

537 row : `sqlalchemy.engine.RowProxy` 

538 A result row from a SQLAlchemy SELECT query. 

539 dataId : `DataCoordinate` 

540 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class) 

541 `DataCoordinate` is constructed from ``row`` if `None`. 

542 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

543 Records to use to return an `ExpandedDataCoordinate`. If provided, 

544 outer keys must include all dimension element names in ``graph``, 

545 and inner keys should be tuples of dimension primary key values 

546 in the same order as ``element.graph.required``. 

547 

548 Returns 

549 ------- 

550 ref : `DatasetRef` 

551 Reference to the dataset; guaranteed to have `DatasetRef.id` not 

552 `None`. 

553 """ 

554 datasetColumns = self.getDatasetColumns() 

555 assert datasetColumns is not None 

556 if dataId is None: 

557 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records) 

558 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]] 

559 return DatasetRef(datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], 

560 run=runRecord.name) 

561 

562 def _makeSubsetQueryColumns(self, *, graph: Optional[DimensionGraph] = None, 

563 datasets: bool = True, 

564 unique: bool = False) -> Tuple[DimensionGraph, Optional[QueryColumns]]: 

565 """Helper method for subclass implementations of `subset`. 

566 

567 Parameters 

568 ---------- 

569 graph : `DimensionGraph`, optional 

570 Dimensions to include in the new `Query` being constructed. 

571 ``subset`` implementations should generally just forward their 

572 own ``graph`` argument here. 

573 datasets : `bool`, optional 

574 Whether the new `Query` should include dataset results. Defaults 

575 to `True`, but is ignored if ``self`` does not include dataset 

576 results. 

577 unique : `bool`, optional 

578 Whether the new `Query` should guarantee unique results (this may 

579 come with a performance penalty). 

580 

581 Returns 

582 ------- 

583 graph : `DimensionGraph` 

584 The dimensions of the new `Query`. This is exactly the same as 

585 the argument of the same name, with ``self.graph`` used if that 

586 argument is `None`. 

587 columns : `QueryColumns` or `None` 

588 A struct containing the SQLAlchemy column objects to use in the 

589 new query, constructed by delegating to other (mostly abstract) 

590 methods on ``self``. If `None`, `subset` may return ``self``. 

591 """ 

592 if graph is None: 

593 graph = self.graph 

594 if (graph == self.graph and (self.getDatasetColumns() is None or datasets) 

595 and (self.isUnique() or not unique)): 

596 return graph, None 

597 columns = QueryColumns() 

598 for dimension in graph.dimensions: 

599 col = self.getDimensionColumn(dimension.name) 

600 columns.keys[dimension] = [col] 

601 if not unique: 

602 for element in self.spatial: 

603 col = self.getRegionColumn(element.name) 

604 columns.regions[element] = col 

605 if datasets and self.getDatasetColumns() is not None: 

606 columns.datasets = self.getDatasetColumns() 

607 return graph, columns 

608 

609 @abstractmethod 

610 def materialize(self, db: Database) -> ContextManager[Query]: 

611 """Execute this query and insert its results into a temporary table. 

612 

613 Parameters 

614 ---------- 

615 db : `Database` 

616 Database engine to execute the query against. 

617 

618 Returns 

619 ------- 

620 context : `typing.ContextManager` [ `MaterializedQuery` ] 

621 A context manager that ensures the temporary table is created and 

622 populated in ``__enter__`` (returning a `MaterializedQuery` object 

623 backed by that table), and dropped in ``__exit__``. If ``self`` 

624 is already a `MaterializedQuery`, ``__enter__`` may just return 

625 ``self`` and ``__exit__`` may do nothing (reflecting the fact that 

626 an outer context manager should already take care of everything 

627 else). 

628 """ 

629 raise NotImplementedError() 

630 

631 @abstractmethod 

632 def subset(self, *, graph: Optional[DimensionGraph] = None, 

633 datasets: bool = True, 

634 unique: bool = False) -> Query: 

635 """Return a new `Query` whose columns and/or rows are (mostly) subset 

636 of this one's. 

637 

638 Parameters 

639 ---------- 

640 graph : `DimensionGraph`, optional 

641 Dimensions to include in the new `Query` being constructed. 

642 If `None` (default), ``self.graph`` is used. 

643 datasets : `bool`, optional 

644 Whether the new `Query` should include dataset results. Defaults 

645 to `True`, but is ignored if ``self`` does not include dataset 

646 results. 

647 unique : `bool`, optional 

648 Whether the new `Query` should guarantee unique results (this may 

649 come with a performance penalty). 

650 

651 Returns 

652 ------- 

653 query : `Query` 

654 A query object corresponding to the given inputs. May be ``self`` 

655 if no changes were requested. 

656 

657 Notes 

658 ----- 

659 The way spatial overlaps are handled at present makes it impossible to 

660 fully guarantee in general that the new query's rows are a subset of 

661 this one's while also returning unique rows. That's because the 

662 database is only capable of performing approximate, conservative 

663 overlaps via the common skypix system; we defer actual region overlap 

664 operations to per-result-row Python logic. But including the region 

665 columns necessary to do that postprocessing in the query makes it 

666 impossible to do a SELECT DISTINCT on the user-visible dimensions of 

667 the query. For example, consider starting with a query with dimensions 

668 (instrument, skymap, visit, tract). That involves a spatial join 

669 between visit and tract, and we include the region columns from both 

670 tables in the results in order to only actually yield result rows 

671 (see `predicate` and `rows`) where the regions in those two columns 

672 overlap. If the user then wants to subset to just (skymap, tract) with 

673 unique results, we have two unpalatable options: 

674 

675 - we can do a SELECT DISTINCT with just the skymap and tract columns 

676 in the SELECT clause, dropping all detailed overlap information and 

677 including some tracts that did not actually overlap any of the 

678 visits in the original query (but were regarded as _possibly_ 

679 overlapping via the coarser, common-skypix relationships); 

680 

681 - we can include the tract and visit region columns in the query, and 

682 continue to filter out the non-overlapping pairs, but completely 

683 disregard the user's request for unique tracts. 

684 

685 This interface specifies that implementations must do the former, as 

686 that's what makes things efficient in our most important use case 

687 (``QuantumGraph`` generation in ``pipe_base``). We may be able to 

688 improve this situation in the future by putting exact overlap 

689 information in the database, either by using built-in (but 

690 engine-specific) spatial database functionality or (more likely) 

691 switching to a scheme in which pairwise dimension spatial relationships 

692 are explicitly precomputed (for e.g. combinations of instruments and 

693 skymaps). 

694 """ 

695 raise NotImplementedError() 

696 

697 @abstractmethod 

698 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

699 """Return a `QueryBuilder` that can be used to construct a new `Query` 

700 that is joined to (and hence constrained by) this one. 

701 

702 Parameters 

703 ---------- 

704 summary : `QuerySummary`, optional 

705 A `QuerySummary` instance that specifies the dimensions and any 

706 additional constraints to include in the new query being 

707 constructed, or `None` to use the dimensions of ``self`` with no 

708 additional constraints. 

709 """ 

710 raise NotImplementedError() 

711 

712 graph: DimensionGraph 

713 """The dimensions identified by this query and included in any data IDs 

714 created from its result rows (`DimensionGraph`). 

715 """ 

716 

717 whereRegion: Optional[Region] 

718 """A spatial region that all regions in all rows returned by this query 

719 must overlap (`lsst.sphgeom.Region` or `None`). 

720 """ 

721 

722 managers: RegistryManagers 

723 """A struct containing `Registry` helper object (`RegistryManagers`). 

724 """ 

725 

726 

727class DirectQueryUniqueness(enum.Enum): 

728 """An enum representing the ways in which a query can have unique rows (or 

729 not). 

730 """ 

731 

732 NOT_UNIQUE = enum.auto() 

733 """The query is not expected to have unique rows. 

734 """ 

735 

736 NATURALLY_UNIQUE = enum.auto() 

737 """The construction of the query guarantees that it will have unique 

738 result rows, even without SELECT DISTINCT or a GROUP BY clause. 

739 """ 

740 

741 NEEDS_DISTINCT = enum.auto() 

742 """The query is expected to yield unique result rows, and needs to use 

743 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this. 

744 """ 

745 

746 

747class DirectQuery(Query): 

748 """A `Query` implementation that represents a direct SELECT query that 

749 usually joins many tables. 

750 

751 `DirectQuery` objects should generally only be constructed by 

752 `QueryBuilder` or the methods of other `Query` objects. 

753 

754 Parameters 

755 ---------- 

756 simpleQuery : `SimpleQuery` 

757 Struct representing the actual SELECT, FROM, and WHERE clauses. 

758 columns : `QueryColumns` 

759 Columns that are referenced in the query in any clause. 

760 uniqueness : `DirectQueryUniqueness` 

761 Enum value indicating whether the query should yield unique result 

762 rows, and if so whether that needs to be explicitly requested of the 

763 database. 

764 graph : `DimensionGraph` 

765 Object describing the dimensions included in the query. 

766 whereRegion : `lsst.sphgeom.Region`, optional 

767 Region that all region columns in all returned rows must overlap. 

768 managers : `RegistryManagers` 

769 Struct containing the `Registry` manager helper objects, to be 

770 forwarded to the `Query` constructor. 

771 doomed_by : `Iterable` [ `str` ], optional 

772 A list of messages (appropriate for e.g. logging or exceptions) that 

773 explain why the query is known to return no results even before it is 

774 executed. Queries with a non-empty list will never be executed. 

775 """ 

776 def __init__(self, *, 

777 simpleQuery: SimpleQuery, 

778 columns: QueryColumns, 

779 uniqueness: DirectQueryUniqueness, 

780 graph: DimensionGraph, 

781 whereRegion: Optional[Region], 

782 managers: RegistryManagers, 

783 order_by_columns: Iterable[OrderByColumn] = (), 

784 limit: Optional[Tuple[int, Optional[int]]] = None, 

785 doomed_by: Iterable[str] = ()): 

786 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

787 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql" 

788 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns." 

789 self._simpleQuery = simpleQuery 

790 self._columns = columns 

791 self._uniqueness = uniqueness 

792 self._order_by_columns = order_by_columns 

793 self._limit = limit 

794 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None 

795 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

796 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

797 

798 def isUnique(self) -> bool: 

799 # Docstring inherited from Query. 

800 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE 

801 

802 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

803 # Docstring inherited from Query. 

804 column = self._dimensionColumns.get(name) 

805 if column is None: 

806 column = self._columns.getKeyColumn(name).label(name) 

807 self._dimensionColumns[name] = column 

808 return column 

809 

810 @property 

811 def spatial(self) -> Iterator[DimensionElement]: 

812 # Docstring inherited from Query. 

813 return iter(self._columns.regions) 

814 

815 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

816 # Docstring inherited from Query. 

817 column = self._regionColumns.get(name) 

818 if column is None: 

819 column = self._columns.regions[name].column.label(f"{name}_region") 

820 self._regionColumns[name] = column 

821 return column 

822 

823 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

824 # Docstring inherited from Query. 

825 if self._datasetQueryColumns is None: 

826 base = self._columns.datasets 

827 if base is None: 

828 return None 

829 ingestDate = base.ingestDate 

830 if ingestDate is not None: 

831 ingestDate = ingestDate.label("ingest_date") 

832 self._datasetQueryColumns = DatasetQueryColumns( 

833 datasetType=base.datasetType, 

834 id=base.id.label("dataset_id"), 

835 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()), 

836 ingestDate=ingestDate, 

837 ) 

838 return self._datasetQueryColumns 

839 

840 @property 

841 def sql(self) -> sqlalchemy.sql.FromClause: 

842 # Docstring inherited from Query. 

843 simpleQuery = self._simpleQuery.copy() 

844 for dimension in self.graph: 

845 simpleQuery.columns.append(self.getDimensionColumn(dimension.name)) 

846 for element in self.spatial: 

847 simpleQuery.columns.append(self.getRegionColumn(element.name)) 

848 datasetColumns = self.getDatasetColumns() 

849 if datasetColumns is not None: 

850 simpleQuery.columns.extend(datasetColumns) 

851 

852 if self._order_by_columns: 

853 # add ORDER BY columns 

854 select_columns = [column.column for column in self._order_by_columns if column.add_to_select] 

855 simpleQuery.columns.extend(select_columns) 

856 sql = simpleQuery.combine() 

857 order_by_columns = [column.column_order for column in self._order_by_columns] 

858 sql = sql.order_by(*order_by_columns) 

859 else: 

860 sql = simpleQuery.combine() 

861 

862 if self._limit: 

863 sql = sql.limit(self._limit[0]) 

864 if self._limit[1] is not None: 

865 sql = sql.offset(self._limit[1]) 

866 

867 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT: 

868 return sql.distinct() 

869 else: 

870 return sql 

871 

872 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec: 

873 """Helper method for subclass implementations of `materialize`. 

874 

875 Parameters 

876 ---------- 

877 constraints : `bool`, optional 

878 If `True` (`False` is default), define a specification that 

879 includes actual foreign key constraints for logical foreign keys. 

880 Some database engines do not permit temporary tables to reference 

881 normal tables, so this should be `False` when generating a spec 

882 for a temporary table unless the database engine is known to 

883 support them. 

884 

885 Returns 

886 ------- 

887 spec : `ddl.TableSpec` 

888 Specification for a table that could hold this query's result rows. 

889 """ 

890 unique = self.isUnique() 

891 spec = ddl.TableSpec(fields=()) 

892 for dimension in self.graph: 

893 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints) 

894 for element in self.spatial: 

895 spec.fields.update( 

896 SpatialRegionDatabaseRepresentation.makeFieldSpecs( 

897 nullable=True, 

898 name=f"{element.name}_region", 

899 ) 

900 ) 

901 datasetColumns = self.getDatasetColumns() 

902 if datasetColumns is not None: 

903 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints) 

904 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints) 

905 

906 # may need few extra columns from ORDER BY 

907 spec.fields.update(column.field_spec for column in self._order_by_columns 

908 if column.field_spec is not None) 

909 

910 return spec 

911 

912 @contextmanager 

913 def materialize(self, db: Database) -> Iterator[Query]: 

914 # Docstring inherited from Query. 

915 spec = self._makeTableSpec() 

916 with db.session() as session: 

917 table = session.makeTemporaryTable(spec) 

918 if not self._doomed_by: 

919 db.insert(table, select=self.sql, names=spec.fields.names) 

920 order_by_columns = [column.materialized(table) for column in self._order_by_columns] 

921 yield MaterializedQuery(table=table, 

922 spatial=self.spatial, 

923 datasetType=self.datasetType, 

924 isUnique=self.isUnique(), 

925 graph=self.graph, 

926 whereRegion=self.whereRegion, 

927 managers=self.managers, 

928 doomed_by=self._doomed_by, 

929 order_by_columns=order_by_columns) 

930 session.dropTemporaryTable(table) 

931 

932 def subset(self, *, graph: Optional[DimensionGraph] = None, 

933 datasets: bool = True, 

934 unique: bool = False) -> Query: 

935 # Docstring inherited from Query. 

936 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

937 if columns is None: 

938 return self 

939 if columns.isEmpty(): 

940 return EmptyQuery(self.graph.universe, self.managers) 

941 return DirectQuery( 

942 simpleQuery=self._simpleQuery.copy(), 

943 columns=columns, 

944 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

945 graph=graph, 

946 whereRegion=self.whereRegion if not unique else None, 

947 managers=self.managers, 

948 doomed_by=self._doomed_by, 

949 ) 

950 

951 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

952 # Docstring inherited from Query. 

953 from ._builder import QueryBuilder 

954 if summary is None: 

955 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

956 if not summary.requested.issubset(self.graph): 

957 raise NotImplementedError( 

958 f"Query.makeBuilder does not yet support augmenting dimensions " 

959 f"({summary.requested.dimensions}) beyond those originally included in the query " 

960 f"({self.graph.dimensions})." 

961 ) 

962 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

963 builder.joinTable(self.sql.alias(), dimensions=self.graph.dimensions, 

964 datasets=self.getDatasetColumns()) 

965 return builder 

966 

967 

968class MaterializedQuery(Query): 

969 """A `Query` implementation that represents query results saved in a 

970 temporary table. 

971 

972 `MaterializedQuery` instances should not be constructed directly; use 

973 `Query.materialize()` instead. 

974 

975 Parameters 

976 ---------- 

977 table : `sqlalchemy.schema.Table` 

978 SQLAlchemy object representing the temporary table. 

979 spatial : `Iterable` [ `DimensionElement` ] 

980 Spatial dimension elements whose regions must overlap for each valid 

981 result row (which may reject some rows that are in the table). 

982 datasetType : `DatasetType` 

983 The `DatasetType` of datasets returned by this query, or `None` 

984 if there are no dataset results 

985 isUnique : `bool` 

986 If `True`, the table's rows are unique, and there is no need to 

987 add ``SELECT DISTINCT`` to guarantee this in results. 

988 graph : `DimensionGraph` 

989 Dimensions included in the columns of this table. 

990 whereRegion : `Region` or `None` 

991 A spatial region all result-row regions must overlap to be valid (which 

992 may reject some rows that are in the table). 

993 managers : `RegistryManagers` 

994 A struct containing `Registry` manager helper objects, forwarded to 

995 the `Query` constructor. 

996 doomed_by : `Iterable` [ `str` ], optional 

997 A list of messages (appropriate for e.g. logging or exceptions) that 

998 explain why the query is known to return no results even before it is 

999 executed. Queries with a non-empty list will never be executed. 

1000 order_by : `Tuple` [ `str` ], optional 

1001 Optional list of column names to use in ORDER BY clause, names can be 

1002 prefixed with minus sign for descending ordering. 

1003 """ 

1004 def __init__(self, *, 

1005 table: sqlalchemy.schema.Table, 

1006 spatial: Iterable[DimensionElement], 

1007 datasetType: Optional[DatasetType], 

1008 isUnique: bool, 

1009 graph: DimensionGraph, 

1010 whereRegion: Optional[Region], 

1011 managers: RegistryManagers, 

1012 doomed_by: Iterable[str] = (), 

1013 order_by_columns: Iterable[OrderByColumn] = ()): 

1014 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, 

1015 doomed_by=doomed_by) 

1016 self._table = table 

1017 self._spatial = tuple(spatial) 

1018 self._datasetType = datasetType 

1019 self._isUnique = isUnique 

1020 self._order_by_columns = order_by_columns 

1021 

1022 def isUnique(self) -> bool: 

1023 # Docstring inherited from Query. 

1024 return self._isUnique 

1025 

1026 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1027 # Docstring inherited from Query. 

1028 return self._table.columns[name] 

1029 

1030 @property 

1031 def spatial(self) -> Iterator[DimensionElement]: 

1032 # Docstring inherited from Query. 

1033 return iter(self._spatial) 

1034 

1035 def order_by(self, *args: str) -> Query: 

1036 # Docstring inherited from Query. 

1037 raise NotImplementedError("MaterializedQuery.order_by should not be called directly") 

1038 

1039 def limit(self, limit: int, offset: Optional[int] = None) -> Query: 

1040 # Docstring inherited from Query. 

1041 

1042 # Calling limit on materialized data is likely an error, limit should 

1043 # be set before materializing. 

1044 raise NotImplementedError("MaterializedQuery.limit should not be called directly") 

1045 

1046 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1047 # Docstring inherited from Query. 

1048 return self._table.columns[f"{name}_region"] 

1049 

1050 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1051 # Docstring inherited from Query. 

1052 if self._datasetType is not None: 

1053 return DatasetQueryColumns( 

1054 datasetType=self._datasetType, 

1055 id=self._table.columns["dataset_id"], 

1056 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()], 

1057 ingestDate=None, 

1058 ) 

1059 else: 

1060 return None 

1061 

1062 @property 

1063 def sql(self) -> sqlalchemy.sql.FromClause: 

1064 # Docstring inherited from Query. 

1065 select = self._table.select() 

1066 if self._order_by_columns: 

1067 order_by_columns = [column.column_order for column in self._order_by_columns] 

1068 select = select.order_by(*order_by_columns) 

1069 return select 

1070 

1071 @contextmanager 

1072 def materialize(self, db: Database) -> Iterator[Query]: 

1073 # Docstring inherited from Query. 

1074 yield self 

1075 

1076 def subset(self, *, graph: Optional[DimensionGraph] = None, 

1077 datasets: bool = True, 

1078 unique: bool = False) -> Query: 

1079 # Docstring inherited from Query. 

1080 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

1081 if columns is None: 

1082 return self 

1083 if columns.isEmpty(): 

1084 return EmptyQuery(self.graph.universe, managers=self.managers) 

1085 simpleQuery = SimpleQuery() 

1086 simpleQuery.join(self._table) 

1087 return DirectQuery( 

1088 simpleQuery=simpleQuery, 

1089 columns=columns, 

1090 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

1091 graph=graph, 

1092 whereRegion=self.whereRegion if not unique else None, 

1093 managers=self.managers, 

1094 doomed_by=self._doomed_by, 

1095 ) 

1096 

1097 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1098 # Docstring inherited from Query. 

1099 from ._builder import QueryBuilder 

1100 if summary is None: 

1101 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

1102 if not summary.requested.issubset(self.graph): 

1103 raise NotImplementedError( 

1104 f"Query.makeBuilder does not yet support augmenting dimensions " 

1105 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1106 f"({self.graph.dimensions})." 

1107 ) 

1108 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

1109 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()) 

1110 return builder 

1111 

1112 

1113class EmptyQuery(Query): 

1114 """A `Query` implementation that handes the special case where the query 

1115 would have no columns. 

1116 

1117 Parameters 

1118 ---------- 

1119 universe : `DimensionUniverse` 

1120 Set of all dimensions from which the null set is extracted. 

1121 managers : `RegistryManagers` 

1122 A struct containing the registry manager instances used by the query 

1123 system. 

1124 doomed_by : `Iterable` [ `str` ], optional 

1125 A list of messages (appropriate for e.g. logging or exceptions) that 

1126 explain why the query is known to return no results even before it is 

1127 executed. Queries with a non-empty list will never be executed. 

1128 """ 

1129 def __init__( 

1130 self, 

1131 universe: DimensionUniverse, 

1132 managers: RegistryManagers, 

1133 doomed_by: Iterable[str] = (), 

1134 ): 

1135 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by) 

1136 

1137 def isUnique(self) -> bool: 

1138 # Docstring inherited from Query. 

1139 return True 

1140 

1141 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1142 # Docstring inherited from Query. 

1143 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).") 

1144 

1145 @property 

1146 def spatial(self) -> Iterator[DimensionElement]: 

1147 # Docstring inherited from Query. 

1148 return iter(()) 

1149 

1150 def order_by(self, *args: str) -> Query: 

1151 # Docstring inherited from Query. 

1152 return self 

1153 

1154 def limit(self, limit: int, offset: Optional[int] = None) -> Query: 

1155 # Docstring inherited from Query. 

1156 return self 

1157 

1158 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1159 # Docstring inherited from Query. 

1160 raise KeyError(f"No region for {name} in query (no regions at all, actually).") 

1161 

1162 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1163 # Docstring inherited from Query. 

1164 return None 

1165 

1166 def rows(self, db: Database, *, region: Optional[Region] = None 

1167 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

1168 if not self._doomed_by: 

1169 yield None 

1170 

1171 @property 

1172 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

1173 # Docstring inherited from Query. 

1174 return None 

1175 

1176 @contextmanager 

1177 def materialize(self, db: Database) -> Iterator[Query]: 

1178 # Docstring inherited from Query. 

1179 yield self 

1180 

1181 def subset(self, *, graph: Optional[DimensionGraph] = None, 

1182 datasets: bool = True, 

1183 unique: bool = False) -> Query: 

1184 # Docstring inherited from Query. 

1185 assert graph is None or graph.issubset(self.graph) 

1186 return self 

1187 

1188 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1189 # Docstring inherited from Query. 

1190 from ._builder import QueryBuilder 

1191 if summary is None: 

1192 summary = QuerySummary(self.graph) 

1193 if not summary.requested.issubset(self.graph): 

1194 raise NotImplementedError( 

1195 f"Query.makeBuilder does not yet support augmenting dimensions " 

1196 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1197 f"({self.graph.dimensions})." 

1198 ) 

1199 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)