Coverage for python/lsst/daf/butler/registry/queries/_query.py: 24%

362 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-22 02:05 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("Query",) 

24 

25import dataclasses 

26import enum 

27import itertools 

28from abc import ABC, abstractmethod 

29from contextlib import contextmanager 

30from typing import TYPE_CHECKING, ContextManager, Dict, Iterable, Iterator, Mapping, Optional, Tuple 

31 

32import sqlalchemy 

33from lsst.sphgeom import Region 

34 

35from ...core import ( 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 Dimension, 

40 DimensionElement, 

41 DimensionGraph, 

42 DimensionRecord, 

43 DimensionUniverse, 

44 SimpleQuery, 

45 addDimensionForeignKey, 

46 ddl, 

47) 

48from ..interfaces import Database 

49from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers 

50 

51if TYPE_CHECKING: 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true

52 from ._builder import QueryBuilder 

53 

54 

55@dataclasses.dataclass(frozen=True) 

56class OrderByColumn: 

57 """Information about single column in ORDER BY clause.""" 

58 

59 column: sqlalchemy.sql.ColumnElement 

60 """Name of the column or `None` for primary key (`str` or `None`)""" 

61 

62 ordering: bool 

63 """True for ascending order, False for descending (`bool`).""" 

64 

65 @property 

66 def column_order(self) -> sqlalchemy.sql.ColumnElement: 

67 """Column element for use in ORDER BY clause 

68 (`sqlalchemy.sql.ColumnElement`) 

69 """ 

70 return self.column.asc() if self.ordering else self.column.desc() 

71 

72 

73class Query(ABC): 

74 """An abstract base class for queries that return some combination of 

75 `DatasetRef` and `DataCoordinate` objects. 

76 

77 Parameters 

78 ---------- 

79 graph : `DimensionGraph` 

80 Object describing the dimensions included in the query. 

81 whereRegion : `lsst.sphgeom.Region`, optional 

82 Region that all region columns in all returned rows must overlap. 

83 managers : `RegistryManagers` 

84 A struct containing the registry manager instances used by the query 

85 system. 

86 doomed_by : `Iterable` [ `str` ], optional 

87 A list of messages (appropriate for e.g. logging or exceptions) that 

88 explain why the query is known to return no results even before it is 

89 executed. Queries with a non-empty list will never be executed. 

90 

91 Notes 

92 ----- 

93 The `Query` hierarchy abstracts over the database/SQL representation of a 

94 particular set of data IDs or datasets. It is expected to be used as a 

95 backend for other objects that provide more natural interfaces for one or 

96 both of these, not as part of a public interface to query results. 

97 """ 

98 

99 def __init__( 

100 self, 

101 *, 

102 graph: DimensionGraph, 

103 whereRegion: Optional[Region], 

104 managers: RegistryManagers, 

105 doomed_by: Iterable[str] = (), 

106 ): 

107 self.graph = graph 

108 self.whereRegion = whereRegion 

109 self.managers = managers 

110 self._doomed_by = tuple(doomed_by) 

111 self._filtered_by_join: Optional[int] = None 

112 self._filtered_by_where: Optional[int] = None 

113 

114 @abstractmethod 

115 def isUnique(self) -> bool: 

116 """Return `True` if this query's rows are guaranteed to be unique, and 

117 `False` otherwise. 

118 

119 If this query has dataset results (`datasetType` is not `None`), 

120 uniqueness applies to the `DatasetRef` instances returned by 

121 `extractDatasetRef` from the result of `rows`. If it does not have 

122 dataset results, uniqueness applies to the `DataCoordinate` instances 

123 returned by `extractDataId`. 

124 """ 

125 raise NotImplementedError() 

126 

127 @abstractmethod 

128 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

129 """Return the query column that contains the primary key value for 

130 the dimension with the given name. 

131 

132 Parameters 

133 ---------- 

134 name : `str` 

135 Name of the dimension. 

136 

137 Returns 

138 ------- 

139 column : `sqlalchemy.sql.ColumnElement`. 

140 SQLAlchemy object representing a column in the query. 

141 

142 Notes 

143 ----- 

144 This method is intended primarily as a hook for subclasses to implement 

145 and the ABC to call in order to provide higher-level functionality; 

146 code that uses `Query` objects (but does not implement one) should 

147 usually not have to call this method. 

148 """ 

149 raise NotImplementedError() 

150 

151 @property 

152 @abstractmethod 

153 def spatial(self) -> Iterator[DimensionElement]: 

154 """An iterator over the dimension element columns used in post-query 

155 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]). 

156 

157 Notes 

158 ----- 

159 This property is intended primarily as a hook for subclasses to 

160 implement and the ABC to call in order to provide higher-level 

161 functionality; code that uses `Query` objects (but does not implement 

162 one) should usually not have to access this property. 

163 """ 

164 raise NotImplementedError() 

165 

166 @abstractmethod 

167 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

168 """Return a region column for one of the dimension elements iterated 

169 over by `spatial`. 

170 

171 Parameters 

172 ---------- 

173 name : `str` 

174 Name of the element. 

175 

176 Returns 

177 ------- 

178 column : `sqlalchemy.sql.ColumnElement` 

179 SQLAlchemy representing a result column in the query. 

180 

181 Notes 

182 ----- 

183 This method is intended primarily as a hook for subclasses to implement 

184 and the ABC to call in order to provide higher-level functionality; 

185 code that uses `Query` objects (but does not implement one) should 

186 usually not have to call this method. 

187 """ 

188 raise NotImplementedError() 

189 

190 @property 

191 def datasetType(self) -> Optional[DatasetType]: 

192 """The `DatasetType` of datasets returned by this query, or `None` 

193 if there are no dataset results (`DatasetType` or `None`). 

194 """ 

195 cols = self.getDatasetColumns() 

196 if cols is None: 

197 return None 

198 return cols.datasetType 

199 

200 def count(self, db: Database, *, exact: bool = True) -> int: 

201 """Count the number of rows this query would return. 

202 

203 Parameters 

204 ---------- 

205 db : `Database` 

206 Object managing the database connection. 

207 exact : `bool`, optional 

208 If `True`, run the full query and perform post-query filtering if 

209 needed to account for that filtering in the count. If `False`, the 

210 result may be an upper bound. 

211 

212 Returns 

213 ------- 

214 count : `int` 

215 The number of rows the query would return, or an upper bound if 

216 ``exact=False``. 

217 

218 Notes 

219 ----- 

220 This counts the number of rows returned, not the number of unique rows 

221 returned, so even with ``exact=True`` it may provide only an upper 

222 bound on the number of *deduplicated* result rows. 

223 """ 

224 if self._doomed_by: 

225 return 0 

226 sql = self.sql 

227 if sql is None: 

228 return 1 

229 if exact and self.spatial: 

230 filtered_count = 0 

231 for _ in self.rows(db): 

232 filtered_count += 1 

233 return filtered_count 

234 else: 

235 return db.query(sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)).scalar() 

236 

237 def any( 

238 self, 

239 db: Database, 

240 *, 

241 execute: bool = True, 

242 exact: bool = True, 

243 ) -> bool: 

244 """Test whether this query returns any results. 

245 

246 Parameters 

247 ---------- 

248 db : `Database` 

249 Object managing the database connection. 

250 execute : `bool`, optional 

251 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

252 determined prior to execution that the query would return no rows. 

253 exact : `bool`, optional 

254 If `True`, run the full query and perform post-query filtering if 

255 needed, until at least one result row is found. If `False`, the 

256 returned result does not account for post-query filtering, and 

257 hence may be `True` even when all result rows would be filtered 

258 out. 

259 

260 Returns 

261 ------- 

262 any : `bool` 

263 `True` if the query would (or might, depending on arguments) yield 

264 result rows. `False` if it definitely would not. 

265 """ 

266 if self._doomed_by: 

267 return False 

268 sql = self.sql 

269 if sql is None: 

270 return True 

271 if exact and not execute: 

272 raise TypeError("Cannot obtain exact results without executing the query.") 

273 if exact and self.spatial: 

274 for _ in self.rows(db): 

275 return True 

276 return False 

277 elif execute: 

278 return db.query(sql.limit(1)).one_or_none() is not None 

279 else: 

280 return True 

281 

282 def explain_no_results( 

283 self, 

284 db: Database, 

285 *, 

286 followup: bool = True, 

287 ) -> Iterator[str]: 

288 """Return human-readable messages that may help explain why the query 

289 yields no results. 

290 

291 Parameters 

292 ---------- 

293 db : `Database` 

294 Object managing the database connection. 

295 followup : `bool`, optional 

296 If `True` (default) perform inexpensive follow-up queries if no 

297 diagnostics are available from query generation alone. 

298 

299 Returns 

300 ------- 

301 messages : `Iterator` [ `str` ] 

302 String messages that describe reasons the query might not yield any 

303 results. 

304 

305 Notes 

306 ----- 

307 Messages related to post-query filtering are only available if `rows`, 

308 `any`, or `count` was already called with the same region (with 

309 ``exact=True`` for the latter two). 

310 """ 

311 from ._builder import QueryBuilder 

312 

313 if self._doomed_by: 

314 yield from self._doomed_by 

315 return 

316 if self._filtered_by_where: 

317 yield ( 

318 f"{self._filtered_by_where} result rows were filtered out because " 

319 "one or more region did not overlap the WHERE-clause region." 

320 ) 

321 if self._filtered_by_join: 

322 yield ( 

323 f"{self._filtered_by_join} result rows were filtered out because " 

324 "one or more regions did not overlap." 

325 ) 

326 if (not followup) or self._filtered_by_join or self._filtered_by_where: 

327 return 

328 # Query didn't return results even before client-side filtering, and 

329 # caller says we can do follow-up queries to determine why. 

330 # Start by seeing if there are _any_ dimension records for each element 

331 # involved. 

332 for element in self.graph.elements: 

333 summary = QuerySummary(element.graph) 

334 builder = QueryBuilder(summary, self.managers) 

335 followup_query = builder.finish() 

336 if not followup_query.any(db, exact=False): 

337 yield f"No dimension records for element '{element.name}' found." 

338 yield from followup_query.explain_no_results(db, followup=False) 

339 return 

340 

341 @abstractmethod 

342 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

343 """Return the columns for the datasets returned by this query. 

344 

345 Returns 

346 ------- 

347 columns : `DatasetQueryColumns` or `None` 

348 Struct containing SQLAlchemy representations of the result columns 

349 for a dataset. 

350 

351 Notes 

352 ----- 

353 This method is intended primarily as a hook for subclasses to implement 

354 and the ABC to call in order to provide higher-level functionality; 

355 code that uses `Query` objects (but does not implement one) should 

356 usually not have to call this method. 

357 """ 

358 raise NotImplementedError() 

359 

360 @property 

361 @abstractmethod 

362 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

363 """A SQLAlchemy object representing the full query 

364 (`sqlalchemy.sql.FromClause` or `None`). 

365 

366 This is `None` in the special case where the query has no columns, and 

367 only one logical row. 

368 """ 

369 raise NotImplementedError() 

370 

371 def rows(self, db: Database) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

372 """Execute the query and yield result rows, applying `predicate`. 

373 

374 Parameters 

375 ---------- 

376 db : `Database` 

377 Object managing the database connection. 

378 

379 Yields 

380 ------ 

381 row : `sqlalchemy.engine.RowProxy` or `None` 

382 Result row from the query. `None` may yielded exactly once instead 

383 of any real rows to indicate an empty query (see `EmptyQuery`). 

384 """ 

385 if self._doomed_by: 

386 return 

387 self._filtered_by_where = 0 

388 self._filtered_by_join = 0 

389 for row in db.query(self.sql): 

390 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial] 

391 if self.whereRegion and any(r.isDisjointFrom(self.whereRegion) for r in rowRegions): 

392 self._filtered_by_where += 1 

393 continue 

394 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)): 

395 self._filtered_by_join += 1 

396 continue 

397 yield row 

398 

399 def extractDimensionsTuple( 

400 self, row: Optional[sqlalchemy.engine.RowProxy], dimensions: Iterable[Dimension] 

401 ) -> tuple: 

402 """Extract a tuple of data ID values from a result row. 

403 

404 Parameters 

405 ---------- 

406 row : `sqlalchemy.engine.RowProxy` or `None` 

407 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

408 the row from an `EmptyQuery`. 

409 dimensions : `Iterable` [ `Dimension` ] 

410 The dimensions to include in the returned tuple, in order. 

411 

412 Returns 

413 ------- 

414 values : `tuple` 

415 A tuple of dimension primary key values. 

416 """ 

417 if row is None: 

418 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions." 

419 return () 

420 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions) 

421 

422 def extractDataId( 

423 self, 

424 row: Optional[sqlalchemy.engine.RowProxy], 

425 *, 

426 graph: Optional[DimensionGraph] = None, 

427 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

428 ) -> DataCoordinate: 

429 """Extract a data ID from a result row. 

430 

431 Parameters 

432 ---------- 

433 row : `sqlalchemy.engine.RowProxy` or `None` 

434 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

435 the row from an `EmptyQuery`. 

436 graph : `DimensionGraph`, optional 

437 The dimensions the returned data ID should identify. If not 

438 provided, this will be all dimensions in `QuerySummary.requested`. 

439 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

440 Nested mapping containing records to attach to the returned 

441 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will 

442 return `True`. If provided, outer keys must include all dimension 

443 element names in ``graph``, and inner keys should be tuples of 

444 dimension primary key values in the same order as 

445 ``element.graph.required``. If not provided, 

446 `DataCoordinate.hasRecords` will return `False` on the returned 

447 object. 

448 

449 Returns 

450 ------- 

451 dataId : `DataCoordinate` 

452 A data ID that identifies all required and implied dimensions. If 

453 ``records is not None``, this is have 

454 `~DataCoordinate.hasRecords()` return `True`. 

455 """ 

456 if graph is None: 

457 graph = self.graph 

458 if not graph: 

459 return DataCoordinate.makeEmpty(self.graph.universe) 

460 dataId = DataCoordinate.fromFullValues( 

461 graph, self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied)) 

462 ) 

463 if records is not None: 

464 recordsForRow = {} 

465 for element in graph.elements: 

466 key = tuple(dataId.subset(element.graph).values()) 

467 recordsForRow[element.name] = records[element.name].get(key) 

468 return dataId.expanded(recordsForRow) 

469 else: 

470 return dataId 

471 

472 def extractDatasetRef( 

473 self, 

474 row: sqlalchemy.engine.RowProxy, 

475 dataId: Optional[DataCoordinate] = None, 

476 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

477 ) -> DatasetRef: 

478 """Extract a `DatasetRef` from a result row. 

479 

480 Parameters 

481 ---------- 

482 row : `sqlalchemy.engine.RowProxy` 

483 A result row from a SQLAlchemy SELECT query. 

484 dataId : `DataCoordinate` 

485 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class) 

486 `DataCoordinate` is constructed from ``row`` if `None`. 

487 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

488 Records to use to return an `ExpandedDataCoordinate`. If provided, 

489 outer keys must include all dimension element names in ``graph``, 

490 and inner keys should be tuples of dimension primary key values 

491 in the same order as ``element.graph.required``. 

492 

493 Returns 

494 ------- 

495 ref : `DatasetRef` 

496 Reference to the dataset; guaranteed to have `DatasetRef.id` not 

497 `None`. 

498 """ 

499 datasetColumns = self.getDatasetColumns() 

500 assert datasetColumns is not None 

501 if dataId is None: 

502 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records) 

503 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]] 

504 return DatasetRef( 

505 datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], run=runRecord.name 

506 ) 

507 

508 def _makeSubsetQueryColumns( 

509 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

510 ) -> Tuple[DimensionGraph, Optional[QueryColumns]]: 

511 """Helper method for subclass implementations of `subset`. 

512 

513 Parameters 

514 ---------- 

515 graph : `DimensionGraph`, optional 

516 Dimensions to include in the new `Query` being constructed. 

517 ``subset`` implementations should generally just forward their 

518 own ``graph`` argument here. 

519 datasets : `bool`, optional 

520 Whether the new `Query` should include dataset results. Defaults 

521 to `True`, but is ignored if ``self`` does not include dataset 

522 results. 

523 unique : `bool`, optional 

524 Whether the new `Query` should guarantee unique results (this may 

525 come with a performance penalty). 

526 

527 Returns 

528 ------- 

529 graph : `DimensionGraph` 

530 The dimensions of the new `Query`. This is exactly the same as 

531 the argument of the same name, with ``self.graph`` used if that 

532 argument is `None`. 

533 columns : `QueryColumns` or `None` 

534 A struct containing the SQLAlchemy column objects to use in the 

535 new query, constructed by delegating to other (mostly abstract) 

536 methods on ``self``. If `None`, `subset` may return ``self``. 

537 """ 

538 if graph is None: 

539 graph = self.graph 

540 if ( 

541 graph == self.graph 

542 and (self.getDatasetColumns() is None or datasets) 

543 and (self.isUnique() or not unique) 

544 ): 

545 return graph, None 

546 columns = QueryColumns() 

547 for dimension in graph.dimensions: 

548 col = self.getDimensionColumn(dimension.name) 

549 columns.keys[dimension] = [col] 

550 if not unique: 

551 for element in self.spatial: 

552 col = self.getRegionColumn(element.name) 

553 columns.regions[element] = col 

554 if datasets and self.getDatasetColumns() is not None: 

555 columns.datasets = self.getDatasetColumns() 

556 return graph, columns 

557 

558 @abstractmethod 

559 def materialize(self, db: Database) -> ContextManager[Query]: 

560 """Execute this query and insert its results into a temporary table. 

561 

562 Parameters 

563 ---------- 

564 db : `Database` 

565 Database engine to execute the query against. 

566 

567 Returns 

568 ------- 

569 context : `typing.ContextManager` [ `MaterializedQuery` ] 

570 A context manager that ensures the temporary table is created and 

571 populated in ``__enter__`` (returning a `MaterializedQuery` object 

572 backed by that table), and dropped in ``__exit__``. If ``self`` 

573 is already a `MaterializedQuery`, ``__enter__`` may just return 

574 ``self`` and ``__exit__`` may do nothing (reflecting the fact that 

575 an outer context manager should already take care of everything 

576 else). 

577 """ 

578 raise NotImplementedError() 

579 

580 @abstractmethod 

581 def subset( 

582 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

583 ) -> Query: 

584 """Return a new `Query` whose columns and/or rows are (mostly) subset 

585 of this one's. 

586 

587 Parameters 

588 ---------- 

589 graph : `DimensionGraph`, optional 

590 Dimensions to include in the new `Query` being constructed. 

591 If `None` (default), ``self.graph`` is used. 

592 datasets : `bool`, optional 

593 Whether the new `Query` should include dataset results. Defaults 

594 to `True`, but is ignored if ``self`` does not include dataset 

595 results. 

596 unique : `bool`, optional 

597 Whether the new `Query` should guarantee unique results (this may 

598 come with a performance penalty). 

599 

600 Returns 

601 ------- 

602 query : `Query` 

603 A query object corresponding to the given inputs. May be ``self`` 

604 if no changes were requested. 

605 

606 Notes 

607 ----- 

608 The way spatial overlaps are handled at present makes it impossible to 

609 fully guarantee in general that the new query's rows are a subset of 

610 this one's while also returning unique rows. That's because the 

611 database is only capable of performing approximate, conservative 

612 overlaps via the common skypix system; we defer actual region overlap 

613 operations to per-result-row Python logic. But including the region 

614 columns necessary to do that postprocessing in the query makes it 

615 impossible to do a SELECT DISTINCT on the user-visible dimensions of 

616 the query. For example, consider starting with a query with dimensions 

617 (instrument, skymap, visit, tract). That involves a spatial join 

618 between visit and tract, and we include the region columns from both 

619 tables in the results in order to only actually yield result rows 

620 (see `predicate` and `rows`) where the regions in those two columns 

621 overlap. If the user then wants to subset to just (skymap, tract) with 

622 unique results, we have two unpalatable options: 

623 

624 - we can do a SELECT DISTINCT with just the skymap and tract columns 

625 in the SELECT clause, dropping all detailed overlap information and 

626 including some tracts that did not actually overlap any of the 

627 visits in the original query (but were regarded as _possibly_ 

628 overlapping via the coarser, common-skypix relationships); 

629 

630 - we can include the tract and visit region columns in the query, and 

631 continue to filter out the non-overlapping pairs, but completely 

632 disregard the user's request for unique tracts. 

633 

634 This interface specifies that implementations must do the former, as 

635 that's what makes things efficient in our most important use case 

636 (``QuantumGraph`` generation in ``pipe_base``). We may be able to 

637 improve this situation in the future by putting exact overlap 

638 information in the database, either by using built-in (but 

639 engine-specific) spatial database functionality or (more likely) 

640 switching to a scheme in which pairwise dimension spatial relationships 

641 are explicitly precomputed (for e.g. combinations of instruments and 

642 skymaps). 

643 """ 

644 raise NotImplementedError() 

645 

646 @abstractmethod 

647 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

648 """Return a `QueryBuilder` that can be used to construct a new `Query` 

649 that is joined to (and hence constrained by) this one. 

650 

651 Parameters 

652 ---------- 

653 summary : `QuerySummary`, optional 

654 A `QuerySummary` instance that specifies the dimensions and any 

655 additional constraints to include in the new query being 

656 constructed, or `None` to use the dimensions of ``self`` with no 

657 additional constraints. 

658 """ 

659 raise NotImplementedError() 

660 

661 graph: DimensionGraph 

662 """The dimensions identified by this query and included in any data IDs 

663 created from its result rows (`DimensionGraph`). 

664 """ 

665 

666 whereRegion: Optional[Region] 

667 """A spatial region that all regions in all rows returned by this query 

668 must overlap (`lsst.sphgeom.Region` or `None`). 

669 """ 

670 

671 managers: RegistryManagers 

672 """A struct containing `Registry` helper object (`RegistryManagers`). 

673 """ 

674 

675 

676class DirectQueryUniqueness(enum.Enum): 

677 """An enum representing the ways in which a query can have unique rows (or 

678 not). 

679 """ 

680 

681 NOT_UNIQUE = enum.auto() 

682 """The query is not expected to have unique rows. 

683 """ 

684 

685 NATURALLY_UNIQUE = enum.auto() 

686 """The construction of the query guarantees that it will have unique 

687 result rows, even without SELECT DISTINCT or a GROUP BY clause. 

688 """ 

689 

690 NEEDS_DISTINCT = enum.auto() 

691 """The query is expected to yield unique result rows, and needs to use 

692 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this. 

693 """ 

694 

695 

696class DirectQuery(Query): 

697 """A `Query` implementation that represents a direct SELECT query that 

698 usually joins many tables. 

699 

700 `DirectQuery` objects should generally only be constructed by 

701 `QueryBuilder` or the methods of other `Query` objects. 

702 

703 Parameters 

704 ---------- 

705 simpleQuery : `SimpleQuery` 

706 Struct representing the actual SELECT, FROM, and WHERE clauses. 

707 columns : `QueryColumns` 

708 Columns that are referenced in the query in any clause. 

709 uniqueness : `DirectQueryUniqueness` 

710 Enum value indicating whether the query should yield unique result 

711 rows, and if so whether that needs to be explicitly requested of the 

712 database. 

713 graph : `DimensionGraph` 

714 Object describing the dimensions included in the query. 

715 whereRegion : `lsst.sphgeom.Region`, optional 

716 Region that all region columns in all returned rows must overlap. 

717 managers : `RegistryManagers` 

718 Struct containing the `Registry` manager helper objects, to be 

719 forwarded to the `Query` constructor. 

720 doomed_by : `Iterable` [ `str` ], optional 

721 A list of messages (appropriate for e.g. logging or exceptions) that 

722 explain why the query is known to return no results even before it is 

723 executed. Queries with a non-empty list will never be executed. 

724 """ 

725 

726 def __init__( 

727 self, 

728 *, 

729 simpleQuery: SimpleQuery, 

730 columns: QueryColumns, 

731 uniqueness: DirectQueryUniqueness, 

732 graph: DimensionGraph, 

733 whereRegion: Optional[Region], 

734 managers: RegistryManagers, 

735 order_by_columns: Iterable[OrderByColumn] = (), 

736 limit: Optional[Tuple[int, Optional[int]]] = None, 

737 doomed_by: Iterable[str] = (), 

738 ): 

739 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

740 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql" 

741 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns." 

742 self._simpleQuery = simpleQuery 

743 self._columns = columns 

744 self._uniqueness = uniqueness 

745 self._order_by_columns = order_by_columns 

746 self._limit = limit 

747 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None 

748 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

749 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

750 

751 def isUnique(self) -> bool: 

752 # Docstring inherited from Query. 

753 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE 

754 

755 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

756 # Docstring inherited from Query. 

757 column = self._dimensionColumns.get(name) 

758 if column is None: 

759 column = self._columns.getKeyColumn(name).label(name) 

760 self._dimensionColumns[name] = column 

761 return column 

762 

763 @property 

764 def spatial(self) -> Iterator[DimensionElement]: 

765 # Docstring inherited from Query. 

766 return iter(self._columns.regions) 

767 

768 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

769 # Docstring inherited from Query. 

770 column = self._regionColumns.get(name) 

771 if column is None: 

772 column = self._columns.regions[name].label(f"{name}_region") 

773 self._regionColumns[name] = column 

774 return column 

775 

776 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

777 # Docstring inherited from Query. 

778 if self._datasetQueryColumns is None: 

779 base = self._columns.datasets 

780 if base is None: 

781 return None 

782 ingestDate = base.ingestDate 

783 if ingestDate is not None: 

784 ingestDate = ingestDate.label("ingest_date") 

785 self._datasetQueryColumns = DatasetQueryColumns( 

786 datasetType=base.datasetType, 

787 id=base.id.label("dataset_id"), 

788 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()), 

789 ingestDate=ingestDate, 

790 ) 

791 return self._datasetQueryColumns 

792 

793 @property 

794 def sql(self) -> sqlalchemy.sql.FromClause: 

795 # Docstring inherited from Query. 

796 simpleQuery = self._simpleQuery.copy() 

797 for dimension in self.graph: 

798 simpleQuery.columns.append(self.getDimensionColumn(dimension.name)) 

799 for element in self.spatial: 

800 simpleQuery.columns.append(self.getRegionColumn(element.name)) 

801 datasetColumns = self.getDatasetColumns() 

802 if datasetColumns is not None: 

803 simpleQuery.columns.extend(datasetColumns) 

804 

805 assert not simpleQuery.order_by, "Input query cannot have ORDER BY" 

806 if self._order_by_columns: 

807 # add ORDER BY column 

808 order_by_columns = [column.column_order for column in self._order_by_columns] 

809 order_by_column = sqlalchemy.func.row_number().over(order_by=order_by_columns).label("_orderby") 

810 simpleQuery.columns.append(order_by_column) 

811 simpleQuery.order_by = [order_by_column] 

812 

813 assert simpleQuery.limit is None, "Input query cannot have LIMIT" 

814 simpleQuery.limit = self._limit 

815 

816 sql = simpleQuery.combine() 

817 

818 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT: 

819 return sql.distinct() 

820 else: 

821 return sql 

822 

823 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec: 

824 """Helper method for subclass implementations of `materialize`. 

825 

826 Parameters 

827 ---------- 

828 constraints : `bool`, optional 

829 If `True` (`False` is default), define a specification that 

830 includes actual foreign key constraints for logical foreign keys. 

831 Some database engines do not permit temporary tables to reference 

832 normal tables, so this should be `False` when generating a spec 

833 for a temporary table unless the database engine is known to 

834 support them. 

835 

836 Returns 

837 ------- 

838 spec : `ddl.TableSpec` 

839 Specification for a table that could hold this query's result rows. 

840 """ 

841 unique = self.isUnique() 

842 spec = ddl.TableSpec(fields=()) 

843 for dimension in self.graph: 

844 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints) 

845 for element in self.spatial: 

846 spec.fields.add(ddl.FieldSpec.for_region(f"{element.name}_region")) 

847 datasetColumns = self.getDatasetColumns() 

848 if datasetColumns is not None: 

849 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints) 

850 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints) 

851 

852 # Need a column for ORDER BY if ordering is requested 

853 if self._order_by_columns: 

854 spec.fields.add( 

855 ddl.FieldSpec( 

856 name="_orderby", 

857 dtype=sqlalchemy.BigInteger, 

858 nullable=False, 

859 doc="Column to use with ORDER BY", 

860 ) 

861 ) 

862 

863 return spec 

864 

865 @contextmanager 

866 def materialize(self, db: Database) -> Iterator[Query]: 

867 # Docstring inherited from Query. 

868 spec = self._makeTableSpec() 

869 with db.session() as session: 

870 table = session.makeTemporaryTable(spec) 

871 if not self._doomed_by: 

872 db.insert(table, select=self.sql, names=spec.fields.names) 

873 yield MaterializedQuery( 

874 table=table, 

875 spatial=self.spatial, 

876 datasetType=self.datasetType, 

877 isUnique=self.isUnique(), 

878 graph=self.graph, 

879 whereRegion=self.whereRegion, 

880 managers=self.managers, 

881 doomed_by=self._doomed_by, 

882 ) 

883 session.dropTemporaryTable(table) 

884 

885 def subset( 

886 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

887 ) -> Query: 

888 # Docstring inherited from Query. 

889 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

890 if columns is None: 

891 return self 

892 if columns.isEmpty(): 

893 return EmptyQuery(self.graph.universe, self.managers) 

894 return DirectQuery( 

895 simpleQuery=self._simpleQuery.copy(), 

896 columns=columns, 

897 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

898 graph=graph, 

899 whereRegion=self.whereRegion if not unique else None, 

900 managers=self.managers, 

901 doomed_by=self._doomed_by, 

902 ) 

903 

904 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

905 # Docstring inherited from Query. 

906 from ._builder import QueryBuilder 

907 

908 if summary is None: 

909 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

910 if not summary.requested.issubset(self.graph): 

911 raise NotImplementedError( 

912 f"Query.makeBuilder does not yet support augmenting dimensions " 

913 f"({summary.requested.dimensions}) beyond those originally included in the query " 

914 f"({self.graph.dimensions})." 

915 ) 

916 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

917 builder.joinTable( 

918 self.sql.alias(), dimensions=self.graph.dimensions, datasets=self.getDatasetColumns() 

919 ) 

920 return builder 

921 

922 

923class MaterializedQuery(Query): 

924 """A `Query` implementation that represents query results saved in a 

925 temporary table. 

926 

927 `MaterializedQuery` instances should not be constructed directly; use 

928 `Query.materialize()` instead. 

929 

930 Parameters 

931 ---------- 

932 table : `sqlalchemy.schema.Table` 

933 SQLAlchemy object representing the temporary table. 

934 spatial : `Iterable` [ `DimensionElement` ] 

935 Spatial dimension elements whose regions must overlap for each valid 

936 result row (which may reject some rows that are in the table). 

937 datasetType : `DatasetType` 

938 The `DatasetType` of datasets returned by this query, or `None` 

939 if there are no dataset results 

940 isUnique : `bool` 

941 If `True`, the table's rows are unique, and there is no need to 

942 add ``SELECT DISTINCT`` to guarantee this in results. 

943 graph : `DimensionGraph` 

944 Dimensions included in the columns of this table. 

945 whereRegion : `Region` or `None` 

946 A spatial region all result-row regions must overlap to be valid (which 

947 may reject some rows that are in the table). 

948 managers : `RegistryManagers` 

949 A struct containing `Registry` manager helper objects, forwarded to 

950 the `Query` constructor. 

951 doomed_by : `Iterable` [ `str` ], optional 

952 A list of messages (appropriate for e.g. logging or exceptions) that 

953 explain why the query is known to return no results even before it is 

954 executed. Queries with a non-empty list will never be executed. 

955 """ 

956 

957 def __init__( 

958 self, 

959 *, 

960 table: sqlalchemy.schema.Table, 

961 spatial: Iterable[DimensionElement], 

962 datasetType: Optional[DatasetType], 

963 isUnique: bool, 

964 graph: DimensionGraph, 

965 whereRegion: Optional[Region], 

966 managers: RegistryManagers, 

967 doomed_by: Iterable[str] = (), 

968 ): 

969 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

970 self._table = table 

971 self._spatial = tuple(spatial) 

972 self._datasetType = datasetType 

973 self._isUnique = isUnique 

974 

975 def isUnique(self) -> bool: 

976 # Docstring inherited from Query. 

977 return self._isUnique 

978 

979 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

980 # Docstring inherited from Query. 

981 return self._table.columns[name] 

982 

983 @property 

984 def spatial(self) -> Iterator[DimensionElement]: 

985 # Docstring inherited from Query. 

986 return iter(self._spatial) 

987 

988 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

989 # Docstring inherited from Query. 

990 return self._table.columns[f"{name}_region"] 

991 

992 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

993 # Docstring inherited from Query. 

994 if self._datasetType is not None: 

995 return DatasetQueryColumns( 

996 datasetType=self._datasetType, 

997 id=self._table.columns["dataset_id"], 

998 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()], 

999 ingestDate=None, 

1000 ) 

1001 else: 

1002 return None 

1003 

1004 @property 

1005 def sql(self) -> sqlalchemy.sql.FromClause: 

1006 # Docstring inherited from Query. 

1007 select = self._table.select() 

1008 if "_orderby" in self._table.columns: 

1009 select = select.order_by(self._table.columns["_orderby"]) 

1010 return select 

1011 

1012 @contextmanager 

1013 def materialize(self, db: Database) -> Iterator[Query]: 

1014 # Docstring inherited from Query. 

1015 yield self 

1016 

1017 def subset( 

1018 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

1019 ) -> Query: 

1020 # Docstring inherited from Query. 

1021 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

1022 if columns is None: 

1023 return self 

1024 if columns.isEmpty(): 

1025 return EmptyQuery(self.graph.universe, managers=self.managers) 

1026 simpleQuery = SimpleQuery() 

1027 simpleQuery.join(self._table) 

1028 return DirectQuery( 

1029 simpleQuery=simpleQuery, 

1030 columns=columns, 

1031 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

1032 graph=graph, 

1033 whereRegion=self.whereRegion if not unique else None, 

1034 managers=self.managers, 

1035 doomed_by=self._doomed_by, 

1036 ) 

1037 

1038 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1039 # Docstring inherited from Query. 

1040 from ._builder import QueryBuilder 

1041 

1042 if summary is None: 

1043 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

1044 if not summary.requested.issubset(self.graph): 

1045 raise NotImplementedError( 

1046 f"Query.makeBuilder does not yet support augmenting dimensions " 

1047 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1048 f"({self.graph.dimensions})." 

1049 ) 

1050 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

1051 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()) 

1052 return builder 

1053 

1054 

1055class EmptyQuery(Query): 

1056 """A `Query` implementation that handes the special case where the query 

1057 would have no columns. 

1058 

1059 Parameters 

1060 ---------- 

1061 universe : `DimensionUniverse` 

1062 Set of all dimensions from which the null set is extracted. 

1063 managers : `RegistryManagers` 

1064 A struct containing the registry manager instances used by the query 

1065 system. 

1066 doomed_by : `Iterable` [ `str` ], optional 

1067 A list of messages (appropriate for e.g. logging or exceptions) that 

1068 explain why the query is known to return no results even before it is 

1069 executed. Queries with a non-empty list will never be executed. 

1070 """ 

1071 

1072 def __init__( 

1073 self, 

1074 universe: DimensionUniverse, 

1075 managers: RegistryManagers, 

1076 doomed_by: Iterable[str] = (), 

1077 ): 

1078 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by) 

1079 

1080 def isUnique(self) -> bool: 

1081 # Docstring inherited from Query. 

1082 return True 

1083 

1084 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1085 # Docstring inherited from Query. 

1086 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).") 

1087 

1088 @property 

1089 def spatial(self) -> Iterator[DimensionElement]: 

1090 # Docstring inherited from Query. 

1091 return iter(()) 

1092 

1093 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1094 # Docstring inherited from Query. 

1095 raise KeyError(f"No region for {name} in query (no regions at all, actually).") 

1096 

1097 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1098 # Docstring inherited from Query. 

1099 return None 

1100 

1101 def rows(self, db: Database) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

1102 if not self._doomed_by: 

1103 yield None 

1104 

1105 @property 

1106 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

1107 # Docstring inherited from Query. 

1108 return None 

1109 

1110 @contextmanager 

1111 def materialize(self, db: Database) -> Iterator[Query]: 

1112 # Docstring inherited from Query. 

1113 yield self 

1114 

1115 def subset( 

1116 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

1117 ) -> Query: 

1118 # Docstring inherited from Query. 

1119 assert graph is None or graph.issubset(self.graph) 

1120 return self 

1121 

1122 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1123 # Docstring inherited from Query. 

1124 from ._builder import QueryBuilder 

1125 

1126 if summary is None: 

1127 summary = QuerySummary(self.graph) 

1128 if not summary.requested.issubset(self.graph): 

1129 raise NotImplementedError( 

1130 f"Query.makeBuilder does not yet support augmenting dimensions " 

1131 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1132 f"({self.graph.dimensions})." 

1133 ) 

1134 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)