Coverage for python/lsst/daf/butler/registry/queries/_query.py: 23%

365 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-04 02:04 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("Query",) 

24 

25import dataclasses 

26import enum 

27import itertools 

28from abc import ABC, abstractmethod 

29from contextlib import contextmanager 

30from typing import TYPE_CHECKING, ContextManager, Dict, Iterable, Iterator, Mapping, Optional, Tuple 

31 

32import sqlalchemy 

33from lsst.sphgeom import Region 

34 

35from ...core import ( 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 Dimension, 

40 DimensionElement, 

41 DimensionGraph, 

42 DimensionRecord, 

43 DimensionUniverse, 

44 SimpleQuery, 

45 addDimensionForeignKey, 

46 ddl, 

47) 

48from ..interfaces import Database 

49from ._query_backend import QueryBackend 

50from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary 

51 

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 from ._builder import QueryBuilder 

54 

55 

56@dataclasses.dataclass(frozen=True) 

57class OrderByColumn: 

58 """Information about single column in ORDER BY clause.""" 

59 

60 column: sqlalchemy.sql.ColumnElement 

61 """Name of the column or `None` for primary key (`str` or `None`)""" 

62 

63 ordering: bool 

64 """True for ascending order, False for descending (`bool`).""" 

65 

66 @property 

67 def column_order(self) -> sqlalchemy.sql.ColumnElement: 

68 """Column element for use in ORDER BY clause 

69 (`sqlalchemy.sql.ColumnElement`) 

70 """ 

71 return self.column.asc() if self.ordering else self.column.desc() 

72 

73 

74class Query(ABC): 

75 """An abstract base class for queries that return some combination of 

76 `DatasetRef` and `DataCoordinate` objects. 

77 

78 Parameters 

79 ---------- 

80 graph : `DimensionGraph` 

81 Object describing the dimensions included in the query. 

82 whereRegion : `lsst.sphgeom.Region`, optional 

83 Region that all region columns in all returned rows must overlap. 

84 backend : `QueryBackend` 

85 Backend object that represents the `Registry` implementation. 

86 doomed_by : `Iterable` [ `str` ], optional 

87 A list of messages (appropriate for e.g. logging or exceptions) that 

88 explain why the query is known to return no results even before it is 

89 executed. Queries with a non-empty list will never be executed. 

90 

91 Notes 

92 ----- 

93 The `Query` hierarchy abstracts over the database/SQL representation of a 

94 particular set of data IDs or datasets. It is expected to be used as a 

95 backend for other objects that provide more natural interfaces for one or 

96 both of these, not as part of a public interface to query results. 

97 """ 

98 

99 def __init__( 

100 self, 

101 *, 

102 graph: DimensionGraph, 

103 whereRegion: Optional[Region], 

104 backend: QueryBackend, 

105 doomed_by: Iterable[str] = (), 

106 ): 

107 self.graph = graph 

108 self.whereRegion = whereRegion 

109 self.backend = backend 

110 self._doomed_by = tuple(doomed_by) 

111 self._filtered_by_join: Optional[int] = None 

112 self._filtered_by_where: Optional[int] = None 

113 

114 @abstractmethod 

115 def isUnique(self) -> bool: 

116 """Return `True` if this query's rows are guaranteed to be unique, and 

117 `False` otherwise. 

118 

119 If this query has dataset results (`datasetType` is not `None`), 

120 uniqueness applies to the `DatasetRef` instances returned by 

121 `extractDatasetRef` from the result of `rows`. If it does not have 

122 dataset results, uniqueness applies to the `DataCoordinate` instances 

123 returned by `extractDataId`. 

124 """ 

125 raise NotImplementedError() 

126 

127 @abstractmethod 

128 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

129 """Return the query column that contains the primary key value for 

130 the dimension with the given name. 

131 

132 Parameters 

133 ---------- 

134 name : `str` 

135 Name of the dimension. 

136 

137 Returns 

138 ------- 

139 column : `sqlalchemy.sql.ColumnElement`. 

140 SQLAlchemy object representing a column in the query. 

141 

142 Notes 

143 ----- 

144 This method is intended primarily as a hook for subclasses to implement 

145 and the ABC to call in order to provide higher-level functionality; 

146 code that uses `Query` objects (but does not implement one) should 

147 usually not have to call this method. 

148 """ 

149 raise NotImplementedError() 

150 

151 @property 

152 @abstractmethod 

153 def spatial(self) -> Iterator[DimensionElement]: 

154 """An iterator over the dimension element columns used in post-query 

155 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]). 

156 

157 Notes 

158 ----- 

159 This property is intended primarily as a hook for subclasses to 

160 implement and the ABC to call in order to provide higher-level 

161 functionality; code that uses `Query` objects (but does not implement 

162 one) should usually not have to access this property. 

163 """ 

164 raise NotImplementedError() 

165 

166 @abstractmethod 

167 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

168 """Return a region column for one of the dimension elements iterated 

169 over by `spatial`. 

170 

171 Parameters 

172 ---------- 

173 name : `str` 

174 Name of the element. 

175 

176 Returns 

177 ------- 

178 column : `sqlalchemy.sql.ColumnElement` 

179 SQLAlchemy representing a result column in the query. 

180 

181 Notes 

182 ----- 

183 This method is intended primarily as a hook for subclasses to implement 

184 and the ABC to call in order to provide higher-level functionality; 

185 code that uses `Query` objects (but does not implement one) should 

186 usually not have to call this method. 

187 """ 

188 raise NotImplementedError() 

189 

190 @property 

191 def datasetType(self) -> Optional[DatasetType]: 

192 """The `DatasetType` of datasets returned by this query, or `None` 

193 if there are no dataset results (`DatasetType` or `None`). 

194 """ 

195 cols = self.getDatasetColumns() 

196 if cols is None: 

197 return None 

198 return cols.datasetType 

199 

200 def count(self, db: Database, *, exact: bool = True) -> int: 

201 """Count the number of rows this query would return. 

202 

203 Parameters 

204 ---------- 

205 db : `Database` 

206 Object managing the database connection. 

207 exact : `bool`, optional 

208 If `True`, run the full query and perform post-query filtering if 

209 needed to account for that filtering in the count. If `False`, the 

210 result may be an upper bound. 

211 

212 Returns 

213 ------- 

214 count : `int` 

215 The number of rows the query would return, or an upper bound if 

216 ``exact=False``. 

217 

218 Notes 

219 ----- 

220 This counts the number of rows returned, not the number of unique rows 

221 returned, so even with ``exact=True`` it may provide only an upper 

222 bound on the number of *deduplicated* result rows. 

223 """ 

224 if self._doomed_by: 

225 return 0 

226 sql = self.sql 

227 if sql is None: 

228 return 1 

229 if exact and self.spatial: 

230 filtered_count = 0 

231 for _ in self.rows(db): 

232 filtered_count += 1 

233 return filtered_count 

234 else: 

235 with db.query(sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)) as sql_result: 

236 return sql_result.scalar() 

237 

238 def any( 

239 self, 

240 db: Database, 

241 *, 

242 execute: bool = True, 

243 exact: bool = True, 

244 ) -> bool: 

245 """Test whether this query returns any results. 

246 

247 Parameters 

248 ---------- 

249 db : `Database` 

250 Object managing the database connection. 

251 execute : `bool`, optional 

252 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

253 determined prior to execution that the query would return no rows. 

254 exact : `bool`, optional 

255 If `True`, run the full query and perform post-query filtering if 

256 needed, until at least one result row is found. If `False`, the 

257 returned result does not account for post-query filtering, and 

258 hence may be `True` even when all result rows would be filtered 

259 out. 

260 

261 Returns 

262 ------- 

263 any : `bool` 

264 `True` if the query would (or might, depending on arguments) yield 

265 result rows. `False` if it definitely would not. 

266 """ 

267 if self._doomed_by: 

268 return False 

269 sql = self.sql 

270 if sql is None: 

271 return True 

272 if exact and not execute: 

273 raise TypeError("Cannot obtain exact results without executing the query.") 

274 if exact and self.spatial: 

275 for _ in self.rows(db): 

276 return True 

277 return False 

278 elif execute: 

279 with db.query(sql.limit(1)) as sql_result: 

280 return sql_result.one_or_none() is not None 

281 else: 

282 return True 

283 

284 def explain_no_results( 

285 self, 

286 db: Database, 

287 *, 

288 followup: bool = True, 

289 ) -> Iterator[str]: 

290 """Return human-readable messages that may help explain why the query 

291 yields no results. 

292 

293 Parameters 

294 ---------- 

295 db : `Database` 

296 Object managing the database connection. 

297 followup : `bool`, optional 

298 If `True` (default) perform inexpensive follow-up queries if no 

299 diagnostics are available from query generation alone. 

300 

301 Returns 

302 ------- 

303 messages : `Iterator` [ `str` ] 

304 String messages that describe reasons the query might not yield any 

305 results. 

306 

307 Notes 

308 ----- 

309 Messages related to post-query filtering are only available if `rows`, 

310 `any`, or `count` was already called with the same region (with 

311 ``exact=True`` for the latter two). 

312 """ 

313 from ._builder import QueryBuilder 

314 

315 if self._doomed_by: 

316 yield from self._doomed_by 

317 return 

318 if self._filtered_by_where: 

319 yield ( 

320 f"{self._filtered_by_where} result rows were filtered out because " 

321 "one or more region did not overlap the WHERE-clause region." 

322 ) 

323 if self._filtered_by_join: 

324 yield ( 

325 f"{self._filtered_by_join} result rows were filtered out because " 

326 "one or more regions did not overlap." 

327 ) 

328 if (not followup) or self._filtered_by_join or self._filtered_by_where: 

329 return 

330 # Query didn't return results even before client-side filtering, and 

331 # caller says we can do follow-up queries to determine why. 

332 # Start by seeing if there are _any_ dimension records for each element 

333 # involved. 

334 for element in self.graph.elements: 

335 summary = QuerySummary(element.graph) 

336 builder = QueryBuilder(summary, self.backend) 

337 followup_query = builder.finish() 

338 if not followup_query.any(db, exact=False): 

339 yield f"No dimension records for element '{element.name}' found." 

340 yield from followup_query.explain_no_results(db, followup=False) 

341 return 

342 

343 @abstractmethod 

344 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

345 """Return the columns for the datasets returned by this query. 

346 

347 Returns 

348 ------- 

349 columns : `DatasetQueryColumns` or `None` 

350 Struct containing SQLAlchemy representations of the result columns 

351 for a dataset. 

352 

353 Notes 

354 ----- 

355 This method is intended primarily as a hook for subclasses to implement 

356 and the ABC to call in order to provide higher-level functionality; 

357 code that uses `Query` objects (but does not implement one) should 

358 usually not have to call this method. 

359 """ 

360 raise NotImplementedError() 

361 

362 @property 

363 @abstractmethod 

364 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

365 """A SQLAlchemy object representing the full query 

366 (`sqlalchemy.sql.FromClause` or `None`). 

367 

368 This is `None` in the special case where the query has no columns, and 

369 only one logical row. 

370 """ 

371 raise NotImplementedError() 

372 

373 def rows(self, db: Database) -> Iterator[Optional[sqlalchemy.engine.Row]]: 

374 """Execute the query and yield result rows, applying `predicate`. 

375 

376 Parameters 

377 ---------- 

378 db : `Database` 

379 Object managing the database connection. 

380 

381 Yields 

382 ------ 

383 row : `sqlalchemy.engine.RowProxy` or `None` 

384 Result row from the query. `None` may yielded exactly once instead 

385 of any real rows to indicate an empty query (see `EmptyQuery`). 

386 """ 

387 if self._doomed_by: 

388 return 

389 self._filtered_by_where = 0 

390 self._filtered_by_join = 0 

391 with db.query(self.sql) as sql_result: 

392 sql_rows = sql_result.fetchall() 

393 for row in sql_rows: 

394 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial] 

395 if self.whereRegion and any(r.isDisjointFrom(self.whereRegion) for r in rowRegions): 

396 self._filtered_by_where += 1 

397 continue 

398 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)): 

399 self._filtered_by_join += 1 

400 continue 

401 yield row 

402 

403 def extractDimensionsTuple( 

404 self, row: Optional[sqlalchemy.engine.RowProxy], dimensions: Iterable[Dimension] 

405 ) -> tuple: 

406 """Extract a tuple of data ID values from a result row. 

407 

408 Parameters 

409 ---------- 

410 row : `sqlalchemy.engine.RowProxy` or `None` 

411 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

412 the row from an `EmptyQuery`. 

413 dimensions : `Iterable` [ `Dimension` ] 

414 The dimensions to include in the returned tuple, in order. 

415 

416 Returns 

417 ------- 

418 values : `tuple` 

419 A tuple of dimension primary key values. 

420 """ 

421 if row is None: 

422 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions." 

423 return () 

424 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions) 

425 

426 def extractDataId( 

427 self, 

428 row: Optional[sqlalchemy.engine.RowProxy], 

429 *, 

430 graph: Optional[DimensionGraph] = None, 

431 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

432 ) -> DataCoordinate: 

433 """Extract a data ID from a result row. 

434 

435 Parameters 

436 ---------- 

437 row : `sqlalchemy.engine.RowProxy` or `None` 

438 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

439 the row from an `EmptyQuery`. 

440 graph : `DimensionGraph`, optional 

441 The dimensions the returned data ID should identify. If not 

442 provided, this will be all dimensions in `QuerySummary.requested`. 

443 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

444 Nested mapping containing records to attach to the returned 

445 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will 

446 return `True`. If provided, outer keys must include all dimension 

447 element names in ``graph``, and inner keys should be tuples of 

448 dimension primary key values in the same order as 

449 ``element.graph.required``. If not provided, 

450 `DataCoordinate.hasRecords` will return `False` on the returned 

451 object. 

452 

453 Returns 

454 ------- 

455 dataId : `DataCoordinate` 

456 A data ID that identifies all required and implied dimensions. If 

457 ``records is not None``, this is have 

458 `~DataCoordinate.hasRecords()` return `True`. 

459 """ 

460 if graph is None: 

461 graph = self.graph 

462 if not graph: 

463 return DataCoordinate.makeEmpty(self.graph.universe) 

464 dataId = DataCoordinate.fromFullValues( 

465 graph, self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied)) 

466 ) 

467 if records is not None: 

468 recordsForRow = {} 

469 for element in graph.elements: 

470 key = tuple(dataId.subset(element.graph).values()) 

471 recordsForRow[element.name] = records[element.name].get(key) 

472 return dataId.expanded(recordsForRow) 

473 else: 

474 return dataId 

475 

476 def extractDatasetRef( 

477 self, 

478 row: sqlalchemy.engine.RowProxy, 

479 dataId: Optional[DataCoordinate] = None, 

480 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

481 ) -> DatasetRef: 

482 """Extract a `DatasetRef` from a result row. 

483 

484 Parameters 

485 ---------- 

486 row : `sqlalchemy.engine.RowProxy` 

487 A result row from a SQLAlchemy SELECT query. 

488 dataId : `DataCoordinate` 

489 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class) 

490 `DataCoordinate` is constructed from ``row`` if `None`. 

491 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

492 Records to use to return an `ExpandedDataCoordinate`. If provided, 

493 outer keys must include all dimension element names in ``graph``, 

494 and inner keys should be tuples of dimension primary key values 

495 in the same order as ``element.graph.required``. 

496 

497 Returns 

498 ------- 

499 ref : `DatasetRef` 

500 Reference to the dataset; guaranteed to have `DatasetRef.id` not 

501 `None`. 

502 """ 

503 datasetColumns = self.getDatasetColumns() 

504 assert datasetColumns is not None 

505 if dataId is None: 

506 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records) 

507 runRecord = self.backend.managers.collections[row._mapping[datasetColumns.runKey]] 

508 return DatasetRef( 

509 datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], run=runRecord.name 

510 ) 

511 

512 def _makeSubsetQueryColumns( 

513 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

514 ) -> Tuple[DimensionGraph, Optional[QueryColumns]]: 

515 """Helper method for subclass implementations of `subset`. 

516 

517 Parameters 

518 ---------- 

519 graph : `DimensionGraph`, optional 

520 Dimensions to include in the new `Query` being constructed. 

521 ``subset`` implementations should generally just forward their 

522 own ``graph`` argument here. 

523 datasets : `bool`, optional 

524 Whether the new `Query` should include dataset results. Defaults 

525 to `True`, but is ignored if ``self`` does not include dataset 

526 results. 

527 unique : `bool`, optional 

528 Whether the new `Query` should guarantee unique results (this may 

529 come with a performance penalty). 

530 

531 Returns 

532 ------- 

533 graph : `DimensionGraph` 

534 The dimensions of the new `Query`. This is exactly the same as 

535 the argument of the same name, with ``self.graph`` used if that 

536 argument is `None`. 

537 columns : `QueryColumns` or `None` 

538 A struct containing the SQLAlchemy column objects to use in the 

539 new query, constructed by delegating to other (mostly abstract) 

540 methods on ``self``. If `None`, `subset` may return ``self``. 

541 """ 

542 if graph is None: 

543 graph = self.graph 

544 if ( 

545 graph == self.graph 

546 and (self.getDatasetColumns() is None or datasets) 

547 and (self.isUnique() or not unique) 

548 ): 

549 return graph, None 

550 columns = QueryColumns() 

551 for dimension in graph.dimensions: 

552 col = self.getDimensionColumn(dimension.name) 

553 columns.keys[dimension] = [col] 

554 if not unique: 

555 for element in self.spatial: 

556 col = self.getRegionColumn(element.name) 

557 columns.regions[element] = col 

558 if datasets and self.getDatasetColumns() is not None: 

559 columns.datasets = self.getDatasetColumns() 

560 return graph, columns 

561 

562 @abstractmethod 

563 def materialize(self, db: Database) -> ContextManager[Query]: 

564 """Execute this query and insert its results into a temporary table. 

565 

566 Parameters 

567 ---------- 

568 db : `Database` 

569 Database engine to execute the query against. 

570 

571 Returns 

572 ------- 

573 context : `typing.ContextManager` [ `MaterializedQuery` ] 

574 A context manager that ensures the temporary table is created and 

575 populated in ``__enter__`` (returning a `MaterializedQuery` object 

576 backed by that table), and dropped in ``__exit__``. If ``self`` 

577 is already a `MaterializedQuery`, ``__enter__`` may just return 

578 ``self`` and ``__exit__`` may do nothing (reflecting the fact that 

579 an outer context manager should already take care of everything 

580 else). 

581 """ 

582 raise NotImplementedError() 

583 

584 @abstractmethod 

585 def subset( 

586 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

587 ) -> Query: 

588 """Return a new `Query` whose columns and/or rows are (mostly) subset 

589 of this one's. 

590 

591 Parameters 

592 ---------- 

593 graph : `DimensionGraph`, optional 

594 Dimensions to include in the new `Query` being constructed. 

595 If `None` (default), ``self.graph`` is used. 

596 datasets : `bool`, optional 

597 Whether the new `Query` should include dataset results. Defaults 

598 to `True`, but is ignored if ``self`` does not include dataset 

599 results. 

600 unique : `bool`, optional 

601 Whether the new `Query` should guarantee unique results (this may 

602 come with a performance penalty). 

603 

604 Returns 

605 ------- 

606 query : `Query` 

607 A query object corresponding to the given inputs. May be ``self`` 

608 if no changes were requested. 

609 

610 Notes 

611 ----- 

612 The way spatial overlaps are handled at present makes it impossible to 

613 fully guarantee in general that the new query's rows are a subset of 

614 this one's while also returning unique rows. That's because the 

615 database is only capable of performing approximate, conservative 

616 overlaps via the common skypix system; we defer actual region overlap 

617 operations to per-result-row Python logic. But including the region 

618 columns necessary to do that postprocessing in the query makes it 

619 impossible to do a SELECT DISTINCT on the user-visible dimensions of 

620 the query. For example, consider starting with a query with dimensions 

621 (instrument, skymap, visit, tract). That involves a spatial join 

622 between visit and tract, and we include the region columns from both 

623 tables in the results in order to only actually yield result rows 

624 (see `predicate` and `rows`) where the regions in those two columns 

625 overlap. If the user then wants to subset to just (skymap, tract) with 

626 unique results, we have two unpalatable options: 

627 

628 - we can do a SELECT DISTINCT with just the skymap and tract columns 

629 in the SELECT clause, dropping all detailed overlap information and 

630 including some tracts that did not actually overlap any of the 

631 visits in the original query (but were regarded as _possibly_ 

632 overlapping via the coarser, common-skypix relationships); 

633 

634 - we can include the tract and visit region columns in the query, and 

635 continue to filter out the non-overlapping pairs, but completely 

636 disregard the user's request for unique tracts. 

637 

638 This interface specifies that implementations must do the former, as 

639 that's what makes things efficient in our most important use case 

640 (``QuantumGraph`` generation in ``pipe_base``). We may be able to 

641 improve this situation in the future by putting exact overlap 

642 information in the database, either by using built-in (but 

643 engine-specific) spatial database functionality or (more likely) 

644 switching to a scheme in which pairwise dimension spatial relationships 

645 are explicitly precomputed (for e.g. combinations of instruments and 

646 skymaps). 

647 """ 

648 raise NotImplementedError() 

649 

650 @abstractmethod 

651 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

652 """Return a `QueryBuilder` that can be used to construct a new `Query` 

653 that is joined to (and hence constrained by) this one. 

654 

655 Parameters 

656 ---------- 

657 summary : `QuerySummary`, optional 

658 A `QuerySummary` instance that specifies the dimensions and any 

659 additional constraints to include in the new query being 

660 constructed, or `None` to use the dimensions of ``self`` with no 

661 additional constraints. 

662 """ 

663 raise NotImplementedError() 

664 

665 graph: DimensionGraph 

666 """The dimensions identified by this query and included in any data IDs 

667 created from its result rows (`DimensionGraph`). 

668 """ 

669 

670 whereRegion: Optional[Region] 

671 """A spatial region that all regions in all rows returned by this query 

672 must overlap (`lsst.sphgeom.Region` or `None`). 

673 """ 

674 

675 backend: QueryBackend 

676 """Backend object that represents the `Registry` implementation. 

677 """ 

678 

679 

680class DirectQueryUniqueness(enum.Enum): 

681 """An enum representing the ways in which a query can have unique rows (or 

682 not). 

683 """ 

684 

685 NOT_UNIQUE = enum.auto() 

686 """The query is not expected to have unique rows. 

687 """ 

688 

689 NATURALLY_UNIQUE = enum.auto() 

690 """The construction of the query guarantees that it will have unique 

691 result rows, even without SELECT DISTINCT or a GROUP BY clause. 

692 """ 

693 

694 NEEDS_DISTINCT = enum.auto() 

695 """The query is expected to yield unique result rows, and needs to use 

696 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this. 

697 """ 

698 

699 

700class DirectQuery(Query): 

701 """A `Query` implementation that represents a direct SELECT query that 

702 usually joins many tables. 

703 

704 `DirectQuery` objects should generally only be constructed by 

705 `QueryBuilder` or the methods of other `Query` objects. 

706 

707 Parameters 

708 ---------- 

709 simpleQuery : `SimpleQuery` 

710 Struct representing the actual SELECT, FROM, and WHERE clauses. 

711 columns : `QueryColumns` 

712 Columns that are referenced in the query in any clause. 

713 uniqueness : `DirectQueryUniqueness` 

714 Enum value indicating whether the query should yield unique result 

715 rows, and if so whether that needs to be explicitly requested of the 

716 database. 

717 graph : `DimensionGraph` 

718 Object describing the dimensions included in the query. 

719 whereRegion : `lsst.sphgeom.Region`, optional 

720 Region that all region columns in all returned rows must overlap. 

721 backend : `QueryBackend` 

722 Backend object that represents the `Registry` implementation. 

723 doomed_by : `Iterable` [ `str` ], optional 

724 A list of messages (appropriate for e.g. logging or exceptions) that 

725 explain why the query is known to return no results even before it is 

726 executed. Queries with a non-empty list will never be executed. 

727 """ 

728 

729 def __init__( 

730 self, 

731 *, 

732 simpleQuery: SimpleQuery, 

733 columns: QueryColumns, 

734 uniqueness: DirectQueryUniqueness, 

735 graph: DimensionGraph, 

736 whereRegion: Optional[Region], 

737 backend: QueryBackend, 

738 order_by_columns: Iterable[OrderByColumn] = (), 

739 limit: Optional[Tuple[int, Optional[int]]] = None, 

740 doomed_by: Iterable[str] = (), 

741 ): 

742 super().__init__(graph=graph, whereRegion=whereRegion, backend=backend, doomed_by=doomed_by) 

743 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql" 

744 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns." 

745 self._simpleQuery = simpleQuery 

746 self._columns = columns 

747 self._uniqueness = uniqueness 

748 self._order_by_columns = order_by_columns 

749 self._limit = limit 

750 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None 

751 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

752 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

753 

754 def isUnique(self) -> bool: 

755 # Docstring inherited from Query. 

756 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE 

757 

758 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

759 # Docstring inherited from Query. 

760 column = self._dimensionColumns.get(name) 

761 if column is None: 

762 column = self._columns.getKeyColumn(name).label(name) 

763 self._dimensionColumns[name] = column 

764 return column 

765 

766 @property 

767 def spatial(self) -> Iterator[DimensionElement]: 

768 # Docstring inherited from Query. 

769 return iter(self._columns.regions) 

770 

771 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

772 # Docstring inherited from Query. 

773 column = self._regionColumns.get(name) 

774 if column is None: 

775 column = self._columns.regions[name].label(f"{name}_region") 

776 self._regionColumns[name] = column 

777 return column 

778 

779 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

780 # Docstring inherited from Query. 

781 if self._datasetQueryColumns is None: 

782 base = self._columns.datasets 

783 if base is None: 

784 return None 

785 ingestDate = base.ingestDate 

786 if ingestDate is not None: 

787 ingestDate = ingestDate.label("ingest_date") 

788 self._datasetQueryColumns = DatasetQueryColumns( 

789 datasetType=base.datasetType, 

790 id=base.id.label("dataset_id"), 

791 runKey=base.runKey.label(self.backend.managers.collections.getRunForeignKeyName()), 

792 ingestDate=ingestDate, 

793 ) 

794 return self._datasetQueryColumns 

795 

796 @property 

797 def sql(self) -> sqlalchemy.sql.FromClause: 

798 # Docstring inherited from Query. 

799 simpleQuery = self._simpleQuery.copy() 

800 for dimension in self.graph: 

801 simpleQuery.columns.append(self.getDimensionColumn(dimension.name)) 

802 for element in self.spatial: 

803 simpleQuery.columns.append(self.getRegionColumn(element.name)) 

804 datasetColumns = self.getDatasetColumns() 

805 if datasetColumns is not None: 

806 simpleQuery.columns.extend(datasetColumns) 

807 

808 assert not simpleQuery.order_by, "Input query cannot have ORDER BY" 

809 if self._order_by_columns: 

810 # add ORDER BY column 

811 order_by_columns = [column.column_order for column in self._order_by_columns] 

812 order_by_column = sqlalchemy.func.row_number().over(order_by=order_by_columns).label("_orderby") 

813 simpleQuery.columns.append(order_by_column) 

814 simpleQuery.order_by = [order_by_column] 

815 

816 assert simpleQuery.limit is None, "Input query cannot have LIMIT" 

817 simpleQuery.limit = self._limit 

818 

819 sql = simpleQuery.combine() 

820 

821 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT: 

822 return sql.distinct() 

823 else: 

824 return sql 

825 

826 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec: 

827 """Helper method for subclass implementations of `materialize`. 

828 

829 Parameters 

830 ---------- 

831 constraints : `bool`, optional 

832 If `True` (`False` is default), define a specification that 

833 includes actual foreign key constraints for logical foreign keys. 

834 Some database engines do not permit temporary tables to reference 

835 normal tables, so this should be `False` when generating a spec 

836 for a temporary table unless the database engine is known to 

837 support them. 

838 

839 Returns 

840 ------- 

841 spec : `ddl.TableSpec` 

842 Specification for a table that could hold this query's result rows. 

843 """ 

844 unique = self.isUnique() 

845 spec = ddl.TableSpec(fields=()) 

846 for dimension in self.graph: 

847 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints) 

848 for element in self.spatial: 

849 spec.fields.add(ddl.FieldSpec.for_region(f"{element.name}_region")) 

850 datasetColumns = self.getDatasetColumns() 

851 if datasetColumns is not None: 

852 self.backend.managers.datasets.addDatasetForeignKey( 

853 spec, primaryKey=unique, constraint=constraints 

854 ) 

855 self.backend.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints) 

856 

857 # Need a column for ORDER BY if ordering is requested 

858 if self._order_by_columns: 

859 spec.fields.add( 

860 ddl.FieldSpec( 

861 name="_orderby", 

862 dtype=sqlalchemy.BigInteger, 

863 nullable=False, 

864 doc="Column to use with ORDER BY", 

865 ) 

866 ) 

867 

868 return spec 

869 

870 @contextmanager 

871 def materialize(self, db: Database) -> Iterator[Query]: 

872 # Docstring inherited from Query. 

873 spec = self._makeTableSpec() 

874 with db.temporary_table(spec) as table: 

875 if not self._doomed_by: 

876 db.insert(table, select=self.sql, names=spec.fields.names) 

877 yield MaterializedQuery( 

878 table=table, 

879 spatial=self.spatial, 

880 datasetType=self.datasetType, 

881 isUnique=self.isUnique(), 

882 graph=self.graph, 

883 whereRegion=self.whereRegion, 

884 backend=self.backend, 

885 doomed_by=self._doomed_by, 

886 ) 

887 

888 def subset( 

889 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

890 ) -> Query: 

891 # Docstring inherited from Query. 

892 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

893 if columns is None: 

894 return self 

895 if columns.isEmpty(): 

896 return EmptyQuery(self.graph.universe, self.backend) 

897 return DirectQuery( 

898 simpleQuery=self._simpleQuery.copy(), 

899 columns=columns, 

900 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

901 graph=graph, 

902 whereRegion=self.whereRegion if not unique else None, 

903 backend=self.backend, 

904 doomed_by=self._doomed_by, 

905 ) 

906 

907 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

908 # Docstring inherited from Query. 

909 from ._builder import QueryBuilder 

910 

911 if summary is None: 

912 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

913 if not summary.requested.issubset(self.graph): 

914 raise NotImplementedError( 

915 f"Query.makeBuilder does not yet support augmenting dimensions " 

916 f"({summary.requested.dimensions}) beyond those originally included in the query " 

917 f"({self.graph.dimensions})." 

918 ) 

919 builder = QueryBuilder(summary, backend=self.backend, doomed_by=self._doomed_by) 

920 builder.joinTable( 

921 self.sql.alias(), dimensions=self.graph.dimensions, datasets=self.getDatasetColumns() 

922 ) 

923 return builder 

924 

925 

926class MaterializedQuery(Query): 

927 """A `Query` implementation that represents query results saved in a 

928 temporary table. 

929 

930 `MaterializedQuery` instances should not be constructed directly; use 

931 `Query.materialize()` instead. 

932 

933 Parameters 

934 ---------- 

935 table : `sqlalchemy.schema.Table` 

936 SQLAlchemy object representing the temporary table. 

937 spatial : `Iterable` [ `DimensionElement` ] 

938 Spatial dimension elements whose regions must overlap for each valid 

939 result row (which may reject some rows that are in the table). 

940 datasetType : `DatasetType` 

941 The `DatasetType` of datasets returned by this query, or `None` 

942 if there are no dataset results 

943 isUnique : `bool` 

944 If `True`, the table's rows are unique, and there is no need to 

945 add ``SELECT DISTINCT`` to guarantee this in results. 

946 graph : `DimensionGraph` 

947 Dimensions included in the columns of this table. 

948 whereRegion : `Region` or `None` 

949 A spatial region all result-row regions must overlap to be valid (which 

950 may reject some rows that are in the table). 

951 backend : `QueryBackend` 

952 Backend object that represents the `Registry` implementation. 

953 doomed_by : `Iterable` [ `str` ], optional 

954 A list of messages (appropriate for e.g. logging or exceptions) that 

955 explain why the query is known to return no results even before it is 

956 executed. Queries with a non-empty list will never be executed. 

957 """ 

958 

959 def __init__( 

960 self, 

961 *, 

962 table: sqlalchemy.schema.Table, 

963 spatial: Iterable[DimensionElement], 

964 datasetType: Optional[DatasetType], 

965 isUnique: bool, 

966 graph: DimensionGraph, 

967 whereRegion: Optional[Region], 

968 backend: QueryBackend, 

969 doomed_by: Iterable[str] = (), 

970 ): 

971 super().__init__(graph=graph, whereRegion=whereRegion, backend=backend, doomed_by=doomed_by) 

972 self._table = table 

973 self._spatial = tuple(spatial) 

974 self._datasetType = datasetType 

975 self._isUnique = isUnique 

976 

977 def isUnique(self) -> bool: 

978 # Docstring inherited from Query. 

979 return self._isUnique 

980 

981 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

982 # Docstring inherited from Query. 

983 return self._table.columns[name] 

984 

985 @property 

986 def spatial(self) -> Iterator[DimensionElement]: 

987 # Docstring inherited from Query. 

988 return iter(self._spatial) 

989 

990 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

991 # Docstring inherited from Query. 

992 return self._table.columns[f"{name}_region"] 

993 

994 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

995 # Docstring inherited from Query. 

996 if self._datasetType is not None: 

997 return DatasetQueryColumns( 

998 datasetType=self._datasetType, 

999 id=self._table.columns["dataset_id"], 

1000 runKey=self._table.columns[self.backend.managers.collections.getRunForeignKeyName()], 

1001 ingestDate=None, 

1002 ) 

1003 else: 

1004 return None 

1005 

1006 @property 

1007 def sql(self) -> sqlalchemy.sql.FromClause: 

1008 # Docstring inherited from Query. 

1009 select = self._table.select() 

1010 if "_orderby" in self._table.columns: 

1011 select = select.order_by(self._table.columns["_orderby"]) 

1012 return select 

1013 

1014 @contextmanager 

1015 def materialize(self, db: Database) -> Iterator[Query]: 

1016 # Docstring inherited from Query. 

1017 yield self 

1018 

1019 def subset( 

1020 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

1021 ) -> Query: 

1022 # Docstring inherited from Query. 

1023 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

1024 if columns is None: 

1025 return self 

1026 if columns.isEmpty(): 

1027 return EmptyQuery(self.graph.universe, self.backend) 

1028 simpleQuery = SimpleQuery() 

1029 simpleQuery.join(self._table) 

1030 return DirectQuery( 

1031 simpleQuery=simpleQuery, 

1032 columns=columns, 

1033 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

1034 graph=graph, 

1035 whereRegion=self.whereRegion if not unique else None, 

1036 backend=self.backend, 

1037 doomed_by=self._doomed_by, 

1038 ) 

1039 

1040 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1041 # Docstring inherited from Query. 

1042 from ._builder import QueryBuilder 

1043 

1044 if summary is None: 

1045 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

1046 if not summary.requested.issubset(self.graph): 

1047 raise NotImplementedError( 

1048 f"Query.makeBuilder does not yet support augmenting dimensions " 

1049 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1050 f"({self.graph.dimensions})." 

1051 ) 

1052 builder = QueryBuilder(summary, backend=self.backend, doomed_by=self._doomed_by) 

1053 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()) 

1054 return builder 

1055 

1056 

1057class EmptyQuery(Query): 

1058 """A `Query` implementation that handes the special case where the query 

1059 would have no columns. 

1060 

1061 Parameters 

1062 ---------- 

1063 universe : `DimensionUniverse` 

1064 Set of all dimensions from which the null set is extracted. 

1065 backend : `QueryBackend` 

1066 Backend object that represents the `Registry` implementation. 

1067 doomed_by : `Iterable` [ `str` ], optional 

1068 A list of messages (appropriate for e.g. logging or exceptions) that 

1069 explain why the query is known to return no results even before it is 

1070 executed. Queries with a non-empty list will never be executed. 

1071 """ 

1072 

1073 def __init__( 

1074 self, 

1075 universe: DimensionUniverse, 

1076 backend: QueryBackend, 

1077 doomed_by: Iterable[str] = (), 

1078 ): 

1079 super().__init__(graph=universe.empty, whereRegion=None, backend=backend, doomed_by=doomed_by) 

1080 

1081 def isUnique(self) -> bool: 

1082 # Docstring inherited from Query. 

1083 return True 

1084 

1085 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1086 # Docstring inherited from Query. 

1087 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).") 

1088 

1089 @property 

1090 def spatial(self) -> Iterator[DimensionElement]: 

1091 # Docstring inherited from Query. 

1092 return iter(()) 

1093 

1094 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1095 # Docstring inherited from Query. 

1096 raise KeyError(f"No region for {name} in query (no regions at all, actually).") 

1097 

1098 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1099 # Docstring inherited from Query. 

1100 return None 

1101 

1102 def rows(self, db: Database) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

1103 if not self._doomed_by: 

1104 yield None 

1105 

1106 @property 

1107 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

1108 # Docstring inherited from Query. 

1109 return None 

1110 

1111 @contextmanager 

1112 def materialize(self, db: Database) -> Iterator[Query]: 

1113 # Docstring inherited from Query. 

1114 yield self 

1115 

1116 def subset( 

1117 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

1118 ) -> Query: 

1119 # Docstring inherited from Query. 

1120 assert graph is None or graph.issubset(self.graph) 

1121 return self 

1122 

1123 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1124 # Docstring inherited from Query. 

1125 from ._builder import QueryBuilder 

1126 

1127 if summary is None: 

1128 summary = QuerySummary(self.graph) 

1129 if not summary.requested.issubset(self.graph): 

1130 raise NotImplementedError( 

1131 f"Query.makeBuilder does not yet support augmenting dimensions " 

1132 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1133 f"({self.graph.dimensions})." 

1134 ) 

1135 return QueryBuilder(summary, backend=self.backend, doomed_by=self._doomed_by)