Coverage for python/lsst/daf/butler/registry/queries/_query.py: 22%

365 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-24 23:50 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("Query",) 

24 

25import dataclasses 

26import enum 

27import itertools 

28from abc import ABC, abstractmethod 

29from contextlib import contextmanager 

30from typing import TYPE_CHECKING, ContextManager, Dict, Iterable, Iterator, Mapping, Optional, Tuple 

31 

32import sqlalchemy 

33from lsst.sphgeom import Region 

34 

35from ...core import ( 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 Dimension, 

40 DimensionElement, 

41 DimensionGraph, 

42 DimensionRecord, 

43 DimensionUniverse, 

44 SimpleQuery, 

45 SpatialRegionDatabaseRepresentation, 

46 addDimensionForeignKey, 

47 ddl, 

48) 

49from ..interfaces import Database 

50from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers 

51 

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 from ._builder import QueryBuilder 

54 

55 

56@dataclasses.dataclass(frozen=True) 

57class OrderByColumn: 

58 """Information about single column in ORDER BY clause.""" 

59 

60 column: sqlalchemy.sql.ColumnElement 

61 """Name of the column or `None` for primary key (`str` or `None`)""" 

62 

63 ordering: bool 

64 """True for ascending order, False for descending (`bool`).""" 

65 

66 @property 

67 def column_order(self) -> sqlalchemy.sql.ColumnElement: 

68 """Column element for use in ORDER BY clause 

69 (`sqlalchemy.sql.ColumnElement`) 

70 """ 

71 return self.column.asc() if self.ordering else self.column.desc() 

72 

73 

74class Query(ABC): 

75 """An abstract base class for queries that return some combination of 

76 `DatasetRef` and `DataCoordinate` objects. 

77 

78 Parameters 

79 ---------- 

80 graph : `DimensionGraph` 

81 Object describing the dimensions included in the query. 

82 whereRegion : `lsst.sphgeom.Region`, optional 

83 Region that all region columns in all returned rows must overlap. 

84 managers : `RegistryManagers` 

85 A struct containing the registry manager instances used by the query 

86 system. 

87 doomed_by : `Iterable` [ `str` ], optional 

88 A list of messages (appropriate for e.g. logging or exceptions) that 

89 explain why the query is known to return no results even before it is 

90 executed. Queries with a non-empty list will never be executed. 

91 

92 Notes 

93 ----- 

94 The `Query` hierarchy abstracts over the database/SQL representation of a 

95 particular set of data IDs or datasets. It is expected to be used as a 

96 backend for other objects that provide more natural interfaces for one or 

97 both of these, not as part of a public interface to query results. 

98 """ 

99 

100 def __init__( 

101 self, 

102 *, 

103 graph: DimensionGraph, 

104 whereRegion: Optional[Region], 

105 managers: RegistryManagers, 

106 doomed_by: Iterable[str] = (), 

107 ): 

108 self.graph = graph 

109 self.whereRegion = whereRegion 

110 self.managers = managers 

111 self._doomed_by = tuple(doomed_by) 

112 self._filtered_by_join: Optional[int] = None 

113 self._filtered_by_where: Optional[int] = None 

114 

115 @abstractmethod 

116 def isUnique(self) -> bool: 

117 """Return `True` if this query's rows are guaranteed to be unique, and 

118 `False` otherwise. 

119 

120 If this query has dataset results (`datasetType` is not `None`), 

121 uniqueness applies to the `DatasetRef` instances returned by 

122 `extractDatasetRef` from the result of `rows`. If it does not have 

123 dataset results, uniqueness applies to the `DataCoordinate` instances 

124 returned by `extractDataId`. 

125 """ 

126 raise NotImplementedError() 

127 

128 @abstractmethod 

129 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

130 """Return the query column that contains the primary key value for 

131 the dimension with the given name. 

132 

133 Parameters 

134 ---------- 

135 name : `str` 

136 Name of the dimension. 

137 

138 Returns 

139 ------- 

140 column : `sqlalchemy.sql.ColumnElement`. 

141 SQLAlchemy object representing a column in the query. 

142 

143 Notes 

144 ----- 

145 This method is intended primarily as a hook for subclasses to implement 

146 and the ABC to call in order to provide higher-level functionality; 

147 code that uses `Query` objects (but does not implement one) should 

148 usually not have to call this method. 

149 """ 

150 raise NotImplementedError() 

151 

152 @property 

153 @abstractmethod 

154 def spatial(self) -> Iterator[DimensionElement]: 

155 """An iterator over the dimension element columns used in post-query 

156 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]). 

157 

158 Notes 

159 ----- 

160 This property is intended primarily as a hook for subclasses to 

161 implement and the ABC to call in order to provide higher-level 

162 functionality; code that uses `Query` objects (but does not implement 

163 one) should usually not have to access this property. 

164 """ 

165 raise NotImplementedError() 

166 

167 @abstractmethod 

168 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

169 """Return a region column for one of the dimension elements iterated 

170 over by `spatial`. 

171 

172 Parameters 

173 ---------- 

174 name : `str` 

175 Name of the element. 

176 

177 Returns 

178 ------- 

179 column : `sqlalchemy.sql.ColumnElement` 

180 SQLAlchemy representing a result column in the query. 

181 

182 Notes 

183 ----- 

184 This method is intended primarily as a hook for subclasses to implement 

185 and the ABC to call in order to provide higher-level functionality; 

186 code that uses `Query` objects (but does not implement one) should 

187 usually not have to call this method. 

188 """ 

189 raise NotImplementedError() 

190 

191 @property 

192 def datasetType(self) -> Optional[DatasetType]: 

193 """The `DatasetType` of datasets returned by this query, or `None` 

194 if there are no dataset results (`DatasetType` or `None`). 

195 """ 

196 cols = self.getDatasetColumns() 

197 if cols is None: 

198 return None 

199 return cols.datasetType 

200 

201 def count(self, db: Database, *, region: Optional[Region] = None, exact: bool = True) -> int: 

202 """Count the number of rows this query would return. 

203 

204 Parameters 

205 ---------- 

206 db : `Database` 

207 Object managing the database connection. 

208 region : `sphgeom.Region`, optional 

209 A region that any result-row regions must overlap in order to be 

210 yielded. If not provided, this will be ``self.whereRegion``, if 

211 that exists. 

212 exact : `bool`, optional 

213 If `True`, run the full query and perform post-query filtering if 

214 needed to account for that filtering in the count. If `False`, the 

215 result may be an upper bound. 

216 

217 Returns 

218 ------- 

219 count : `int` 

220 The number of rows the query would return, or an upper bound if 

221 ``exact=False``. 

222 

223 Notes 

224 ----- 

225 This counts the number of rows returned, not the number of unique rows 

226 returned, so even with ``exact=True`` it may provide only an upper 

227 bound on the number of *deduplicated* result rows. 

228 """ 

229 if self._doomed_by: 

230 return 0 

231 sql = self.sql 

232 if sql is None: 

233 return 1 

234 if exact and self.spatial: 

235 filtered_count = 0 

236 for _ in self.rows(db, region=region): 

237 filtered_count += 1 

238 return filtered_count 

239 else: 

240 with db.query(sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)) as sql_result: 

241 return sql_result.scalar() 

242 

243 def any( 

244 self, 

245 db: Database, 

246 *, 

247 region: Optional[Region] = None, 

248 execute: bool = True, 

249 exact: bool = True, 

250 ) -> bool: 

251 """Test whether this query returns any results. 

252 

253 Parameters 

254 ---------- 

255 db : `Database` 

256 Object managing the database connection. 

257 region : `sphgeom.Region`, optional 

258 A region that any result-row regions must overlap in order to be 

259 yielded. If not provided, this will be ``self.whereRegion``, if 

260 that exists. 

261 execute : `bool`, optional 

262 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

263 determined prior to execution that the query would return no rows. 

264 exact : `bool`, optional 

265 If `True`, run the full query and perform post-query filtering if 

266 needed, until at least one result row is found. If `False`, the 

267 returned result does not account for post-query filtering, and 

268 hence may be `True` even when all result rows would be filtered 

269 out. 

270 

271 Returns 

272 ------- 

273 any : `bool` 

274 `True` if the query would (or might, depending on arguments) yield 

275 result rows. `False` if it definitely would not. 

276 """ 

277 if self._doomed_by: 

278 return False 

279 sql = self.sql 

280 if sql is None: 

281 return True 

282 if exact and not execute: 

283 raise TypeError("Cannot obtain exact results without executing the query.") 

284 if exact and self.spatial: 

285 for _ in self.rows(db, region=region): 

286 return True 

287 return False 

288 elif execute: 

289 with db.query(sql.limit(1)) as sql_result: 

290 return sql_result.one_or_none() is not None 

291 else: 

292 return True 

293 

294 def explain_no_results( 

295 self, 

296 db: Database, 

297 *, 

298 region: Optional[Region] = None, 

299 followup: bool = True, 

300 ) -> Iterator[str]: 

301 """Return human-readable messages that may help explain why the query 

302 yields no results. 

303 

304 Parameters 

305 ---------- 

306 db : `Database` 

307 Object managing the database connection. 

308 region : `sphgeom.Region`, optional 

309 A region that any result-row regions must overlap in order to be 

310 yielded. If not provided, this will be ``self.whereRegion``, if 

311 that exists. 

312 followup : `bool`, optional 

313 If `True` (default) perform inexpensive follow-up queries if no 

314 diagnostics are available from query generation alone. 

315 

316 Returns 

317 ------- 

318 messages : `Iterator` [ `str` ] 

319 String messages that describe reasons the query might not yield any 

320 results. 

321 

322 Notes 

323 ----- 

324 Messages related to post-query filtering are only available if `rows`, 

325 `any`, or `count` was already called with the same region (with 

326 ``exact=True`` for the latter two). 

327 """ 

328 from ._builder import QueryBuilder 

329 

330 if self._doomed_by: 

331 yield from self._doomed_by 

332 return 

333 if self._filtered_by_where: 

334 yield ( 

335 f"{self._filtered_by_where} result rows were filtered out because " 

336 "one or more region did not overlap the WHERE-clause region." 

337 ) 

338 if self._filtered_by_join: 

339 yield ( 

340 f"{self._filtered_by_join} result rows were filtered out because " 

341 "one or more regions did not overlap." 

342 ) 

343 if (not followup) or self._filtered_by_join or self._filtered_by_where: 

344 return 

345 # Query didn't return results even before client-side filtering, and 

346 # caller says we can do follow-up queries to determine why. 

347 # Start by seeing if there are _any_ dimension records for each element 

348 # involved. 

349 for element in self.graph.elements: 

350 summary = QuerySummary(element.graph) 

351 builder = QueryBuilder(summary, self.managers) 

352 followup_query = builder.finish() 

353 if not followup_query.any(db, exact=False): 

354 yield f"No dimension records for element '{element.name}' found." 

355 yield from followup_query.explain_no_results(db, region=region, followup=False) 

356 return 

357 

358 @abstractmethod 

359 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

360 """Return the columns for the datasets returned by this query. 

361 

362 Returns 

363 ------- 

364 columns : `DatasetQueryColumns` or `None` 

365 Struct containing SQLAlchemy representations of the result columns 

366 for a dataset. 

367 

368 Notes 

369 ----- 

370 This method is intended primarily as a hook for subclasses to implement 

371 and the ABC to call in order to provide higher-level functionality; 

372 code that uses `Query` objects (but does not implement one) should 

373 usually not have to call this method. 

374 """ 

375 raise NotImplementedError() 

376 

377 @property 

378 @abstractmethod 

379 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

380 """A SQLAlchemy object representing the full query 

381 (`sqlalchemy.sql.FromClause` or `None`). 

382 

383 This is `None` in the special case where the query has no columns, and 

384 only one logical row. 

385 """ 

386 raise NotImplementedError() 

387 

388 def rows( 

389 self, db: Database, *, region: Optional[Region] = None 

390 ) -> Iterator[Optional[sqlalchemy.engine.Row]]: 

391 """Execute the query and yield result rows, applying `predicate`. 

392 

393 Parameters 

394 ---------- 

395 db : `Database` 

396 Object managing the database connection. 

397 region : `sphgeom.Region`, optional 

398 A region that any result-row regions must overlap in order to be 

399 yielded. If not provided, this will be ``self.whereRegion``, if 

400 that exists. 

401 

402 Yields 

403 ------ 

404 row : `sqlalchemy.engine.RowProxy` or `None` 

405 Result row from the query. `None` may yielded exactly once instead 

406 of any real rows to indicate an empty query (see `EmptyQuery`). 

407 """ 

408 if self._doomed_by: 

409 return 

410 whereRegion = region if region is not None else self.whereRegion 

411 self._filtered_by_where = 0 

412 self._filtered_by_join = 0 

413 with db.query(self.sql) as sql_result: 

414 sql_rows = sql_result.fetchall() 

415 for row in sql_rows: 

416 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial] 

417 if whereRegion and any(r.isDisjointFrom(whereRegion) for r in rowRegions): 

418 self._filtered_by_where += 1 

419 continue 

420 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)): 

421 self._filtered_by_join += 1 

422 continue 

423 yield row 

424 

425 def extractDimensionsTuple( 

426 self, row: Optional[sqlalchemy.engine.RowProxy], dimensions: Iterable[Dimension] 

427 ) -> tuple: 

428 """Extract a tuple of data ID values from a result row. 

429 

430 Parameters 

431 ---------- 

432 row : `sqlalchemy.engine.RowProxy` or `None` 

433 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

434 the row from an `EmptyQuery`. 

435 dimensions : `Iterable` [ `Dimension` ] 

436 The dimensions to include in the returned tuple, in order. 

437 

438 Returns 

439 ------- 

440 values : `tuple` 

441 A tuple of dimension primary key values. 

442 """ 

443 if row is None: 

444 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions." 

445 return () 

446 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions) 

447 

448 def extractDataId( 

449 self, 

450 row: Optional[sqlalchemy.engine.RowProxy], 

451 *, 

452 graph: Optional[DimensionGraph] = None, 

453 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

454 ) -> DataCoordinate: 

455 """Extract a data ID from a result row. 

456 

457 Parameters 

458 ---------- 

459 row : `sqlalchemy.engine.RowProxy` or `None` 

460 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

461 the row from an `EmptyQuery`. 

462 graph : `DimensionGraph`, optional 

463 The dimensions the returned data ID should identify. If not 

464 provided, this will be all dimensions in `QuerySummary.requested`. 

465 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

466 Nested mapping containing records to attach to the returned 

467 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will 

468 return `True`. If provided, outer keys must include all dimension 

469 element names in ``graph``, and inner keys should be tuples of 

470 dimension primary key values in the same order as 

471 ``element.graph.required``. If not provided, 

472 `DataCoordinate.hasRecords` will return `False` on the returned 

473 object. 

474 

475 Returns 

476 ------- 

477 dataId : `DataCoordinate` 

478 A data ID that identifies all required and implied dimensions. If 

479 ``records is not None``, this is have 

480 `~DataCoordinate.hasRecords()` return `True`. 

481 """ 

482 if graph is None: 

483 graph = self.graph 

484 if not graph: 

485 return DataCoordinate.makeEmpty(self.graph.universe) 

486 dataId = DataCoordinate.fromFullValues( 

487 graph, self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied)) 

488 ) 

489 if records is not None: 

490 recordsForRow = {} 

491 for element in graph.elements: 

492 key = tuple(dataId.subset(element.graph).values()) 

493 recordsForRow[element.name] = records[element.name].get(key) 

494 return dataId.expanded(recordsForRow) 

495 else: 

496 return dataId 

497 

498 def extractDatasetRef( 

499 self, 

500 row: sqlalchemy.engine.RowProxy, 

501 dataId: Optional[DataCoordinate] = None, 

502 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

503 ) -> DatasetRef: 

504 """Extract a `DatasetRef` from a result row. 

505 

506 Parameters 

507 ---------- 

508 row : `sqlalchemy.engine.RowProxy` 

509 A result row from a SQLAlchemy SELECT query. 

510 dataId : `DataCoordinate` 

511 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class) 

512 `DataCoordinate` is constructed from ``row`` if `None`. 

513 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

514 Records to use to return an `ExpandedDataCoordinate`. If provided, 

515 outer keys must include all dimension element names in ``graph``, 

516 and inner keys should be tuples of dimension primary key values 

517 in the same order as ``element.graph.required``. 

518 

519 Returns 

520 ------- 

521 ref : `DatasetRef` 

522 Reference to the dataset; guaranteed to have `DatasetRef.id` not 

523 `None`. 

524 """ 

525 datasetColumns = self.getDatasetColumns() 

526 assert datasetColumns is not None 

527 if dataId is None: 

528 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records) 

529 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]] 

530 return DatasetRef( 

531 datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], run=runRecord.name 

532 ) 

533 

534 def _makeSubsetQueryColumns( 

535 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

536 ) -> Tuple[DimensionGraph, Optional[QueryColumns]]: 

537 """Helper method for subclass implementations of `subset`. 

538 

539 Parameters 

540 ---------- 

541 graph : `DimensionGraph`, optional 

542 Dimensions to include in the new `Query` being constructed. 

543 ``subset`` implementations should generally just forward their 

544 own ``graph`` argument here. 

545 datasets : `bool`, optional 

546 Whether the new `Query` should include dataset results. Defaults 

547 to `True`, but is ignored if ``self`` does not include dataset 

548 results. 

549 unique : `bool`, optional 

550 Whether the new `Query` should guarantee unique results (this may 

551 come with a performance penalty). 

552 

553 Returns 

554 ------- 

555 graph : `DimensionGraph` 

556 The dimensions of the new `Query`. This is exactly the same as 

557 the argument of the same name, with ``self.graph`` used if that 

558 argument is `None`. 

559 columns : `QueryColumns` or `None` 

560 A struct containing the SQLAlchemy column objects to use in the 

561 new query, constructed by delegating to other (mostly abstract) 

562 methods on ``self``. If `None`, `subset` may return ``self``. 

563 """ 

564 if graph is None: 

565 graph = self.graph 

566 if ( 

567 graph == self.graph 

568 and (self.getDatasetColumns() is None or datasets) 

569 and (self.isUnique() or not unique) 

570 ): 

571 return graph, None 

572 columns = QueryColumns() 

573 for dimension in graph.dimensions: 

574 col = self.getDimensionColumn(dimension.name) 

575 columns.keys[dimension] = [col] 

576 if not unique: 

577 for element in self.spatial: 

578 col = self.getRegionColumn(element.name) 

579 columns.regions[element] = col 

580 if datasets and self.getDatasetColumns() is not None: 

581 columns.datasets = self.getDatasetColumns() 

582 return graph, columns 

583 

584 @abstractmethod 

585 def materialize(self, db: Database) -> ContextManager[Query]: 

586 """Execute this query and insert its results into a temporary table. 

587 

588 Parameters 

589 ---------- 

590 db : `Database` 

591 Database engine to execute the query against. 

592 

593 Returns 

594 ------- 

595 context : `typing.ContextManager` [ `MaterializedQuery` ] 

596 A context manager that ensures the temporary table is created and 

597 populated in ``__enter__`` (returning a `MaterializedQuery` object 

598 backed by that table), and dropped in ``__exit__``. If ``self`` 

599 is already a `MaterializedQuery`, ``__enter__`` may just return 

600 ``self`` and ``__exit__`` may do nothing (reflecting the fact that 

601 an outer context manager should already take care of everything 

602 else). 

603 """ 

604 raise NotImplementedError() 

605 

606 @abstractmethod 

607 def subset( 

608 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

609 ) -> Query: 

610 """Return a new `Query` whose columns and/or rows are (mostly) subset 

611 of this one's. 

612 

613 Parameters 

614 ---------- 

615 graph : `DimensionGraph`, optional 

616 Dimensions to include in the new `Query` being constructed. 

617 If `None` (default), ``self.graph`` is used. 

618 datasets : `bool`, optional 

619 Whether the new `Query` should include dataset results. Defaults 

620 to `True`, but is ignored if ``self`` does not include dataset 

621 results. 

622 unique : `bool`, optional 

623 Whether the new `Query` should guarantee unique results (this may 

624 come with a performance penalty). 

625 

626 Returns 

627 ------- 

628 query : `Query` 

629 A query object corresponding to the given inputs. May be ``self`` 

630 if no changes were requested. 

631 

632 Notes 

633 ----- 

634 The way spatial overlaps are handled at present makes it impossible to 

635 fully guarantee in general that the new query's rows are a subset of 

636 this one's while also returning unique rows. That's because the 

637 database is only capable of performing approximate, conservative 

638 overlaps via the common skypix system; we defer actual region overlap 

639 operations to per-result-row Python logic. But including the region 

640 columns necessary to do that postprocessing in the query makes it 

641 impossible to do a SELECT DISTINCT on the user-visible dimensions of 

642 the query. For example, consider starting with a query with dimensions 

643 (instrument, skymap, visit, tract). That involves a spatial join 

644 between visit and tract, and we include the region columns from both 

645 tables in the results in order to only actually yield result rows 

646 (see `predicate` and `rows`) where the regions in those two columns 

647 overlap. If the user then wants to subset to just (skymap, tract) with 

648 unique results, we have two unpalatable options: 

649 

650 - we can do a SELECT DISTINCT with just the skymap and tract columns 

651 in the SELECT clause, dropping all detailed overlap information and 

652 including some tracts that did not actually overlap any of the 

653 visits in the original query (but were regarded as _possibly_ 

654 overlapping via the coarser, common-skypix relationships); 

655 

656 - we can include the tract and visit region columns in the query, and 

657 continue to filter out the non-overlapping pairs, but completely 

658 disregard the user's request for unique tracts. 

659 

660 This interface specifies that implementations must do the former, as 

661 that's what makes things efficient in our most important use case 

662 (``QuantumGraph`` generation in ``pipe_base``). We may be able to 

663 improve this situation in the future by putting exact overlap 

664 information in the database, either by using built-in (but 

665 engine-specific) spatial database functionality or (more likely) 

666 switching to a scheme in which pairwise dimension spatial relationships 

667 are explicitly precomputed (for e.g. combinations of instruments and 

668 skymaps). 

669 """ 

670 raise NotImplementedError() 

671 

672 @abstractmethod 

673 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

674 """Return a `QueryBuilder` that can be used to construct a new `Query` 

675 that is joined to (and hence constrained by) this one. 

676 

677 Parameters 

678 ---------- 

679 summary : `QuerySummary`, optional 

680 A `QuerySummary` instance that specifies the dimensions and any 

681 additional constraints to include in the new query being 

682 constructed, or `None` to use the dimensions of ``self`` with no 

683 additional constraints. 

684 """ 

685 raise NotImplementedError() 

686 

687 graph: DimensionGraph 

688 """The dimensions identified by this query and included in any data IDs 

689 created from its result rows (`DimensionGraph`). 

690 """ 

691 

692 whereRegion: Optional[Region] 

693 """A spatial region that all regions in all rows returned by this query 

694 must overlap (`lsst.sphgeom.Region` or `None`). 

695 """ 

696 

697 managers: RegistryManagers 

698 """A struct containing `Registry` helper object (`RegistryManagers`). 

699 """ 

700 

701 

702class DirectQueryUniqueness(enum.Enum): 

703 """An enum representing the ways in which a query can have unique rows (or 

704 not). 

705 """ 

706 

707 NOT_UNIQUE = enum.auto() 

708 """The query is not expected to have unique rows. 

709 """ 

710 

711 NATURALLY_UNIQUE = enum.auto() 

712 """The construction of the query guarantees that it will have unique 

713 result rows, even without SELECT DISTINCT or a GROUP BY clause. 

714 """ 

715 

716 NEEDS_DISTINCT = enum.auto() 

717 """The query is expected to yield unique result rows, and needs to use 

718 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this. 

719 """ 

720 

721 

722class DirectQuery(Query): 

723 """A `Query` implementation that represents a direct SELECT query that 

724 usually joins many tables. 

725 

726 `DirectQuery` objects should generally only be constructed by 

727 `QueryBuilder` or the methods of other `Query` objects. 

728 

729 Parameters 

730 ---------- 

731 simpleQuery : `SimpleQuery` 

732 Struct representing the actual SELECT, FROM, and WHERE clauses. 

733 columns : `QueryColumns` 

734 Columns that are referenced in the query in any clause. 

735 uniqueness : `DirectQueryUniqueness` 

736 Enum value indicating whether the query should yield unique result 

737 rows, and if so whether that needs to be explicitly requested of the 

738 database. 

739 graph : `DimensionGraph` 

740 Object describing the dimensions included in the query. 

741 whereRegion : `lsst.sphgeom.Region`, optional 

742 Region that all region columns in all returned rows must overlap. 

743 managers : `RegistryManagers` 

744 Struct containing the `Registry` manager helper objects, to be 

745 forwarded to the `Query` constructor. 

746 doomed_by : `Iterable` [ `str` ], optional 

747 A list of messages (appropriate for e.g. logging or exceptions) that 

748 explain why the query is known to return no results even before it is 

749 executed. Queries with a non-empty list will never be executed. 

750 """ 

751 

752 def __init__( 

753 self, 

754 *, 

755 simpleQuery: SimpleQuery, 

756 columns: QueryColumns, 

757 uniqueness: DirectQueryUniqueness, 

758 graph: DimensionGraph, 

759 whereRegion: Optional[Region], 

760 managers: RegistryManagers, 

761 order_by_columns: Iterable[OrderByColumn] = (), 

762 limit: Optional[Tuple[int, Optional[int]]] = None, 

763 doomed_by: Iterable[str] = (), 

764 ): 

765 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

766 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql" 

767 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns." 

768 self._simpleQuery = simpleQuery 

769 self._columns = columns 

770 self._uniqueness = uniqueness 

771 self._order_by_columns = order_by_columns 

772 self._limit = limit 

773 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None 

774 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

775 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

776 

777 def isUnique(self) -> bool: 

778 # Docstring inherited from Query. 

779 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE 

780 

781 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

782 # Docstring inherited from Query. 

783 column = self._dimensionColumns.get(name) 

784 if column is None: 

785 column = self._columns.getKeyColumn(name).label(name) 

786 self._dimensionColumns[name] = column 

787 return column 

788 

789 @property 

790 def spatial(self) -> Iterator[DimensionElement]: 

791 # Docstring inherited from Query. 

792 return iter(self._columns.regions) 

793 

794 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

795 # Docstring inherited from Query. 

796 column = self._regionColumns.get(name) 

797 if column is None: 

798 column = self._columns.regions[name].column.label(f"{name}_region") 

799 self._regionColumns[name] = column 

800 return column 

801 

802 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

803 # Docstring inherited from Query. 

804 if self._datasetQueryColumns is None: 

805 base = self._columns.datasets 

806 if base is None: 

807 return None 

808 ingestDate = base.ingestDate 

809 if ingestDate is not None: 

810 ingestDate = ingestDate.label("ingest_date") 

811 self._datasetQueryColumns = DatasetQueryColumns( 

812 datasetType=base.datasetType, 

813 id=base.id.label("dataset_id"), 

814 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()), 

815 ingestDate=ingestDate, 

816 ) 

817 return self._datasetQueryColumns 

818 

819 @property 

820 def sql(self) -> sqlalchemy.sql.FromClause: 

821 # Docstring inherited from Query. 

822 simpleQuery = self._simpleQuery.copy() 

823 for dimension in self.graph: 

824 simpleQuery.columns.append(self.getDimensionColumn(dimension.name)) 

825 for element in self.spatial: 

826 simpleQuery.columns.append(self.getRegionColumn(element.name)) 

827 datasetColumns = self.getDatasetColumns() 

828 if datasetColumns is not None: 

829 simpleQuery.columns.extend(datasetColumns) 

830 

831 assert not simpleQuery.order_by, "Input query cannot have ORDER BY" 

832 if self._order_by_columns: 

833 # add ORDER BY column 

834 order_by_columns = [column.column_order for column in self._order_by_columns] 

835 order_by_column = sqlalchemy.func.row_number().over(order_by=order_by_columns).label("_orderby") 

836 simpleQuery.columns.append(order_by_column) 

837 simpleQuery.order_by = [order_by_column] 

838 

839 assert simpleQuery.limit is None, "Input query cannot have LIMIT" 

840 simpleQuery.limit = self._limit 

841 

842 sql = simpleQuery.combine() 

843 

844 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT: 

845 return sql.distinct() 

846 else: 

847 return sql 

848 

849 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec: 

850 """Helper method for subclass implementations of `materialize`. 

851 

852 Parameters 

853 ---------- 

854 constraints : `bool`, optional 

855 If `True` (`False` is default), define a specification that 

856 includes actual foreign key constraints for logical foreign keys. 

857 Some database engines do not permit temporary tables to reference 

858 normal tables, so this should be `False` when generating a spec 

859 for a temporary table unless the database engine is known to 

860 support them. 

861 

862 Returns 

863 ------- 

864 spec : `ddl.TableSpec` 

865 Specification for a table that could hold this query's result rows. 

866 """ 

867 unique = self.isUnique() 

868 spec = ddl.TableSpec(fields=()) 

869 for dimension in self.graph: 

870 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints) 

871 for element in self.spatial: 

872 spec.fields.update( 

873 SpatialRegionDatabaseRepresentation.makeFieldSpecs( 

874 nullable=True, 

875 name=f"{element.name}_region", 

876 ) 

877 ) 

878 datasetColumns = self.getDatasetColumns() 

879 if datasetColumns is not None: 

880 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints) 

881 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints) 

882 

883 # Need a column for ORDER BY if ordering is requested 

884 if self._order_by_columns: 

885 spec.fields.add( 

886 ddl.FieldSpec( 

887 name="_orderby", 

888 dtype=sqlalchemy.BigInteger, 

889 nullable=False, 

890 doc="Column to use with ORDER BY", 

891 ) 

892 ) 

893 

894 return spec 

895 

896 @contextmanager 

897 def materialize(self, db: Database) -> Iterator[Query]: 

898 # Docstring inherited from Query. 

899 spec = self._makeTableSpec() 

900 with db.temporary_table(spec) as table: 

901 if not self._doomed_by: 

902 db.insert(table, select=self.sql, names=spec.fields.names) 

903 yield MaterializedQuery( 

904 table=table, 

905 spatial=self.spatial, 

906 datasetType=self.datasetType, 

907 isUnique=self.isUnique(), 

908 graph=self.graph, 

909 whereRegion=self.whereRegion, 

910 managers=self.managers, 

911 doomed_by=self._doomed_by, 

912 ) 

913 

914 def subset( 

915 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

916 ) -> Query: 

917 # Docstring inherited from Query. 

918 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

919 if columns is None: 

920 return self 

921 if columns.isEmpty(): 

922 return EmptyQuery(self.graph.universe, self.managers) 

923 return DirectQuery( 

924 simpleQuery=self._simpleQuery.copy(), 

925 columns=columns, 

926 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

927 graph=graph, 

928 whereRegion=self.whereRegion if not unique else None, 

929 managers=self.managers, 

930 doomed_by=self._doomed_by, 

931 ) 

932 

933 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

934 # Docstring inherited from Query. 

935 from ._builder import QueryBuilder 

936 

937 if summary is None: 

938 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

939 if not summary.requested.issubset(self.graph): 

940 raise NotImplementedError( 

941 f"Query.makeBuilder does not yet support augmenting dimensions " 

942 f"({summary.requested.dimensions}) beyond those originally included in the query " 

943 f"({self.graph.dimensions})." 

944 ) 

945 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

946 builder.joinTable( 

947 self.sql.alias(), dimensions=self.graph.dimensions, datasets=self.getDatasetColumns() 

948 ) 

949 return builder 

950 

951 

952class MaterializedQuery(Query): 

953 """A `Query` implementation that represents query results saved in a 

954 temporary table. 

955 

956 `MaterializedQuery` instances should not be constructed directly; use 

957 `Query.materialize()` instead. 

958 

959 Parameters 

960 ---------- 

961 table : `sqlalchemy.schema.Table` 

962 SQLAlchemy object representing the temporary table. 

963 spatial : `Iterable` [ `DimensionElement` ] 

964 Spatial dimension elements whose regions must overlap for each valid 

965 result row (which may reject some rows that are in the table). 

966 datasetType : `DatasetType` 

967 The `DatasetType` of datasets returned by this query, or `None` 

968 if there are no dataset results 

969 isUnique : `bool` 

970 If `True`, the table's rows are unique, and there is no need to 

971 add ``SELECT DISTINCT`` to guarantee this in results. 

972 graph : `DimensionGraph` 

973 Dimensions included in the columns of this table. 

974 whereRegion : `Region` or `None` 

975 A spatial region all result-row regions must overlap to be valid (which 

976 may reject some rows that are in the table). 

977 managers : `RegistryManagers` 

978 A struct containing `Registry` manager helper objects, forwarded to 

979 the `Query` constructor. 

980 doomed_by : `Iterable` [ `str` ], optional 

981 A list of messages (appropriate for e.g. logging or exceptions) that 

982 explain why the query is known to return no results even before it is 

983 executed. Queries with a non-empty list will never be executed. 

984 """ 

985 

986 def __init__( 

987 self, 

988 *, 

989 table: sqlalchemy.schema.Table, 

990 spatial: Iterable[DimensionElement], 

991 datasetType: Optional[DatasetType], 

992 isUnique: bool, 

993 graph: DimensionGraph, 

994 whereRegion: Optional[Region], 

995 managers: RegistryManagers, 

996 doomed_by: Iterable[str] = (), 

997 ): 

998 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

999 self._table = table 

1000 self._spatial = tuple(spatial) 

1001 self._datasetType = datasetType 

1002 self._isUnique = isUnique 

1003 

1004 def isUnique(self) -> bool: 

1005 # Docstring inherited from Query. 

1006 return self._isUnique 

1007 

1008 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1009 # Docstring inherited from Query. 

1010 return self._table.columns[name] 

1011 

1012 @property 

1013 def spatial(self) -> Iterator[DimensionElement]: 

1014 # Docstring inherited from Query. 

1015 return iter(self._spatial) 

1016 

1017 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1018 # Docstring inherited from Query. 

1019 return self._table.columns[f"{name}_region"] 

1020 

1021 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1022 # Docstring inherited from Query. 

1023 if self._datasetType is not None: 

1024 return DatasetQueryColumns( 

1025 datasetType=self._datasetType, 

1026 id=self._table.columns["dataset_id"], 

1027 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()], 

1028 ingestDate=None, 

1029 ) 

1030 else: 

1031 return None 

1032 

1033 @property 

1034 def sql(self) -> sqlalchemy.sql.FromClause: 

1035 # Docstring inherited from Query. 

1036 select = self._table.select() 

1037 if "_orderby" in self._table.columns: 

1038 select = select.order_by(self._table.columns["_orderby"]) 

1039 return select 

1040 

1041 @contextmanager 

1042 def materialize(self, db: Database) -> Iterator[Query]: 

1043 # Docstring inherited from Query. 

1044 yield self 

1045 

1046 def subset( 

1047 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

1048 ) -> Query: 

1049 # Docstring inherited from Query. 

1050 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

1051 if columns is None: 

1052 return self 

1053 if columns.isEmpty(): 

1054 return EmptyQuery(self.graph.universe, managers=self.managers) 

1055 simpleQuery = SimpleQuery() 

1056 simpleQuery.join(self._table) 

1057 return DirectQuery( 

1058 simpleQuery=simpleQuery, 

1059 columns=columns, 

1060 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

1061 graph=graph, 

1062 whereRegion=self.whereRegion if not unique else None, 

1063 managers=self.managers, 

1064 doomed_by=self._doomed_by, 

1065 ) 

1066 

1067 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1068 # Docstring inherited from Query. 

1069 from ._builder import QueryBuilder 

1070 

1071 if summary is None: 

1072 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

1073 if not summary.requested.issubset(self.graph): 

1074 raise NotImplementedError( 

1075 f"Query.makeBuilder does not yet support augmenting dimensions " 

1076 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1077 f"({self.graph.dimensions})." 

1078 ) 

1079 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

1080 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()) 

1081 return builder 

1082 

1083 

1084class EmptyQuery(Query): 

1085 """A `Query` implementation that handes the special case where the query 

1086 would have no columns. 

1087 

1088 Parameters 

1089 ---------- 

1090 universe : `DimensionUniverse` 

1091 Set of all dimensions from which the null set is extracted. 

1092 managers : `RegistryManagers` 

1093 A struct containing the registry manager instances used by the query 

1094 system. 

1095 doomed_by : `Iterable` [ `str` ], optional 

1096 A list of messages (appropriate for e.g. logging or exceptions) that 

1097 explain why the query is known to return no results even before it is 

1098 executed. Queries with a non-empty list will never be executed. 

1099 """ 

1100 

1101 def __init__( 

1102 self, 

1103 universe: DimensionUniverse, 

1104 managers: RegistryManagers, 

1105 doomed_by: Iterable[str] = (), 

1106 ): 

1107 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by) 

1108 

1109 def isUnique(self) -> bool: 

1110 # Docstring inherited from Query. 

1111 return True 

1112 

1113 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1114 # Docstring inherited from Query. 

1115 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).") 

1116 

1117 @property 

1118 def spatial(self) -> Iterator[DimensionElement]: 

1119 # Docstring inherited from Query. 

1120 return iter(()) 

1121 

1122 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1123 # Docstring inherited from Query. 

1124 raise KeyError(f"No region for {name} in query (no regions at all, actually).") 

1125 

1126 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1127 # Docstring inherited from Query. 

1128 return None 

1129 

1130 def rows( 

1131 self, db: Database, *, region: Optional[Region] = None 

1132 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

1133 if not self._doomed_by: 

1134 yield None 

1135 

1136 @property 

1137 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

1138 # Docstring inherited from Query. 

1139 return None 

1140 

1141 @contextmanager 

1142 def materialize(self, db: Database) -> Iterator[Query]: 

1143 # Docstring inherited from Query. 

1144 yield self 

1145 

1146 def subset( 

1147 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

1148 ) -> Query: 

1149 # Docstring inherited from Query. 

1150 assert graph is None or graph.issubset(self.graph) 

1151 return self 

1152 

1153 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1154 # Docstring inherited from Query. 

1155 from ._builder import QueryBuilder 

1156 

1157 if summary is None: 

1158 summary = QuerySummary(self.graph) 

1159 if not summary.requested.issubset(self.graph): 

1160 raise NotImplementedError( 

1161 f"Query.makeBuilder does not yet support augmenting dimensions " 

1162 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1163 f"({self.graph.dimensions})." 

1164 ) 

1165 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)