Coverage for python/lsst/daf/butler/registry/queries/_query.py: 22%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

318 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("Query",) 

24 

25from abc import ABC, abstractmethod 

26from contextlib import contextmanager 

27import enum 

28import itertools 

29from typing import ( 

30 Dict, 

31 Iterable, 

32 Iterator, 

33 Mapping, 

34 Optional, 

35 Tuple, 

36 TYPE_CHECKING, 

37) 

38 

39import sqlalchemy 

40 

41from lsst.sphgeom import Region 

42 

43from ...core import ( 

44 addDimensionForeignKey, 

45 DataCoordinate, 

46 DatasetRef, 

47 DatasetType, 

48 ddl, 

49 Dimension, 

50 DimensionElement, 

51 DimensionGraph, 

52 DimensionRecord, 

53 DimensionUniverse, 

54 SpatialRegionDatabaseRepresentation, 

55 SimpleQuery, 

56) 

57from ..interfaces import Database 

58from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers 

59 

60if TYPE_CHECKING: 60 ↛ 61line 60 didn't jump to line 61, because the condition on line 60 was never true

61 from ._builder import QueryBuilder 

62 

63 

64class Query(ABC): 

65 """An abstract base class for queries that return some combination of 

66 `DatasetRef` and `DataCoordinate` objects. 

67 

68 Parameters 

69 ---------- 

70 graph : `DimensionGraph` 

71 Object describing the dimensions included in the query. 

72 whereRegion : `lsst.sphgeom.Region`, optional 

73 Region that all region columns in all returned rows must overlap. 

74 managers : `RegistryManagers` 

75 A struct containing the registry manager instances used by the query 

76 system. 

77 doomed_by : `Iterable` [ `str` ], optional 

78 A list of messages (appropriate for e.g. logging or exceptions) that 

79 explain why the query is known to return no results even before it is 

80 executed. Queries with a non-empty list will never be executed. 

81 

82 Notes 

83 ----- 

84 The `Query` hierarchy abstracts over the database/SQL representation of a 

85 particular set of data IDs or datasets. It is expected to be used as a 

86 backend for other objects that provide more natural interfaces for one or 

87 both of these, not as part of a public interface to query results. 

88 """ 

89 def __init__(self, *, 

90 graph: DimensionGraph, 

91 whereRegion: Optional[Region], 

92 managers: RegistryManagers, 

93 doomed_by: Iterable[str] = (), 

94 ): 

95 self.graph = graph 

96 self.whereRegion = whereRegion 

97 self.managers = managers 

98 self._doomed_by = tuple(doomed_by) 

99 self._filtered_by_join: Optional[int] = None 

100 self._filtered_by_where: Optional[int] = None 

101 

102 @abstractmethod 

103 def isUnique(self) -> bool: 

104 """Return `True` if this query's rows are guaranteed to be unique, and 

105 `False` otherwise. 

106 

107 If this query has dataset results (`datasetType` is not `None`), 

108 uniqueness applies to the `DatasetRef` instances returned by 

109 `extractDatasetRef` from the result of `rows`. If it does not have 

110 dataset results, uniqueness applies to the `DataCoordinate` instances 

111 returned by `extractDataId`. 

112 """ 

113 raise NotImplementedError() 

114 

115 @abstractmethod 

116 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

117 """Return the query column that contains the primary key value for 

118 the dimension with the given name. 

119 

120 Parameters 

121 ---------- 

122 name : `str` 

123 Name of the dimension. 

124 

125 Returns 

126 ------- 

127 column : `sqlalchemy.sql.ColumnElement`. 

128 SQLAlchemy object representing a column in the query. 

129 

130 Notes 

131 ----- 

132 This method is intended primarily as a hook for subclasses to implement 

133 and the ABC to call in order to provide higher-level functionality; 

134 code that uses `Query` objects (but does not implement one) should 

135 usually not have to call this method. 

136 """ 

137 raise NotImplementedError() 

138 

139 @property 

140 @abstractmethod 

141 def spatial(self) -> Iterator[DimensionElement]: 

142 """An iterator over the dimension element columns used in post-query 

143 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]). 

144 

145 Notes 

146 ----- 

147 This property is intended primarily as a hook for subclasses to 

148 implement and the ABC to call in order to provide higher-level 

149 functionality; code that uses `Query` objects (but does not implement 

150 one) should usually not have to access this property. 

151 """ 

152 raise NotImplementedError() 

153 

154 @abstractmethod 

155 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

156 """Return a region column for one of the dimension elements iterated 

157 over by `spatial`. 

158 

159 Parameters 

160 ---------- 

161 name : `str` 

162 Name of the element. 

163 

164 Returns 

165 ------- 

166 column : `sqlalchemy.sql.ColumnElement` 

167 SQLAlchemy representing a result column in the query. 

168 

169 Notes 

170 ----- 

171 This method is intended primarily as a hook for subclasses to implement 

172 and the ABC to call in order to provide higher-level functionality; 

173 code that uses `Query` objects (but does not implement one) should 

174 usually not have to call this method. 

175 """ 

176 raise NotImplementedError() 

177 

178 @property 

179 def datasetType(self) -> Optional[DatasetType]: 

180 """The `DatasetType` of datasets returned by this query, or `None` 

181 if there are no dataset results (`DatasetType` or `None`). 

182 """ 

183 cols = self.getDatasetColumns() 

184 if cols is None: 

185 return None 

186 return cols.datasetType 

187 

188 def count(self, db: Database, *, region: Optional[Region] = None, exact: bool = True) -> int: 

189 """Count the number of rows this query would return. 

190 

191 Parameters 

192 ---------- 

193 db : `Database` 

194 Object managing the database connection. 

195 region : `sphgeom.Region`, optional 

196 A region that any result-row regions must overlap in order to be 

197 yielded. If not provided, this will be ``self.whereRegion``, if 

198 that exists. 

199 exact : `bool`, optional 

200 If `True`, run the full query and perform post-query filtering if 

201 needed to account for that filtering in the count. If `False`, the 

202 result may be an upper bound. 

203 

204 Returns 

205 ------- 

206 count : `int` 

207 The number of rows the query would return, or an upper bound if 

208 ``exact=False``. 

209 

210 Notes 

211 ----- 

212 This counts the number of rows returned, not the number of unique rows 

213 returned, so even with ``exact=True`` it may provide only an upper 

214 bound on the number of *deduplicated* result rows. 

215 """ 

216 if self._doomed_by: 

217 return 0 

218 sql = self.sql 

219 if sql is None: 

220 return 1 

221 if exact and self.spatial: 

222 filtered_count = 0 

223 for _ in self.rows(db, region=region): 

224 filtered_count += 1 

225 return filtered_count 

226 else: 

227 return db.query( 

228 sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None) 

229 ).scalar() 

230 

231 def any( 

232 self, 

233 db: Database, *, 

234 region: Optional[Region] = None, 

235 execute: bool = True, 

236 exact: bool = True, 

237 ) -> bool: 

238 """Test whether this query returns any results. 

239 

240 Parameters 

241 ---------- 

242 db : `Database` 

243 Object managing the database connection. 

244 region : `sphgeom.Region`, optional 

245 A region that any result-row regions must overlap in order to be 

246 yielded. If not provided, this will be ``self.whereRegion``, if 

247 that exists. 

248 execute : `bool`, optional 

249 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

250 determined prior to execution that the query would return no rows. 

251 exact : `bool`, optional 

252 If `True`, run the full query and perform post-query filtering if 

253 needed, until at least one result row is found. If `False`, the 

254 returned result does not account for post-query filtering, and 

255 hence may be `True` even when all result rows would be filtered 

256 out. 

257 

258 Returns 

259 ------- 

260 any : `bool` 

261 `True` if the query would (or might, depending on arguments) yield 

262 result rows. `False` if it definitely would not. 

263 """ 

264 if self._doomed_by: 

265 return False 

266 sql = self.sql 

267 if sql is None: 

268 return True 

269 if exact and not execute: 

270 raise TypeError("Cannot obtain exact results without executing the query.") 

271 if exact and self.spatial: 

272 for _ in self.rows(db, region=region): 

273 return True 

274 return False 

275 elif execute: 

276 return db.query(sql.limit(1)).one_or_none() is not None 

277 else: 

278 return True 

279 

280 def explain_no_results( 

281 self, 

282 db: Database, *, 

283 region: Optional[Region] = None, 

284 ) -> Iterator[str]: 

285 """Return human-readable messages that may help explain why the query 

286 yields no results. 

287 

288 Parameters 

289 ---------- 

290 db : `Database` 

291 Object managing the database connection. 

292 region : `sphgeom.Region`, optional 

293 A region that any result-row regions must overlap in order to be 

294 yielded. If not provided, this will be ``self.whereRegion``, if 

295 that exists. 

296 

297 Returns 

298 ------- 

299 messages : `Iterator` [ `str` ] 

300 String messages that describe reasons the query might not yield any 

301 results. 

302 

303 Notes 

304 ----- 

305 Messages related to post-query filtering are only available if `rows`, 

306 `any`, or `count` was already called with the same region (with 

307 ``exact=True`` for the latter two). 

308 

309 At present, this method only returns messages that are generated while 

310 the query is being built or filtered. In the future, it may perform 

311 its own new follow-up queries, which users may wish to short-circuit 

312 simply by not continuing to iterate over its results. 

313 """ 

314 yield from self._doomed_by 

315 if self._filtered_by_where: 

316 yield ( 

317 f"{self._filtered_by_where} result rows were filtered out because " 

318 "one or more region did not overlap the WHERE-clause region." 

319 ) 

320 if self._filtered_by_join: 

321 yield ( 

322 f"{self._filtered_by_join} result rows were filtered out because " 

323 "one or more regions did not overlap." 

324 ) 

325 

326 @abstractmethod 

327 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

328 """Return the columns for the datasets returned by this query. 

329 

330 Returns 

331 ------- 

332 columns : `DatasetQueryColumns` or `None` 

333 Struct containing SQLAlchemy representations of the result columns 

334 for a dataset. 

335 

336 Notes 

337 ----- 

338 This method is intended primarily as a hook for subclasses to implement 

339 and the ABC to call in order to provide higher-level functionality; 

340 code that uses `Query` objects (but does not implement one) should 

341 usually not have to call this method. 

342 """ 

343 raise NotImplementedError() 

344 

345 @property 

346 @abstractmethod 

347 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

348 """A SQLAlchemy object representing the full query 

349 (`sqlalchemy.sql.FromClause` or `None`). 

350 

351 This is `None` in the special case where the query has no columns, and 

352 only one logical row. 

353 """ 

354 raise NotImplementedError() 

355 

356 def rows(self, db: Database, *, region: Optional[Region] = None 

357 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

358 """Execute the query and yield result rows, applying `predicate`. 

359 

360 Parameters 

361 ---------- 

362 db : `Database` 

363 Object managing the database connection. 

364 region : `sphgeom.Region`, optional 

365 A region that any result-row regions must overlap in order to be 

366 yielded. If not provided, this will be ``self.whereRegion``, if 

367 that exists. 

368 

369 Yields 

370 ------ 

371 row : `sqlalchemy.engine.RowProxy` or `None` 

372 Result row from the query. `None` may yielded exactly once instead 

373 of any real rows to indicate an empty query (see `EmptyQuery`). 

374 """ 

375 if self._doomed_by: 

376 return 

377 whereRegion = region if region is not None else self.whereRegion 

378 self._filtered_by_where = 0 

379 self._filtered_by_join = 0 

380 for row in db.query(self.sql): 

381 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial] 

382 if whereRegion and any(r.isDisjointFrom(whereRegion) for r in rowRegions): 

383 self._filtered_by_where += 1 

384 continue 

385 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)): 

386 self._filtered_by_join += 1 

387 continue 

388 yield row 

389 

390 def extractDimensionsTuple(self, row: Optional[sqlalchemy.engine.RowProxy], 

391 dimensions: Iterable[Dimension]) -> tuple: 

392 """Extract a tuple of data ID values from a result row. 

393 

394 Parameters 

395 ---------- 

396 row : `sqlalchemy.engine.RowProxy` or `None` 

397 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

398 the row from an `EmptyQuery`. 

399 dimensions : `Iterable` [ `Dimension` ] 

400 The dimensions to include in the returned tuple, in order. 

401 

402 Returns 

403 ------- 

404 values : `tuple` 

405 A tuple of dimension primary key values. 

406 """ 

407 if row is None: 

408 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions." 

409 return () 

410 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions) 

411 

412 def extractDataId(self, row: Optional[sqlalchemy.engine.RowProxy], *, 

413 graph: Optional[DimensionGraph] = None, 

414 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

415 ) -> DataCoordinate: 

416 """Extract a data ID from a result row. 

417 

418 Parameters 

419 ---------- 

420 row : `sqlalchemy.engine.RowProxy` or `None` 

421 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

422 the row from an `EmptyQuery`. 

423 graph : `DimensionGraph`, optional 

424 The dimensions the returned data ID should identify. If not 

425 provided, this will be all dimensions in `QuerySummary.requested`. 

426 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

427 Nested mapping containing records to attach to the returned 

428 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will 

429 return `True`. If provided, outer keys must include all dimension 

430 element names in ``graph``, and inner keys should be tuples of 

431 dimension primary key values in the same order as 

432 ``element.graph.required``. If not provided, 

433 `DataCoordinate.hasRecords` will return `False` on the returned 

434 object. 

435 

436 Returns 

437 ------- 

438 dataId : `DataCoordinate` 

439 A data ID that identifies all required and implied dimensions. If 

440 ``records is not None``, this is have 

441 `~DataCoordinate.hasRecords()` return `True`. 

442 """ 

443 if graph is None: 

444 graph = self.graph 

445 if not graph: 

446 return DataCoordinate.makeEmpty(self.graph.universe) 

447 dataId = DataCoordinate.fromFullValues( 

448 graph, 

449 self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied)) 

450 ) 

451 if records is not None: 

452 recordsForRow = {} 

453 for element in graph.elements: 

454 key = tuple(dataId.subset(element.graph).values()) 

455 recordsForRow[element.name] = records[element.name].get(key) 

456 return dataId.expanded(recordsForRow) 

457 else: 

458 return dataId 

459 

460 def extractDatasetRef(self, row: sqlalchemy.engine.RowProxy, 

461 dataId: Optional[DataCoordinate] = None, 

462 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

463 ) -> DatasetRef: 

464 """Extract a `DatasetRef` from a result row. 

465 

466 Parameters 

467 ---------- 

468 row : `sqlalchemy.engine.RowProxy` 

469 A result row from a SQLAlchemy SELECT query. 

470 dataId : `DataCoordinate` 

471 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class) 

472 `DataCoordinate` is constructed from ``row`` if `None`. 

473 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

474 Records to use to return an `ExpandedDataCoordinate`. If provided, 

475 outer keys must include all dimension element names in ``graph``, 

476 and inner keys should be tuples of dimension primary key values 

477 in the same order as ``element.graph.required``. 

478 

479 Returns 

480 ------- 

481 ref : `DatasetRef` 

482 Reference to the dataset; guaranteed to have `DatasetRef.id` not 

483 `None`. 

484 """ 

485 datasetColumns = self.getDatasetColumns() 

486 assert datasetColumns is not None 

487 if dataId is None: 

488 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records) 

489 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]] 

490 return DatasetRef(datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], 

491 run=runRecord.name) 

492 

493 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec: 

494 """Helper method for subclass implementations of `materialize`. 

495 

496 Parameters 

497 ---------- 

498 constraints : `bool`, optional 

499 If `True` (`False` is default), define a specification that 

500 includes actual foreign key constraints for logical foreign keys. 

501 Some database engines do not permit temporary tables to reference 

502 normal tables, so this should be `False` when generating a spec 

503 for a temporary table unless the database engine is known to 

504 support them. 

505 

506 Returns 

507 ------- 

508 spec : `ddl.TableSpec` 

509 Specification for a table that could hold this query's result rows. 

510 """ 

511 unique = self.isUnique() 

512 spec = ddl.TableSpec(fields=()) 

513 for dimension in self.graph: 

514 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints) 

515 for element in self.spatial: 

516 spec.fields.update( 

517 SpatialRegionDatabaseRepresentation.makeFieldSpecs( 

518 nullable=True, 

519 name=f"{element.name}_region", 

520 ) 

521 ) 

522 datasetColumns = self.getDatasetColumns() 

523 if datasetColumns is not None: 

524 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints) 

525 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints) 

526 return spec 

527 

528 def _makeSubsetQueryColumns(self, *, graph: Optional[DimensionGraph] = None, 

529 datasets: bool = True, 

530 unique: bool = False) -> Tuple[DimensionGraph, Optional[QueryColumns]]: 

531 """Helper method for subclass implementations of `subset`. 

532 

533 Parameters 

534 ---------- 

535 graph : `DimensionGraph`, optional 

536 Dimensions to include in the new `Query` being constructed. 

537 ``subset`` implementations should generally just forward their 

538 own ``graph`` argument here. 

539 datasets : `bool`, optional 

540 Whether the new `Query` should include dataset results. Defaults 

541 to `True`, but is ignored if ``self`` does not include dataset 

542 results. 

543 unique : `bool`, optional 

544 Whether the new `Query` should guarantee unique results (this may 

545 come with a performance penalty). 

546 

547 Returns 

548 ------- 

549 graph : `DimensionGraph` 

550 The dimensions of the new `Query`. This is exactly the same as 

551 the argument of the same name, with ``self.graph`` used if that 

552 argument is `None`. 

553 columns : `QueryColumns` or `None` 

554 A struct containing the SQLAlchemy column objects to use in the 

555 new query, contructed by delegating to other (mostly abstract) 

556 methods on ``self``. If `None`, `subset` may return ``self``. 

557 """ 

558 if graph is None: 

559 graph = self.graph 

560 if (graph == self.graph and (self.getDatasetColumns() is None or datasets) 

561 and (self.isUnique() or not unique)): 

562 return graph, None 

563 columns = QueryColumns() 

564 for dimension in graph.dimensions: 

565 col = self.getDimensionColumn(dimension.name) 

566 columns.keys[dimension] = [col] 

567 if not unique: 

568 for element in self.spatial: 

569 col = self.getRegionColumn(element.name) 

570 columns.regions[element] = col 

571 if datasets and self.getDatasetColumns() is not None: 

572 columns.datasets = self.getDatasetColumns() 

573 return graph, columns 

574 

575 @contextmanager 

576 def materialize(self, db: Database) -> Iterator[Query]: 

577 """Execute this query and insert its results into a temporary table. 

578 

579 Parameters 

580 ---------- 

581 db : `Database` 

582 Database engine to execute the query against. 

583 

584 Returns 

585 ------- 

586 context : `typing.ContextManager` [ `MaterializedQuery` ] 

587 A context manager that ensures the temporary table is created and 

588 populated in ``__enter__`` (returning a `MaterializedQuery` object 

589 backed by that table), and dropped in ``__exit__``. If ``self`` 

590 is already a `MaterializedQuery`, ``__enter__`` may just return 

591 ``self`` and ``__exit__`` may do nothing (reflecting the fact that 

592 an outer context manager should already take care of everything 

593 else). 

594 """ 

595 spec = self._makeTableSpec() 

596 with db.session() as session: 

597 table = session.makeTemporaryTable(spec) 

598 if not self._doomed_by: 

599 db.insert(table, select=self.sql, names=spec.fields.names) 

600 yield MaterializedQuery(table=table, 

601 spatial=self.spatial, 

602 datasetType=self.datasetType, 

603 isUnique=self.isUnique(), 

604 graph=self.graph, 

605 whereRegion=self.whereRegion, 

606 managers=self.managers, 

607 doomed_by=self._doomed_by) 

608 session.dropTemporaryTable(table) 

609 

610 @abstractmethod 

611 def subset(self, *, graph: Optional[DimensionGraph] = None, 

612 datasets: bool = True, 

613 unique: bool = False) -> Query: 

614 """Return a new `Query` whose columns and/or rows are (mostly) subset 

615 of this one's. 

616 

617 Parameters 

618 ---------- 

619 graph : `DimensionGraph`, optional 

620 Dimensions to include in the new `Query` being constructed. 

621 If `None` (default), ``self.graph`` is used. 

622 datasets : `bool`, optional 

623 Whether the new `Query` should include dataset results. Defaults 

624 to `True`, but is ignored if ``self`` does not include dataset 

625 results. 

626 unique : `bool`, optional 

627 Whether the new `Query` should guarantee unique results (this may 

628 come with a performance penalty). 

629 

630 Returns 

631 ------- 

632 query : `Query` 

633 A query object corresponding to the given inputs. May be ``self`` 

634 if no changes were requested. 

635 

636 Notes 

637 ----- 

638 The way spatial overlaps are handled at present makes it impossible to 

639 fully guarantee in general that the new query's rows are a subset of 

640 this one's while also returning unique rows. That's because the 

641 database is only capable of performing approximate, conservative 

642 overlaps via the common skypix system; we defer actual region overlap 

643 operations to per-result-row Python logic. But including the region 

644 columns necessary to do that postprocessing in the query makes it 

645 impossible to do a SELECT DISTINCT on the user-visible dimensions of 

646 the query. For example, consider starting with a query with dimensions 

647 (instrument, skymap, visit, tract). That involves a spatial join 

648 between visit and tract, and we include the region columns from both 

649 tables in the results in order to only actually yield result rows 

650 (see `predicate` and `rows`) where the regions in those two columns 

651 overlap. If the user then wants to subset to just (skymap, tract) with 

652 unique results, we have two unpalatable options: 

653 

654 - we can do a SELECT DISTINCT with just the skymap and tract columns 

655 in the SELECT clause, dropping all detailed overlap information and 

656 including some tracts that did not actually overlap any of the 

657 visits in the original query (but were regarded as _possibly_ 

658 overlapping via the coarser, common-skypix relationships); 

659 

660 - we can include the tract and visit region columns in the query, and 

661 continue to filter out the non-overlapping pairs, but completely 

662 disregard the user's request for unique tracts. 

663 

664 This interface specifies that implementations must do the former, as 

665 that's what makes things efficient in our most important use case 

666 (``QuantumGraph`` generation in ``pipe_base``). We may be able to 

667 improve this situation in the future by putting exact overlap 

668 information in the database, either by using built-in (but 

669 engine-specific) spatial database functionality or (more likely) 

670 switching to a scheme in which pairwise dimension spatial relationships 

671 are explicitly precomputed (for e.g. combinations of instruments and 

672 skymaps). 

673 """ 

674 raise NotImplementedError() 

675 

676 @abstractmethod 

677 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

678 """Return a `QueryBuilder` that can be used to construct a new `Query` 

679 that is joined to (and hence constrained by) this one. 

680 

681 Parameters 

682 ---------- 

683 summary : `QuerySummary`, optional 

684 A `QuerySummary` instance that specifies the dimensions and any 

685 additional constraints to include in the new query being 

686 constructed, or `None` to use the dimensions of ``self`` with no 

687 additional constraints. 

688 """ 

689 raise NotImplementedError() 

690 

691 graph: DimensionGraph 

692 """The dimensions identified by this query and included in any data IDs 

693 created from its result rows (`DimensionGraph`). 

694 """ 

695 

696 whereRegion: Optional[Region] 

697 """A spatial region that all regions in all rows returned by this query 

698 must overlap (`lsst.sphgeom.Region` or `None`). 

699 """ 

700 

701 managers: RegistryManagers 

702 """A struct containing `Registry` helper object (`RegistryManagers`). 

703 """ 

704 

705 

706class DirectQueryUniqueness(enum.Enum): 

707 """An enum representing the ways in which a query can have unique rows (or 

708 not). 

709 """ 

710 

711 NOT_UNIQUE = enum.auto() 

712 """The query is not expected to have unique rows. 

713 """ 

714 

715 NATURALLY_UNIQUE = enum.auto() 

716 """The construction of the query guarantees that it will have unique 

717 result rows, even without SELECT DISTINCT or a GROUP BY clause. 

718 """ 

719 

720 NEEDS_DISTINCT = enum.auto() 

721 """The query is expected to yield unique result rows, and needs to use 

722 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this. 

723 """ 

724 

725 

726class DirectQuery(Query): 

727 """A `Query` implementation that represents a direct SELECT query that 

728 usually joins many tables. 

729 

730 `DirectQuery` objects should generally only be constructed by 

731 `QueryBuilder` or the methods of other `Query` objects. 

732 

733 Parameters 

734 ---------- 

735 simpleQuery : `SimpleQuery` 

736 Struct representing the actual SELECT, FROM, and WHERE clauses. 

737 columns : `QueryColumns` 

738 Columns that are referenced in the query in any clause. 

739 uniqueness : `DirectQueryUniqueness` 

740 Enum value indicating whether the query should yield unique result 

741 rows, and if so whether that needs to be explicitly requested of the 

742 database. 

743 graph : `DimensionGraph` 

744 Object describing the dimensions included in the query. 

745 whereRegion : `lsst.sphgeom.Region`, optional 

746 Region that all region columns in all returned rows must overlap. 

747 managers : `RegistryManagers` 

748 Struct containing the `Registry` manager helper objects, to be 

749 forwarded to the `Query` constructor. 

750 doomed_by : `Iterable` [ `str` ], optional 

751 A list of messages (appropriate for e.g. logging or exceptions) that 

752 explain why the query is known to return no results even before it is 

753 executed. Queries with a non-empty list will never be executed. 

754 """ 

755 def __init__(self, *, 

756 simpleQuery: SimpleQuery, 

757 columns: QueryColumns, 

758 uniqueness: DirectQueryUniqueness, 

759 graph: DimensionGraph, 

760 whereRegion: Optional[Region], 

761 managers: RegistryManagers, 

762 doomed_by: Iterable[str] = ()): 

763 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

764 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql" 

765 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns." 

766 self._simpleQuery = simpleQuery 

767 self._columns = columns 

768 self._uniqueness = uniqueness 

769 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None 

770 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

771 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

772 

773 def isUnique(self) -> bool: 

774 # Docstring inherited from Query. 

775 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE 

776 

777 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

778 # Docstring inherited from Query. 

779 column = self._dimensionColumns.get(name) 

780 if column is None: 

781 column = self._columns.getKeyColumn(name).label(name) 

782 self._dimensionColumns[name] = column 

783 return column 

784 

785 @property 

786 def spatial(self) -> Iterator[DimensionElement]: 

787 # Docstring inherited from Query. 

788 return iter(self._columns.regions) 

789 

790 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

791 # Docstring inherited from Query. 

792 column = self._regionColumns.get(name) 

793 if column is None: 

794 column = self._columns.regions[name].column.label(f"{name}_region") 

795 self._regionColumns[name] = column 

796 return column 

797 

798 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

799 # Docstring inherited from Query. 

800 if self._datasetQueryColumns is None: 

801 base = self._columns.datasets 

802 if base is None: 

803 return None 

804 ingestDate = base.ingestDate 

805 if ingestDate is not None: 

806 ingestDate = ingestDate.label("ingest_date") 

807 self._datasetQueryColumns = DatasetQueryColumns( 

808 datasetType=base.datasetType, 

809 id=base.id.label("dataset_id"), 

810 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()), 

811 ingestDate=ingestDate, 

812 ) 

813 return self._datasetQueryColumns 

814 

815 @property 

816 def sql(self) -> sqlalchemy.sql.FromClause: 

817 # Docstring inherited from Query. 

818 simpleQuery = self._simpleQuery.copy() 

819 for dimension in self.graph: 

820 simpleQuery.columns.append(self.getDimensionColumn(dimension.name)) 

821 for element in self.spatial: 

822 simpleQuery.columns.append(self.getRegionColumn(element.name)) 

823 datasetColumns = self.getDatasetColumns() 

824 if datasetColumns is not None: 

825 simpleQuery.columns.extend(datasetColumns) 

826 sql = simpleQuery.combine() 

827 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT: 

828 return sql.distinct() 

829 else: 

830 return sql 

831 

832 def subset(self, *, graph: Optional[DimensionGraph] = None, 

833 datasets: bool = True, 

834 unique: bool = False) -> Query: 

835 # Docstring inherited from Query. 

836 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

837 if columns is None: 

838 return self 

839 if columns.isEmpty(): 

840 return EmptyQuery(self.graph.universe, self.managers) 

841 return DirectQuery( 

842 simpleQuery=self._simpleQuery.copy(), 

843 columns=columns, 

844 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

845 graph=graph, 

846 whereRegion=self.whereRegion if not unique else None, 

847 managers=self.managers, 

848 doomed_by=self._doomed_by, 

849 ) 

850 

851 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

852 # Docstring inherited from Query. 

853 from ._builder import QueryBuilder 

854 if summary is None: 

855 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

856 if not summary.requested.issubset(self.graph): 

857 raise NotImplementedError( 

858 f"Query.makeBuilder does not yet support augmenting dimensions " 

859 f"({summary.requested.dimensions}) beyond those originally included in the query " 

860 f"({self.graph.dimensions})." 

861 ) 

862 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

863 builder.joinTable(self.sql.alias(), dimensions=self.graph.dimensions, 

864 datasets=self.getDatasetColumns()) 

865 return builder 

866 

867 

868class MaterializedQuery(Query): 

869 """A `Query` implementation that represents query results saved in a 

870 temporary table. 

871 

872 `MaterializedQuery` instances should not be constructed directly; use 

873 `Query.materialize()` instead. 

874 

875 Parameters 

876 ---------- 

877 table : `sqlalchemy.schema.Table` 

878 SQLAlchemy object represnting the temporary table. 

879 spatial : `Iterable` [ `DimensionElement` ] 

880 Spatial dimension elements whose regions must overlap for each valid 

881 result row (which may reject some rows that are in the table). 

882 datasetType : `DatasetType` 

883 The `DatasetType` of datasets returned by this query, or `None` 

884 if there are no dataset results 

885 isUnique : `bool` 

886 If `True`, the table's rows are unique, and there is no need to 

887 add ``SELECT DISTINCT`` to gaurantee this in results. 

888 graph : `DimensionGraph` 

889 Dimensions included in the columns of this table. 

890 whereRegion : `Region` or `None` 

891 A spatial region all result-row regions must overlap to be valid (which 

892 may reject some rows that are in the table). 

893 managers : `RegistryManagers` 

894 A struct containing `Registry` manager helper objects, forwarded to 

895 the `Query` constructor. 

896 doomed_by : `Iterable` [ `str` ], optional 

897 A list of messages (appropriate for e.g. logging or exceptions) that 

898 explain why the query is known to return no results even before it is 

899 executed. Queries with a non-empty list will never be executed. 

900 """ 

901 def __init__(self, *, 

902 table: sqlalchemy.schema.Table, 

903 spatial: Iterable[DimensionElement], 

904 datasetType: Optional[DatasetType], 

905 isUnique: bool, 

906 graph: DimensionGraph, 

907 whereRegion: Optional[Region], 

908 managers: RegistryManagers, 

909 doomed_by: Iterable[str] = ()): 

910 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

911 self._table = table 

912 self._spatial = tuple(spatial) 

913 self._datasetType = datasetType 

914 self._isUnique = isUnique 

915 

916 def isUnique(self) -> bool: 

917 # Docstring inherited from Query. 

918 return self._isUnique 

919 

920 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

921 # Docstring inherited from Query. 

922 return self._table.columns[name] 

923 

924 @property 

925 def spatial(self) -> Iterator[DimensionElement]: 

926 # Docstring inherited from Query. 

927 return iter(self._spatial) 

928 

929 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

930 # Docstring inherited from Query. 

931 return self._table.columns[f"{name}_region"] 

932 

933 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

934 # Docstring inherited from Query. 

935 if self._datasetType is not None: 

936 return DatasetQueryColumns( 

937 datasetType=self._datasetType, 

938 id=self._table.columns["dataset_id"], 

939 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()], 

940 ingestDate=None, 

941 ) 

942 else: 

943 return None 

944 

945 @property 

946 def sql(self) -> sqlalchemy.sql.FromClause: 

947 # Docstring inherited from Query. 

948 return self._table.select() 

949 

950 @contextmanager 

951 def materialize(self, db: Database) -> Iterator[Query]: 

952 # Docstring inherited from Query. 

953 yield self 

954 

955 def subset(self, *, graph: Optional[DimensionGraph] = None, 

956 datasets: bool = True, 

957 unique: bool = False) -> Query: 

958 # Docstring inherited from Query. 

959 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

960 if columns is None: 

961 return self 

962 if columns.isEmpty(): 

963 return EmptyQuery(self.graph.universe, managers=self.managers) 

964 simpleQuery = SimpleQuery() 

965 simpleQuery.join(self._table) 

966 return DirectQuery( 

967 simpleQuery=simpleQuery, 

968 columns=columns, 

969 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

970 graph=graph, 

971 whereRegion=self.whereRegion if not unique else None, 

972 managers=self.managers, 

973 doomed_by=self._doomed_by, 

974 ) 

975 

976 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

977 # Docstring inherited from Query. 

978 from ._builder import QueryBuilder 

979 if summary is None: 

980 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

981 if not summary.requested.issubset(self.graph): 

982 raise NotImplementedError( 

983 f"Query.makeBuilder does not yet support augmenting dimensions " 

984 f"({summary.requested.dimensions}) beyond those originally included in the query " 

985 f"({self.graph.dimensions})." 

986 ) 

987 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

988 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()) 

989 return builder 

990 

991 

992class EmptyQuery(Query): 

993 """A `Query` implementation that handes the special case where the query 

994 would have no columns. 

995 

996 Parameters 

997 ---------- 

998 universe : `DimensionUniverse` 

999 Set of all dimensions from which the null set is extracted. 

1000 managers : `RegistryManagers` 

1001 A struct containing the registry manager instances used by the query 

1002 system. 

1003 doomed_by : `Iterable` [ `str` ], optional 

1004 A list of messages (appropriate for e.g. logging or exceptions) that 

1005 explain why the query is known to return no results even before it is 

1006 executed. Queries with a non-empty list will never be executed. 

1007 """ 

1008 def __init__( 

1009 self, 

1010 universe: DimensionUniverse, 

1011 managers: RegistryManagers, 

1012 doomed_by: Iterable[str] = (), 

1013 ): 

1014 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by) 

1015 

1016 def isUnique(self) -> bool: 

1017 # Docstring inherited from Query. 

1018 return True 

1019 

1020 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1021 # Docstring inherited from Query. 

1022 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).") 

1023 

1024 @property 

1025 def spatial(self) -> Iterator[DimensionElement]: 

1026 # Docstring inherited from Query. 

1027 return iter(()) 

1028 

1029 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1030 # Docstring inherited from Query. 

1031 raise KeyError(f"No region for {name} in query (no regions at all, actually).") 

1032 

1033 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1034 # Docstring inherited from Query. 

1035 return None 

1036 

1037 def rows(self, db: Database, *, region: Optional[Region] = None 

1038 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

1039 if not self._doomed_by: 

1040 yield None 

1041 

1042 @property 

1043 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

1044 # Docstring inherited from Query. 

1045 return None 

1046 

1047 @contextmanager 

1048 def materialize(self, db: Database) -> Iterator[Query]: 

1049 # Docstring inherited from Query. 

1050 yield self 

1051 

1052 def subset(self, *, graph: Optional[DimensionGraph] = None, 

1053 datasets: bool = True, 

1054 unique: bool = False) -> Query: 

1055 # Docstring inherited from Query. 

1056 assert graph is None or graph.issubset(self.graph) 

1057 return self 

1058 

1059 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1060 # Docstring inherited from Query. 

1061 from ._builder import QueryBuilder 

1062 if summary is None: 

1063 summary = QuerySummary(self.graph) 

1064 if not summary.requested.issubset(self.graph): 

1065 raise NotImplementedError( 

1066 f"Query.makeBuilder does not yet support augmenting dimensions " 

1067 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1068 f"({self.graph.dimensions})." 

1069 ) 

1070 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)