Coverage for python/lsst/daf/butler/registry/queries/_query.py: 21%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

331 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("Query",) 

24 

25from abc import ABC, abstractmethod 

26from contextlib import contextmanager 

27import enum 

28import itertools 

29from typing import ( 

30 Dict, 

31 Iterable, 

32 Iterator, 

33 Mapping, 

34 Optional, 

35 Tuple, 

36 TYPE_CHECKING, 

37) 

38 

39import sqlalchemy 

40 

41from lsst.sphgeom import Region 

42 

43from ...core import ( 

44 addDimensionForeignKey, 

45 DataCoordinate, 

46 DatasetRef, 

47 DatasetType, 

48 ddl, 

49 Dimension, 

50 DimensionElement, 

51 DimensionGraph, 

52 DimensionRecord, 

53 DimensionUniverse, 

54 SpatialRegionDatabaseRepresentation, 

55 SimpleQuery, 

56) 

57from ..interfaces import Database 

58from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers 

59 

60if TYPE_CHECKING: 60 ↛ 61line 60 didn't jump to line 61, because the condition on line 60 was never true

61 from ._builder import QueryBuilder 

62 

63 

64class Query(ABC): 

65 """An abstract base class for queries that return some combination of 

66 `DatasetRef` and `DataCoordinate` objects. 

67 

68 Parameters 

69 ---------- 

70 graph : `DimensionGraph` 

71 Object describing the dimensions included in the query. 

72 whereRegion : `lsst.sphgeom.Region`, optional 

73 Region that all region columns in all returned rows must overlap. 

74 managers : `RegistryManagers` 

75 A struct containing the registry manager instances used by the query 

76 system. 

77 doomed_by : `Iterable` [ `str` ], optional 

78 A list of messages (appropriate for e.g. logging or exceptions) that 

79 explain why the query is known to return no results even before it is 

80 executed. Queries with a non-empty list will never be executed. 

81 

82 Notes 

83 ----- 

84 The `Query` hierarchy abstracts over the database/SQL representation of a 

85 particular set of data IDs or datasets. It is expected to be used as a 

86 backend for other objects that provide more natural interfaces for one or 

87 both of these, not as part of a public interface to query results. 

88 """ 

89 def __init__(self, *, 

90 graph: DimensionGraph, 

91 whereRegion: Optional[Region], 

92 managers: RegistryManagers, 

93 doomed_by: Iterable[str] = (), 

94 ): 

95 self.graph = graph 

96 self.whereRegion = whereRegion 

97 self.managers = managers 

98 self._doomed_by = tuple(doomed_by) 

99 self._filtered_by_join: Optional[int] = None 

100 self._filtered_by_where: Optional[int] = None 

101 

102 @abstractmethod 

103 def isUnique(self) -> bool: 

104 """Return `True` if this query's rows are guaranteed to be unique, and 

105 `False` otherwise. 

106 

107 If this query has dataset results (`datasetType` is not `None`), 

108 uniqueness applies to the `DatasetRef` instances returned by 

109 `extractDatasetRef` from the result of `rows`. If it does not have 

110 dataset results, uniqueness applies to the `DataCoordinate` instances 

111 returned by `extractDataId`. 

112 """ 

113 raise NotImplementedError() 

114 

115 @abstractmethod 

116 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

117 """Return the query column that contains the primary key value for 

118 the dimension with the given name. 

119 

120 Parameters 

121 ---------- 

122 name : `str` 

123 Name of the dimension. 

124 

125 Returns 

126 ------- 

127 column : `sqlalchemy.sql.ColumnElement`. 

128 SQLAlchemy object representing a column in the query. 

129 

130 Notes 

131 ----- 

132 This method is intended primarily as a hook for subclasses to implement 

133 and the ABC to call in order to provide higher-level functionality; 

134 code that uses `Query` objects (but does not implement one) should 

135 usually not have to call this method. 

136 """ 

137 raise NotImplementedError() 

138 

139 @property 

140 @abstractmethod 

141 def spatial(self) -> Iterator[DimensionElement]: 

142 """An iterator over the dimension element columns used in post-query 

143 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]). 

144 

145 Notes 

146 ----- 

147 This property is intended primarily as a hook for subclasses to 

148 implement and the ABC to call in order to provide higher-level 

149 functionality; code that uses `Query` objects (but does not implement 

150 one) should usually not have to access this property. 

151 """ 

152 raise NotImplementedError() 

153 

154 @abstractmethod 

155 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

156 """Return a region column for one of the dimension elements iterated 

157 over by `spatial`. 

158 

159 Parameters 

160 ---------- 

161 name : `str` 

162 Name of the element. 

163 

164 Returns 

165 ------- 

166 column : `sqlalchemy.sql.ColumnElement` 

167 SQLAlchemy representing a result column in the query. 

168 

169 Notes 

170 ----- 

171 This method is intended primarily as a hook for subclasses to implement 

172 and the ABC to call in order to provide higher-level functionality; 

173 code that uses `Query` objects (but does not implement one) should 

174 usually not have to call this method. 

175 """ 

176 raise NotImplementedError() 

177 

178 @property 

179 def datasetType(self) -> Optional[DatasetType]: 

180 """The `DatasetType` of datasets returned by this query, or `None` 

181 if there are no dataset results (`DatasetType` or `None`). 

182 """ 

183 cols = self.getDatasetColumns() 

184 if cols is None: 

185 return None 

186 return cols.datasetType 

187 

188 def count(self, db: Database, *, region: Optional[Region] = None, exact: bool = True) -> int: 

189 """Count the number of rows this query would return. 

190 

191 Parameters 

192 ---------- 

193 db : `Database` 

194 Object managing the database connection. 

195 region : `sphgeom.Region`, optional 

196 A region that any result-row regions must overlap in order to be 

197 yielded. If not provided, this will be ``self.whereRegion``, if 

198 that exists. 

199 exact : `bool`, optional 

200 If `True`, run the full query and perform post-query filtering if 

201 needed to account for that filtering in the count. If `False`, the 

202 result may be an upper bound. 

203 

204 Returns 

205 ------- 

206 count : `int` 

207 The number of rows the query would return, or an upper bound if 

208 ``exact=False``. 

209 

210 Notes 

211 ----- 

212 This counts the number of rows returned, not the number of unique rows 

213 returned, so even with ``exact=True`` it may provide only an upper 

214 bound on the number of *deduplicated* result rows. 

215 """ 

216 if self._doomed_by: 

217 return 0 

218 sql = self.sql 

219 if sql is None: 

220 return 1 

221 if exact and self.spatial: 

222 filtered_count = 0 

223 for _ in self.rows(db, region=region): 

224 filtered_count += 1 

225 return filtered_count 

226 else: 

227 return db.query( 

228 sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None) 

229 ).scalar() 

230 

231 def any( 

232 self, 

233 db: Database, *, 

234 region: Optional[Region] = None, 

235 execute: bool = True, 

236 exact: bool = True, 

237 ) -> bool: 

238 """Test whether this query returns any results. 

239 

240 Parameters 

241 ---------- 

242 db : `Database` 

243 Object managing the database connection. 

244 region : `sphgeom.Region`, optional 

245 A region that any result-row regions must overlap in order to be 

246 yielded. If not provided, this will be ``self.whereRegion``, if 

247 that exists. 

248 execute : `bool`, optional 

249 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

250 determined prior to execution that the query would return no rows. 

251 exact : `bool`, optional 

252 If `True`, run the full query and perform post-query filtering if 

253 needed, until at least one result row is found. If `False`, the 

254 returned result does not account for post-query filtering, and 

255 hence may be `True` even when all result rows would be filtered 

256 out. 

257 

258 Returns 

259 ------- 

260 any : `bool` 

261 `True` if the query would (or might, depending on arguments) yield 

262 result rows. `False` if it definitely would not. 

263 """ 

264 if self._doomed_by: 

265 return False 

266 sql = self.sql 

267 if sql is None: 

268 return True 

269 if exact and not execute: 

270 raise TypeError("Cannot obtain exact results without executing the query.") 

271 if exact and self.spatial: 

272 for _ in self.rows(db, region=region): 

273 return True 

274 return False 

275 elif execute: 

276 return db.query(sql.limit(1)).one_or_none() is not None 

277 else: 

278 return True 

279 

280 def explain_no_results( 

281 self, 

282 db: Database, *, 

283 region: Optional[Region] = None, 

284 followup: bool = True, 

285 ) -> Iterator[str]: 

286 """Return human-readable messages that may help explain why the query 

287 yields no results. 

288 

289 Parameters 

290 ---------- 

291 db : `Database` 

292 Object managing the database connection. 

293 region : `sphgeom.Region`, optional 

294 A region that any result-row regions must overlap in order to be 

295 yielded. If not provided, this will be ``self.whereRegion``, if 

296 that exists. 

297 followup : `bool`, optional 

298 If `True` (default) perform inexpensive follow-up queries if no 

299 diagnostics are available from query generation alone. 

300 

301 Returns 

302 ------- 

303 messages : `Iterator` [ `str` ] 

304 String messages that describe reasons the query might not yield any 

305 results. 

306 

307 Notes 

308 ----- 

309 Messages related to post-query filtering are only available if `rows`, 

310 `any`, or `count` was already called with the same region (with 

311 ``exact=True`` for the latter two). 

312 """ 

313 from ._builder import QueryBuilder 

314 if self._doomed_by: 

315 yield from self._doomed_by 

316 return 

317 if self._filtered_by_where: 

318 yield ( 

319 f"{self._filtered_by_where} result rows were filtered out because " 

320 "one or more region did not overlap the WHERE-clause region." 

321 ) 

322 if self._filtered_by_join: 

323 yield ( 

324 f"{self._filtered_by_join} result rows were filtered out because " 

325 "one or more regions did not overlap." 

326 ) 

327 if (not followup) or self._filtered_by_join or self._filtered_by_where: 

328 return 

329 # Query didn't return results even before client-side filtering, and 

330 # caller says we can do follow-up queries to determine why. 

331 # Start by seeing if there are _any_ dimension records for each element 

332 # involved. 

333 for element in self.graph.elements: 

334 summary = QuerySummary(element.graph) 

335 builder = QueryBuilder(summary, self.managers) 

336 followup_query = builder.finish() 

337 if not followup_query.any(db, exact=False): 

338 yield f"No dimension records for element '{element.name}' found." 

339 yield from followup_query.explain_no_results(db, region=region, followup=False) 

340 return 

341 

342 @abstractmethod 

343 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

344 """Return the columns for the datasets returned by this query. 

345 

346 Returns 

347 ------- 

348 columns : `DatasetQueryColumns` or `None` 

349 Struct containing SQLAlchemy representations of the result columns 

350 for a dataset. 

351 

352 Notes 

353 ----- 

354 This method is intended primarily as a hook for subclasses to implement 

355 and the ABC to call in order to provide higher-level functionality; 

356 code that uses `Query` objects (but does not implement one) should 

357 usually not have to call this method. 

358 """ 

359 raise NotImplementedError() 

360 

361 @property 

362 @abstractmethod 

363 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

364 """A SQLAlchemy object representing the full query 

365 (`sqlalchemy.sql.FromClause` or `None`). 

366 

367 This is `None` in the special case where the query has no columns, and 

368 only one logical row. 

369 """ 

370 raise NotImplementedError() 

371 

372 def rows(self, db: Database, *, region: Optional[Region] = None 

373 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

374 """Execute the query and yield result rows, applying `predicate`. 

375 

376 Parameters 

377 ---------- 

378 db : `Database` 

379 Object managing the database connection. 

380 region : `sphgeom.Region`, optional 

381 A region that any result-row regions must overlap in order to be 

382 yielded. If not provided, this will be ``self.whereRegion``, if 

383 that exists. 

384 

385 Yields 

386 ------ 

387 row : `sqlalchemy.engine.RowProxy` or `None` 

388 Result row from the query. `None` may yielded exactly once instead 

389 of any real rows to indicate an empty query (see `EmptyQuery`). 

390 """ 

391 if self._doomed_by: 

392 return 

393 whereRegion = region if region is not None else self.whereRegion 

394 self._filtered_by_where = 0 

395 self._filtered_by_join = 0 

396 for row in db.query(self.sql): 

397 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial] 

398 if whereRegion and any(r.isDisjointFrom(whereRegion) for r in rowRegions): 

399 self._filtered_by_where += 1 

400 continue 

401 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)): 

402 self._filtered_by_join += 1 

403 continue 

404 yield row 

405 

406 def extractDimensionsTuple(self, row: Optional[sqlalchemy.engine.RowProxy], 

407 dimensions: Iterable[Dimension]) -> tuple: 

408 """Extract a tuple of data ID values from a result row. 

409 

410 Parameters 

411 ---------- 

412 row : `sqlalchemy.engine.RowProxy` or `None` 

413 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

414 the row from an `EmptyQuery`. 

415 dimensions : `Iterable` [ `Dimension` ] 

416 The dimensions to include in the returned tuple, in order. 

417 

418 Returns 

419 ------- 

420 values : `tuple` 

421 A tuple of dimension primary key values. 

422 """ 

423 if row is None: 

424 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions." 

425 return () 

426 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions) 

427 

428 def extractDataId(self, row: Optional[sqlalchemy.engine.RowProxy], *, 

429 graph: Optional[DimensionGraph] = None, 

430 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

431 ) -> DataCoordinate: 

432 """Extract a data ID from a result row. 

433 

434 Parameters 

435 ---------- 

436 row : `sqlalchemy.engine.RowProxy` or `None` 

437 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

438 the row from an `EmptyQuery`. 

439 graph : `DimensionGraph`, optional 

440 The dimensions the returned data ID should identify. If not 

441 provided, this will be all dimensions in `QuerySummary.requested`. 

442 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

443 Nested mapping containing records to attach to the returned 

444 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will 

445 return `True`. If provided, outer keys must include all dimension 

446 element names in ``graph``, and inner keys should be tuples of 

447 dimension primary key values in the same order as 

448 ``element.graph.required``. If not provided, 

449 `DataCoordinate.hasRecords` will return `False` on the returned 

450 object. 

451 

452 Returns 

453 ------- 

454 dataId : `DataCoordinate` 

455 A data ID that identifies all required and implied dimensions. If 

456 ``records is not None``, this is have 

457 `~DataCoordinate.hasRecords()` return `True`. 

458 """ 

459 if graph is None: 

460 graph = self.graph 

461 if not graph: 

462 return DataCoordinate.makeEmpty(self.graph.universe) 

463 dataId = DataCoordinate.fromFullValues( 

464 graph, 

465 self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied)) 

466 ) 

467 if records is not None: 

468 recordsForRow = {} 

469 for element in graph.elements: 

470 key = tuple(dataId.subset(element.graph).values()) 

471 recordsForRow[element.name] = records[element.name].get(key) 

472 return dataId.expanded(recordsForRow) 

473 else: 

474 return dataId 

475 

476 def extractDatasetRef(self, row: sqlalchemy.engine.RowProxy, 

477 dataId: Optional[DataCoordinate] = None, 

478 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

479 ) -> DatasetRef: 

480 """Extract a `DatasetRef` from a result row. 

481 

482 Parameters 

483 ---------- 

484 row : `sqlalchemy.engine.RowProxy` 

485 A result row from a SQLAlchemy SELECT query. 

486 dataId : `DataCoordinate` 

487 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class) 

488 `DataCoordinate` is constructed from ``row`` if `None`. 

489 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

490 Records to use to return an `ExpandedDataCoordinate`. If provided, 

491 outer keys must include all dimension element names in ``graph``, 

492 and inner keys should be tuples of dimension primary key values 

493 in the same order as ``element.graph.required``. 

494 

495 Returns 

496 ------- 

497 ref : `DatasetRef` 

498 Reference to the dataset; guaranteed to have `DatasetRef.id` not 

499 `None`. 

500 """ 

501 datasetColumns = self.getDatasetColumns() 

502 assert datasetColumns is not None 

503 if dataId is None: 

504 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records) 

505 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]] 

506 return DatasetRef(datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], 

507 run=runRecord.name) 

508 

509 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec: 

510 """Helper method for subclass implementations of `materialize`. 

511 

512 Parameters 

513 ---------- 

514 constraints : `bool`, optional 

515 If `True` (`False` is default), define a specification that 

516 includes actual foreign key constraints for logical foreign keys. 

517 Some database engines do not permit temporary tables to reference 

518 normal tables, so this should be `False` when generating a spec 

519 for a temporary table unless the database engine is known to 

520 support them. 

521 

522 Returns 

523 ------- 

524 spec : `ddl.TableSpec` 

525 Specification for a table that could hold this query's result rows. 

526 """ 

527 unique = self.isUnique() 

528 spec = ddl.TableSpec(fields=()) 

529 for dimension in self.graph: 

530 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints) 

531 for element in self.spatial: 

532 spec.fields.update( 

533 SpatialRegionDatabaseRepresentation.makeFieldSpecs( 

534 nullable=True, 

535 name=f"{element.name}_region", 

536 ) 

537 ) 

538 datasetColumns = self.getDatasetColumns() 

539 if datasetColumns is not None: 

540 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints) 

541 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints) 

542 return spec 

543 

544 def _makeSubsetQueryColumns(self, *, graph: Optional[DimensionGraph] = None, 

545 datasets: bool = True, 

546 unique: bool = False) -> Tuple[DimensionGraph, Optional[QueryColumns]]: 

547 """Helper method for subclass implementations of `subset`. 

548 

549 Parameters 

550 ---------- 

551 graph : `DimensionGraph`, optional 

552 Dimensions to include in the new `Query` being constructed. 

553 ``subset`` implementations should generally just forward their 

554 own ``graph`` argument here. 

555 datasets : `bool`, optional 

556 Whether the new `Query` should include dataset results. Defaults 

557 to `True`, but is ignored if ``self`` does not include dataset 

558 results. 

559 unique : `bool`, optional 

560 Whether the new `Query` should guarantee unique results (this may 

561 come with a performance penalty). 

562 

563 Returns 

564 ------- 

565 graph : `DimensionGraph` 

566 The dimensions of the new `Query`. This is exactly the same as 

567 the argument of the same name, with ``self.graph`` used if that 

568 argument is `None`. 

569 columns : `QueryColumns` or `None` 

570 A struct containing the SQLAlchemy column objects to use in the 

571 new query, contructed by delegating to other (mostly abstract) 

572 methods on ``self``. If `None`, `subset` may return ``self``. 

573 """ 

574 if graph is None: 

575 graph = self.graph 

576 if (graph == self.graph and (self.getDatasetColumns() is None or datasets) 

577 and (self.isUnique() or not unique)): 

578 return graph, None 

579 columns = QueryColumns() 

580 for dimension in graph.dimensions: 

581 col = self.getDimensionColumn(dimension.name) 

582 columns.keys[dimension] = [col] 

583 if not unique: 

584 for element in self.spatial: 

585 col = self.getRegionColumn(element.name) 

586 columns.regions[element] = col 

587 if datasets and self.getDatasetColumns() is not None: 

588 columns.datasets = self.getDatasetColumns() 

589 return graph, columns 

590 

591 @contextmanager 

592 def materialize(self, db: Database) -> Iterator[Query]: 

593 """Execute this query and insert its results into a temporary table. 

594 

595 Parameters 

596 ---------- 

597 db : `Database` 

598 Database engine to execute the query against. 

599 

600 Returns 

601 ------- 

602 context : `typing.ContextManager` [ `MaterializedQuery` ] 

603 A context manager that ensures the temporary table is created and 

604 populated in ``__enter__`` (returning a `MaterializedQuery` object 

605 backed by that table), and dropped in ``__exit__``. If ``self`` 

606 is already a `MaterializedQuery`, ``__enter__`` may just return 

607 ``self`` and ``__exit__`` may do nothing (reflecting the fact that 

608 an outer context manager should already take care of everything 

609 else). 

610 """ 

611 spec = self._makeTableSpec() 

612 with db.session() as session: 

613 table = session.makeTemporaryTable(spec) 

614 if not self._doomed_by: 

615 db.insert(table, select=self.sql, names=spec.fields.names) 

616 yield MaterializedQuery(table=table, 

617 spatial=self.spatial, 

618 datasetType=self.datasetType, 

619 isUnique=self.isUnique(), 

620 graph=self.graph, 

621 whereRegion=self.whereRegion, 

622 managers=self.managers, 

623 doomed_by=self._doomed_by) 

624 session.dropTemporaryTable(table) 

625 

626 @abstractmethod 

627 def subset(self, *, graph: Optional[DimensionGraph] = None, 

628 datasets: bool = True, 

629 unique: bool = False) -> Query: 

630 """Return a new `Query` whose columns and/or rows are (mostly) subset 

631 of this one's. 

632 

633 Parameters 

634 ---------- 

635 graph : `DimensionGraph`, optional 

636 Dimensions to include in the new `Query` being constructed. 

637 If `None` (default), ``self.graph`` is used. 

638 datasets : `bool`, optional 

639 Whether the new `Query` should include dataset results. Defaults 

640 to `True`, but is ignored if ``self`` does not include dataset 

641 results. 

642 unique : `bool`, optional 

643 Whether the new `Query` should guarantee unique results (this may 

644 come with a performance penalty). 

645 

646 Returns 

647 ------- 

648 query : `Query` 

649 A query object corresponding to the given inputs. May be ``self`` 

650 if no changes were requested. 

651 

652 Notes 

653 ----- 

654 The way spatial overlaps are handled at present makes it impossible to 

655 fully guarantee in general that the new query's rows are a subset of 

656 this one's while also returning unique rows. That's because the 

657 database is only capable of performing approximate, conservative 

658 overlaps via the common skypix system; we defer actual region overlap 

659 operations to per-result-row Python logic. But including the region 

660 columns necessary to do that postprocessing in the query makes it 

661 impossible to do a SELECT DISTINCT on the user-visible dimensions of 

662 the query. For example, consider starting with a query with dimensions 

663 (instrument, skymap, visit, tract). That involves a spatial join 

664 between visit and tract, and we include the region columns from both 

665 tables in the results in order to only actually yield result rows 

666 (see `predicate` and `rows`) where the regions in those two columns 

667 overlap. If the user then wants to subset to just (skymap, tract) with 

668 unique results, we have two unpalatable options: 

669 

670 - we can do a SELECT DISTINCT with just the skymap and tract columns 

671 in the SELECT clause, dropping all detailed overlap information and 

672 including some tracts that did not actually overlap any of the 

673 visits in the original query (but were regarded as _possibly_ 

674 overlapping via the coarser, common-skypix relationships); 

675 

676 - we can include the tract and visit region columns in the query, and 

677 continue to filter out the non-overlapping pairs, but completely 

678 disregard the user's request for unique tracts. 

679 

680 This interface specifies that implementations must do the former, as 

681 that's what makes things efficient in our most important use case 

682 (``QuantumGraph`` generation in ``pipe_base``). We may be able to 

683 improve this situation in the future by putting exact overlap 

684 information in the database, either by using built-in (but 

685 engine-specific) spatial database functionality or (more likely) 

686 switching to a scheme in which pairwise dimension spatial relationships 

687 are explicitly precomputed (for e.g. combinations of instruments and 

688 skymaps). 

689 """ 

690 raise NotImplementedError() 

691 

692 @abstractmethod 

693 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

694 """Return a `QueryBuilder` that can be used to construct a new `Query` 

695 that is joined to (and hence constrained by) this one. 

696 

697 Parameters 

698 ---------- 

699 summary : `QuerySummary`, optional 

700 A `QuerySummary` instance that specifies the dimensions and any 

701 additional constraints to include in the new query being 

702 constructed, or `None` to use the dimensions of ``self`` with no 

703 additional constraints. 

704 """ 

705 raise NotImplementedError() 

706 

707 graph: DimensionGraph 

708 """The dimensions identified by this query and included in any data IDs 

709 created from its result rows (`DimensionGraph`). 

710 """ 

711 

712 whereRegion: Optional[Region] 

713 """A spatial region that all regions in all rows returned by this query 

714 must overlap (`lsst.sphgeom.Region` or `None`). 

715 """ 

716 

717 managers: RegistryManagers 

718 """A struct containing `Registry` helper object (`RegistryManagers`). 

719 """ 

720 

721 

722class DirectQueryUniqueness(enum.Enum): 

723 """An enum representing the ways in which a query can have unique rows (or 

724 not). 

725 """ 

726 

727 NOT_UNIQUE = enum.auto() 

728 """The query is not expected to have unique rows. 

729 """ 

730 

731 NATURALLY_UNIQUE = enum.auto() 

732 """The construction of the query guarantees that it will have unique 

733 result rows, even without SELECT DISTINCT or a GROUP BY clause. 

734 """ 

735 

736 NEEDS_DISTINCT = enum.auto() 

737 """The query is expected to yield unique result rows, and needs to use 

738 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this. 

739 """ 

740 

741 

742class DirectQuery(Query): 

743 """A `Query` implementation that represents a direct SELECT query that 

744 usually joins many tables. 

745 

746 `DirectQuery` objects should generally only be constructed by 

747 `QueryBuilder` or the methods of other `Query` objects. 

748 

749 Parameters 

750 ---------- 

751 simpleQuery : `SimpleQuery` 

752 Struct representing the actual SELECT, FROM, and WHERE clauses. 

753 columns : `QueryColumns` 

754 Columns that are referenced in the query in any clause. 

755 uniqueness : `DirectQueryUniqueness` 

756 Enum value indicating whether the query should yield unique result 

757 rows, and if so whether that needs to be explicitly requested of the 

758 database. 

759 graph : `DimensionGraph` 

760 Object describing the dimensions included in the query. 

761 whereRegion : `lsst.sphgeom.Region`, optional 

762 Region that all region columns in all returned rows must overlap. 

763 managers : `RegistryManagers` 

764 Struct containing the `Registry` manager helper objects, to be 

765 forwarded to the `Query` constructor. 

766 doomed_by : `Iterable` [ `str` ], optional 

767 A list of messages (appropriate for e.g. logging or exceptions) that 

768 explain why the query is known to return no results even before it is 

769 executed. Queries with a non-empty list will never be executed. 

770 """ 

771 def __init__(self, *, 

772 simpleQuery: SimpleQuery, 

773 columns: QueryColumns, 

774 uniqueness: DirectQueryUniqueness, 

775 graph: DimensionGraph, 

776 whereRegion: Optional[Region], 

777 managers: RegistryManagers, 

778 doomed_by: Iterable[str] = ()): 

779 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

780 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql" 

781 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns." 

782 self._simpleQuery = simpleQuery 

783 self._columns = columns 

784 self._uniqueness = uniqueness 

785 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None 

786 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

787 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

788 

789 def isUnique(self) -> bool: 

790 # Docstring inherited from Query. 

791 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE 

792 

793 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

794 # Docstring inherited from Query. 

795 column = self._dimensionColumns.get(name) 

796 if column is None: 

797 column = self._columns.getKeyColumn(name).label(name) 

798 self._dimensionColumns[name] = column 

799 return column 

800 

801 @property 

802 def spatial(self) -> Iterator[DimensionElement]: 

803 # Docstring inherited from Query. 

804 return iter(self._columns.regions) 

805 

806 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

807 # Docstring inherited from Query. 

808 column = self._regionColumns.get(name) 

809 if column is None: 

810 column = self._columns.regions[name].column.label(f"{name}_region") 

811 self._regionColumns[name] = column 

812 return column 

813 

814 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

815 # Docstring inherited from Query. 

816 if self._datasetQueryColumns is None: 

817 base = self._columns.datasets 

818 if base is None: 

819 return None 

820 ingestDate = base.ingestDate 

821 if ingestDate is not None: 

822 ingestDate = ingestDate.label("ingest_date") 

823 self._datasetQueryColumns = DatasetQueryColumns( 

824 datasetType=base.datasetType, 

825 id=base.id.label("dataset_id"), 

826 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()), 

827 ingestDate=ingestDate, 

828 ) 

829 return self._datasetQueryColumns 

830 

831 @property 

832 def sql(self) -> sqlalchemy.sql.FromClause: 

833 # Docstring inherited from Query. 

834 simpleQuery = self._simpleQuery.copy() 

835 for dimension in self.graph: 

836 simpleQuery.columns.append(self.getDimensionColumn(dimension.name)) 

837 for element in self.spatial: 

838 simpleQuery.columns.append(self.getRegionColumn(element.name)) 

839 datasetColumns = self.getDatasetColumns() 

840 if datasetColumns is not None: 

841 simpleQuery.columns.extend(datasetColumns) 

842 sql = simpleQuery.combine() 

843 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT: 

844 return sql.distinct() 

845 else: 

846 return sql 

847 

848 def subset(self, *, graph: Optional[DimensionGraph] = None, 

849 datasets: bool = True, 

850 unique: bool = False) -> Query: 

851 # Docstring inherited from Query. 

852 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

853 if columns is None: 

854 return self 

855 if columns.isEmpty(): 

856 return EmptyQuery(self.graph.universe, self.managers) 

857 return DirectQuery( 

858 simpleQuery=self._simpleQuery.copy(), 

859 columns=columns, 

860 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

861 graph=graph, 

862 whereRegion=self.whereRegion if not unique else None, 

863 managers=self.managers, 

864 doomed_by=self._doomed_by, 

865 ) 

866 

867 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

868 # Docstring inherited from Query. 

869 from ._builder import QueryBuilder 

870 if summary is None: 

871 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

872 if not summary.requested.issubset(self.graph): 

873 raise NotImplementedError( 

874 f"Query.makeBuilder does not yet support augmenting dimensions " 

875 f"({summary.requested.dimensions}) beyond those originally included in the query " 

876 f"({self.graph.dimensions})." 

877 ) 

878 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

879 builder.joinTable(self.sql.alias(), dimensions=self.graph.dimensions, 

880 datasets=self.getDatasetColumns()) 

881 return builder 

882 

883 

884class MaterializedQuery(Query): 

885 """A `Query` implementation that represents query results saved in a 

886 temporary table. 

887 

888 `MaterializedQuery` instances should not be constructed directly; use 

889 `Query.materialize()` instead. 

890 

891 Parameters 

892 ---------- 

893 table : `sqlalchemy.schema.Table` 

894 SQLAlchemy object represnting the temporary table. 

895 spatial : `Iterable` [ `DimensionElement` ] 

896 Spatial dimension elements whose regions must overlap for each valid 

897 result row (which may reject some rows that are in the table). 

898 datasetType : `DatasetType` 

899 The `DatasetType` of datasets returned by this query, or `None` 

900 if there are no dataset results 

901 isUnique : `bool` 

902 If `True`, the table's rows are unique, and there is no need to 

903 add ``SELECT DISTINCT`` to gaurantee this in results. 

904 graph : `DimensionGraph` 

905 Dimensions included in the columns of this table. 

906 whereRegion : `Region` or `None` 

907 A spatial region all result-row regions must overlap to be valid (which 

908 may reject some rows that are in the table). 

909 managers : `RegistryManagers` 

910 A struct containing `Registry` manager helper objects, forwarded to 

911 the `Query` constructor. 

912 doomed_by : `Iterable` [ `str` ], optional 

913 A list of messages (appropriate for e.g. logging or exceptions) that 

914 explain why the query is known to return no results even before it is 

915 executed. Queries with a non-empty list will never be executed. 

916 """ 

917 def __init__(self, *, 

918 table: sqlalchemy.schema.Table, 

919 spatial: Iterable[DimensionElement], 

920 datasetType: Optional[DatasetType], 

921 isUnique: bool, 

922 graph: DimensionGraph, 

923 whereRegion: Optional[Region], 

924 managers: RegistryManagers, 

925 doomed_by: Iterable[str] = ()): 

926 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

927 self._table = table 

928 self._spatial = tuple(spatial) 

929 self._datasetType = datasetType 

930 self._isUnique = isUnique 

931 

932 def isUnique(self) -> bool: 

933 # Docstring inherited from Query. 

934 return self._isUnique 

935 

936 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

937 # Docstring inherited from Query. 

938 return self._table.columns[name] 

939 

940 @property 

941 def spatial(self) -> Iterator[DimensionElement]: 

942 # Docstring inherited from Query. 

943 return iter(self._spatial) 

944 

945 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

946 # Docstring inherited from Query. 

947 return self._table.columns[f"{name}_region"] 

948 

949 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

950 # Docstring inherited from Query. 

951 if self._datasetType is not None: 

952 return DatasetQueryColumns( 

953 datasetType=self._datasetType, 

954 id=self._table.columns["dataset_id"], 

955 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()], 

956 ingestDate=None, 

957 ) 

958 else: 

959 return None 

960 

961 @property 

962 def sql(self) -> sqlalchemy.sql.FromClause: 

963 # Docstring inherited from Query. 

964 return self._table.select() 

965 

966 @contextmanager 

967 def materialize(self, db: Database) -> Iterator[Query]: 

968 # Docstring inherited from Query. 

969 yield self 

970 

971 def subset(self, *, graph: Optional[DimensionGraph] = None, 

972 datasets: bool = True, 

973 unique: bool = False) -> Query: 

974 # Docstring inherited from Query. 

975 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

976 if columns is None: 

977 return self 

978 if columns.isEmpty(): 

979 return EmptyQuery(self.graph.universe, managers=self.managers) 

980 simpleQuery = SimpleQuery() 

981 simpleQuery.join(self._table) 

982 return DirectQuery( 

983 simpleQuery=simpleQuery, 

984 columns=columns, 

985 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

986 graph=graph, 

987 whereRegion=self.whereRegion if not unique else None, 

988 managers=self.managers, 

989 doomed_by=self._doomed_by, 

990 ) 

991 

992 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

993 # Docstring inherited from Query. 

994 from ._builder import QueryBuilder 

995 if summary is None: 

996 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

997 if not summary.requested.issubset(self.graph): 

998 raise NotImplementedError( 

999 f"Query.makeBuilder does not yet support augmenting dimensions " 

1000 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1001 f"({self.graph.dimensions})." 

1002 ) 

1003 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

1004 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()) 

1005 return builder 

1006 

1007 

1008class EmptyQuery(Query): 

1009 """A `Query` implementation that handes the special case where the query 

1010 would have no columns. 

1011 

1012 Parameters 

1013 ---------- 

1014 universe : `DimensionUniverse` 

1015 Set of all dimensions from which the null set is extracted. 

1016 managers : `RegistryManagers` 

1017 A struct containing the registry manager instances used by the query 

1018 system. 

1019 doomed_by : `Iterable` [ `str` ], optional 

1020 A list of messages (appropriate for e.g. logging or exceptions) that 

1021 explain why the query is known to return no results even before it is 

1022 executed. Queries with a non-empty list will never be executed. 

1023 """ 

1024 def __init__( 

1025 self, 

1026 universe: DimensionUniverse, 

1027 managers: RegistryManagers, 

1028 doomed_by: Iterable[str] = (), 

1029 ): 

1030 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by) 

1031 

1032 def isUnique(self) -> bool: 

1033 # Docstring inherited from Query. 

1034 return True 

1035 

1036 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1037 # Docstring inherited from Query. 

1038 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).") 

1039 

1040 @property 

1041 def spatial(self) -> Iterator[DimensionElement]: 

1042 # Docstring inherited from Query. 

1043 return iter(()) 

1044 

1045 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1046 # Docstring inherited from Query. 

1047 raise KeyError(f"No region for {name} in query (no regions at all, actually).") 

1048 

1049 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1050 # Docstring inherited from Query. 

1051 return None 

1052 

1053 def rows(self, db: Database, *, region: Optional[Region] = None 

1054 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

1055 if not self._doomed_by: 

1056 yield None 

1057 

1058 @property 

1059 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

1060 # Docstring inherited from Query. 

1061 return None 

1062 

1063 @contextmanager 

1064 def materialize(self, db: Database) -> Iterator[Query]: 

1065 # Docstring inherited from Query. 

1066 yield self 

1067 

1068 def subset(self, *, graph: Optional[DimensionGraph] = None, 

1069 datasets: bool = True, 

1070 unique: bool = False) -> Query: 

1071 # Docstring inherited from Query. 

1072 assert graph is None or graph.issubset(self.graph) 

1073 return self 

1074 

1075 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1076 # Docstring inherited from Query. 

1077 from ._builder import QueryBuilder 

1078 if summary is None: 

1079 summary = QuerySummary(self.graph) 

1080 if not summary.requested.issubset(self.graph): 

1081 raise NotImplementedError( 

1082 f"Query.makeBuilder does not yet support augmenting dimensions " 

1083 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1084 f"({self.graph.dimensions})." 

1085 ) 

1086 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)