Coverage for python/lsst/daf/butler/registry/queries/_query.py: 24%

363 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-08-26 09:24 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("Query",) 

24 

25import dataclasses 

26import enum 

27import itertools 

28from abc import ABC, abstractmethod 

29from contextlib import contextmanager 

30from typing import TYPE_CHECKING, ContextManager, Dict, Iterable, Iterator, Mapping, Optional, Tuple 

31 

32import sqlalchemy 

33from lsst.sphgeom import Region 

34 

35from ...core import ( 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 Dimension, 

40 DimensionElement, 

41 DimensionGraph, 

42 DimensionRecord, 

43 DimensionUniverse, 

44 SimpleQuery, 

45 SpatialRegionDatabaseRepresentation, 

46 addDimensionForeignKey, 

47 ddl, 

48) 

49from ..interfaces import Database 

50from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers 

51 

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 from ._builder import QueryBuilder 

54 

55 

56@dataclasses.dataclass(frozen=True) 

57class OrderByColumn: 

58 """Information about single column in ORDER BY clause.""" 

59 

60 column: sqlalchemy.sql.ColumnElement 

61 """Name of the column or `None` for primary key (`str` or `None`)""" 

62 

63 ordering: bool 

64 """True for ascending order, False for descending (`bool`).""" 

65 

66 @property 

67 def column_order(self) -> sqlalchemy.sql.ColumnElement: 

68 """Column element for use in ORDER BY clause 

69 (`sqlalchemy.sql.ColumnElement`) 

70 """ 

71 return self.column.asc() if self.ordering else self.column.desc() 

72 

73 

74class Query(ABC): 

75 """An abstract base class for queries that return some combination of 

76 `DatasetRef` and `DataCoordinate` objects. 

77 

78 Parameters 

79 ---------- 

80 graph : `DimensionGraph` 

81 Object describing the dimensions included in the query. 

82 whereRegion : `lsst.sphgeom.Region`, optional 

83 Region that all region columns in all returned rows must overlap. 

84 managers : `RegistryManagers` 

85 A struct containing the registry manager instances used by the query 

86 system. 

87 doomed_by : `Iterable` [ `str` ], optional 

88 A list of messages (appropriate for e.g. logging or exceptions) that 

89 explain why the query is known to return no results even before it is 

90 executed. Queries with a non-empty list will never be executed. 

91 

92 Notes 

93 ----- 

94 The `Query` hierarchy abstracts over the database/SQL representation of a 

95 particular set of data IDs or datasets. It is expected to be used as a 

96 backend for other objects that provide more natural interfaces for one or 

97 both of these, not as part of a public interface to query results. 

98 """ 

99 

100 def __init__( 

101 self, 

102 *, 

103 graph: DimensionGraph, 

104 whereRegion: Optional[Region], 

105 managers: RegistryManagers, 

106 doomed_by: Iterable[str] = (), 

107 ): 

108 self.graph = graph 

109 self.whereRegion = whereRegion 

110 self.managers = managers 

111 self._doomed_by = tuple(doomed_by) 

112 self._filtered_by_join: Optional[int] = None 

113 self._filtered_by_where: Optional[int] = None 

114 

115 @abstractmethod 

116 def isUnique(self) -> bool: 

117 """Return `True` if this query's rows are guaranteed to be unique, and 

118 `False` otherwise. 

119 

120 If this query has dataset results (`datasetType` is not `None`), 

121 uniqueness applies to the `DatasetRef` instances returned by 

122 `extractDatasetRef` from the result of `rows`. If it does not have 

123 dataset results, uniqueness applies to the `DataCoordinate` instances 

124 returned by `extractDataId`. 

125 """ 

126 raise NotImplementedError() 

127 

128 @abstractmethod 

129 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

130 """Return the query column that contains the primary key value for 

131 the dimension with the given name. 

132 

133 Parameters 

134 ---------- 

135 name : `str` 

136 Name of the dimension. 

137 

138 Returns 

139 ------- 

140 column : `sqlalchemy.sql.ColumnElement`. 

141 SQLAlchemy object representing a column in the query. 

142 

143 Notes 

144 ----- 

145 This method is intended primarily as a hook for subclasses to implement 

146 and the ABC to call in order to provide higher-level functionality; 

147 code that uses `Query` objects (but does not implement one) should 

148 usually not have to call this method. 

149 """ 

150 raise NotImplementedError() 

151 

152 @property 

153 @abstractmethod 

154 def spatial(self) -> Iterator[DimensionElement]: 

155 """An iterator over the dimension element columns used in post-query 

156 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]). 

157 

158 Notes 

159 ----- 

160 This property is intended primarily as a hook for subclasses to 

161 implement and the ABC to call in order to provide higher-level 

162 functionality; code that uses `Query` objects (but does not implement 

163 one) should usually not have to access this property. 

164 """ 

165 raise NotImplementedError() 

166 

167 @abstractmethod 

168 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

169 """Return a region column for one of the dimension elements iterated 

170 over by `spatial`. 

171 

172 Parameters 

173 ---------- 

174 name : `str` 

175 Name of the element. 

176 

177 Returns 

178 ------- 

179 column : `sqlalchemy.sql.ColumnElement` 

180 SQLAlchemy representing a result column in the query. 

181 

182 Notes 

183 ----- 

184 This method is intended primarily as a hook for subclasses to implement 

185 and the ABC to call in order to provide higher-level functionality; 

186 code that uses `Query` objects (but does not implement one) should 

187 usually not have to call this method. 

188 """ 

189 raise NotImplementedError() 

190 

191 @property 

192 def datasetType(self) -> Optional[DatasetType]: 

193 """The `DatasetType` of datasets returned by this query, or `None` 

194 if there are no dataset results (`DatasetType` or `None`). 

195 """ 

196 cols = self.getDatasetColumns() 

197 if cols is None: 

198 return None 

199 return cols.datasetType 

200 

201 def count(self, db: Database, *, region: Optional[Region] = None, exact: bool = True) -> int: 

202 """Count the number of rows this query would return. 

203 

204 Parameters 

205 ---------- 

206 db : `Database` 

207 Object managing the database connection. 

208 region : `sphgeom.Region`, optional 

209 A region that any result-row regions must overlap in order to be 

210 yielded. If not provided, this will be ``self.whereRegion``, if 

211 that exists. 

212 exact : `bool`, optional 

213 If `True`, run the full query and perform post-query filtering if 

214 needed to account for that filtering in the count. If `False`, the 

215 result may be an upper bound. 

216 

217 Returns 

218 ------- 

219 count : `int` 

220 The number of rows the query would return, or an upper bound if 

221 ``exact=False``. 

222 

223 Notes 

224 ----- 

225 This counts the number of rows returned, not the number of unique rows 

226 returned, so even with ``exact=True`` it may provide only an upper 

227 bound on the number of *deduplicated* result rows. 

228 """ 

229 if self._doomed_by: 

230 return 0 

231 sql = self.sql 

232 if sql is None: 

233 return 1 

234 if exact and self.spatial: 

235 filtered_count = 0 

236 for _ in self.rows(db, region=region): 

237 filtered_count += 1 

238 return filtered_count 

239 else: 

240 return db.query(sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)).scalar() 

241 

242 def any( 

243 self, 

244 db: Database, 

245 *, 

246 region: Optional[Region] = None, 

247 execute: bool = True, 

248 exact: bool = True, 

249 ) -> bool: 

250 """Test whether this query returns any results. 

251 

252 Parameters 

253 ---------- 

254 db : `Database` 

255 Object managing the database connection. 

256 region : `sphgeom.Region`, optional 

257 A region that any result-row regions must overlap in order to be 

258 yielded. If not provided, this will be ``self.whereRegion``, if 

259 that exists. 

260 execute : `bool`, optional 

261 If `True`, execute at least a ``LIMIT 1`` query if it cannot be 

262 determined prior to execution that the query would return no rows. 

263 exact : `bool`, optional 

264 If `True`, run the full query and perform post-query filtering if 

265 needed, until at least one result row is found. If `False`, the 

266 returned result does not account for post-query filtering, and 

267 hence may be `True` even when all result rows would be filtered 

268 out. 

269 

270 Returns 

271 ------- 

272 any : `bool` 

273 `True` if the query would (or might, depending on arguments) yield 

274 result rows. `False` if it definitely would not. 

275 """ 

276 if self._doomed_by: 

277 return False 

278 sql = self.sql 

279 if sql is None: 

280 return True 

281 if exact and not execute: 

282 raise TypeError("Cannot obtain exact results without executing the query.") 

283 if exact and self.spatial: 

284 for _ in self.rows(db, region=region): 

285 return True 

286 return False 

287 elif execute: 

288 return db.query(sql.limit(1)).one_or_none() is not None 

289 else: 

290 return True 

291 

292 def explain_no_results( 

293 self, 

294 db: Database, 

295 *, 

296 region: Optional[Region] = None, 

297 followup: bool = True, 

298 ) -> Iterator[str]: 

299 """Return human-readable messages that may help explain why the query 

300 yields no results. 

301 

302 Parameters 

303 ---------- 

304 db : `Database` 

305 Object managing the database connection. 

306 region : `sphgeom.Region`, optional 

307 A region that any result-row regions must overlap in order to be 

308 yielded. If not provided, this will be ``self.whereRegion``, if 

309 that exists. 

310 followup : `bool`, optional 

311 If `True` (default) perform inexpensive follow-up queries if no 

312 diagnostics are available from query generation alone. 

313 

314 Returns 

315 ------- 

316 messages : `Iterator` [ `str` ] 

317 String messages that describe reasons the query might not yield any 

318 results. 

319 

320 Notes 

321 ----- 

322 Messages related to post-query filtering are only available if `rows`, 

323 `any`, or `count` was already called with the same region (with 

324 ``exact=True`` for the latter two). 

325 """ 

326 from ._builder import QueryBuilder 

327 

328 if self._doomed_by: 

329 yield from self._doomed_by 

330 return 

331 if self._filtered_by_where: 

332 yield ( 

333 f"{self._filtered_by_where} result rows were filtered out because " 

334 "one or more region did not overlap the WHERE-clause region." 

335 ) 

336 if self._filtered_by_join: 

337 yield ( 

338 f"{self._filtered_by_join} result rows were filtered out because " 

339 "one or more regions did not overlap." 

340 ) 

341 if (not followup) or self._filtered_by_join or self._filtered_by_where: 

342 return 

343 # Query didn't return results even before client-side filtering, and 

344 # caller says we can do follow-up queries to determine why. 

345 # Start by seeing if there are _any_ dimension records for each element 

346 # involved. 

347 for element in self.graph.elements: 

348 summary = QuerySummary(element.graph) 

349 builder = QueryBuilder(summary, self.managers) 

350 followup_query = builder.finish() 

351 if not followup_query.any(db, exact=False): 

352 yield f"No dimension records for element '{element.name}' found." 

353 yield from followup_query.explain_no_results(db, region=region, followup=False) 

354 return 

355 

356 @abstractmethod 

357 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

358 """Return the columns for the datasets returned by this query. 

359 

360 Returns 

361 ------- 

362 columns : `DatasetQueryColumns` or `None` 

363 Struct containing SQLAlchemy representations of the result columns 

364 for a dataset. 

365 

366 Notes 

367 ----- 

368 This method is intended primarily as a hook for subclasses to implement 

369 and the ABC to call in order to provide higher-level functionality; 

370 code that uses `Query` objects (but does not implement one) should 

371 usually not have to call this method. 

372 """ 

373 raise NotImplementedError() 

374 

375 @property 

376 @abstractmethod 

377 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

378 """A SQLAlchemy object representing the full query 

379 (`sqlalchemy.sql.FromClause` or `None`). 

380 

381 This is `None` in the special case where the query has no columns, and 

382 only one logical row. 

383 """ 

384 raise NotImplementedError() 

385 

386 def rows( 

387 self, db: Database, *, region: Optional[Region] = None 

388 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

389 """Execute the query and yield result rows, applying `predicate`. 

390 

391 Parameters 

392 ---------- 

393 db : `Database` 

394 Object managing the database connection. 

395 region : `sphgeom.Region`, optional 

396 A region that any result-row regions must overlap in order to be 

397 yielded. If not provided, this will be ``self.whereRegion``, if 

398 that exists. 

399 

400 Yields 

401 ------ 

402 row : `sqlalchemy.engine.RowProxy` or `None` 

403 Result row from the query. `None` may yielded exactly once instead 

404 of any real rows to indicate an empty query (see `EmptyQuery`). 

405 """ 

406 if self._doomed_by: 

407 return 

408 whereRegion = region if region is not None else self.whereRegion 

409 self._filtered_by_where = 0 

410 self._filtered_by_join = 0 

411 for row in db.query(self.sql): 

412 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial] 

413 if whereRegion and any(r.isDisjointFrom(whereRegion) for r in rowRegions): 

414 self._filtered_by_where += 1 

415 continue 

416 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)): 

417 self._filtered_by_join += 1 

418 continue 

419 yield row 

420 

421 def extractDimensionsTuple( 

422 self, row: Optional[sqlalchemy.engine.RowProxy], dimensions: Iterable[Dimension] 

423 ) -> tuple: 

424 """Extract a tuple of data ID values from a result row. 

425 

426 Parameters 

427 ---------- 

428 row : `sqlalchemy.engine.RowProxy` or `None` 

429 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

430 the row from an `EmptyQuery`. 

431 dimensions : `Iterable` [ `Dimension` ] 

432 The dimensions to include in the returned tuple, in order. 

433 

434 Returns 

435 ------- 

436 values : `tuple` 

437 A tuple of dimension primary key values. 

438 """ 

439 if row is None: 

440 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions." 

441 return () 

442 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions) 

443 

444 def extractDataId( 

445 self, 

446 row: Optional[sqlalchemy.engine.RowProxy], 

447 *, 

448 graph: Optional[DimensionGraph] = None, 

449 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

450 ) -> DataCoordinate: 

451 """Extract a data ID from a result row. 

452 

453 Parameters 

454 ---------- 

455 row : `sqlalchemy.engine.RowProxy` or `None` 

456 A result row from a SQLAlchemy SELECT query, or `None` to indicate 

457 the row from an `EmptyQuery`. 

458 graph : `DimensionGraph`, optional 

459 The dimensions the returned data ID should identify. If not 

460 provided, this will be all dimensions in `QuerySummary.requested`. 

461 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

462 Nested mapping containing records to attach to the returned 

463 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will 

464 return `True`. If provided, outer keys must include all dimension 

465 element names in ``graph``, and inner keys should be tuples of 

466 dimension primary key values in the same order as 

467 ``element.graph.required``. If not provided, 

468 `DataCoordinate.hasRecords` will return `False` on the returned 

469 object. 

470 

471 Returns 

472 ------- 

473 dataId : `DataCoordinate` 

474 A data ID that identifies all required and implied dimensions. If 

475 ``records is not None``, this is have 

476 `~DataCoordinate.hasRecords()` return `True`. 

477 """ 

478 if graph is None: 

479 graph = self.graph 

480 if not graph: 

481 return DataCoordinate.makeEmpty(self.graph.universe) 

482 dataId = DataCoordinate.fromFullValues( 

483 graph, self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied)) 

484 ) 

485 if records is not None: 

486 recordsForRow = {} 

487 for element in graph.elements: 

488 key = tuple(dataId.subset(element.graph).values()) 

489 recordsForRow[element.name] = records[element.name].get(key) 

490 return dataId.expanded(recordsForRow) 

491 else: 

492 return dataId 

493 

494 def extractDatasetRef( 

495 self, 

496 row: sqlalchemy.engine.RowProxy, 

497 dataId: Optional[DataCoordinate] = None, 

498 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None, 

499 ) -> DatasetRef: 

500 """Extract a `DatasetRef` from a result row. 

501 

502 Parameters 

503 ---------- 

504 row : `sqlalchemy.engine.RowProxy` 

505 A result row from a SQLAlchemy SELECT query. 

506 dataId : `DataCoordinate` 

507 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class) 

508 `DataCoordinate` is constructed from ``row`` if `None`. 

509 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ] 

510 Records to use to return an `ExpandedDataCoordinate`. If provided, 

511 outer keys must include all dimension element names in ``graph``, 

512 and inner keys should be tuples of dimension primary key values 

513 in the same order as ``element.graph.required``. 

514 

515 Returns 

516 ------- 

517 ref : `DatasetRef` 

518 Reference to the dataset; guaranteed to have `DatasetRef.id` not 

519 `None`. 

520 """ 

521 datasetColumns = self.getDatasetColumns() 

522 assert datasetColumns is not None 

523 if dataId is None: 

524 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records) 

525 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]] 

526 return DatasetRef( 

527 datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], run=runRecord.name 

528 ) 

529 

530 def _makeSubsetQueryColumns( 

531 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

532 ) -> Tuple[DimensionGraph, Optional[QueryColumns]]: 

533 """Helper method for subclass implementations of `subset`. 

534 

535 Parameters 

536 ---------- 

537 graph : `DimensionGraph`, optional 

538 Dimensions to include in the new `Query` being constructed. 

539 ``subset`` implementations should generally just forward their 

540 own ``graph`` argument here. 

541 datasets : `bool`, optional 

542 Whether the new `Query` should include dataset results. Defaults 

543 to `True`, but is ignored if ``self`` does not include dataset 

544 results. 

545 unique : `bool`, optional 

546 Whether the new `Query` should guarantee unique results (this may 

547 come with a performance penalty). 

548 

549 Returns 

550 ------- 

551 graph : `DimensionGraph` 

552 The dimensions of the new `Query`. This is exactly the same as 

553 the argument of the same name, with ``self.graph`` used if that 

554 argument is `None`. 

555 columns : `QueryColumns` or `None` 

556 A struct containing the SQLAlchemy column objects to use in the 

557 new query, constructed by delegating to other (mostly abstract) 

558 methods on ``self``. If `None`, `subset` may return ``self``. 

559 """ 

560 if graph is None: 

561 graph = self.graph 

562 if ( 

563 graph == self.graph 

564 and (self.getDatasetColumns() is None or datasets) 

565 and (self.isUnique() or not unique) 

566 ): 

567 return graph, None 

568 columns = QueryColumns() 

569 for dimension in graph.dimensions: 

570 col = self.getDimensionColumn(dimension.name) 

571 columns.keys[dimension] = [col] 

572 if not unique: 

573 for element in self.spatial: 

574 col = self.getRegionColumn(element.name) 

575 columns.regions[element] = col 

576 if datasets and self.getDatasetColumns() is not None: 

577 columns.datasets = self.getDatasetColumns() 

578 return graph, columns 

579 

580 @abstractmethod 

581 def materialize(self, db: Database) -> ContextManager[Query]: 

582 """Execute this query and insert its results into a temporary table. 

583 

584 Parameters 

585 ---------- 

586 db : `Database` 

587 Database engine to execute the query against. 

588 

589 Returns 

590 ------- 

591 context : `typing.ContextManager` [ `MaterializedQuery` ] 

592 A context manager that ensures the temporary table is created and 

593 populated in ``__enter__`` (returning a `MaterializedQuery` object 

594 backed by that table), and dropped in ``__exit__``. If ``self`` 

595 is already a `MaterializedQuery`, ``__enter__`` may just return 

596 ``self`` and ``__exit__`` may do nothing (reflecting the fact that 

597 an outer context manager should already take care of everything 

598 else). 

599 """ 

600 raise NotImplementedError() 

601 

602 @abstractmethod 

603 def subset( 

604 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

605 ) -> Query: 

606 """Return a new `Query` whose columns and/or rows are (mostly) subset 

607 of this one's. 

608 

609 Parameters 

610 ---------- 

611 graph : `DimensionGraph`, optional 

612 Dimensions to include in the new `Query` being constructed. 

613 If `None` (default), ``self.graph`` is used. 

614 datasets : `bool`, optional 

615 Whether the new `Query` should include dataset results. Defaults 

616 to `True`, but is ignored if ``self`` does not include dataset 

617 results. 

618 unique : `bool`, optional 

619 Whether the new `Query` should guarantee unique results (this may 

620 come with a performance penalty). 

621 

622 Returns 

623 ------- 

624 query : `Query` 

625 A query object corresponding to the given inputs. May be ``self`` 

626 if no changes were requested. 

627 

628 Notes 

629 ----- 

630 The way spatial overlaps are handled at present makes it impossible to 

631 fully guarantee in general that the new query's rows are a subset of 

632 this one's while also returning unique rows. That's because the 

633 database is only capable of performing approximate, conservative 

634 overlaps via the common skypix system; we defer actual region overlap 

635 operations to per-result-row Python logic. But including the region 

636 columns necessary to do that postprocessing in the query makes it 

637 impossible to do a SELECT DISTINCT on the user-visible dimensions of 

638 the query. For example, consider starting with a query with dimensions 

639 (instrument, skymap, visit, tract). That involves a spatial join 

640 between visit and tract, and we include the region columns from both 

641 tables in the results in order to only actually yield result rows 

642 (see `predicate` and `rows`) where the regions in those two columns 

643 overlap. If the user then wants to subset to just (skymap, tract) with 

644 unique results, we have two unpalatable options: 

645 

646 - we can do a SELECT DISTINCT with just the skymap and tract columns 

647 in the SELECT clause, dropping all detailed overlap information and 

648 including some tracts that did not actually overlap any of the 

649 visits in the original query (but were regarded as _possibly_ 

650 overlapping via the coarser, common-skypix relationships); 

651 

652 - we can include the tract and visit region columns in the query, and 

653 continue to filter out the non-overlapping pairs, but completely 

654 disregard the user's request for unique tracts. 

655 

656 This interface specifies that implementations must do the former, as 

657 that's what makes things efficient in our most important use case 

658 (``QuantumGraph`` generation in ``pipe_base``). We may be able to 

659 improve this situation in the future by putting exact overlap 

660 information in the database, either by using built-in (but 

661 engine-specific) spatial database functionality or (more likely) 

662 switching to a scheme in which pairwise dimension spatial relationships 

663 are explicitly precomputed (for e.g. combinations of instruments and 

664 skymaps). 

665 """ 

666 raise NotImplementedError() 

667 

668 @abstractmethod 

669 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

670 """Return a `QueryBuilder` that can be used to construct a new `Query` 

671 that is joined to (and hence constrained by) this one. 

672 

673 Parameters 

674 ---------- 

675 summary : `QuerySummary`, optional 

676 A `QuerySummary` instance that specifies the dimensions and any 

677 additional constraints to include in the new query being 

678 constructed, or `None` to use the dimensions of ``self`` with no 

679 additional constraints. 

680 """ 

681 raise NotImplementedError() 

682 

683 graph: DimensionGraph 

684 """The dimensions identified by this query and included in any data IDs 

685 created from its result rows (`DimensionGraph`). 

686 """ 

687 

688 whereRegion: Optional[Region] 

689 """A spatial region that all regions in all rows returned by this query 

690 must overlap (`lsst.sphgeom.Region` or `None`). 

691 """ 

692 

693 managers: RegistryManagers 

694 """A struct containing `Registry` helper object (`RegistryManagers`). 

695 """ 

696 

697 

698class DirectQueryUniqueness(enum.Enum): 

699 """An enum representing the ways in which a query can have unique rows (or 

700 not). 

701 """ 

702 

703 NOT_UNIQUE = enum.auto() 

704 """The query is not expected to have unique rows. 

705 """ 

706 

707 NATURALLY_UNIQUE = enum.auto() 

708 """The construction of the query guarantees that it will have unique 

709 result rows, even without SELECT DISTINCT or a GROUP BY clause. 

710 """ 

711 

712 NEEDS_DISTINCT = enum.auto() 

713 """The query is expected to yield unique result rows, and needs to use 

714 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this. 

715 """ 

716 

717 

718class DirectQuery(Query): 

719 """A `Query` implementation that represents a direct SELECT query that 

720 usually joins many tables. 

721 

722 `DirectQuery` objects should generally only be constructed by 

723 `QueryBuilder` or the methods of other `Query` objects. 

724 

725 Parameters 

726 ---------- 

727 simpleQuery : `SimpleQuery` 

728 Struct representing the actual SELECT, FROM, and WHERE clauses. 

729 columns : `QueryColumns` 

730 Columns that are referenced in the query in any clause. 

731 uniqueness : `DirectQueryUniqueness` 

732 Enum value indicating whether the query should yield unique result 

733 rows, and if so whether that needs to be explicitly requested of the 

734 database. 

735 graph : `DimensionGraph` 

736 Object describing the dimensions included in the query. 

737 whereRegion : `lsst.sphgeom.Region`, optional 

738 Region that all region columns in all returned rows must overlap. 

739 managers : `RegistryManagers` 

740 Struct containing the `Registry` manager helper objects, to be 

741 forwarded to the `Query` constructor. 

742 doomed_by : `Iterable` [ `str` ], optional 

743 A list of messages (appropriate for e.g. logging or exceptions) that 

744 explain why the query is known to return no results even before it is 

745 executed. Queries with a non-empty list will never be executed. 

746 """ 

747 

748 def __init__( 

749 self, 

750 *, 

751 simpleQuery: SimpleQuery, 

752 columns: QueryColumns, 

753 uniqueness: DirectQueryUniqueness, 

754 graph: DimensionGraph, 

755 whereRegion: Optional[Region], 

756 managers: RegistryManagers, 

757 order_by_columns: Iterable[OrderByColumn] = (), 

758 limit: Optional[Tuple[int, Optional[int]]] = None, 

759 doomed_by: Iterable[str] = (), 

760 ): 

761 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

762 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql" 

763 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns." 

764 self._simpleQuery = simpleQuery 

765 self._columns = columns 

766 self._uniqueness = uniqueness 

767 self._order_by_columns = order_by_columns 

768 self._limit = limit 

769 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None 

770 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

771 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {} 

772 

773 def isUnique(self) -> bool: 

774 # Docstring inherited from Query. 

775 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE 

776 

777 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

778 # Docstring inherited from Query. 

779 column = self._dimensionColumns.get(name) 

780 if column is None: 

781 column = self._columns.getKeyColumn(name).label(name) 

782 self._dimensionColumns[name] = column 

783 return column 

784 

785 @property 

786 def spatial(self) -> Iterator[DimensionElement]: 

787 # Docstring inherited from Query. 

788 return iter(self._columns.regions) 

789 

790 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

791 # Docstring inherited from Query. 

792 column = self._regionColumns.get(name) 

793 if column is None: 

794 column = self._columns.regions[name].column.label(f"{name}_region") 

795 self._regionColumns[name] = column 

796 return column 

797 

798 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

799 # Docstring inherited from Query. 

800 if self._datasetQueryColumns is None: 

801 base = self._columns.datasets 

802 if base is None: 

803 return None 

804 ingestDate = base.ingestDate 

805 if ingestDate is not None: 

806 ingestDate = ingestDate.label("ingest_date") 

807 self._datasetQueryColumns = DatasetQueryColumns( 

808 datasetType=base.datasetType, 

809 id=base.id.label("dataset_id"), 

810 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()), 

811 ingestDate=ingestDate, 

812 ) 

813 return self._datasetQueryColumns 

814 

815 @property 

816 def sql(self) -> sqlalchemy.sql.FromClause: 

817 # Docstring inherited from Query. 

818 simpleQuery = self._simpleQuery.copy() 

819 for dimension in self.graph: 

820 simpleQuery.columns.append(self.getDimensionColumn(dimension.name)) 

821 for element in self.spatial: 

822 simpleQuery.columns.append(self.getRegionColumn(element.name)) 

823 datasetColumns = self.getDatasetColumns() 

824 if datasetColumns is not None: 

825 simpleQuery.columns.extend(datasetColumns) 

826 

827 assert not simpleQuery.order_by, "Input query cannot have ORDER BY" 

828 if self._order_by_columns: 

829 # add ORDER BY column 

830 order_by_columns = [column.column_order for column in self._order_by_columns] 

831 order_by_column = sqlalchemy.func.row_number().over(order_by=order_by_columns).label("_orderby") 

832 simpleQuery.columns.append(order_by_column) 

833 simpleQuery.order_by = [order_by_column] 

834 

835 assert simpleQuery.limit is None, "Input query cannot have LIMIT" 

836 simpleQuery.limit = self._limit 

837 

838 sql = simpleQuery.combine() 

839 

840 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT: 

841 return sql.distinct() 

842 else: 

843 return sql 

844 

845 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec: 

846 """Helper method for subclass implementations of `materialize`. 

847 

848 Parameters 

849 ---------- 

850 constraints : `bool`, optional 

851 If `True` (`False` is default), define a specification that 

852 includes actual foreign key constraints for logical foreign keys. 

853 Some database engines do not permit temporary tables to reference 

854 normal tables, so this should be `False` when generating a spec 

855 for a temporary table unless the database engine is known to 

856 support them. 

857 

858 Returns 

859 ------- 

860 spec : `ddl.TableSpec` 

861 Specification for a table that could hold this query's result rows. 

862 """ 

863 unique = self.isUnique() 

864 spec = ddl.TableSpec(fields=()) 

865 for dimension in self.graph: 

866 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints) 

867 for element in self.spatial: 

868 spec.fields.update( 

869 SpatialRegionDatabaseRepresentation.makeFieldSpecs( 

870 nullable=True, 

871 name=f"{element.name}_region", 

872 ) 

873 ) 

874 datasetColumns = self.getDatasetColumns() 

875 if datasetColumns is not None: 

876 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints) 

877 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints) 

878 

879 # Need a column for ORDER BY if ordering is requested 

880 if self._order_by_columns: 

881 spec.fields.add( 

882 ddl.FieldSpec( 

883 name="_orderby", 

884 dtype=sqlalchemy.BigInteger, 

885 nullable=False, 

886 doc="Column to use with ORDER BY", 

887 ) 

888 ) 

889 

890 return spec 

891 

892 @contextmanager 

893 def materialize(self, db: Database) -> Iterator[Query]: 

894 # Docstring inherited from Query. 

895 spec = self._makeTableSpec() 

896 with db.session() as session: 

897 table = session.makeTemporaryTable(spec) 

898 if not self._doomed_by: 

899 db.insert(table, select=self.sql, names=spec.fields.names) 

900 yield MaterializedQuery( 

901 table=table, 

902 spatial=self.spatial, 

903 datasetType=self.datasetType, 

904 isUnique=self.isUnique(), 

905 graph=self.graph, 

906 whereRegion=self.whereRegion, 

907 managers=self.managers, 

908 doomed_by=self._doomed_by, 

909 ) 

910 session.dropTemporaryTable(table) 

911 

912 def subset( 

913 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

914 ) -> Query: 

915 # Docstring inherited from Query. 

916 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

917 if columns is None: 

918 return self 

919 if columns.isEmpty(): 

920 return EmptyQuery(self.graph.universe, self.managers) 

921 return DirectQuery( 

922 simpleQuery=self._simpleQuery.copy(), 

923 columns=columns, 

924 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

925 graph=graph, 

926 whereRegion=self.whereRegion if not unique else None, 

927 managers=self.managers, 

928 doomed_by=self._doomed_by, 

929 ) 

930 

931 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

932 # Docstring inherited from Query. 

933 from ._builder import QueryBuilder 

934 

935 if summary is None: 

936 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

937 if not summary.requested.issubset(self.graph): 

938 raise NotImplementedError( 

939 f"Query.makeBuilder does not yet support augmenting dimensions " 

940 f"({summary.requested.dimensions}) beyond those originally included in the query " 

941 f"({self.graph.dimensions})." 

942 ) 

943 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

944 builder.joinTable( 

945 self.sql.alias(), dimensions=self.graph.dimensions, datasets=self.getDatasetColumns() 

946 ) 

947 return builder 

948 

949 

950class MaterializedQuery(Query): 

951 """A `Query` implementation that represents query results saved in a 

952 temporary table. 

953 

954 `MaterializedQuery` instances should not be constructed directly; use 

955 `Query.materialize()` instead. 

956 

957 Parameters 

958 ---------- 

959 table : `sqlalchemy.schema.Table` 

960 SQLAlchemy object representing the temporary table. 

961 spatial : `Iterable` [ `DimensionElement` ] 

962 Spatial dimension elements whose regions must overlap for each valid 

963 result row (which may reject some rows that are in the table). 

964 datasetType : `DatasetType` 

965 The `DatasetType` of datasets returned by this query, or `None` 

966 if there are no dataset results 

967 isUnique : `bool` 

968 If `True`, the table's rows are unique, and there is no need to 

969 add ``SELECT DISTINCT`` to guarantee this in results. 

970 graph : `DimensionGraph` 

971 Dimensions included in the columns of this table. 

972 whereRegion : `Region` or `None` 

973 A spatial region all result-row regions must overlap to be valid (which 

974 may reject some rows that are in the table). 

975 managers : `RegistryManagers` 

976 A struct containing `Registry` manager helper objects, forwarded to 

977 the `Query` constructor. 

978 doomed_by : `Iterable` [ `str` ], optional 

979 A list of messages (appropriate for e.g. logging or exceptions) that 

980 explain why the query is known to return no results even before it is 

981 executed. Queries with a non-empty list will never be executed. 

982 """ 

983 

984 def __init__( 

985 self, 

986 *, 

987 table: sqlalchemy.schema.Table, 

988 spatial: Iterable[DimensionElement], 

989 datasetType: Optional[DatasetType], 

990 isUnique: bool, 

991 graph: DimensionGraph, 

992 whereRegion: Optional[Region], 

993 managers: RegistryManagers, 

994 doomed_by: Iterable[str] = (), 

995 ): 

996 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by) 

997 self._table = table 

998 self._spatial = tuple(spatial) 

999 self._datasetType = datasetType 

1000 self._isUnique = isUnique 

1001 

1002 def isUnique(self) -> bool: 

1003 # Docstring inherited from Query. 

1004 return self._isUnique 

1005 

1006 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1007 # Docstring inherited from Query. 

1008 return self._table.columns[name] 

1009 

1010 @property 

1011 def spatial(self) -> Iterator[DimensionElement]: 

1012 # Docstring inherited from Query. 

1013 return iter(self._spatial) 

1014 

1015 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1016 # Docstring inherited from Query. 

1017 return self._table.columns[f"{name}_region"] 

1018 

1019 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1020 # Docstring inherited from Query. 

1021 if self._datasetType is not None: 

1022 return DatasetQueryColumns( 

1023 datasetType=self._datasetType, 

1024 id=self._table.columns["dataset_id"], 

1025 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()], 

1026 ingestDate=None, 

1027 ) 

1028 else: 

1029 return None 

1030 

1031 @property 

1032 def sql(self) -> sqlalchemy.sql.FromClause: 

1033 # Docstring inherited from Query. 

1034 select = self._table.select() 

1035 if "_orderby" in self._table.columns: 

1036 select = select.order_by(self._table.columns["_orderby"]) 

1037 return select 

1038 

1039 @contextmanager 

1040 def materialize(self, db: Database) -> Iterator[Query]: 

1041 # Docstring inherited from Query. 

1042 yield self 

1043 

1044 def subset( 

1045 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

1046 ) -> Query: 

1047 # Docstring inherited from Query. 

1048 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique) 

1049 if columns is None: 

1050 return self 

1051 if columns.isEmpty(): 

1052 return EmptyQuery(self.graph.universe, managers=self.managers) 

1053 simpleQuery = SimpleQuery() 

1054 simpleQuery.join(self._table) 

1055 return DirectQuery( 

1056 simpleQuery=simpleQuery, 

1057 columns=columns, 

1058 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE, 

1059 graph=graph, 

1060 whereRegion=self.whereRegion if not unique else None, 

1061 managers=self.managers, 

1062 doomed_by=self._doomed_by, 

1063 ) 

1064 

1065 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1066 # Docstring inherited from Query. 

1067 from ._builder import QueryBuilder 

1068 

1069 if summary is None: 

1070 summary = QuerySummary(self.graph, whereRegion=self.whereRegion) 

1071 if not summary.requested.issubset(self.graph): 

1072 raise NotImplementedError( 

1073 f"Query.makeBuilder does not yet support augmenting dimensions " 

1074 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1075 f"({self.graph.dimensions})." 

1076 ) 

1077 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by) 

1078 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()) 

1079 return builder 

1080 

1081 

1082class EmptyQuery(Query): 

1083 """A `Query` implementation that handes the special case where the query 

1084 would have no columns. 

1085 

1086 Parameters 

1087 ---------- 

1088 universe : `DimensionUniverse` 

1089 Set of all dimensions from which the null set is extracted. 

1090 managers : `RegistryManagers` 

1091 A struct containing the registry manager instances used by the query 

1092 system. 

1093 doomed_by : `Iterable` [ `str` ], optional 

1094 A list of messages (appropriate for e.g. logging or exceptions) that 

1095 explain why the query is known to return no results even before it is 

1096 executed. Queries with a non-empty list will never be executed. 

1097 """ 

1098 

1099 def __init__( 

1100 self, 

1101 universe: DimensionUniverse, 

1102 managers: RegistryManagers, 

1103 doomed_by: Iterable[str] = (), 

1104 ): 

1105 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by) 

1106 

1107 def isUnique(self) -> bool: 

1108 # Docstring inherited from Query. 

1109 return True 

1110 

1111 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1112 # Docstring inherited from Query. 

1113 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).") 

1114 

1115 @property 

1116 def spatial(self) -> Iterator[DimensionElement]: 

1117 # Docstring inherited from Query. 

1118 return iter(()) 

1119 

1120 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement: 

1121 # Docstring inherited from Query. 

1122 raise KeyError(f"No region for {name} in query (no regions at all, actually).") 

1123 

1124 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]: 

1125 # Docstring inherited from Query. 

1126 return None 

1127 

1128 def rows( 

1129 self, db: Database, *, region: Optional[Region] = None 

1130 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]: 

1131 if not self._doomed_by: 

1132 yield None 

1133 

1134 @property 

1135 def sql(self) -> Optional[sqlalchemy.sql.FromClause]: 

1136 # Docstring inherited from Query. 

1137 return None 

1138 

1139 @contextmanager 

1140 def materialize(self, db: Database) -> Iterator[Query]: 

1141 # Docstring inherited from Query. 

1142 yield self 

1143 

1144 def subset( 

1145 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False 

1146 ) -> Query: 

1147 # Docstring inherited from Query. 

1148 assert graph is None or graph.issubset(self.graph) 

1149 return self 

1150 

1151 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder: 

1152 # Docstring inherited from Query. 

1153 from ._builder import QueryBuilder 

1154 

1155 if summary is None: 

1156 summary = QuerySummary(self.graph) 

1157 if not summary.requested.issubset(self.graph): 

1158 raise NotImplementedError( 

1159 f"Query.makeBuilder does not yet support augmenting dimensions " 

1160 f"({summary.requested.dimensions}) beyond those originally included in the query " 

1161 f"({self.graph.dimensions})." 

1162 ) 

1163 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)