Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DataCoordinateQueryResults", 

26 "DatasetQueryResults", 

27 "ParentDatasetQueryResults", 

28) 

29 

30from abc import abstractmethod 

31from contextlib import contextmanager, ExitStack 

32import itertools 

33from typing import ( 

34 Any, 

35 Callable, 

36 ContextManager, 

37 Iterable, 

38 Iterator, 

39 Mapping, 

40 Optional, 

41 Sequence, 

42 Union, 

43) 

44 

45import sqlalchemy 

46 

47from ...core import ( 

48 DataCoordinate, 

49 DataCoordinateIterable, 

50 DatasetRef, 

51 DatasetType, 

52 DimensionGraph, 

53 DimensionRecord, 

54 SimpleQuery, 

55) 

56from ..interfaces import Database 

57from ._query import Query 

58 

59 

60class DataCoordinateQueryResults(DataCoordinateIterable): 

61 """An enhanced implementation of `DataCoordinateIterable` that represents 

62 data IDs retrieved from a database query. 

63 

64 Parameters 

65 ---------- 

66 db : `Database` 

67 Database engine used to execute queries. 

68 query : `Query` 

69 Low-level representation of the query that backs this result object. 

70 records : `Mapping`, optional 

71 A nested mapping containing `DimensionRecord` objects for all 

72 dimensions and all data IDs this query will yield. If `None` 

73 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

74 The outer mapping has `str` keys (the names of dimension elements). 

75 The inner mapping has `tuple` keys representing data IDs (tuple 

76 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

77 

78 Notes 

79 ----- 

80 Constructing an instance of this does nothing; the query is not executed 

81 until it is iterated over (or some other operation is performed that 

82 involves iteration). 

83 

84 Instances should generally only be constructed by `Registry` methods or the 

85 methods of other query result objects. 

86 """ 

87 def __init__(self, db: Database, query: Query, *, 

88 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None): 

89 self._db = db 

90 self._query = query 

91 self._records = records 

92 assert query.datasetType is None, \ 

93 "Query used to initialize data coordinate results should not have any datasets." 

94 

95 __slots__ = ("_db", "_query", "_records") 

96 

97 def __iter__(self) -> Iterator[DataCoordinate]: 

98 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

99 

100 @property 

101 def graph(self) -> DimensionGraph: 

102 # Docstring inherited from DataCoordinateIterable. 

103 return self._query.graph 

104 

105 def hasFull(self) -> bool: 

106 # Docstring inherited from DataCoordinateIterable. 

107 return True 

108 

109 def hasRecords(self) -> bool: 

110 # Docstring inherited from DataCoordinateIterable. 

111 return self._records is not None or not self._query.graph 

112 

113 @contextmanager 

114 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

115 """Insert this query's results into a temporary table. 

116 

117 Returns 

118 ------- 

119 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

120 A context manager that ensures the temporary table is created and 

121 populated in ``__enter__`` (returning a results object backed by 

122 that table), and dropped in ``__exit__``. If ``self`` is already 

123 materialized, the context manager may do nothing (reflecting the 

124 fact that an outer context manager should already take care of 

125 everything else). 

126 

127 Notes 

128 ----- 

129 When using a very large result set to perform multiple queries (e.g. 

130 multiple calls to `subset` with different arguments, or even a single 

131 call to `expanded`), it may be much more efficient to start by 

132 materializing the query and only then performing the follow up queries. 

133 It may also be less efficient, depending on how well database engine's 

134 query optimizer can simplify those particular follow-up queries and 

135 how efficiently it caches query results even when the are not 

136 explicitly inserted into a temporary table. See `expanded` and 

137 `subset` for examples. 

138 """ 

139 with self._query.materialize(self._db) as materialized: 

140 yield DataCoordinateQueryResults(self._db, materialized, records=self._records) 

141 

142 def expanded(self) -> DataCoordinateQueryResults: 

143 """Return a results object for which `hasRecords` returns `True`. 

144 

145 This method may involve actually executing database queries to fetch 

146 `DimensionRecord` objects. 

147 

148 Returns 

149 ------- 

150 results : `DataCoordinateQueryResults` 

151 A results object for which `hasRecords` returns `True`. May be 

152 ``self`` if that is already the case. 

153 

154 Notes 

155 ----- 

156 For very result sets, it may be much more efficient to call 

157 `materialize` before calling `expanded`, to avoid performing the 

158 original query multiple times (as a subquery) in the follow-up queries 

159 that fetch dimension records. For example:: 

160 

161 with registry.queryDataIds(...).materialize() as tempDataIds: 

162 dataIdsWithRecords = tempDataIds.expanded() 

163 for dataId in dataIdsWithRecords: 

164 ... 

165 """ 

166 if self._records is None: 

167 records = {} 

168 for element in self.graph.elements: 

169 subset = self.subset(graph=element.graph, unique=True) 

170 records[element.name] = { 

171 tuple(record.dataId.values()): record 

172 for record in self._query.managers.dimensions[element].fetch(subset) 

173 } 

174 return DataCoordinateQueryResults(self._db, self._query, records=records) 

175 else: 

176 return self 

177 

178 def subset(self, graph: Optional[DimensionGraph] = None, *, 

179 unique: bool = False) -> DataCoordinateQueryResults: 

180 """Return a results object containing a subset of the dimensions of 

181 this one, and/or a unique near-subset of its rows. 

182 

183 This method may involve actually executing database queries to fetch 

184 `DimensionRecord` objects. 

185 

186 Parameters 

187 ---------- 

188 graph : `DimensionGraph`, optional 

189 Dimensions to include in the new results object. If `None`, 

190 ``self.graph`` is used. 

191 unique : `bool`, optional 

192 If `True` (`False` is default), the query should only return unique 

193 data IDs. This is implemented in the database; to obtain unique 

194 results via Python-side processing (which may be more efficient in 

195 some cases), use `toSet` to construct a `DataCoordinateSet` from 

196 this results object instead. 

197 

198 Returns 

199 ------- 

200 results : `DataCoordinateQueryResults` 

201 A results object corresponding to the given criteria. May be 

202 ``self`` if it already qualifies. 

203 

204 Notes 

205 ----- 

206 This method can only return a "near-subset" of the original result rows 

207 in general because of subtleties in how spatial overlaps are 

208 implemented; see `Query.subset` for more information. 

209 

210 When calling `subset` multiple times on the same very large result set, 

211 it may be much more efficient to call `materialize` first. For 

212 example:: 

213 

214 dimensions1 = DimensionGraph(...) 

215 dimensions2 = DimensionGraph(...) 

216 with registry.queryDataIds(...).materialize() as tempDataIds: 

217 for dataId1 in tempDataIds.subset( 

218 graph=dimensions1, 

219 unique=True): 

220 ... 

221 for dataId2 in tempDataIds.subset( 

222 graph=dimensions2, 

223 unique=True): 

224 ... 

225 """ 

226 if graph is None: 

227 graph = self.graph 

228 if not graph.issubset(self.graph): 

229 raise ValueError(f"{graph} is not a subset of {self.graph}") 

230 if graph == self.graph and (not unique or self._query.isUnique()): 

231 return self 

232 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] 

233 if self._records is not None: 

234 records = {element.name: self._records[element.name] for element in graph.elements} 

235 else: 

236 records = None 

237 return DataCoordinateQueryResults( 

238 self._db, 

239 self._query.subset(graph=graph, datasets=False, unique=unique), 

240 records=records, 

241 ) 

242 

243 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

244 # Docstring inherited from DataCoordinateIterable. 

245 sql = self._query.sql 

246 if sql is not None: 

247 fromClause = sql.alias("c") 

248 query.join( 

249 fromClause, 

250 onclause=sqlalchemy.sql.and_(*[ 

251 columns(dimension.name) == fromClause.columns[dimension.name] 

252 for dimension in self.graph.required 

253 ]) 

254 ) 

255 

256 def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *, 

257 findFirst: bool = True) -> ParentDatasetQueryResults: 

258 """Find datasets using the data IDs identified by this query. 

259 

260 Parameters 

261 ---------- 

262 datasetType : `DatasetType` or `str` 

263 Dataset type or the name of one to search for. Must have 

264 dimensions that are a subset of ``self.graph``. 

265 collections : `Any` 

266 An expression that fully or partially identifies the collections 

267 to search for the dataset, such as a `str`, `re.Pattern`, or 

268 iterable thereof. ``...`` can be used to return all collections. 

269 See :ref:`daf_butler_collection_expressions` for more information. 

270 findFirst : `bool`, optional 

271 If `True` (default), for each result data ID, only yield one 

272 `DatasetRef`, from the first collection in which a dataset of that 

273 dataset type appears (according to the order of ``collections`` 

274 passed in). If `True`, ``collections`` must not contain regular 

275 expressions and may not be ``...``. 

276 

277 Returns 

278 ------- 

279 datasets : `ParentDatasetQueryResults` 

280 A lazy-evaluation object representing dataset query results, 

281 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

282 nested data IDs in those dataset references will have records as 

283 well. 

284 

285 Raises 

286 ------ 

287 ValueError 

288 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

289 """ 

290 if not isinstance(datasetType, DatasetType): 

291 datasetType = self._query.managers.datasets[datasetType].datasetType 

292 # moving component handling down into managers. 

293 if not datasetType.dimensions.issubset(self.graph): 

294 raise ValueError(f"findDatasets requires that the dataset type have the same dimensions as " 

295 f"the DataCoordinateQueryResult used as input to the search, but " 

296 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input " 

297 f"dimensions are {self.graph}.") 

298 builder = self._query.makeBuilder() 

299 if datasetType.isComponent(): 

300 # We were given a true DatasetType instance, but it's a component. 

301 parentName, componentName = datasetType.nameAndComponent() 

302 storage = self._query.managers.datasets[parentName] 

303 datasetType = storage.datasetType 

304 components = [componentName] 

305 else: 

306 components = [None] 

307 if not builder.joinDataset(datasetType, collections=collections, findFirst=findFirst): 

308 raise RuntimeError( 

309 f"Error finding datasets of type {datasetType.name} in collections {collections}; " 

310 "it is impossible for any such datasets to be found in any of those collections, " 

311 "most likely because the dataset type is not registered. " 

312 "This error may become a successful query that returns no results in the future, " 

313 "because queries with no results are not usually considered an error." 

314 ) 

315 query = builder.finish(joinMissing=False) 

316 return ParentDatasetQueryResults(db=self._db, query=query, components=components, 

317 records=self._records) 

318 

319 

320class DatasetQueryResults(Iterable[DatasetRef]): 

321 """An interface for objects that represent the results of queries for 

322 datasets. 

323 """ 

324 

325 @abstractmethod 

326 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

327 """Group results by parent dataset type. 

328 

329 Returns 

330 ------- 

331 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

332 An iterator over `DatasetQueryResults` instances that are each 

333 responsible for a single parent dataset type (either just that 

334 dataset type, one or more of its component dataset types, or both). 

335 """ 

336 raise NotImplementedError() 

337 

338 @abstractmethod 

339 def materialize(self) -> ContextManager[DatasetQueryResults]: 

340 """Insert this query's results into a temporary table. 

341 

342 Returns 

343 ------- 

344 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

345 A context manager that ensures the temporary table is created and 

346 populated in ``__enter__`` (returning a results object backed by 

347 that table), and dropped in ``__exit__``. If ``self`` is already 

348 materialized, the context manager may do nothing (reflecting the 

349 fact that an outer context manager should already take care of 

350 everything else). 

351 """ 

352 raise NotImplementedError() 

353 

354 @abstractmethod 

355 def expanded(self) -> DatasetQueryResults: 

356 """Return a `DatasetQueryResults` for which `DataCoordinate.hasResults` 

357 returns `True` for all data IDs in returned `DatasetRef` objects. 

358 

359 Returns 

360 ------- 

361 expanded : `DatasetQueryResults` 

362 Either a new `DatasetQueryResults` instance or ``self``, if it is 

363 already expanded. 

364 

365 Notes 

366 ----- 

367 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

368 to call `materialize` before expanding data IDs for very large result 

369 sets. 

370 """ 

371 raise NotImplementedError() 

372 

373 

374class ParentDatasetQueryResults(DatasetQueryResults): 

375 """An object that represents results from a query for datasets with a 

376 single parent `DatasetType`. 

377 

378 Parameters 

379 ---------- 

380 db : `Database` 

381 Database engine to execute queries against. 

382 query : `Query` 

383 Low-level query object that backs these results. ``query.datasetType`` 

384 will be the parent dataset type for this object, and may not be `None`. 

385 components : `Sequence` [ `str` or `None` ] 

386 Names of components to include in iteration. `None` may be included 

387 (at most once) to include the parent dataset type. 

388 records : `Mapping`, optional 

389 Mapping containing `DimensionRecord` objects for all dimensions and 

390 all data IDs this query will yield. If `None` (default), 

391 `DataCoordinate.hasRecords` will return `False` for all nested data 

392 IDs. This is a nested mapping with `str` names of dimension elements 

393 as outer keys, `DimensionRecord` instances as inner values, and 

394 ``tuple(record.dataId.values())`` for the inner keys / outer values 

395 (where ``record`` is the innermost `DimensionRecord` instance). 

396 """ 

397 def __init__(self, db: Database, query: Query, *, 

398 components: Sequence[Optional[str]], 

399 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None): 

400 self._db = db 

401 self._query = query 

402 self._components = components 

403 self._records = records 

404 assert query.datasetType is not None, \ 

405 "Query used to initialize dataset results must have a dataset." 

406 assert query.datasetType.dimensions == query.graph 

407 

408 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

409 

410 def __iter__(self) -> Iterator[DatasetRef]: 

411 for row in self._query.rows(self._db): 

412 parentRef = self._query.extractDatasetRef(row, records=self._records) 

413 for component in self._components: 

414 if component is None: 

415 yield parentRef 

416 else: 

417 yield parentRef.makeComponentRef(component) 

418 

419 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

420 # Docstring inherited from DatasetQueryResults. 

421 yield self 

422 

423 @contextmanager 

424 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

425 # Docstring inherited from DatasetQueryResults. 

426 with self._query.materialize(self._db) as materialized: 

427 yield ParentDatasetQueryResults(self._db, materialized, 

428 components=self._components, 

429 records=self._records) 

430 

431 @property 

432 def parentDatasetType(self) -> DatasetType: 

433 """The parent dataset type for all datasets in this iterable 

434 (`DatasetType`). 

435 """ 

436 assert self._query.datasetType is not None 

437 return self._query.datasetType 

438 

439 @property 

440 def dataIds(self) -> DataCoordinateQueryResults: 

441 """A lazy-evaluation object representing a query for the just the data 

442 IDs of the datasets that would be returned by this query 

443 (`DataCoordinateQueryResults`). 

444 

445 The returned object is not in general `zip`-iterable with ``self``; 

446 it may be in a different order or have (or not have) duplicates. 

447 """ 

448 return DataCoordinateQueryResults( 

449 self._db, 

450 self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False), 

451 records=self._records, 

452 ) 

453 

454 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults: 

455 """Return a new query results object for the same parent datasets but 

456 different components. 

457 

458 components : `Sequence` [ `str` or `None` ] 

459 Names of components to include in iteration. `None` may be 

460 included (at most once) to include the parent dataset type. 

461 """ 

462 return ParentDatasetQueryResults(self._db, self._query, records=self._records, 

463 components=components) 

464 

465 def expanded(self) -> ParentDatasetQueryResults: 

466 # Docstring inherited from DatasetQueryResults. 

467 if self._records is None: 

468 records = self.dataIds.expanded()._records 

469 return ParentDatasetQueryResults(self._db, self._query, records=records, 

470 components=self._components) 

471 else: 

472 return self 

473 

474 

475class ChainedDatasetQueryResults(DatasetQueryResults): 

476 """A `DatasetQueryResults` implementation that simply chains together 

477 other results objects, each for a different parent dataset type. 

478 

479 Parameters 

480 ---------- 

481 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

482 The underlying results objects this object will chain together. 

483 """ 

484 

485 def __init__(self, chain: Sequence[ParentDatasetQueryResults]): 

486 self._chain = chain 

487 

488 __slots__ = ("_chain",) 

489 

490 def __iter__(self) -> Iterator[DatasetRef]: 

491 return itertools.chain.from_iterable(self._chain) 

492 

493 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

494 # Docstring inherited from DatasetQueryResults. 

495 return iter(self._chain) 

496 

497 @contextmanager 

498 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

499 # Docstring inherited from DatasetQueryResults. 

500 with ExitStack() as stack: 

501 yield ChainedDatasetQueryResults( 

502 [stack.enter_context(r.materialize()) for r in self._chain] 

503 ) 

504 

505 def expanded(self) -> ChainedDatasetQueryResults: 

506 # Docstring inherited from DatasetQueryResults. 

507 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])