Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "ChainedDatasetQueryResults", 

25 "DataCoordinateQueryResults", 

26 "DatasetQueryResults", 

27 "ParentDatasetQueryResults", 

28) 

29 

30from abc import abstractmethod 

31from contextlib import contextmanager, ExitStack 

32import itertools 

33from typing import ( 

34 Any, 

35 Callable, 

36 ContextManager, 

37 Iterable, 

38 Iterator, 

39 Mapping, 

40 Optional, 

41 Sequence, 

42 Union, 

43) 

44 

45import sqlalchemy 

46 

47from ...core import ( 

48 DataCoordinate, 

49 DataCoordinateIterable, 

50 DatasetRef, 

51 DatasetType, 

52 DimensionGraph, 

53 DimensionRecord, 

54 SimpleQuery, 

55) 

56from ..interfaces import Database 

57from ._query import Query 

58 

59 

60class DataCoordinateQueryResults(DataCoordinateIterable): 

61 """An enhanced implementation of `DataCoordinateIterable` that represents 

62 data IDs retrieved from a database query. 

63 

64 Parameters 

65 ---------- 

66 db : `Database` 

67 Database engine used to execute queries. 

68 query : `Query` 

69 Low-level representation of the query that backs this result object. 

70 records : `Mapping`, optional 

71 A nested mapping containing `DimensionRecord` objects for all 

72 dimensions and all data IDs this query will yield. If `None` 

73 (default), `DataCoordinateIterable.hasRecords` will return `False`. 

74 The outer mapping has `str` keys (the names of dimension elements). 

75 The inner mapping has `tuple` keys representing data IDs (tuple 

76 conversions of `DataCoordinate.values()`) and `DimensionRecord` values. 

77 

78 Notes 

79 ----- 

80 Constructing an instance of this does nothing; the query is not executed 

81 until it is iterated over (or some other operation is performed that 

82 involves iteration). 

83 

84 Instances should generally only be constructed by `Registry` methods or the 

85 methods of other query result objects. 

86 """ 

87 def __init__(self, db: Database, query: Query, *, 

88 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None): 

89 self._db = db 

90 self._query = query 

91 self._records = records 

92 assert query.datasetType is None, \ 

93 "Query used to initialize data coordinate results should not have any datasets." 

94 

95 __slots__ = ("_db", "_query", "_records") 

96 

97 def __iter__(self) -> Iterator[DataCoordinate]: 

98 return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db)) 

99 

100 @property 

101 def graph(self) -> DimensionGraph: 

102 # Docstring inherited from DataCoordinateIterable. 

103 return self._query.graph 

104 

105 def hasFull(self) -> bool: 

106 # Docstring inherited from DataCoordinateIterable. 

107 return True 

108 

109 def hasRecords(self) -> bool: 

110 # Docstring inherited from DataCoordinateIterable. 

111 return self._records is not None or not self._query.graph 

112 

113 @contextmanager 

114 def materialize(self) -> Iterator[DataCoordinateQueryResults]: 

115 """Insert this query's results into a temporary table. 

116 

117 Returns 

118 ------- 

119 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ] 

120 A context manager that ensures the temporary table is created and 

121 populated in ``__enter__`` (returning a results object backed by 

122 that table), and dropped in ``__exit__``. If ``self`` is already 

123 materialized, the context manager may do nothing (reflecting the 

124 fact that an outer context manager should already take care of 

125 everything else). 

126 

127 Notes 

128 ----- 

129 When using a very large result set to perform multiple queries (e.g. 

130 multiple calls to `subset` with different arguments, or even a single 

131 call to `expanded`), it may be much more efficient to start by 

132 materializing the query and only then performing the follow up queries. 

133 It may also be less efficient, depending on how well database engine's 

134 query optimizer can simplify those particular follow-up queries and 

135 how efficiently it caches query results even when the are not 

136 explicitly inserted into a temporary table. See `expanded` and 

137 `subset` for examples. 

138 """ 

139 with self._query.materialize(self._db) as materialized: 

140 yield DataCoordinateQueryResults(self._db, materialized, records=self._records) 

141 

142 def expanded(self) -> DataCoordinateQueryResults: 

143 """Return a results object for which `hasRecords` returns `True`. 

144 

145 This method may involve actually executing database queries to fetch 

146 `DimensionRecord` objects. 

147 

148 Returns 

149 ------- 

150 results : `DataCoordinateQueryResults` 

151 A results object for which `hasRecords` returns `True`. May be 

152 ``self`` if that is already the case. 

153 

154 Notes 

155 ----- 

156 For very result sets, it may be much more efficient to call 

157 `materialize` before calling `expanded`, to avoid performing the 

158 original query multiple times (as a subquery) in the follow-up queries 

159 that fetch dimension records. For example:: 

160 

161 with registry.queryDataIds(...).materialize() as tempDataIds: 

162 dataIdsWithRecords = tempDataIds.expanded() 

163 for dataId in dataIdsWithRecords: 

164 ... 

165 """ 

166 if self._records is None: 

167 records = {} 

168 for element in self.graph.elements: 

169 subset = self.subset(graph=element.graph, unique=True) 

170 records[element.name] = { 

171 tuple(record.dataId.values()): record 

172 for record in self._query.managers.dimensions[element].fetch(subset) 

173 } 

174 return DataCoordinateQueryResults(self._db, self._query, records=records) 

175 else: 

176 return self 

177 

178 def subset(self, graph: Optional[DimensionGraph] = None, *, 

179 unique: bool = False) -> DataCoordinateQueryResults: 

180 """Return a results object containing a subset of the dimensions of 

181 this one, and/or a unique near-subset of its rows. 

182 

183 This method may involve actually executing database queries to fetch 

184 `DimensionRecord` objects. 

185 

186 Parameters 

187 ---------- 

188 graph : `DimensionGraph`, optional 

189 Dimensions to include in the new results object. If `None`, 

190 ``self.graph`` is used. 

191 unique : `bool`, optional 

192 If `True` (`False` is default), the query should only return unique 

193 data IDs. This is implemented in the database; to obtain unique 

194 results via Python-side processing (which may be more efficient in 

195 some cases), use `toSet` to construct a `DataCoordinateSet` from 

196 this results object instead. 

197 

198 Returns 

199 ------- 

200 results : `DataCoordinateQueryResults` 

201 A results object corresponding to the given criteria. May be 

202 ``self`` if it already qualifies. 

203 

204 Notes 

205 ----- 

206 This method can only return a "near-subset" of the original result rows 

207 in general because of subtleties in how spatial overlaps are 

208 implemented; see `Query.subset` for more information. 

209 

210 When calling `subset` multiple times on the same very large result set, 

211 it may be much more efficient to call `materialize` first. For 

212 example:: 

213 

214 dimensions1 = DimensionGraph(...) 

215 dimensions2 = DimensionGraph(...) 

216 with registry.queryDataIds(...).materialize() as tempDataIds: 

217 for dataId1 in tempDataIds.subset( 

218 graph=dimensions1, 

219 unique=True): 

220 ... 

221 for dataId2 in tempDataIds.subset( 

222 graph=dimensions2, 

223 unique=True): 

224 ... 

225 """ 

226 if graph is None: 

227 graph = self.graph 

228 if not graph.issubset(self.graph): 

229 raise ValueError(f"{graph} is not a subset of {self.graph}") 

230 if graph == self.graph and (not unique or self._query.isUnique()): 

231 return self 

232 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] 

233 if self._records is not None: 

234 records = {element.name: self._records[element.name] for element in graph.elements} 

235 else: 

236 records = None 

237 return DataCoordinateQueryResults( 

238 self._db, 

239 self._query.subset(graph=graph, datasets=False, unique=unique), 

240 records=records, 

241 ) 

242 

243 def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.ColumnElement]) -> None: 

244 # Docstring inherited from DataCoordinateIterable. 

245 sql = self._query.sql 

246 if sql is not None: 

247 fromClause = sql.alias("c") 

248 query.join( 

249 fromClause, 

250 onclause=sqlalchemy.sql.and_(*[ 

251 columns(dimension.name) == fromClause.columns[dimension.name] 

252 for dimension in self.graph.required 

253 ]) 

254 ) 

255 

256 def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *, 

257 findFirst: bool = True) -> ParentDatasetQueryResults: 

258 """Find datasets using the data IDs identified by this query. 

259 

260 Parameters 

261 ---------- 

262 datasetType : `DatasetType` or `str` 

263 Dataset type or the name of one to search for. Must have 

264 dimensions that are a subset of ``self.graph``. 

265 collections : `Any` 

266 An expression that fully or partially identifies the collections 

267 to search for the dataset, such as a `str`, `re.Pattern`, or 

268 iterable thereof. ``...`` can be used to return all collections. 

269 See :ref:`daf_butler_collection_expressions` for more information. 

270 findFirst : `bool`, optional 

271 If `True` (default), for each result data ID, only yield one 

272 `DatasetRef`, from the first collection in which a dataset of that 

273 dataset type appears (according to the order of ``collections`` 

274 passed in). If `True`, ``collections`` must not contain regular 

275 expressions and may not be ``...``. 

276 

277 Returns 

278 ------- 

279 datasets : `ParentDatasetQueryResults` 

280 A lazy-evaluation object representing dataset query results, 

281 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all 

282 nested data IDs in those dataset references will have records as 

283 well. 

284 

285 Raises 

286 ------ 

287 ValueError 

288 Raised if ``datasetType.dimensions.issubset(self.graph) is False``. 

289 """ 

290 if not isinstance(datasetType, DatasetType): 

291 datasetType = self._query.managers.datasets[datasetType].datasetType 

292 # moving component handling down into managers. 

293 if not datasetType.dimensions.issubset(self.graph): 

294 raise ValueError(f"findDatasets requires that the dataset type have the same dimensions as " 

295 f"the DataCoordinateQueryResult used as input to the search, but " 

296 f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input " 

297 f"dimensions are {self.graph}.") 

298 builder = self._query.makeBuilder() 

299 if datasetType.isComponent(): 

300 # We were given a true DatasetType instance, but it's a component. 

301 parentName, componentName = datasetType.nameAndComponent() 

302 storage = self._query.managers.datasets[parentName] 

303 datasetType = storage.datasetType 

304 components = [componentName] 

305 else: 

306 components = [None] 

307 builder.joinDataset(datasetType, collections=collections, findFirst=findFirst) 

308 query = builder.finish(joinMissing=False) 

309 return ParentDatasetQueryResults(db=self._db, query=query, components=components, 

310 records=self._records) 

311 

312 

313class DatasetQueryResults(Iterable[DatasetRef]): 

314 """An interface for objects that represent the results of queries for 

315 datasets. 

316 """ 

317 

318 @abstractmethod 

319 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

320 """Group results by parent dataset type. 

321 

322 Returns 

323 ------- 

324 iter : `Iterator` [ `ParentDatasetQueryResults` ] 

325 An iterator over `DatasetQueryResults` instances that are each 

326 responsible for a single parent dataset type (either just that 

327 dataset type, one or more of its component dataset types, or both). 

328 """ 

329 raise NotImplementedError() 

330 

331 @abstractmethod 

332 def materialize(self) -> ContextManager[DatasetQueryResults]: 

333 """Insert this query's results into a temporary table. 

334 

335 Returns 

336 ------- 

337 context : `typing.ContextManager` [ `DatasetQueryResults` ] 

338 A context manager that ensures the temporary table is created and 

339 populated in ``__enter__`` (returning a results object backed by 

340 that table), and dropped in ``__exit__``. If ``self`` is already 

341 materialized, the context manager may do nothing (reflecting the 

342 fact that an outer context manager should already take care of 

343 everything else). 

344 """ 

345 raise NotImplementedError() 

346 

347 @abstractmethod 

348 def expanded(self) -> DatasetQueryResults: 

349 """Return a `DatasetQueryResults` for which `DataCoordinate.hasResults` 

350 returns `True` for all data IDs in returned `DatasetRef` objects. 

351 

352 Returns 

353 ------- 

354 expanded : `DatasetQueryResults` 

355 Either a new `DatasetQueryResults` instance or ``self``, if it is 

356 already expanded. 

357 

358 Notes 

359 ----- 

360 As with `DataCoordinateQueryResults.expanded`, it may be more efficient 

361 to call `materialize` before expanding data IDs for very large result 

362 sets. 

363 """ 

364 raise NotImplementedError() 

365 

366 

367class ParentDatasetQueryResults(DatasetQueryResults): 

368 """An object that represents results from a query for datasets with a 

369 single parent `DatasetType`. 

370 

371 Parameters 

372 ---------- 

373 db : `Database` 

374 Database engine to execute queries against. 

375 query : `Query` 

376 Low-level query object that backs these results. ``query.datasetType`` 

377 will be the parent dataset type for this object, and may not be `None`. 

378 components : `Sequence` [ `str` or `None` ] 

379 Names of components to include in iteration. `None` may be included 

380 (at most once) to include the parent dataset type. 

381 records : `Mapping`, optional 

382 Mapping containing `DimensionRecord` objects for all dimensions and 

383 all data IDs this query will yield. If `None` (default), 

384 `DataCoordinate.hasRecords` will return `False` for all nested data 

385 IDs. This is a nested mapping with `str` names of dimension elements 

386 as outer keys, `DimensionRecord` instances as inner values, and 

387 ``tuple(record.dataId.values())`` for the inner keys / outer values 

388 (where ``record`` is the innermost `DimensionRecord` instance). 

389 """ 

390 def __init__(self, db: Database, query: Query, *, 

391 components: Sequence[Optional[str]], 

392 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None): 

393 self._db = db 

394 self._query = query 

395 self._components = components 

396 self._records = records 

397 assert query.datasetType is not None, \ 

398 "Query used to initialize dataset results must have a dataset." 

399 assert query.datasetType.dimensions == query.graph 

400 

401 __slots__ = ("_db", "_query", "_dimensions", "_components", "_records") 

402 

403 def __iter__(self) -> Iterator[DatasetRef]: 

404 for row in self._query.rows(self._db): 

405 parentRef = self._query.extractDatasetRef(row, records=self._records) 

406 for component in self._components: 

407 if component is None: 

408 yield parentRef 

409 else: 

410 yield parentRef.makeComponentRef(component) 

411 

412 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

413 # Docstring inherited from DatasetQueryResults. 

414 yield self 

415 

416 @contextmanager 

417 def materialize(self) -> Iterator[ParentDatasetQueryResults]: 

418 # Docstring inherited from DatasetQueryResults. 

419 with self._query.materialize(self._db) as materialized: 

420 yield ParentDatasetQueryResults(self._db, materialized, 

421 components=self._components, 

422 records=self._records) 

423 

424 @property 

425 def parentDatasetType(self) -> DatasetType: 

426 """The parent dataset type for all datasets in this iterable 

427 (`DatasetType`). 

428 """ 

429 assert self._query.datasetType is not None 

430 return self._query.datasetType 

431 

432 @property 

433 def dataIds(self) -> DataCoordinateQueryResults: 

434 """A lazy-evaluation object representing a query for the just the data 

435 IDs of the datasets that would be returned by this query 

436 (`DataCoordinateQueryResults`). 

437 

438 The returned object is not in general `zip`-iterable with ``self``; 

439 it may be in a different order or have (or not have) duplicates. 

440 """ 

441 return DataCoordinateQueryResults( 

442 self._db, 

443 self._query.subset(graph=self.parentDatasetType.dimensions, datasets=False, unique=False), 

444 records=self._records, 

445 ) 

446 

447 def withComponents(self, components: Sequence[Optional[str]]) -> ParentDatasetQueryResults: 

448 """Return a new query results object for the same parent datasets but 

449 different components. 

450 

451 components : `Sequence` [ `str` or `None` ] 

452 Names of components to include in iteration. `None` may be 

453 included (at most once) to include the parent dataset type. 

454 """ 

455 return ParentDatasetQueryResults(self._db, self._query, records=self._records, 

456 components=components) 

457 

458 def expanded(self) -> ParentDatasetQueryResults: 

459 # Docstring inherited from DatasetQueryResults. 

460 if self._records is None: 

461 records = self.dataIds.expanded()._records 

462 return ParentDatasetQueryResults(self._db, self._query, records=records, 

463 components=self._components) 

464 else: 

465 return self 

466 

467 

468class ChainedDatasetQueryResults(DatasetQueryResults): 

469 """A `DatasetQueryResults` implementation that simply chains together 

470 other results objects, each for a different parent dataset type. 

471 

472 Parameters 

473 ---------- 

474 chain : `Sequence` [ `ParentDatasetQueryResults` ] 

475 The underlying results objects this object will chain together. 

476 """ 

477 

478 def __init__(self, chain: Sequence[ParentDatasetQueryResults]): 

479 self._chain = chain 

480 

481 __slots__ = ("_chain",) 

482 

483 def __iter__(self) -> Iterator[DatasetRef]: 

484 return itertools.chain.from_iterable(self._chain) 

485 

486 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: 

487 # Docstring inherited from DatasetQueryResults. 

488 return iter(self._chain) 

489 

490 @contextmanager 

491 def materialize(self) -> Iterator[ChainedDatasetQueryResults]: 

492 # Docstring inherited from DatasetQueryResults. 

493 with ExitStack() as stack: 

494 yield ChainedDatasetQueryResults( 

495 [stack.enter_context(r.materialize()) for r in self._chain] 

496 ) 

497 

498 def expanded(self) -> ChainedDatasetQueryResults: 

499 # Docstring inherited from DatasetQueryResults. 

500 return ChainedDatasetQueryResults([r.expanded() for r in self._chain])