Coverage for python/lsst/daf/butler/queries/_query.py: 25%

123 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-05 10:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("Query",) 

31 

32from collections.abc import Iterable, Mapping, Set 

33from typing import Any, final 

34 

35from lsst.utils.iteration import ensure_iterable 

36 

37from .._dataset_type import DatasetType 

38from .._storage_class import StorageClassFactory 

39from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup 

40from ..registry import DatasetTypeError 

41from ._base import QueryBase 

42from ._data_coordinate_query_results import DataCoordinateQueryResults 

43from ._dataset_query_results import DatasetRefQueryResults 

44from ._dimension_record_query_results import DimensionRecordQueryResults 

45from .convert_args import convert_where_args 

46from .driver import QueryDriver 

47from .expression_factory import ExpressionFactory 

48from .result_specs import DataCoordinateResultSpec, DatasetRefResultSpec, DimensionRecordResultSpec 

49from .tree import DatasetSearch, InvalidQueryError, Predicate, QueryTree, make_identity_query_tree 

50 

51 

52@final 

53class Query(QueryBase): 

54 """A method-chaining builder for butler queries. 

55 

56 Parameters 

57 ---------- 

58 driver : `QueryDriver` 

59 Implementation object that knows how to actually execute queries. 

60 tree : `QueryTree` 

61 Description of the query as a tree of joins and column expressions. The 

62 instance returned directly by the `Butler._query` entry point should be 

63 constructed via `make_identity_query_tree`. 

64 

65 Notes 

66 ----- 

67 `Query` objects should never be constructed directly by users; use 

68 `Butler._query` instead. 

69 

70 A `Query` object represents the first stage of query construction, in which 

71 constraints and joins are defined (roughly corresponding to the WHERE and 

72 FROM clauses in SQL). The various "results" objects represent the second 

73 (and final) stage, where the columns returned are specified and any sorting 

74 or integer slicing can be applied. Result objects are obtained from the 

75 `data_ids`, `datasets`, and `dimension_records` methods. 

76 

77 `Query` and query-result objects are always immutable (except for caching 

78 information fetched from the database or server), so modifier methods 

79 always return a new object without modifying the current one. 

80 """ 

81 

82 def __init__(self, driver: QueryDriver, tree: QueryTree): 

83 # __init__ defined here because there are multiple base classes and 

84 # not all define __init__ (and hence inherit object.__init__, which 

85 # just ignores its args). Even if we just delegate to super(), it 

86 # seems less fragile to make it explicit here. 

87 super().__init__(driver, tree) 

88 

89 @property 

90 def constraint_dataset_types(self) -> Set[str]: 

91 """The names of all dataset types joined into the query. 

92 

93 The existence of datasets of these types constrains the data IDs of any 

94 type of result. Fields for these dataset types are also usable in 

95 'where' expressions. 

96 """ 

97 # Note that this includes only dataset type names, not `DatasetType` 

98 # instances; the `DatasetQueryResults` adapter returned by the 

99 # `datasets` method does include `DatasetType` instances, since it is 

100 # in a better position to track and respect any storage class override 

101 # specified. 

102 return self._tree.datasets.keys() 

103 

104 @property 

105 def constraint_dimensions(self) -> DimensionGroup: 

106 """Dimensions currently present in the query, either directly or 

107 indirectly. 

108 

109 This includes dimensions that are present in any joined subquery (such 

110 as a dataset search, materialization, or data ID upload) or `where` 

111 argument, as well as any required or implied dependency of those 

112 dimensions. 

113 """ 

114 return self._tree.dimensions 

115 

116 @property 

117 def expression_factory(self) -> ExpressionFactory: 

118 """A factory for column expressions using overloaded operators. 

119 

120 Notes 

121 ----- 

122 Typically this attribute will be assigned to a single-character local 

123 variable, and then its (dynamic) attributes can be used to obtain 

124 references to columns that can be included in a query:: 

125 

126 with butler._query() as query: 

127 x = query.expression_factory 

128 query = query.where( 

129 x.instrument == "LSSTCam", 

130 x.visit.day_obs > 20240701, 

131 x.any(x.band == 'u', x.band == 'y'), 

132 ) 

133 

134 As shown above, the returned object also has an `any` method to create 

135 combine expressions with logical OR (as well as `not_` and `all`, 

136 though the latter is rarely necessary since `where` already combines 

137 its arguments with AND). 

138 

139 Proxies for fields associated with dataset types (``dataset_id``, 

140 ``ingest_date``, ``run``, ``collection``, as well as ``timespan`` for 

141 `~CollectionType.CALIBRATION` collection searches) can be obtained with 

142 dict-like access instead:: 

143 

144 with butler._query() as query: 

145 query = query.order_by(x["raw"].ingest_date) 

146 

147 Expression proxy objects that correspond to scalar columns overload the 

148 standard comparison operators (``==``, ``!=``, ``<``, ``>``, ``<=``, 

149 ``>=``) and provide `~ScalarExpressionProxy.in_range`, 

150 `~ScalarExpressionProxy.in_iterable`, and 

151 `~ScalarExpressionProxy.in_query` methods for membership tests. For 

152 `order_by` contexts, they also have a `~ScalarExpressionProxy.desc` 

153 property to indicate that the sort order for that expression should be 

154 reversed. 

155 

156 Proxy objects for region and timespan fields have an `overlaps` method, 

157 and timespans also have `~TimespanProxy.begin` and `~TimespanProxy.end` 

158 properties to access scalar expression proxies for the bounds. 

159 

160 All proxy objects also have a `~ExpressionProxy.is_null` property. 

161 

162 Literal values can be created by calling `ExpressionFactory.literal`, 

163 but can almost always be created implicitly via overloaded operators 

164 instead. 

165 """ 

166 return ExpressionFactory(self._driver.universe) 

167 

168 def data_ids( 

169 self, dimensions: DimensionGroup | Iterable[str] | str | None = None 

170 ) -> DataCoordinateQueryResults: 

171 """Return a result object that is a `DataCoordinate` iterable. 

172 

173 Parameters 

174 ---------- 

175 dimensions : `DimensionGroup`, `str`, or \ 

176 `~collections.abc.Iterable` [`str`], optional 

177 The dimensions of the data IDs to yield, as either `DimensionGroup` 

178 instances or `str` names. Will be automatically expanded to a 

179 complete `DimensionGroup`. These dimensions do not need to match 

180 the query's current `dimensions`. Default is 

181 `constraint_dimensions`. 

182 

183 Returns 

184 ------- 

185 data_ids : `DataCoordinateQueryResults` 

186 Data IDs matching the given query parameters. These are guaranteed 

187 to identify all dimensions (`DataCoordinate.hasFull` returns 

188 `True`), but will not contain `DimensionRecord` objects 

189 (`DataCoordinate.hasRecords` returns `False`). Call 

190 `~DataCoordinateQueryResults.with_dimension_records` on the 

191 returned object to include dimension records as well. 

192 """ 

193 tree = self._tree 

194 if dimensions is None: 

195 dimensions = self._tree.dimensions 

196 else: 

197 dimensions = self._driver.universe.conform(dimensions) 

198 if not dimensions <= self._tree.dimensions: 

199 tree = tree.join_dimensions(dimensions) 

200 result_spec = DataCoordinateResultSpec(dimensions=dimensions, include_dimension_records=False) 

201 return DataCoordinateQueryResults(self._driver, tree, result_spec) 

202 

203 def datasets( 

204 self, 

205 dataset_type: str | DatasetType, 

206 collections: str | Iterable[str] | None = None, 

207 *, 

208 find_first: bool = True, 

209 ) -> DatasetRefQueryResults: 

210 """Return a result object that is a `DatasetRef` iterable. 

211 

212 Parameters 

213 ---------- 

214 dataset_type : `str` or `DatasetType` 

215 The dataset type to search for. 

216 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

217 The collection or collections to search, in order. If not provided 

218 or `None`, and the dataset has not already been joined into the 

219 query, the default collection search path for this butler is used. 

220 find_first : `bool`, optional 

221 If `True` (default), for each result data ID, only yield one 

222 `DatasetRef` of each `DatasetType`, from the first collection in 

223 which a dataset of that dataset type appears (according to the 

224 order of ``collections`` passed in). If `True`, ``collections`` 

225 must not be ``...``. 

226 

227 Returns 

228 ------- 

229 refs : `.queries.DatasetRefQueryResults` 

230 Dataset references matching the given query criteria. Nested data 

231 IDs are guaranteed to include values for all implied dimensions 

232 (i.e. `DataCoordinate.hasFull` will return `True`), but will not 

233 include dimension records (`DataCoordinate.hasRecords` will be 

234 `False`) unless 

235 `~.queries.DatasetRefQueryResults.with_dimension_records` is 

236 called on the result object (which returns a new one). 

237 

238 Raises 

239 ------ 

240 lsst.daf.butler.registry.DatasetTypeExpressionError 

241 Raised when the ``dataset_type`` expression is invalid. 

242 lsst.daf.butler.registry.NoDefaultCollectionError 

243 Raised when ``collections`` is `None` and default butler 

244 collections are not defined. 

245 TypeError 

246 Raised when the arguments are incompatible, such as when a 

247 collection wildcard is passed when ``find_first`` is `True` 

248 

249 Notes 

250 ----- 

251 When multiple dataset types are queried in a single call, the 

252 results of this operation are equivalent to querying for each dataset 

253 type separately in turn, and no information about the relationships 

254 between datasets of different types is included. 

255 """ 

256 dataset_type_name, storage_class_name, query = self._join_dataset_search_impl( 

257 dataset_type, collections 

258 ) 

259 dataset_search = query._tree.datasets[dataset_type_name] 

260 spec = DatasetRefResultSpec.model_construct( 

261 dataset_type_name=dataset_type_name, 

262 dimensions=dataset_search.dimensions, 

263 storage_class_name=storage_class_name, 

264 include_dimension_records=False, 

265 find_first=find_first, 

266 ) 

267 return DatasetRefQueryResults(self._driver, tree=query._tree, spec=spec) 

268 

269 def dimension_records(self, element: str) -> DimensionRecordQueryResults: 

270 """Return a result object that is a `DimensionRecord` iterable. 

271 

272 Parameters 

273 ---------- 

274 element : `str` 

275 The name of a dimension element to obtain records for. 

276 

277 Returns 

278 ------- 

279 records : `.queries.DimensionRecordQueryResults` 

280 Data IDs matching the given query parameters. 

281 """ 

282 tree = self._tree 

283 if element not in tree.dimensions.elements: 

284 tree = tree.join_dimensions(self._driver.universe[element].minimal_group) 

285 result_spec = DimensionRecordResultSpec(element=self._driver.universe[element]) 

286 return DimensionRecordQueryResults(self._driver, tree, result_spec) 

287 

288 def materialize( 

289 self, 

290 *, 

291 dimensions: Iterable[str] | DimensionGroup | None = None, 

292 datasets: Iterable[str] | None = None, 

293 ) -> Query: 

294 """Execute the query, save its results to a temporary location, and 

295 return a new query that represents fetching or joining against those 

296 saved results. 

297 

298 Parameters 

299 ---------- 

300 dimensions : `~collections.abc.Iterable` [ `str` ] or \ 

301 `DimensionGroup`, optional 

302 Dimensions to include in the temporary results. Default is to 

303 include all dimensions in the query. 

304 datasets : `~collections.abc.Iterable` [ `str` ], optional 

305 Names of dataset types that should be included in the new query; 

306 default is to include `constraint_dataset_types`. 

307 

308 Returns 

309 ------- 

310 query : `Query` 

311 A new query object whose that represents the materialized rows. 

312 

313 Notes 

314 ----- 

315 Only dimension key columns and (at the discretion of the 

316 implementation) certain dataset columns are actually materialized, 

317 since at this stage we do not know which dataset or dimension record 

318 fields are actually needed in result rows, and these can be joined back 

319 in on the materialized dimension keys. But all constraints on those 

320 dimension keys (including dataset existence) are applied to the 

321 materialized rows. 

322 """ 

323 if datasets is None: 

324 datasets = frozenset(self.constraint_dataset_types) 

325 else: 

326 datasets = frozenset(datasets) 

327 if not (datasets <= self.constraint_dataset_types): 

328 raise InvalidQueryError( 

329 f"Dataset(s) {datasets - self.constraint_dataset_types} are present in the query." 

330 ) 

331 if dimensions is None: 

332 dimensions = self._tree.dimensions 

333 else: 

334 dimensions = self._driver.universe.conform(dimensions) 

335 key = self._driver.materialize(self._tree, dimensions, datasets) 

336 tree = make_identity_query_tree(self._driver.universe).join_materialization( 

337 key, dimensions=dimensions 

338 ) 

339 for dataset_type_name in datasets: 

340 dataset_search = self._tree.datasets[dataset_type_name] 

341 if not (dataset_search.dimensions <= tree.dimensions): 

342 raise InvalidQueryError( 

343 f"Materialization-backed query has dimensions {tree.dimensions}, which do not " 

344 f"cover the dimensions {dataset_search.dimensions} of dataset {dataset_type_name!r}. " 

345 "Expand the dimensions or drop this dataset type in the arguments to materialize to " 

346 "avoid this error." 

347 ) 

348 tree = tree.join_dataset(dataset_type_name, self._tree.datasets[dataset_type_name]) 

349 return Query(self._driver, tree) 

350 

351 def join_dataset_search( 

352 self, 

353 dataset_type: str | DatasetType, 

354 collections: Iterable[str] | None = None, 

355 ) -> Query: 

356 """Return a new query with a search for a dataset joined in. 

357 

358 Parameters 

359 ---------- 

360 dataset_type : `str` or `DatasetType` 

361 Dataset type or name. May not refer to a dataset component. 

362 collections : `~collections.abc.Iterable` [ `str` ], optional 

363 Iterable of collections to search. Order is preserved, but will 

364 not matter if the dataset search is only used as a constraint on 

365 dimensions or if ``find_first=False`` when requesting results. If 

366 not present or `None`, the default collection search path will be 

367 used. 

368 

369 Returns 

370 ------- 

371 query : `Query` 

372 A new query object with dataset columns available and rows 

373 restricted to those consistent with the found data IDs. 

374 

375 Raises 

376 ------ 

377 DatasetTypeError 

378 Raised given dataset type is inconsistent with the registered 

379 dataset type. 

380 MissingDatasetTypeError 

381 Raised if the dataset type has not been registered and only a 

382 `str` dataset type name was given. 

383 

384 Notes 

385 ----- 

386 This method may require communication with the server unless the 

387 dataset type and collections have already been referenced by the same 

388 query context. 

389 """ 

390 _, _, query = self._join_dataset_search_impl( 

391 dataset_type, collections, allow_storage_class_overrides=False 

392 ) 

393 return query 

394 

395 def join_data_coordinates(self, iterable: Iterable[DataCoordinate]) -> Query: 

396 """Return a new query that joins in an explicit table of data IDs. 

397 

398 Parameters 

399 ---------- 

400 iterable : `~collections.abc.Iterable` [ `DataCoordinate` ] 

401 Iterable of `DataCoordinate`. All items must have the same 

402 dimensions. Must have at least one item. 

403 

404 Returns 

405 ------- 

406 query : `Query` 

407 A new query object with the data IDs joined in. 

408 """ 

409 rows: set[tuple[DataIdValue, ...]] = set() 

410 dimensions: DimensionGroup | None = None 

411 for data_coordinate in iterable: 

412 if dimensions is None: 

413 dimensions = data_coordinate.dimensions 

414 elif dimensions != data_coordinate.dimensions: 

415 raise InvalidQueryError( 

416 f"Inconsistent dimensions: {dimensions} != {data_coordinate.dimensions}." 

417 ) 

418 rows.add(data_coordinate.required_values) 

419 if dimensions is None: 

420 raise InvalidQueryError("Cannot upload an empty data coordinate set.") 

421 key = self._driver.upload_data_coordinates(dimensions, rows) 

422 return Query( 

423 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key), driver=self._driver 

424 ) 

425 

426 def join_dimensions(self, dimensions: Iterable[str] | DimensionGroup) -> Query: 

427 """Return a new query that joins the logical tables for additional 

428 dimensions. 

429 

430 Parameters 

431 ---------- 

432 dimensions : `~collections.abc.Iterable` [ `str` ] or `DimensionGroup` 

433 Names of dimensions to join in. 

434 

435 Returns 

436 ------- 

437 query : `Query` 

438 A new query object with the dimensions joined in. 

439 

440 Notes 

441 ----- 

442 Dimensions are automatically joined in whenever needed, so this method 

443 should rarely need to be called directly. 

444 """ 

445 dimensions = self._driver.universe.conform(dimensions) 

446 return Query(tree=self._tree.join_dimensions(dimensions), driver=self._driver) 

447 

448 def where( 

449 self, 

450 *args: str | Predicate | DataId, 

451 bind: Mapping[str, Any] | None = None, 

452 **kwargs: Any, 

453 ) -> Query: 

454 """Return a query with a boolean-expression filter on its rows. 

455 

456 Parameters 

457 ---------- 

458 *args 

459 Constraints to apply, combined with logical AND. Arguments may be 

460 `str` expressions to parse, `Predicate` objects (these are 

461 typically constructed via `expression_factory`) or data IDs. 

462 bind : `~collections.abc.Mapping` 

463 Mapping from string identifier appearing in a string expression to 

464 a literal value that should be substituted for it. This is 

465 recommended instead of embedding literals directly into the 

466 expression, especially for strings, timespans, or other types where 

467 quoting or formatting is nontrivial. 

468 **kwargs 

469 Data ID key value pairs that extend and override any present in 

470 ``*args``. 

471 

472 Returns 

473 ------- 

474 query : `Query` 

475 A new query object with the given row filters (as well as any 

476 already present in ``self``). All row filters are combined with 

477 logical AND. 

478 

479 Notes 

480 ----- 

481 If an expression references a dimension or dimension element that is 

482 not already present in the query, it will be joined in, but dataset 

483 searches must already be joined into a query in order to reference 

484 their fields in expressions. 

485 

486 Data ID values are not checked for consistency; they are extracted from 

487 ``args`` and then ``kwargs`` and combined, with later values overriding 

488 earlier ones. 

489 """ 

490 return Query( 

491 tree=self._tree.where( 

492 convert_where_args(self.dimensions, self.constraint_dataset_types, *args, bind=bind, **kwargs) 

493 ), 

494 driver=self._driver, 

495 ) 

496 

497 def _join_dataset_search_impl( 

498 self, 

499 dataset_type: str | DatasetType, 

500 collections: Iterable[str] | None = None, 

501 allow_storage_class_overrides: bool = True, 

502 ) -> tuple[str, str, Query]: 

503 """Implement `join_dataset_search`, and also return the dataset type 

504 name and storage class, in addition to the modified Query. 

505 """ 

506 # In this method we need the dimensions of the dataset type, but we 

507 # might not need the storage class, since the dataset may only be used 

508 # as an existence constraint. It depends on whether 

509 # `join_dataset_search` or `datasets` is calling this method. 

510 dimensions: DimensionGroup | None = None 

511 storage_class_name: str | None = None 

512 # Handle DatasetType vs. str arg. 

513 if isinstance(dataset_type, DatasetType): 

514 dataset_type_name = dataset_type.name 

515 dimensions = dataset_type.dimensions.as_group() 

516 storage_class_name = dataset_type.storageClass_name 

517 elif isinstance(dataset_type, str): 

518 dataset_type_name = dataset_type 

519 else: 

520 raise TypeError(f"Invalid dataset type argument {dataset_type!r}.") 

521 # See if this dataset has already been joined into the query. 

522 if existing_search := self._tree.datasets.get(dataset_type_name): 

523 if collections is None: 

524 collections = existing_search.collections 

525 else: 

526 collections = tuple(ensure_iterable(collections)) 

527 if collections != existing_search.collections: 

528 raise InvalidQueryError( 

529 f"Dataset type {dataset_type_name!r} was already joined into this " 

530 "query with a different collection search path (previously " 

531 f"[{', '.join(existing_search.collections)}], now [{', '.join(collections)}])." 

532 ) 

533 if dimensions is None: 

534 dimensions = existing_search.dimensions 

535 else: 

536 if collections is None: 

537 collections = self._driver.get_default_collections() 

538 collections = tuple(ensure_iterable(collections)) 

539 # Look up the data repository definition of the dataset type to check 

540 # for consistency, or get dimensions and storage class if we don't have 

541 # them. 

542 resolved_dataset_type = self._driver.get_dataset_type(dataset_type_name) 

543 resolved_dimensions = resolved_dataset_type.dimensions.as_group() 

544 if dimensions is not None and dimensions != resolved_dimensions: 

545 raise DatasetTypeError( 

546 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the " 

547 f"registered dimensions {resolved_dimensions}." 

548 ) 

549 if storage_class_name is not None: 

550 if storage_class_name != resolved_dataset_type.storageClass_name: 

551 if not allow_storage_class_overrides: 

552 raise InvalidQueryError( 

553 f"Storage class {storage_class_name!r} for dataset type {dataset_type!r} differs " 

554 f"from repository definition {resolved_dataset_type.storageClass_name!r}, but " 

555 "join_dataset_search does not are about storage classes and cannot record this " 

556 "override. Pass the override to `Query.datasets` instead." 

557 ) 

558 if not ( 

559 StorageClassFactory() 

560 .getStorageClass(storage_class_name) 

561 .can_convert(resolved_dataset_type.storageClass) 

562 ): 

563 raise DatasetTypeError( 

564 f"Given storage class {storage_class_name!r} for {dataset_type_name!r} is not " 

565 f"compatible with repository storage class {resolved_dataset_type.storageClass_name}." 

566 ) 

567 else: 

568 storage_class_name = resolved_dataset_type.storageClass_name 

569 dataset_search = DatasetSearch.model_construct( 

570 collections=collections, 

571 dimensions=resolved_dimensions, 

572 ) 

573 return ( 

574 dataset_type_name, 

575 storage_class_name, 

576 Query(self._driver, self._tree.join_dataset(dataset_type_name, dataset_search)), 

577 )