Coverage for python/lsst/daf/butler/queries/_query.py: 24%

126 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-30 02:53 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("Query",) 

31 

32from collections.abc import Iterable, Mapping, Set 

33from typing import Any, final 

34 

35from lsst.utils.iteration import ensure_iterable 

36 

37from .._dataset_type import DatasetType 

38from .._exceptions import InvalidQueryError 

39from .._storage_class import StorageClassFactory 

40from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup 

41from ..registry import DatasetTypeError 

42from ._base import QueryBase 

43from ._data_coordinate_query_results import DataCoordinateQueryResults 

44from ._dataset_query_results import DatasetRefQueryResults 

45from ._dimension_record_query_results import DimensionRecordQueryResults 

46from .convert_args import convert_where_args 

47from .driver import QueryDriver 

48from .expression_factory import ExpressionFactory 

49from .result_specs import DataCoordinateResultSpec, DatasetRefResultSpec, DimensionRecordResultSpec 

50from .tree import DatasetSearch, Predicate, QueryTree, make_identity_query_tree 

51 

52 

53@final 

54class Query(QueryBase): 

55 """A method-chaining builder for butler queries. 

56 

57 Parameters 

58 ---------- 

59 driver : `QueryDriver` 

60 Implementation object that knows how to actually execute queries. 

61 tree : `QueryTree`, optional 

62 Description of the query as a tree of joins and column expressions. 

63 Defaults to the result of a call to `tree.make_identity_query_tree`. 

64 

65 Notes 

66 ----- 

67 `Query` objects should never be constructed directly by users; use 

68 `Butler._query` instead. 

69 

70 A `Query` object represents the first stage of query construction, in which 

71 constraints and joins are defined (roughly corresponding to the WHERE and 

72 FROM clauses in SQL). The various "results" objects represent the second 

73 (and final) stage, where the columns returned are specified and any sorting 

74 or integer slicing can be applied. Result objects are obtained from the 

75 `data_ids`, `datasets`, and `dimension_records` methods. 

76 

77 `Query` and query-result objects are always immutable (except for caching 

78 information fetched from the database or server), so modifier methods 

79 always return a new object without modifying the current one. 

80 """ 

81 

82 def __init__(self, driver: QueryDriver, tree: QueryTree | None = None): 

83 # __init__ defined here because there are multiple base classes and 

84 # not all define __init__ (and hence inherit object.__init__, which 

85 # just ignores its args). Even if we just delegate to super(), it 

86 # seems less fragile to make it explicit here. 

87 if tree is None: 

88 tree = make_identity_query_tree(driver.universe) 

89 super().__init__(driver, tree) 

90 

91 @property 

92 def constraint_dataset_types(self) -> Set[str]: 

93 """The names of all dataset types joined into the query. 

94 

95 The existence of datasets of these types constrains the data IDs of any 

96 type of result. Fields for these dataset types are also usable in 

97 'where' expressions. 

98 """ 

99 # Note that this includes only dataset type names, not `DatasetType` 

100 # instances; the `DatasetQueryResults` adapter returned by the 

101 # `datasets` method does include `DatasetType` instances, since it is 

102 # in a better position to track and respect any storage class override 

103 # specified. 

104 return self._tree.datasets.keys() 

105 

106 @property 

107 def constraint_dimensions(self) -> DimensionGroup: 

108 """Dimensions currently present in the query, either directly or 

109 indirectly. 

110 

111 This includes dimensions that are present in any joined subquery (such 

112 as a dataset search, materialization, or data ID upload) or `where` 

113 argument, as well as any required or implied dependency of those 

114 dimensions. 

115 """ 

116 return self._tree.dimensions 

117 

118 @property 

119 def expression_factory(self) -> ExpressionFactory: 

120 """A factory for column expressions using overloaded operators. 

121 

122 Notes 

123 ----- 

124 Typically this attribute will be assigned to a single-character local 

125 variable, and then its (dynamic) attributes can be used to obtain 

126 references to columns that can be included in a query:: 

127 

128 with butler._query() as query: 

129 x = query.expression_factory 

130 query = query.where( 

131 x.instrument == "LSSTCam", 

132 x.visit.day_obs > 20240701, 

133 x.any(x.band == 'u', x.band == 'y'), 

134 ) 

135 

136 As shown above, the returned object also has an `any` method to create 

137 combine expressions with logical OR (as well as `not_` and `all`, 

138 though the latter is rarely necessary since `where` already combines 

139 its arguments with AND). 

140 

141 Proxies for fields associated with dataset types (``dataset_id``, 

142 ``ingest_date``, ``run``, ``collection``, as well as ``timespan`` for 

143 `~CollectionType.CALIBRATION` collection searches) can be obtained with 

144 dict-like access instead:: 

145 

146 with butler._query() as query: 

147 query = query.order_by(x["raw"].ingest_date) 

148 

149 Expression proxy objects that correspond to scalar columns overload the 

150 standard comparison operators (``==``, ``!=``, ``<``, ``>``, ``<=``, 

151 ``>=``) and provide `~ScalarExpressionProxy.in_range`, 

152 `~ScalarExpressionProxy.in_iterable`, and 

153 `~ScalarExpressionProxy.in_query` methods for membership tests. For 

154 `order_by` contexts, they also have a `~ScalarExpressionProxy.desc` 

155 property to indicate that the sort order for that expression should be 

156 reversed. 

157 

158 Proxy objects for region and timespan fields have an `overlaps` method, 

159 and timespans also have `~TimespanProxy.begin` and `~TimespanProxy.end` 

160 properties to access scalar expression proxies for the bounds. 

161 

162 All proxy objects also have a `~ExpressionProxy.is_null` property. 

163 

164 Literal values can be created by calling `ExpressionFactory.literal`, 

165 but can almost always be created implicitly via overloaded operators 

166 instead. 

167 """ 

168 return ExpressionFactory(self._driver.universe) 

169 

170 def data_ids( 

171 self, dimensions: DimensionGroup | Iterable[str] | str | None = None 

172 ) -> DataCoordinateQueryResults: 

173 """Return a result object that is a `DataCoordinate` iterable. 

174 

175 Parameters 

176 ---------- 

177 dimensions : `DimensionGroup`, `str`, or \ 

178 `~collections.abc.Iterable` [`str`], optional 

179 The dimensions of the data IDs to yield, as either `DimensionGroup` 

180 instances or `str` names. Will be automatically expanded to a 

181 complete `DimensionGroup`. These dimensions do not need to match 

182 the query's current `dimensions`. Default is 

183 `constraint_dimensions`. 

184 

185 Returns 

186 ------- 

187 data_ids : `DataCoordinateQueryResults` 

188 Data IDs matching the given query parameters. These are guaranteed 

189 to identify all dimensions (`DataCoordinate.hasFull` returns 

190 `True`), but will not contain `DimensionRecord` objects 

191 (`DataCoordinate.hasRecords` returns `False`). Call 

192 `~DataCoordinateQueryResults.with_dimension_records` on the 

193 returned object to include dimension records as well. 

194 """ 

195 tree = self._tree 

196 if dimensions is None: 

197 dimensions = self._tree.dimensions 

198 else: 

199 dimensions = self._driver.universe.conform(dimensions) 

200 if not dimensions <= self._tree.dimensions: 

201 tree = tree.join_dimensions(dimensions) 

202 result_spec = DataCoordinateResultSpec(dimensions=dimensions, include_dimension_records=False) 

203 return DataCoordinateQueryResults(self._driver, tree, result_spec) 

204 

205 def datasets( 

206 self, 

207 dataset_type: str | DatasetType, 

208 collections: str | Iterable[str] | None = None, 

209 *, 

210 find_first: bool = True, 

211 ) -> DatasetRefQueryResults: 

212 """Return a result object that is a `DatasetRef` iterable. 

213 

214 Parameters 

215 ---------- 

216 dataset_type : `str` or `DatasetType` 

217 The dataset type to search for. 

218 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

219 The collection or collections to search, in order. If not provided 

220 or `None`, and the dataset has not already been joined into the 

221 query, the default collection search path for this butler is used. 

222 find_first : `bool`, optional 

223 If `True` (default), for each result data ID, only yield one 

224 `DatasetRef` of each `DatasetType`, from the first collection in 

225 which a dataset of that dataset type appears (according to the 

226 order of ``collections`` passed in). If `True`, ``collections`` 

227 must not be ``...``. 

228 

229 Returns 

230 ------- 

231 refs : `.queries.DatasetRefQueryResults` 

232 Dataset references matching the given query criteria. Nested data 

233 IDs are guaranteed to include values for all implied dimensions 

234 (i.e. `DataCoordinate.hasFull` will return `True`), but will not 

235 include dimension records (`DataCoordinate.hasRecords` will be 

236 `False`) unless 

237 `~.queries.DatasetRefQueryResults.with_dimension_records` is 

238 called on the result object (which returns a new one). 

239 

240 Raises 

241 ------ 

242 lsst.daf.butler.registry.DatasetTypeExpressionError 

243 Raised when the ``dataset_type`` expression is invalid. 

244 lsst.daf.butler.registry.NoDefaultCollectionError 

245 Raised when ``collections`` is `None` and default butler 

246 collections are not defined. 

247 TypeError 

248 Raised when the arguments are incompatible, such as when a 

249 collection wildcard is passed when ``find_first`` is `True` 

250 

251 Notes 

252 ----- 

253 When multiple dataset types are queried in a single call, the 

254 results of this operation are equivalent to querying for each dataset 

255 type separately in turn, and no information about the relationships 

256 between datasets of different types is included. 

257 """ 

258 dataset_type_name, storage_class_name, query = self._join_dataset_search_impl( 

259 dataset_type, collections 

260 ) 

261 dataset_search = query._tree.datasets[dataset_type_name] 

262 spec = DatasetRefResultSpec.model_construct( 

263 dataset_type_name=dataset_type_name, 

264 dimensions=dataset_search.dimensions, 

265 storage_class_name=storage_class_name, 

266 include_dimension_records=False, 

267 find_first=find_first, 

268 ) 

269 return DatasetRefQueryResults(self._driver, tree=query._tree, spec=spec) 

270 

271 def dimension_records(self, element: str) -> DimensionRecordQueryResults: 

272 """Return a result object that is a `DimensionRecord` iterable. 

273 

274 Parameters 

275 ---------- 

276 element : `str` 

277 The name of a dimension element to obtain records for. 

278 

279 Returns 

280 ------- 

281 records : `.queries.DimensionRecordQueryResults` 

282 Data IDs matching the given query parameters. 

283 """ 

284 tree = self._tree 

285 if element not in tree.dimensions.elements: 

286 tree = tree.join_dimensions(self._driver.universe[element].minimal_group) 

287 result_spec = DimensionRecordResultSpec(element=self._driver.universe[element]) 

288 return DimensionRecordQueryResults(self._driver, tree, result_spec) 

289 

290 def materialize( 

291 self, 

292 *, 

293 dimensions: Iterable[str] | DimensionGroup | None = None, 

294 datasets: Iterable[str] | None = None, 

295 ) -> Query: 

296 """Execute the query, save its results to a temporary location, and 

297 return a new query that represents fetching or joining against those 

298 saved results. 

299 

300 Parameters 

301 ---------- 

302 dimensions : `~collections.abc.Iterable` [ `str` ] or \ 

303 `DimensionGroup`, optional 

304 Dimensions to include in the temporary results. Default is to 

305 include all dimensions in the query. 

306 datasets : `~collections.abc.Iterable` [ `str` ], optional 

307 Names of dataset types that should be included in the new query; 

308 default is to include `constraint_dataset_types`. 

309 

310 Returns 

311 ------- 

312 query : `Query` 

313 A new query object whose that represents the materialized rows. 

314 

315 Notes 

316 ----- 

317 Only dimension key columns and (at the discretion of the 

318 implementation) certain dataset columns are actually materialized, 

319 since at this stage we do not know which dataset or dimension record 

320 fields are actually needed in result rows, and these can be joined back 

321 in on the materialized dimension keys. But all constraints on those 

322 dimension keys (including dataset existence) are applied to the 

323 materialized rows. 

324 """ 

325 if datasets is None: 

326 datasets = frozenset(self.constraint_dataset_types) 

327 else: 

328 datasets = frozenset(datasets) 

329 if not (datasets <= self.constraint_dataset_types): 

330 raise InvalidQueryError( 

331 f"Dataset(s) {datasets - self.constraint_dataset_types} are present in the query." 

332 ) 

333 if dimensions is None: 

334 dimensions = self._tree.dimensions 

335 else: 

336 dimensions = self._driver.universe.conform(dimensions) 

337 key = self._driver.materialize(self._tree, dimensions, datasets) 

338 tree = make_identity_query_tree(self._driver.universe).join_materialization( 

339 key, dimensions=dimensions 

340 ) 

341 for dataset_type_name in datasets: 

342 dataset_search = self._tree.datasets[dataset_type_name] 

343 if not (dataset_search.dimensions <= tree.dimensions): 

344 raise InvalidQueryError( 

345 f"Materialization-backed query has dimensions {tree.dimensions}, which do not " 

346 f"cover the dimensions {dataset_search.dimensions} of dataset {dataset_type_name!r}. " 

347 "Expand the dimensions or drop this dataset type in the arguments to materialize to " 

348 "avoid this error." 

349 ) 

350 tree = tree.join_dataset(dataset_type_name, self._tree.datasets[dataset_type_name]) 

351 return Query(self._driver, tree) 

352 

353 def join_dataset_search( 

354 self, 

355 dataset_type: str | DatasetType, 

356 collections: Iterable[str] | None = None, 

357 ) -> Query: 

358 """Return a new query with a search for a dataset joined in. 

359 

360 Parameters 

361 ---------- 

362 dataset_type : `str` or `DatasetType` 

363 Dataset type or name. May not refer to a dataset component. 

364 collections : `~collections.abc.Iterable` [ `str` ], optional 

365 Iterable of collections to search. Order is preserved, but will 

366 not matter if the dataset search is only used as a constraint on 

367 dimensions or if ``find_first=False`` when requesting results. If 

368 not present or `None`, the default collection search path will be 

369 used. 

370 

371 Returns 

372 ------- 

373 query : `Query` 

374 A new query object with dataset columns available and rows 

375 restricted to those consistent with the found data IDs. 

376 

377 Raises 

378 ------ 

379 DatasetTypeError 

380 Raised given dataset type is inconsistent with the registered 

381 dataset type. 

382 MissingDatasetTypeError 

383 Raised if the dataset type has not been registered and only a 

384 `str` dataset type name was given. 

385 

386 Notes 

387 ----- 

388 This method may require communication with the server unless the 

389 dataset type and collections have already been referenced by the same 

390 query context. 

391 """ 

392 _, _, query = self._join_dataset_search_impl( 

393 dataset_type, collections, allow_storage_class_overrides=False 

394 ) 

395 return query 

396 

397 def join_data_coordinates(self, iterable: Iterable[DataCoordinate]) -> Query: 

398 """Return a new query that joins in an explicit table of data IDs. 

399 

400 Parameters 

401 ---------- 

402 iterable : `~collections.abc.Iterable` [ `DataCoordinate` ] 

403 Iterable of `DataCoordinate`. All items must have the same 

404 dimensions. Must have at least one item. 

405 

406 Returns 

407 ------- 

408 query : `Query` 

409 A new query object with the data IDs joined in. 

410 """ 

411 rows: set[tuple[DataIdValue, ...]] = set() 

412 dimensions: DimensionGroup | None = None 

413 for data_coordinate in iterable: 

414 if dimensions is None: 

415 dimensions = data_coordinate.dimensions 

416 elif dimensions != data_coordinate.dimensions: 

417 raise InvalidQueryError( 

418 f"Inconsistent dimensions: {dimensions} != {data_coordinate.dimensions}." 

419 ) 

420 rows.add(data_coordinate.required_values) 

421 if dimensions is None: 

422 raise InvalidQueryError("Cannot upload an empty data coordinate set.") 

423 key = self._driver.upload_data_coordinates(dimensions, rows) 

424 return Query( 

425 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key), driver=self._driver 

426 ) 

427 

428 def join_dimensions(self, dimensions: Iterable[str] | DimensionGroup) -> Query: 

429 """Return a new query that joins the logical tables for additional 

430 dimensions. 

431 

432 Parameters 

433 ---------- 

434 dimensions : `~collections.abc.Iterable` [ `str` ] or `DimensionGroup` 

435 Names of dimensions to join in. 

436 

437 Returns 

438 ------- 

439 query : `Query` 

440 A new query object with the dimensions joined in. 

441 

442 Notes 

443 ----- 

444 Dimensions are automatically joined in whenever needed, so this method 

445 should rarely need to be called directly. 

446 """ 

447 dimensions = self._driver.universe.conform(dimensions) 

448 return Query(tree=self._tree.join_dimensions(dimensions), driver=self._driver) 

449 

450 def where( 

451 self, 

452 *args: str | Predicate | DataId, 

453 bind: Mapping[str, Any] | None = None, 

454 **kwargs: Any, 

455 ) -> Query: 

456 """Return a query with a boolean-expression filter on its rows. 

457 

458 Parameters 

459 ---------- 

460 *args 

461 Constraints to apply, combined with logical AND. Arguments may be 

462 `str` expressions to parse, `Predicate` objects (these are 

463 typically constructed via `expression_factory`) or data IDs. 

464 bind : `~collections.abc.Mapping` 

465 Mapping from string identifier appearing in a string expression to 

466 a literal value that should be substituted for it. This is 

467 recommended instead of embedding literals directly into the 

468 expression, especially for strings, timespans, or other types where 

469 quoting or formatting is nontrivial. 

470 **kwargs 

471 Data ID key value pairs that extend and override any present in 

472 ``*args``. 

473 

474 Returns 

475 ------- 

476 query : `Query` 

477 A new query object with the given row filters (as well as any 

478 already present in ``self``). All row filters are combined with 

479 logical AND. 

480 

481 Notes 

482 ----- 

483 If an expression references a dimension or dimension element that is 

484 not already present in the query, it will be joined in, but dataset 

485 searches must already be joined into a query in order to reference 

486 their fields in expressions. 

487 

488 Data ID values are not checked for consistency; they are extracted from 

489 ``args`` and then ``kwargs`` and combined, with later values overriding 

490 earlier ones. 

491 """ 

492 return Query( 

493 tree=self._tree.where( 

494 convert_where_args( 

495 self.constraint_dimensions, 

496 self.constraint_dataset_types, 

497 *args, 

498 bind=bind, 

499 **kwargs, 

500 ) 

501 ), 

502 driver=self._driver, 

503 ) 

504 

505 def _join_dataset_search_impl( 

506 self, 

507 dataset_type: str | DatasetType, 

508 collections: Iterable[str] | None = None, 

509 allow_storage_class_overrides: bool = True, 

510 ) -> tuple[str, str, Query]: 

511 """Implement `join_dataset_search`, and also return the dataset type 

512 name and storage class, in addition to the modified Query. 

513 """ 

514 # In this method we need the dimensions of the dataset type, but we 

515 # might not need the storage class, since the dataset may only be used 

516 # as an existence constraint. It depends on whether 

517 # `join_dataset_search` or `datasets` is calling this method. 

518 dimensions: DimensionGroup | None = None 

519 storage_class_name: str | None = None 

520 # Handle DatasetType vs. str arg. 

521 if isinstance(dataset_type, DatasetType): 

522 dataset_type_name = dataset_type.name 

523 dimensions = dataset_type.dimensions.as_group() 

524 storage_class_name = dataset_type.storageClass_name 

525 elif isinstance(dataset_type, str): 

526 dataset_type_name = dataset_type 

527 else: 

528 raise TypeError(f"Invalid dataset type argument {dataset_type!r}.") 

529 # See if this dataset has already been joined into the query. 

530 if existing_search := self._tree.datasets.get(dataset_type_name): 

531 if collections is None: 

532 collections = existing_search.collections 

533 else: 

534 collections = tuple(ensure_iterable(collections)) 

535 if collections != existing_search.collections: 

536 raise InvalidQueryError( 

537 f"Dataset type {dataset_type_name!r} was already joined into this " 

538 "query with a different collection search path (previously " 

539 f"[{', '.join(existing_search.collections)}], now [{', '.join(collections)}])." 

540 ) 

541 if dimensions is None: 

542 dimensions = existing_search.dimensions 

543 else: 

544 if collections is None: 

545 collections = self._driver.get_default_collections() 

546 collections = tuple(ensure_iterable(collections)) 

547 # Look up the data repository definition of the dataset type to check 

548 # for consistency, or get dimensions and storage class if we don't have 

549 # them. 

550 resolved_dataset_type = self._driver.get_dataset_type(dataset_type_name) 

551 resolved_dimensions = resolved_dataset_type.dimensions.as_group() 

552 if dimensions is not None and dimensions != resolved_dimensions: 

553 raise DatasetTypeError( 

554 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the " 

555 f"registered dimensions {resolved_dimensions}." 

556 ) 

557 if storage_class_name is not None: 

558 if storage_class_name != resolved_dataset_type.storageClass_name: 

559 if not allow_storage_class_overrides: 

560 raise InvalidQueryError( 

561 f"Storage class {storage_class_name!r} for dataset type {dataset_type!r} differs " 

562 f"from repository definition {resolved_dataset_type.storageClass_name!r}, but " 

563 "join_dataset_search does not are about storage classes and cannot record this " 

564 "override. Pass the override to `Query.datasets` instead." 

565 ) 

566 if not ( 

567 StorageClassFactory() 

568 .getStorageClass(storage_class_name) 

569 .can_convert(resolved_dataset_type.storageClass) 

570 ): 

571 raise DatasetTypeError( 

572 f"Given storage class {storage_class_name!r} for {dataset_type_name!r} is not " 

573 f"compatible with repository storage class {resolved_dataset_type.storageClass_name}." 

574 ) 

575 else: 

576 storage_class_name = resolved_dataset_type.storageClass_name 

577 dataset_search = DatasetSearch.model_construct( 

578 collections=collections, 

579 dimensions=resolved_dimensions, 

580 ) 

581 return ( 

582 dataset_type_name, 

583 storage_class_name, 

584 Query(self._driver, self._tree.join_dataset(dataset_type_name, dataset_search)), 

585 )