Coverage for python/lsst/daf/butler/queries/_query.py: 24%

125 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-18 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("Query",) 

31 

32from collections.abc import Iterable, Mapping, Set 

33from typing import Any, final 

34 

35from lsst.utils.iteration import ensure_iterable 

36 

37from .._dataset_type import DatasetType 

38from .._storage_class import StorageClassFactory 

39from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup 

40from ..registry import DatasetTypeError 

41from ._base import QueryBase 

42from ._data_coordinate_query_results import DataCoordinateQueryResults 

43from ._dataset_query_results import DatasetRefQueryResults 

44from ._dimension_record_query_results import DimensionRecordQueryResults 

45from .convert_args import convert_where_args 

46from .driver import QueryDriver 

47from .expression_factory import ExpressionFactory 

48from .result_specs import DataCoordinateResultSpec, DatasetRefResultSpec, DimensionRecordResultSpec 

49from .tree import DatasetSearch, InvalidQueryError, Predicate, QueryTree, make_identity_query_tree 

50 

51 

52@final 

53class Query(QueryBase): 

54 """A method-chaining builder for butler queries. 

55 

56 Parameters 

57 ---------- 

58 driver : `QueryDriver` 

59 Implementation object that knows how to actually execute queries. 

60 tree : `QueryTree`, optional 

61 Description of the query as a tree of joins and column expressions. 

62 Defaults to the result of a call to `tree.make_identity_query_tree`. 

63 

64 Notes 

65 ----- 

66 `Query` objects should never be constructed directly by users; use 

67 `Butler._query` instead. 

68 

69 A `Query` object represents the first stage of query construction, in which 

70 constraints and joins are defined (roughly corresponding to the WHERE and 

71 FROM clauses in SQL). The various "results" objects represent the second 

72 (and final) stage, where the columns returned are specified and any sorting 

73 or integer slicing can be applied. Result objects are obtained from the 

74 `data_ids`, `datasets`, and `dimension_records` methods. 

75 

76 `Query` and query-result objects are always immutable (except for caching 

77 information fetched from the database or server), so modifier methods 

78 always return a new object without modifying the current one. 

79 """ 

80 

81 def __init__(self, driver: QueryDriver, tree: QueryTree | None = None): 

82 # __init__ defined here because there are multiple base classes and 

83 # not all define __init__ (and hence inherit object.__init__, which 

84 # just ignores its args). Even if we just delegate to super(), it 

85 # seems less fragile to make it explicit here. 

86 if tree is None: 

87 tree = make_identity_query_tree(driver.universe) 

88 super().__init__(driver, tree) 

89 

90 @property 

91 def constraint_dataset_types(self) -> Set[str]: 

92 """The names of all dataset types joined into the query. 

93 

94 The existence of datasets of these types constrains the data IDs of any 

95 type of result. Fields for these dataset types are also usable in 

96 'where' expressions. 

97 """ 

98 # Note that this includes only dataset type names, not `DatasetType` 

99 # instances; the `DatasetQueryResults` adapter returned by the 

100 # `datasets` method does include `DatasetType` instances, since it is 

101 # in a better position to track and respect any storage class override 

102 # specified. 

103 return self._tree.datasets.keys() 

104 

105 @property 

106 def constraint_dimensions(self) -> DimensionGroup: 

107 """Dimensions currently present in the query, either directly or 

108 indirectly. 

109 

110 This includes dimensions that are present in any joined subquery (such 

111 as a dataset search, materialization, or data ID upload) or `where` 

112 argument, as well as any required or implied dependency of those 

113 dimensions. 

114 """ 

115 return self._tree.dimensions 

116 

117 @property 

118 def expression_factory(self) -> ExpressionFactory: 

119 """A factory for column expressions using overloaded operators. 

120 

121 Notes 

122 ----- 

123 Typically this attribute will be assigned to a single-character local 

124 variable, and then its (dynamic) attributes can be used to obtain 

125 references to columns that can be included in a query:: 

126 

127 with butler._query() as query: 

128 x = query.expression_factory 

129 query = query.where( 

130 x.instrument == "LSSTCam", 

131 x.visit.day_obs > 20240701, 

132 x.any(x.band == 'u', x.band == 'y'), 

133 ) 

134 

135 As shown above, the returned object also has an `any` method to create 

136 combine expressions with logical OR (as well as `not_` and `all`, 

137 though the latter is rarely necessary since `where` already combines 

138 its arguments with AND). 

139 

140 Proxies for fields associated with dataset types (``dataset_id``, 

141 ``ingest_date``, ``run``, ``collection``, as well as ``timespan`` for 

142 `~CollectionType.CALIBRATION` collection searches) can be obtained with 

143 dict-like access instead:: 

144 

145 with butler._query() as query: 

146 query = query.order_by(x["raw"].ingest_date) 

147 

148 Expression proxy objects that correspond to scalar columns overload the 

149 standard comparison operators (``==``, ``!=``, ``<``, ``>``, ``<=``, 

150 ``>=``) and provide `~ScalarExpressionProxy.in_range`, 

151 `~ScalarExpressionProxy.in_iterable`, and 

152 `~ScalarExpressionProxy.in_query` methods for membership tests. For 

153 `order_by` contexts, they also have a `~ScalarExpressionProxy.desc` 

154 property to indicate that the sort order for that expression should be 

155 reversed. 

156 

157 Proxy objects for region and timespan fields have an `overlaps` method, 

158 and timespans also have `~TimespanProxy.begin` and `~TimespanProxy.end` 

159 properties to access scalar expression proxies for the bounds. 

160 

161 All proxy objects also have a `~ExpressionProxy.is_null` property. 

162 

163 Literal values can be created by calling `ExpressionFactory.literal`, 

164 but can almost always be created implicitly via overloaded operators 

165 instead. 

166 """ 

167 return ExpressionFactory(self._driver.universe) 

168 

169 def data_ids( 

170 self, dimensions: DimensionGroup | Iterable[str] | str | None = None 

171 ) -> DataCoordinateQueryResults: 

172 """Return a result object that is a `DataCoordinate` iterable. 

173 

174 Parameters 

175 ---------- 

176 dimensions : `DimensionGroup`, `str`, or \ 

177 `~collections.abc.Iterable` [`str`], optional 

178 The dimensions of the data IDs to yield, as either `DimensionGroup` 

179 instances or `str` names. Will be automatically expanded to a 

180 complete `DimensionGroup`. These dimensions do not need to match 

181 the query's current `dimensions`. Default is 

182 `constraint_dimensions`. 

183 

184 Returns 

185 ------- 

186 data_ids : `DataCoordinateQueryResults` 

187 Data IDs matching the given query parameters. These are guaranteed 

188 to identify all dimensions (`DataCoordinate.hasFull` returns 

189 `True`), but will not contain `DimensionRecord` objects 

190 (`DataCoordinate.hasRecords` returns `False`). Call 

191 `~DataCoordinateQueryResults.with_dimension_records` on the 

192 returned object to include dimension records as well. 

193 """ 

194 tree = self._tree 

195 if dimensions is None: 

196 dimensions = self._tree.dimensions 

197 else: 

198 dimensions = self._driver.universe.conform(dimensions) 

199 if not dimensions <= self._tree.dimensions: 

200 tree = tree.join_dimensions(dimensions) 

201 result_spec = DataCoordinateResultSpec(dimensions=dimensions, include_dimension_records=False) 

202 return DataCoordinateQueryResults(self._driver, tree, result_spec) 

203 

204 def datasets( 

205 self, 

206 dataset_type: str | DatasetType, 

207 collections: str | Iterable[str] | None = None, 

208 *, 

209 find_first: bool = True, 

210 ) -> DatasetRefQueryResults: 

211 """Return a result object that is a `DatasetRef` iterable. 

212 

213 Parameters 

214 ---------- 

215 dataset_type : `str` or `DatasetType` 

216 The dataset type to search for. 

217 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

218 The collection or collections to search, in order. If not provided 

219 or `None`, and the dataset has not already been joined into the 

220 query, the default collection search path for this butler is used. 

221 find_first : `bool`, optional 

222 If `True` (default), for each result data ID, only yield one 

223 `DatasetRef` of each `DatasetType`, from the first collection in 

224 which a dataset of that dataset type appears (according to the 

225 order of ``collections`` passed in). If `True`, ``collections`` 

226 must not be ``...``. 

227 

228 Returns 

229 ------- 

230 refs : `.queries.DatasetRefQueryResults` 

231 Dataset references matching the given query criteria. Nested data 

232 IDs are guaranteed to include values for all implied dimensions 

233 (i.e. `DataCoordinate.hasFull` will return `True`), but will not 

234 include dimension records (`DataCoordinate.hasRecords` will be 

235 `False`) unless 

236 `~.queries.DatasetRefQueryResults.with_dimension_records` is 

237 called on the result object (which returns a new one). 

238 

239 Raises 

240 ------ 

241 lsst.daf.butler.registry.DatasetTypeExpressionError 

242 Raised when the ``dataset_type`` expression is invalid. 

243 lsst.daf.butler.registry.NoDefaultCollectionError 

244 Raised when ``collections`` is `None` and default butler 

245 collections are not defined. 

246 TypeError 

247 Raised when the arguments are incompatible, such as when a 

248 collection wildcard is passed when ``find_first`` is `True` 

249 

250 Notes 

251 ----- 

252 When multiple dataset types are queried in a single call, the 

253 results of this operation are equivalent to querying for each dataset 

254 type separately in turn, and no information about the relationships 

255 between datasets of different types is included. 

256 """ 

257 dataset_type_name, storage_class_name, query = self._join_dataset_search_impl( 

258 dataset_type, collections 

259 ) 

260 dataset_search = query._tree.datasets[dataset_type_name] 

261 spec = DatasetRefResultSpec.model_construct( 

262 dataset_type_name=dataset_type_name, 

263 dimensions=dataset_search.dimensions, 

264 storage_class_name=storage_class_name, 

265 include_dimension_records=False, 

266 find_first=find_first, 

267 ) 

268 return DatasetRefQueryResults(self._driver, tree=query._tree, spec=spec) 

269 

270 def dimension_records(self, element: str) -> DimensionRecordQueryResults: 

271 """Return a result object that is a `DimensionRecord` iterable. 

272 

273 Parameters 

274 ---------- 

275 element : `str` 

276 The name of a dimension element to obtain records for. 

277 

278 Returns 

279 ------- 

280 records : `.queries.DimensionRecordQueryResults` 

281 Data IDs matching the given query parameters. 

282 """ 

283 tree = self._tree 

284 if element not in tree.dimensions.elements: 

285 tree = tree.join_dimensions(self._driver.universe[element].minimal_group) 

286 result_spec = DimensionRecordResultSpec(element=self._driver.universe[element]) 

287 return DimensionRecordQueryResults(self._driver, tree, result_spec) 

288 

289 def materialize( 

290 self, 

291 *, 

292 dimensions: Iterable[str] | DimensionGroup | None = None, 

293 datasets: Iterable[str] | None = None, 

294 ) -> Query: 

295 """Execute the query, save its results to a temporary location, and 

296 return a new query that represents fetching or joining against those 

297 saved results. 

298 

299 Parameters 

300 ---------- 

301 dimensions : `~collections.abc.Iterable` [ `str` ] or \ 

302 `DimensionGroup`, optional 

303 Dimensions to include in the temporary results. Default is to 

304 include all dimensions in the query. 

305 datasets : `~collections.abc.Iterable` [ `str` ], optional 

306 Names of dataset types that should be included in the new query; 

307 default is to include `constraint_dataset_types`. 

308 

309 Returns 

310 ------- 

311 query : `Query` 

312 A new query object whose that represents the materialized rows. 

313 

314 Notes 

315 ----- 

316 Only dimension key columns and (at the discretion of the 

317 implementation) certain dataset columns are actually materialized, 

318 since at this stage we do not know which dataset or dimension record 

319 fields are actually needed in result rows, and these can be joined back 

320 in on the materialized dimension keys. But all constraints on those 

321 dimension keys (including dataset existence) are applied to the 

322 materialized rows. 

323 """ 

324 if datasets is None: 

325 datasets = frozenset(self.constraint_dataset_types) 

326 else: 

327 datasets = frozenset(datasets) 

328 if not (datasets <= self.constraint_dataset_types): 

329 raise InvalidQueryError( 

330 f"Dataset(s) {datasets - self.constraint_dataset_types} are present in the query." 

331 ) 

332 if dimensions is None: 

333 dimensions = self._tree.dimensions 

334 else: 

335 dimensions = self._driver.universe.conform(dimensions) 

336 key = self._driver.materialize(self._tree, dimensions, datasets) 

337 tree = make_identity_query_tree(self._driver.universe).join_materialization( 

338 key, dimensions=dimensions 

339 ) 

340 for dataset_type_name in datasets: 

341 dataset_search = self._tree.datasets[dataset_type_name] 

342 if not (dataset_search.dimensions <= tree.dimensions): 

343 raise InvalidQueryError( 

344 f"Materialization-backed query has dimensions {tree.dimensions}, which do not " 

345 f"cover the dimensions {dataset_search.dimensions} of dataset {dataset_type_name!r}. " 

346 "Expand the dimensions or drop this dataset type in the arguments to materialize to " 

347 "avoid this error." 

348 ) 

349 tree = tree.join_dataset(dataset_type_name, self._tree.datasets[dataset_type_name]) 

350 return Query(self._driver, tree) 

351 

352 def join_dataset_search( 

353 self, 

354 dataset_type: str | DatasetType, 

355 collections: Iterable[str] | None = None, 

356 ) -> Query: 

357 """Return a new query with a search for a dataset joined in. 

358 

359 Parameters 

360 ---------- 

361 dataset_type : `str` or `DatasetType` 

362 Dataset type or name. May not refer to a dataset component. 

363 collections : `~collections.abc.Iterable` [ `str` ], optional 

364 Iterable of collections to search. Order is preserved, but will 

365 not matter if the dataset search is only used as a constraint on 

366 dimensions or if ``find_first=False`` when requesting results. If 

367 not present or `None`, the default collection search path will be 

368 used. 

369 

370 Returns 

371 ------- 

372 query : `Query` 

373 A new query object with dataset columns available and rows 

374 restricted to those consistent with the found data IDs. 

375 

376 Raises 

377 ------ 

378 DatasetTypeError 

379 Raised given dataset type is inconsistent with the registered 

380 dataset type. 

381 MissingDatasetTypeError 

382 Raised if the dataset type has not been registered and only a 

383 `str` dataset type name was given. 

384 

385 Notes 

386 ----- 

387 This method may require communication with the server unless the 

388 dataset type and collections have already been referenced by the same 

389 query context. 

390 """ 

391 _, _, query = self._join_dataset_search_impl( 

392 dataset_type, collections, allow_storage_class_overrides=False 

393 ) 

394 return query 

395 

396 def join_data_coordinates(self, iterable: Iterable[DataCoordinate]) -> Query: 

397 """Return a new query that joins in an explicit table of data IDs. 

398 

399 Parameters 

400 ---------- 

401 iterable : `~collections.abc.Iterable` [ `DataCoordinate` ] 

402 Iterable of `DataCoordinate`. All items must have the same 

403 dimensions. Must have at least one item. 

404 

405 Returns 

406 ------- 

407 query : `Query` 

408 A new query object with the data IDs joined in. 

409 """ 

410 rows: set[tuple[DataIdValue, ...]] = set() 

411 dimensions: DimensionGroup | None = None 

412 for data_coordinate in iterable: 

413 if dimensions is None: 

414 dimensions = data_coordinate.dimensions 

415 elif dimensions != data_coordinate.dimensions: 

416 raise InvalidQueryError( 

417 f"Inconsistent dimensions: {dimensions} != {data_coordinate.dimensions}." 

418 ) 

419 rows.add(data_coordinate.required_values) 

420 if dimensions is None: 

421 raise InvalidQueryError("Cannot upload an empty data coordinate set.") 

422 key = self._driver.upload_data_coordinates(dimensions, rows) 

423 return Query( 

424 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key), driver=self._driver 

425 ) 

426 

427 def join_dimensions(self, dimensions: Iterable[str] | DimensionGroup) -> Query: 

428 """Return a new query that joins the logical tables for additional 

429 dimensions. 

430 

431 Parameters 

432 ---------- 

433 dimensions : `~collections.abc.Iterable` [ `str` ] or `DimensionGroup` 

434 Names of dimensions to join in. 

435 

436 Returns 

437 ------- 

438 query : `Query` 

439 A new query object with the dimensions joined in. 

440 

441 Notes 

442 ----- 

443 Dimensions are automatically joined in whenever needed, so this method 

444 should rarely need to be called directly. 

445 """ 

446 dimensions = self._driver.universe.conform(dimensions) 

447 return Query(tree=self._tree.join_dimensions(dimensions), driver=self._driver) 

448 

449 def where( 

450 self, 

451 *args: str | Predicate | DataId, 

452 bind: Mapping[str, Any] | None = None, 

453 **kwargs: Any, 

454 ) -> Query: 

455 """Return a query with a boolean-expression filter on its rows. 

456 

457 Parameters 

458 ---------- 

459 *args 

460 Constraints to apply, combined with logical AND. Arguments may be 

461 `str` expressions to parse, `Predicate` objects (these are 

462 typically constructed via `expression_factory`) or data IDs. 

463 bind : `~collections.abc.Mapping` 

464 Mapping from string identifier appearing in a string expression to 

465 a literal value that should be substituted for it. This is 

466 recommended instead of embedding literals directly into the 

467 expression, especially for strings, timespans, or other types where 

468 quoting or formatting is nontrivial. 

469 **kwargs 

470 Data ID key value pairs that extend and override any present in 

471 ``*args``. 

472 

473 Returns 

474 ------- 

475 query : `Query` 

476 A new query object with the given row filters (as well as any 

477 already present in ``self``). All row filters are combined with 

478 logical AND. 

479 

480 Notes 

481 ----- 

482 If an expression references a dimension or dimension element that is 

483 not already present in the query, it will be joined in, but dataset 

484 searches must already be joined into a query in order to reference 

485 their fields in expressions. 

486 

487 Data ID values are not checked for consistency; they are extracted from 

488 ``args`` and then ``kwargs`` and combined, with later values overriding 

489 earlier ones. 

490 """ 

491 return Query( 

492 tree=self._tree.where( 

493 convert_where_args( 

494 self.constraint_dimensions, 

495 self.constraint_dataset_types, 

496 *args, 

497 bind=bind, 

498 **kwargs, 

499 ) 

500 ), 

501 driver=self._driver, 

502 ) 

503 

504 def _join_dataset_search_impl( 

505 self, 

506 dataset_type: str | DatasetType, 

507 collections: Iterable[str] | None = None, 

508 allow_storage_class_overrides: bool = True, 

509 ) -> tuple[str, str, Query]: 

510 """Implement `join_dataset_search`, and also return the dataset type 

511 name and storage class, in addition to the modified Query. 

512 """ 

513 # In this method we need the dimensions of the dataset type, but we 

514 # might not need the storage class, since the dataset may only be used 

515 # as an existence constraint. It depends on whether 

516 # `join_dataset_search` or `datasets` is calling this method. 

517 dimensions: DimensionGroup | None = None 

518 storage_class_name: str | None = None 

519 # Handle DatasetType vs. str arg. 

520 if isinstance(dataset_type, DatasetType): 

521 dataset_type_name = dataset_type.name 

522 dimensions = dataset_type.dimensions.as_group() 

523 storage_class_name = dataset_type.storageClass_name 

524 elif isinstance(dataset_type, str): 

525 dataset_type_name = dataset_type 

526 else: 

527 raise TypeError(f"Invalid dataset type argument {dataset_type!r}.") 

528 # See if this dataset has already been joined into the query. 

529 if existing_search := self._tree.datasets.get(dataset_type_name): 

530 if collections is None: 

531 collections = existing_search.collections 

532 else: 

533 collections = tuple(ensure_iterable(collections)) 

534 if collections != existing_search.collections: 

535 raise InvalidQueryError( 

536 f"Dataset type {dataset_type_name!r} was already joined into this " 

537 "query with a different collection search path (previously " 

538 f"[{', '.join(existing_search.collections)}], now [{', '.join(collections)}])." 

539 ) 

540 if dimensions is None: 

541 dimensions = existing_search.dimensions 

542 else: 

543 if collections is None: 

544 collections = self._driver.get_default_collections() 

545 collections = tuple(ensure_iterable(collections)) 

546 # Look up the data repository definition of the dataset type to check 

547 # for consistency, or get dimensions and storage class if we don't have 

548 # them. 

549 resolved_dataset_type = self._driver.get_dataset_type(dataset_type_name) 

550 resolved_dimensions = resolved_dataset_type.dimensions.as_group() 

551 if dimensions is not None and dimensions != resolved_dimensions: 

552 raise DatasetTypeError( 

553 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the " 

554 f"registered dimensions {resolved_dimensions}." 

555 ) 

556 if storage_class_name is not None: 

557 if storage_class_name != resolved_dataset_type.storageClass_name: 

558 if not allow_storage_class_overrides: 

559 raise InvalidQueryError( 

560 f"Storage class {storage_class_name!r} for dataset type {dataset_type!r} differs " 

561 f"from repository definition {resolved_dataset_type.storageClass_name!r}, but " 

562 "join_dataset_search does not are about storage classes and cannot record this " 

563 "override. Pass the override to `Query.datasets` instead." 

564 ) 

565 if not ( 

566 StorageClassFactory() 

567 .getStorageClass(storage_class_name) 

568 .can_convert(resolved_dataset_type.storageClass) 

569 ): 

570 raise DatasetTypeError( 

571 f"Given storage class {storage_class_name!r} for {dataset_type_name!r} is not " 

572 f"compatible with repository storage class {resolved_dataset_type.storageClass_name}." 

573 ) 

574 else: 

575 storage_class_name = resolved_dataset_type.storageClass_name 

576 dataset_search = DatasetSearch.model_construct( 

577 collections=collections, 

578 dimensions=resolved_dimensions, 

579 ) 

580 return ( 

581 dataset_type_name, 

582 storage_class_name, 

583 Query(self._driver, self._tree.join_dataset(dataset_type_name, dataset_search)), 

584 )