Coverage for python / lsst / daf / butler / queries / _query.py: 16%

211 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:37 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("Query", "QueryFactoryFunction") 

31 

32from collections.abc import Callable, Iterable, Mapping, Set 

33from contextlib import AbstractContextManager 

34from types import EllipsisType 

35from typing import Any, TypeAlias, final 

36 

37import astropy.table 

38 

39from lsst.utils.iteration import ensure_iterable 

40 

41from .._dataset_type import DatasetType 

42from .._exceptions import DimensionNameError, InvalidQueryError 

43from .._storage_class import StorageClassFactory 

44from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup 

45from ..registry import DatasetTypeError 

46from ._base import QueryBase 

47from ._data_coordinate_query_results import DataCoordinateQueryResults 

48from ._dataset_query_results import DatasetRefQueryResults 

49from ._dimension_record_query_results import DimensionRecordQueryResults 

50from ._general_query_results import GeneralQueryResults 

51from ._identifiers import IdentifierContext, interpret_identifier 

52from .convert_args import convert_where_args 

53from .driver import QueryDriver 

54from .expression_factory import ExpressionFactory 

55from .predicate_constraints_summary import PredicateConstraintsSummary 

56from .result_specs import ( 

57 DataCoordinateResultSpec, 

58 DatasetRefResultSpec, 

59 DimensionRecordResultSpec, 

60 GeneralResultSpec, 

61) 

62from .tree import ( 

63 ANY_DATASET, 

64 DatasetFieldName, 

65 DatasetFieldReference, 

66 DatasetSearch, 

67 DimensionFieldReference, 

68 DimensionKeyReference, 

69 Predicate, 

70 QueryTree, 

71 make_identity_query_tree, 

72) 

73 

74 

75@final 

76class Query(QueryBase): 

77 """A method-chaining builder for butler queries. 

78 

79 Parameters 

80 ---------- 

81 driver : `~.queries.driver.QueryDriver` 

82 Implementation object that knows how to actually execute queries. 

83 tree : `~.queries.tree.QueryTree`, optional 

84 Description of the query as a tree of joins and column expressions. 

85 Defaults to the result of a call to 

86 `~.queries.tree.make_identity_query_tree`. 

87 

88 Notes 

89 ----- 

90 `Query` objects should never be constructed directly by users; use 

91 `Butler.query <lsst.daf.butler.Butler.query>` instead. 

92 

93 A `Query` object represents the first stage of query construction, in which 

94 constraints and joins are defined (roughly corresponding to the WHERE and 

95 FROM clauses in SQL). The various "results" objects represent the second 

96 (and final) stage, where the columns returned are specified and any sorting 

97 or integer slicing can be applied. Result objects are obtained from the 

98 `data_ids`, `datasets`, and `dimension_records` methods. 

99 

100 `Query` and query-result objects are always immutable (except for caching 

101 information fetched from the database or server), so modifier methods 

102 always return a new object without modifying the current one. 

103 """ 

104 

105 def __init__(self, driver: QueryDriver, tree: QueryTree | None = None): 

106 # __init__ defined here because there are multiple base classes and 

107 # not all define __init__ (and hence inherit object.__init__, which 

108 # just ignores its args). Even if we just delegate to super(), it 

109 # seems less fragile to make it explicit here. 

110 if tree is None: 

111 tree = make_identity_query_tree(driver.universe) 

112 super().__init__(driver, tree) 

113 

114 # If ``_allow_duplicate_overlaps`` is set to `True` then query will be 

115 # allowed to generate non-distinct rows for spatial overlaps. This is 

116 # not a part of public API for now, to be used by graph builder as 

117 # optimization. 

118 self._allow_duplicate_overlaps: bool = False 

119 

120 @property 

121 def constraint_dataset_types(self) -> Set[str]: 

122 """The names of all dataset types joined into the query. 

123 

124 The existence of datasets of these types constrains the data IDs of any 

125 type of result. Fields for these dataset types are also usable in 

126 'where' expressions. 

127 """ 

128 # Note that this includes only dataset type names, not `DatasetType` 

129 # instances; the `DatasetQueryResults` adapter returned by the 

130 # `datasets` method does include `DatasetType` instances, since it is 

131 # in a better position to track and respect any storage class override 

132 # specified. 

133 return self._tree.datasets.keys() 

134 

135 @property 

136 def constraint_dimensions(self) -> DimensionGroup: 

137 """Dimensions currently present in the query, either directly or 

138 indirectly. 

139 

140 This includes dimensions that are present in any joined subquery (such 

141 as a dataset search, materialization, or data ID upload) or `where` 

142 argument, as well as any required or implied dependency of those 

143 dimensions. 

144 """ 

145 return self._tree.dimensions 

146 

147 @property 

148 def expression_factory(self) -> ExpressionFactory: 

149 """A factory for column expressions using overloaded operators. 

150 (`~lsst.daf.butler.queries.expression_factory.ExpressionFactory`). 

151 

152 Notes 

153 ----- 

154 Typically this attribute will be assigned to a single-character local 

155 variable, and then its (dynamic) attributes can be used to obtain 

156 references to columns that can be included in a query:: 

157 

158 with butler.query() as query: 

159 x = query.expression_factory 

160 query = query.where( 

161 x.instrument == "LSSTCam", 

162 x.visit.day_obs > 20240701, 

163 x.any(x.band == "u", x.band == "y"), 

164 ) 

165 

166 As shown above, the returned object also has an 

167 `~lsst.daf.butler.queries.expression_factory.ExpressionFactory.any` 

168 method to create combine expressions with logical OR (as well as 

169 `~lsst.daf.butler.queries.expression_factory.ExpressionFactory.not_` 

170 and 

171 `~lsst.daf.butler.queries.expression_factory.ExpressionFactory.all`, 

172 though the latter is rarely necessary since `where` already combines 

173 its arguments with AND). 

174 

175 Proxies for fields associated with individual datasets but not 

176 dimension records (``dataset_id``, ``ingest_date``, ``run``, 

177 ``collection``, as well as ``timespan`` for 

178 `~lsst.daf.butler.CollectionType.CALIBRATION` collection searches) can 

179 be obtained with dict-like access instead:: 

180 

181 with butler.query() as query: 

182 query = query.order_by(x["raw"].ingest_date) 

183 

184 Expression proxy objects that correspond to scalar columns overload the 

185 standard comparison operators (``==``, ``!=``, ``<``, ``>``, ``<=``, 

186 ``>=``) and provide 

187 `~lsst.daf.butler.queries.expression_factory.ScalarExpressionProxy.in_range`, 

188 `~lsst.daf.butler.queries.expression_factory.ScalarExpressionProxy.in_iterable`, and 

189 `~lsst.daf.butler.queries.expression_factory.ScalarExpressionProxy.in_query` 

190 methods for membership tests. For ``order_by`` contexts, they also have a 

191 `~lsst.daf.butler.queries.expression_factory.ScalarExpressionProxy.desc` 

192 property to indicate that the sort order for that expression should be 

193 reversed. 

194 

195 Proxy objects for 

196 `region <lsst.daf.butler.queries.expression_factory.RegionProxy>` and 

197 `timespan <lsst.daf.butler.queries.expression_factory.TimespanProxy>` 

198 fields have an ``overlaps`` method, and timespans also have 

199 `~lsst.daf.butler.queries.expression_factory.TimespanProxy.begin` and 

200 `~lsst.daf.butler.queries.expression_factory.TimespanProxy.end` 

201 properties to access scalar expression proxies for the bounds. 

202 

203 All proxy objects also have an 

204 `~lsst.daf.butler.queries.expression_factory.ExpressionProxy.is_null` 

205 property. 

206 

207 Literal values can be created by calling 

208 `ExpressionFactory.literal <lsst.daf.butler.queries.expression_factory.ExpressionFactory.literal>`, 

209 but can almost always be created implicitly via overloaded operators 

210 instead. 

211 """ # noqa: W505, long docstrings 

212 return ExpressionFactory(self._driver.universe) 

213 

214 def data_ids( 

215 self, dimensions: DimensionGroup | Iterable[str] | str | None = None 

216 ) -> DataCoordinateQueryResults: 

217 """Return a result object that is a `~lsst.daf.butler.DataCoordinate` 

218 iterable. 

219 

220 Parameters 

221 ---------- 

222 dimensions : `~lsst.daf.butler.DimensionGroup`, `str`, or \ 

223 `~collections.abc.Iterable` [`str`], optional 

224 The dimensions of the data IDs to yield, as either 

225 `~lsst.daf.butler.DimensionGroup` instances or `str` names. Will 

226 be automatically expanded to a complete 

227 `~lsst.daf.butler.DimensionGroup`. These dimensions do not need to 

228 match the query's current dimensions. Default is 

229 `constraint_dimensions`. 

230 

231 Returns 

232 ------- 

233 data_ids : `~lsst.daf.butler.queries.DataCoordinateQueryResults` 

234 Data IDs matching the given query parameters. These are guaranteed 

235 to identify all dimensions (``DataCoordinate.hasFull`` returns 

236 `True`), but will not contain `~lsst.daf.butler.DimensionRecord` 

237 objects (``DataCoordinate.hasRecords`` returns `False`). Call 

238 `~DataCoordinateQueryResults.with_dimension_records` on the 

239 returned object to include dimension records as well. 

240 """ 

241 tree = self._tree 

242 if dimensions is None: 

243 dimensions = self._tree.dimensions 

244 else: 

245 dimensions = self._driver.universe.conform(dimensions) 

246 if not dimensions <= self._tree.dimensions: 

247 tree = tree.join_dimensions(dimensions) 

248 result_spec = DataCoordinateResultSpec( 

249 dimensions=dimensions, 

250 include_dimension_records=False, 

251 allow_duplicate_overlaps=self._allow_duplicate_overlaps, 

252 ) 

253 return DataCoordinateQueryResults(self._driver, tree, result_spec) 

254 

255 def datasets( 

256 self, 

257 dataset_type: str | DatasetType, 

258 collections: str | Iterable[str] | None = None, 

259 *, 

260 find_first: bool = True, 

261 ) -> DatasetRefQueryResults: 

262 """Return a result object that is a `~lsst.daf.butler.DatasetRef` 

263 iterable. 

264 

265 Parameters 

266 ---------- 

267 dataset_type : `str` or `~lsst.daf.butler.DatasetType` 

268 The dataset type to search for. 

269 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

270 The collection or collections to search, in order. If not provided 

271 or `None`, and the dataset has not already been joined into the 

272 query, the default collection search path for this butler is used. 

273 find_first : `bool`, optional 

274 If `True` (default), for each result data ID, only yield one 

275 `~lsst.daf.butler.DatasetRef` of each 

276 `~lsst.daf.butler.DatasetType`, from the first collection in 

277 which a dataset of that dataset type appears (according to the 

278 order of ``collections`` passed in). If `True`, ``collections`` 

279 must not be ``...``. 

280 

281 Returns 

282 ------- 

283 refs : `lsst.daf.butler.queries.DatasetRefQueryResults` 

284 Dataset references matching the given query criteria. Nested data 

285 IDs are guaranteed to include values for all implied dimensions 

286 (i.e. ``DataCoordinate.hasFull`` will return `True`), but will not 

287 include dimension records (``DataCoordinate.hasRecords`` will be 

288 `False`) unless 

289 `~.queries.DatasetRefQueryResults.with_dimension_records` is 

290 called on the result object (which returns a new one). 

291 

292 Raises 

293 ------ 

294 lsst.daf.butler.registry.DatasetTypeExpressionError 

295 Raised when the ``dataset_type`` expression is invalid. 

296 lsst.daf.butler.registry.NoDefaultCollectionError 

297 Raised when ``collections`` is `None` and default butler 

298 collections are not defined. 

299 TypeError 

300 Raised when the arguments are incompatible, such as when a 

301 collection wildcard is passed when ``find_first`` is `True` 

302 """ 

303 dataset_type_name, storage_class_name, query = self._join_dataset_search_impl( 

304 dataset_type, collections 

305 ) 

306 dataset_search = query._tree.datasets[dataset_type_name] 

307 spec = DatasetRefResultSpec.model_construct( 

308 dataset_type_name=dataset_type_name, 

309 dimensions=dataset_search.dimensions, 

310 storage_class_name=storage_class_name, 

311 include_dimension_records=False, 

312 find_first=find_first, 

313 allow_duplicate_overlaps=self._allow_duplicate_overlaps, 

314 ) 

315 return DatasetRefQueryResults(self._driver, tree=query._tree, spec=spec) 

316 

317 def dimension_records(self, element: str) -> DimensionRecordQueryResults: 

318 """Return a result object that is a `~lsst.daf.butler.DimensionRecord` 

319 iterable. 

320 

321 Parameters 

322 ---------- 

323 element : `str` 

324 The name of a dimension element to obtain records for. 

325 

326 Returns 

327 ------- 

328 records : `lsst.daf.butler.queries.DimensionRecordQueryResults` 

329 Data IDs matching the given query parameters. 

330 """ 

331 if element not in self._driver.universe: 

332 # Prefer an explicit exception over a KeyError below. 

333 raise DimensionNameError( 

334 f"No such dimension '{element}', available dimensions: " + str(self._driver.universe.elements) 

335 ) 

336 tree = self._tree 

337 if element not in tree.dimensions.elements: 

338 tree = tree.join_dimensions(self._driver.universe[element].minimal_group) 

339 result_spec = DimensionRecordResultSpec( 

340 element=self._driver.universe[element], allow_duplicate_overlaps=self._allow_duplicate_overlaps 

341 ) 

342 return DimensionRecordQueryResults(self._driver, tree, result_spec) 

343 

344 def general( 

345 self, 

346 dimensions: DimensionGroup | Iterable[str], 

347 *names: str, 

348 dimension_fields: Mapping[str, Set[str]] | None = None, 

349 dataset_fields: Mapping[str, Set[DatasetFieldName] | EllipsisType] | None = None, 

350 find_first: bool | None = None, 

351 ) -> GeneralQueryResults: 

352 """Execute query returning general result. 

353 

354 **This is an experimental interface and may change at any time.** 

355 

356 Parameters 

357 ---------- 

358 dimensions : `~lsst.daf.butler.DimensionGroup` or \ 

359 `~collections.abc.Iterable` [ `str` ] 

360 The dimensions that span all fields returned by this query. 

361 *names : `str` 

362 Names of dimensions fields (in "dimension.field" format), dataset 

363 fields (in "dataset_type.field" format) to include in this query. 

364 dimension_fields : `~collections.abc.Mapping` [`str`, \ 

365 `~collections.abc.Set` [`str`]], optional 

366 Dimension record fields included in this query, keyed by dimension 

367 element name. 

368 dataset_fields : `~collections.abc.Mapping` [`str`, \ 

369 `~collections.abc.Set` | ``...`` ], optional 

370 Dataset fields included in this query, the key in the mapping is 

371 dataset type name. Ellipsis (``...``) can be used for value 

372 to include all dataset fields needed to extract 

373 `~lsst.daf.butler.DatasetRef` instances later. 

374 find_first : `bool`, optional 

375 Whether this query requires find-first resolution for a dataset. 

376 This is ignored and can be omitted if the query has no dataset 

377 fields. It must be explicitly set to `False` if there are multiple 

378 dataset types with fields, or if any dataset's ``collections`` 

379 or ``timespan`` fields are included in the results. 

380 

381 Returns 

382 ------- 

383 result : `~lsst.daf.butler.queries.GeneralQueryResults` 

384 Query result that can be iterated over. 

385 

386 Notes 

387 ----- 

388 The dimensions of the returned query are automatically expanded to 

389 include those associated with all dimension and dataset fields; the 

390 ``dimensions`` argument is just the minimal dimensions to return. 

391 """ 

392 if dimension_fields is None: 

393 dimension_fields = {} 

394 if dataset_fields is None: 

395 dataset_fields = {} 

396 # Processing fields from mapping args, processing the special `...` 

397 # wildcard and dropping keys with empty values. 

398 dataset_fields_dict: dict[str, set[DatasetFieldName]] = {} 

399 for dataset_type_name, fields_for_dataset_type in dataset_fields.items(): 

400 if fields_for_dataset_type is ...: 

401 new_fields_for_dataset_type: set[DatasetFieldName] = { 

402 "run", 

403 "dataset_id", 

404 } # all we need for DatasetRefs. 

405 else: 

406 new_fields_for_dataset_type = set(fields_for_dataset_type) 

407 if new_fields_for_dataset_type: 

408 dataset_fields_dict[dataset_type_name] = new_fields_for_dataset_type 

409 dimension_fields_dict = { 

410 element_name: new_fields_for_element 

411 for element_name, fields_for_element in dimension_fields.items() 

412 if (new_fields_for_element := set(fields_for_element)) 

413 } 

414 # Parse all names passed as positional arguments, and start to 

415 # accumulate additional dimension names we'll need in the results. 

416 dimensions = self._driver.universe.conform(dimensions) 

417 context = IdentifierContext(dimensions, set(self._tree.datasets)) 

418 extra_dimension_names: set[str] = set() 

419 for name in names: 

420 identifier = interpret_identifier(context, name) 

421 match identifier: 

422 case DimensionKeyReference(dimension=dimension): 

423 # Could be because someone asked for the key field. 

424 extra_dimension_names.add(dimension.name) 

425 case DimensionFieldReference(element=element, field=field): 

426 dimension_fields_dict.setdefault(element.name, set()).add(field) 

427 case DatasetFieldReference(dataset_type=dataset_type, field=dataset_field): 

428 if dataset_type is ANY_DATASET: 

429 raise InvalidQueryError("Dataset wildcard fields are not supported by Query.general.") 

430 dataset_fields_dict.setdefault(dataset_type, set()).add(dataset_field) 

431 case _: 

432 raise TypeError(f"Unexpected type of identifier ({name}): {identifier}") 

433 # Add more dimension names from the field mappings (including those 

434 # we just populated from args). Also check that the dataset fields 

435 # are consistent with find_first. 

436 for element_name, fields in dimension_fields_dict.items(): 

437 extra_dimension_names.update(self._driver.universe[element_name].minimal_group.names) 

438 for dataset_type_name, fields_for_dataset_type in dataset_fields_dict.items(): 

439 if "collections" in fields_for_dataset_type and find_first is not False: 

440 raise InvalidQueryError( 

441 f"find_first=False must be passed explicitly if {dataset_type_name}.collections " 

442 "is included in query results." 

443 ) 

444 if "timespan" in fields_for_dataset_type and find_first is not False: 

445 raise InvalidQueryError( 

446 f"find_first=False must be passed explicitly if {dataset_type_name}.timespan " 

447 "is included in query results." 

448 ) 

449 try: 

450 dataset_search = self._tree.datasets[dataset_type_name] 

451 except KeyError: 

452 raise InvalidQueryError( 

453 f"A search for dataset type {dataset_type_name!r} must be explicitly joined into the " 

454 "query before including its fields in query results." 

455 ) from None 

456 extra_dimension_names.update(dataset_search.dimensions.names) 

457 if find_first is None: 

458 if dataset_fields_dict: 

459 raise InvalidQueryError( 

460 "find_first must be passed if dataset fields are included in query results." 

461 ) 

462 else: 

463 find_first = False 

464 if find_first and len(dataset_fields_dict) != 1: 

465 raise InvalidQueryError( 

466 "find_first=True is not valid unless exactly one dataset type's fields are requested." 

467 ) 

468 # Combine extra dimensions with the original ones. 

469 dimensions = self._driver.universe.conform(dimensions.names | extra_dimension_names) 

470 # Merge missing dimensions into the tree. 

471 tree = self._tree 

472 if not dimensions <= tree.dimensions: 

473 tree = tree.join_dimensions(dimensions) 

474 result_spec = GeneralResultSpec( 

475 dimensions=dimensions, 

476 dimension_fields=dimension_fields_dict, 

477 dataset_fields=dataset_fields_dict, 

478 find_first=find_first, 

479 allow_duplicate_overlaps=self._allow_duplicate_overlaps, 

480 ) 

481 return GeneralQueryResults(self._driver, tree=tree, spec=result_spec) 

482 

483 def materialize( 

484 self, 

485 *, 

486 dimensions: Iterable[str] | DimensionGroup | None = None, 

487 datasets: Iterable[str] | None = None, 

488 ) -> Query: 

489 """Execute the query, save its results to a temporary location, and 

490 return a new query that represents fetching or joining against those 

491 saved results. 

492 

493 Parameters 

494 ---------- 

495 dimensions : `~collections.abc.Iterable` [ `str` ] or \ 

496 `~lsst.daf.butler.DimensionGroup`, optional 

497 Dimensions to include in the temporary results. Default is to 

498 include all dimensions in the query. 

499 datasets : `~collections.abc.Iterable` [ `str` ], optional 

500 Names of dataset types that should be included in the new query; 

501 default is to include `constraint_dataset_types`. 

502 

503 Returns 

504 ------- 

505 query : `Query` 

506 A new query object whose that represents the materialized rows. 

507 

508 Notes 

509 ----- 

510 Only dimension key columns and (at the discretion of the 

511 implementation) certain dataset columns are actually materialized, 

512 since at this stage we do not know which dataset or dimension record 

513 fields are actually needed in result rows, and these can be joined back 

514 in on the materialized dimension keys. But all constraints on those 

515 dimension keys (including dataset existence) are applied to the 

516 materialized rows. 

517 """ 

518 if datasets is None: 

519 datasets = frozenset(self.constraint_dataset_types) 

520 else: 

521 datasets = frozenset(datasets) 

522 if not (datasets <= self.constraint_dataset_types): 

523 raise InvalidQueryError( 

524 f"Dataset(s) {datasets - self.constraint_dataset_types} are present in the query." 

525 ) 

526 if dimensions is None: 

527 dimensions = self._tree.dimensions 

528 else: 

529 dimensions = self._driver.universe.conform(dimensions) 

530 key = self._driver.materialize( 

531 self._tree, dimensions, datasets, allow_duplicate_overlaps=self._allow_duplicate_overlaps 

532 ) 

533 tree = make_identity_query_tree(self._driver.universe).join_materialization( 

534 key, dimensions=dimensions 

535 ) 

536 for dataset_type_name in datasets: 

537 dataset_search = self._tree.datasets[dataset_type_name] 

538 if not (dataset_search.dimensions <= tree.dimensions): 

539 raise InvalidQueryError( 

540 f"Materialization-backed query has dimensions {tree.dimensions}, which do not " 

541 f"cover the dimensions {dataset_search.dimensions} of dataset {dataset_type_name!r}. " 

542 "Expand the dimensions or drop this dataset type in the arguments to materialize to " 

543 "avoid this error." 

544 ) 

545 tree = tree.join_dataset(dataset_type_name, dataset_search) 

546 return Query(self._driver, tree) 

547 

548 def join_dataset_search( 

549 self, 

550 dataset_type: str | DatasetType, 

551 collections: Iterable[str] | None = None, 

552 ) -> Query: 

553 """Return a new query with a search for a dataset joined in. 

554 

555 Parameters 

556 ---------- 

557 dataset_type : `str` or `~lsst.daf.butler.DatasetType` 

558 Dataset type or name. May not refer to a dataset component. 

559 collections : `~collections.abc.Iterable` [ `str` ], optional 

560 Iterable of collections to search. Order is preserved, but will 

561 not matter if the dataset search is only used as a constraint on 

562 dimensions or if ``find_first=False`` when requesting results. If 

563 not present or `None`, the default collection search path will be 

564 used. 

565 

566 Returns 

567 ------- 

568 query : `Query` 

569 A new query object with dataset columns available and rows 

570 restricted to those consistent with the found data IDs. 

571 

572 Raises 

573 ------ 

574 DatasetTypeError 

575 Raised if given dataset type is inconsistent with the registered 

576 dataset type. 

577 MissingDatasetTypeError 

578 Raised if the dataset type has not been registered and only a 

579 `str` dataset type name was given. 

580 

581 Notes 

582 ----- 

583 This method may require communication with the server unless the 

584 dataset type and collections have already been referenced by the same 

585 query context. 

586 """ 

587 _, _, query = self._join_dataset_search_impl( 

588 dataset_type, collections, allow_storage_class_overrides=False 

589 ) 

590 return query 

591 

592 def join_data_coordinates(self, iterable: Iterable[DataCoordinate]) -> Query: 

593 """Return a new query that joins in an explicit iterable of data IDs. 

594 

595 Parameters 

596 ---------- 

597 iterable : `~collections.abc.Iterable` \ 

598 [`~lsst.daf.butler.DataCoordinate`] 

599 Iterable of `~lsst.daf.butler.DataCoordinate`. All items must have 

600 the same dimensions. Must have at least one item. 

601 

602 Returns 

603 ------- 

604 query : `Query` 

605 A new query object with the data IDs joined in. 

606 """ 

607 rows: set[tuple[DataIdValue, ...]] = set() 

608 dimensions: DimensionGroup | None = None 

609 for data_coordinate in iterable: 

610 if dimensions is None: 

611 dimensions = data_coordinate.dimensions 

612 elif dimensions != data_coordinate.dimensions: 

613 raise InvalidQueryError( 

614 f"Inconsistent dimensions: {dimensions} != {data_coordinate.dimensions}." 

615 ) 

616 rows.add(data_coordinate.required_values) 

617 if dimensions is None: 

618 raise InvalidQueryError("Cannot upload an empty data coordinate set.") 

619 key = self._driver.upload_data_coordinates(dimensions, rows) 

620 return Query( 

621 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key), 

622 driver=self._driver, 

623 ) 

624 

625 def join_data_coordinate_table(self, table: astropy.table.Table) -> Query: 

626 """Return a new query that joins in an explicit table of data IDs. 

627 

628 Parameters 

629 ---------- 

630 table : `astropy.table.Table` 

631 A table of data IDs to join. Columns must be dimension names, and 

632 columns for dimensions whose values that are implied by others are 

633 ignored. If there is no column for a required dimension is missing 

634 but is fully constrained to a literal by a previous `where` call, a 

635 constant-valued column will be added. 

636 

637 Returns 

638 ------- 

639 query : `Query` 

640 A new query object with the data IDs joined in. 

641 """ 

642 if not len(table): 

643 raise InvalidQueryError("Cannot upload an empty data coordinate set.") 

644 column_names = set(table.colnames) 

645 dimensions = self._driver.universe.conform(column_names) 

646 # To avoid numpy scalar types that will upset SQLAlchemy, we turn the 

647 # columns we care about into lists of regular Python scalars. We do 

648 # this in dimensions.required order so we can zip the values of this 

649 # dict later to make data ID 'required_values' tuples. 

650 column_lists = {d: table[d].data.tolist() if d in column_names else None for d in dimensions.required} 

651 if not column_names.issuperset(dimensions.required): 

652 # If columns are missing, see if they're fixed by a previous 

653 # `where` call or equivalent. 

654 predicate_summary = PredicateConstraintsSummary(self._tree.predicate) 

655 missing = dimensions.required - column_names 

656 provided_by_predicate = predicate_summary.constraint_data_id.keys() & missing 

657 missing -= provided_by_predicate 

658 if missing: 

659 raise InvalidQueryError(f"Data coordinate table is missing required dimension(s) {missing}.") 

660 if provided_by_predicate: 

661 for k in provided_by_predicate: 

662 column_lists[k] = [predicate_summary.constraint_data_id[k]] * len(table) 

663 key = self._driver.upload_data_coordinates(dimensions, zip(*column_lists.values(), strict=True)) 

664 return Query( 

665 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key), 

666 driver=self._driver, 

667 ) 

668 

669 def join_dimensions(self, dimensions: Iterable[str] | DimensionGroup) -> Query: 

670 """Return a new query that joins the logical tables for additional 

671 dimensions. 

672 

673 Parameters 

674 ---------- 

675 dimensions : `~collections.abc.Iterable` [ `str` ] or \ 

676 `~lsst.daf.butler.DimensionGroup` 

677 Names of dimensions to join in. 

678 

679 Returns 

680 ------- 

681 query : `Query` 

682 A new query object with the dimensions joined in. 

683 

684 Notes 

685 ----- 

686 Dimensions are automatically joined in whenever needed, so this method 

687 should rarely need to be called directly. 

688 """ 

689 dimensions = self._driver.universe.conform(dimensions) 

690 return Query(tree=self._tree.join_dimensions(dimensions), driver=self._driver) 

691 

692 def where( 

693 self, 

694 *args: str | Predicate | DataId, 

695 bind: Mapping[str, Any] | None = None, 

696 **kwargs: Any, 

697 ) -> Query: 

698 """Return a query with a boolean-expression filter on its rows. 

699 

700 Parameters 

701 ---------- 

702 *args 

703 Constraints to apply, combined with logical AND. Arguments may be 

704 `str` expressions to parse, 

705 `~lsst.daf.butler.queries.tree.Predicate` objects (these are 

706 typically constructed via `expression_factory`) or data IDs. 

707 bind : `~collections.abc.Mapping` 

708 Mapping from string identifier appearing in a string expression to 

709 a literal value that should be substituted for it. This is 

710 recommended instead of embedding literals directly into the 

711 expression, especially for strings, timespans, or other types where 

712 quoting or formatting is nontrivial. 

713 **kwargs 

714 Data ID key value pairs that extend and override any present in 

715 ``*args``. 

716 

717 Returns 

718 ------- 

719 query : `Query` 

720 A new query object with the given row filters (as well as any 

721 already present in ``self``). All row filters are combined with 

722 logical AND. 

723 

724 Notes 

725 ----- 

726 Expressions referring to dimensions or dimension elements are resolved 

727 automatically. References to dataset fields (see `expression_factory` 

728 for the distinction) cannot be resolved by default; they must either be 

729 preceded by a call to `join_dataset_search` or must be passed to 

730 `DatasetRefQueryResults.where <lsst.daf.butler.queries.DatasetRefQueryResults.where>` 

731 instead. 

732 

733 Data ID values are not checked for consistency; they are extracted from 

734 ``args`` and then ``kwargs`` and combined, with later values overriding 

735 earlier ones. 

736 """ # noqa: W505, long docstrings 

737 return Query( 

738 tree=self._tree.where( 

739 convert_where_args( 

740 self.constraint_dimensions, 

741 self.constraint_dataset_types, 

742 *args, 

743 bind=bind, 

744 **kwargs, 

745 ) 

746 ), 

747 driver=self._driver, 

748 ) 

749 

750 def _skip_governor_validation(self) -> Query: 

751 tree = self._tree.model_copy(update={"validateGovernorConstraints": False}) 

752 return Query(tree=tree, driver=self._driver) 

753 

754 def _join_dataset_search_impl( 

755 self, 

756 dataset_type: str | DatasetType, 

757 collections: Iterable[str] | None = None, 

758 allow_storage_class_overrides: bool = True, 

759 ) -> tuple[str, str, Query]: 

760 """Implement `join_dataset_search`, and also return the dataset type 

761 name and storage class, in addition to the modified Query. 

762 """ 

763 # In this method we need the dimensions of the dataset type, but we 

764 # might not need the storage class, since the dataset may only be used 

765 # as an existence constraint. It depends on whether 

766 # `join_dataset_search` or `datasets` is calling this method. 

767 dimensions: DimensionGroup | None = None 

768 storage_class_name: str | None = None 

769 # Handle DatasetType vs. str arg. 

770 if isinstance(dataset_type, DatasetType): 

771 dataset_type_name = dataset_type.name 

772 dimensions = dataset_type.dimensions 

773 storage_class_name = dataset_type.storageClass_name 

774 elif isinstance(dataset_type, str): 

775 dataset_type_name = dataset_type 

776 else: 

777 raise TypeError(f"Invalid dataset type argument {dataset_type!r}.") 

778 # See if this dataset has already been joined into the query. 

779 if existing_search := self._tree.datasets.get(dataset_type_name): 

780 if collections is None: 

781 collections = existing_search.collections 

782 else: 

783 collections = tuple(ensure_iterable(collections)) 

784 if collections != existing_search.collections: 

785 raise InvalidQueryError( 

786 f"Dataset type {dataset_type_name!r} was already joined into this " 

787 "query with a different collection search path (previously " 

788 f"[{', '.join(existing_search.collections)}], now [{', '.join(collections)}])." 

789 ) 

790 if dimensions is None: 

791 dimensions = existing_search.dimensions 

792 else: 

793 if collections is None: 

794 collections = self._driver.get_default_collections() 

795 collections = tuple(ensure_iterable(collections)) 

796 # Look up the data repository definition of the dataset type to check 

797 # for consistency, or get dimensions and storage class if we don't have 

798 # them. 

799 resolved_dataset_type = self._driver.get_dataset_type(dataset_type_name) 

800 resolved_dimensions = resolved_dataset_type.dimensions 

801 if dimensions is not None and dimensions != resolved_dimensions: 

802 raise DatasetTypeError( 

803 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the " 

804 f"registered dimensions {resolved_dimensions}." 

805 ) 

806 if storage_class_name is not None: 

807 if storage_class_name != resolved_dataset_type.storageClass_name: 

808 if not allow_storage_class_overrides: 

809 raise InvalidQueryError( 

810 f"Storage class {storage_class_name!r} for dataset type {dataset_type!r} differs " 

811 f"from repository definition {resolved_dataset_type.storageClass_name!r}, but " 

812 "join_dataset_search does not are about storage classes and cannot record this " 

813 "override. Pass the override to `Query.datasets` instead." 

814 ) 

815 if not ( 

816 StorageClassFactory() 

817 .getStorageClass(storage_class_name) 

818 .can_convert(resolved_dataset_type.storageClass) 

819 ): 

820 raise DatasetTypeError( 

821 f"Given storage class {storage_class_name!r} for {dataset_type_name!r} is not " 

822 f"compatible with repository storage class {resolved_dataset_type.storageClass_name}." 

823 ) 

824 else: 

825 storage_class_name = resolved_dataset_type.storageClass_name 

826 dataset_search = DatasetSearch.model_construct( 

827 collections=collections, 

828 dimensions=resolved_dimensions, 

829 ) 

830 return ( 

831 dataset_type_name, 

832 storage_class_name, 

833 Query(self._driver, self._tree.join_dataset(dataset_type_name, dataset_search)), 

834 ) 

835 

836 

837QueryFactoryFunction: TypeAlias = Callable[[], AbstractContextManager[Query]] 

838""" 

839Type signature for a function returning a context manager that sets up a 

840`Query` object. (That is, a function equivalent to ``Butler.query()``). 

841"""