Coverage for python/lsst/daf/butler/queries/_query.py: 20%

152 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-05 11:36 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("Query",) 

31 

32from collections.abc import Iterable, Mapping, Set 

33from types import EllipsisType 

34from typing import Any, final, overload 

35 

36from lsst.utils.iteration import ensure_iterable 

37 

38from .._dataset_type import DatasetType 

39from .._storage_class import StorageClassFactory 

40from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup 

41from ..registry import DatasetTypeError, MissingDatasetTypeError 

42from ._base import HomogeneousQueryBase 

43from ._data_coordinate_query_results import DataCoordinateQueryResults 

44from ._dataset_query_results import ( 

45 ChainedDatasetQueryResults, 

46 DatasetQueryResults, 

47 SingleTypeDatasetQueryResults, 

48) 

49from ._dimension_record_query_results import DimensionRecordQueryResults 

50from .convert_args import convert_where_args 

51from .driver import QueryDriver 

52from .expression_factory import ExpressionFactory 

53from .result_specs import DataCoordinateResultSpec, DatasetRefResultSpec, DimensionRecordResultSpec 

54from .tree import DatasetSearch, InvalidQueryError, Predicate, QueryTree, make_identity_query_tree 

55 

56 

57@final 

58class Query(HomogeneousQueryBase): 

59 """A method-chaining builder for butler queries. 

60 

61 Parameters 

62 ---------- 

63 driver : `QueryDriver` 

64 Implementation object that knows how to actually execute queries. 

65 tree : `QueryTree` 

66 Description of the query as a tree of joins and column expressions. The 

67 instance returned directly by the `Butler._query` entry point should be 

68 constructed via `make_identity_query_tree`. 

69 

70 Notes 

71 ----- 

72 `Query` objects should never be constructed directly by users; use 

73 `Butler._query` instead. 

74 

75 A `Query` object represents the first stage of query construction, in which 

76 constraints and joins are defined (roughly corresponding to the WHERE and 

77 FROM clauses in SQL). The various "results" objects represent the second 

78 (and final) stage, where the columns returned are specified and any sorting 

79 or integer slicing can be applied. Result objects are obtained from the 

80 `data_ids`, `datasets`, and `dimension_records` methods. 

81 

82 `Query` and query-result objects are always immutable (except for caching 

83 information fetched from the database or server), so modifier methods 

84 always return a new object without modifying the current one. 

85 """ 

86 

87 def __init__(self, driver: QueryDriver, tree: QueryTree): 

88 # __init__ defined here because there are multiple base classes and 

89 # not all define __init__ (and hence inherit object.__init__, which 

90 # just ignores its args). Even if we just delegate to super(), it 

91 # seems less fragile to make it explicit here. 

92 super().__init__(driver, tree) 

93 

94 @property 

95 def constraint_dataset_types(self) -> Set[str]: 

96 """The names of all dataset types joined into the query. 

97 

98 The existence of datasets of these types constrains the data IDs of any 

99 type of result. Fields for these dataset types are also usable in 

100 'where' expressions. 

101 """ 

102 # Note that this includes only dataset type names, not `DatasetType` 

103 # instances; the `DatasetQueryResults` adapter returned by the 

104 # `datasets` method does include `DatasetType` instances, since it is 

105 # in a better position to track and respect any storage class override 

106 # specified. 

107 return self._tree.datasets.keys() 

108 

109 @property 

110 def constraint_dimensions(self) -> DimensionGroup: 

111 """Dimensions currently present in the query, either directly or 

112 indirectly. 

113 

114 This includes dimensions that are present in any joined subquery (such 

115 as a dataset search, materialization, or data ID upload) or `where` 

116 argument, as well as any required or implied dependency of those 

117 dimensions. 

118 """ 

119 return self._tree.dimensions 

120 

121 @property 

122 def expression_factory(self) -> ExpressionFactory: 

123 """A factory for column expressions using overloaded operators. 

124 

125 Notes 

126 ----- 

127 Typically this attribute will be assigned to a single-character local 

128 variable, and then its (dynamic) attributes can be used to obtain 

129 references to columns that can be included in a query:: 

130 

131 with butler._query() as query: 

132 x = query.expression_factory 

133 query = query.where( 

134 x.instrument == "LSSTCam", 

135 x.visit.day_obs > 20240701, 

136 x.any(x.band == 'u', x.band == 'y'), 

137 ) 

138 

139 As shown above, the returned object also has an `any` method to create 

140 combine expressions with logical OR (as well as `not_` and `all`, 

141 though the latter is rarely necessary since `where` already combines 

142 its arguments with AND). 

143 

144 Proxies for fields associated with dataset types (``dataset_id``, 

145 ``ingest_date``, ``run``, ``collection``, as well as ``timespan`` for 

146 `~CollectionType.CALIBRATION` collection searches) can be obtained with 

147 dict-like access instead:: 

148 

149 with butler._query() as query: 

150 query = query.order_by(x["raw"].ingest_date) 

151 

152 Expression proxy objects that correspond to scalar columns overload the 

153 standard comparison operators (``==``, ``!=``, ``<``, ``>``, ``<=``, 

154 ``>=``) and provide `~ScalarExpressionProxy.in_range`, 

155 `~ScalarExpressionProxy.in_iterable`, and 

156 `~ScalarExpressionProxy.in_query` methods for membership tests. For 

157 `order_by` contexts, they also have a `~ScalarExpressionProxy.desc` 

158 property to indicate that the sort order for that expression should be 

159 reversed. 

160 

161 Proxy objects for region and timespan fields have an `overlaps` method, 

162 and timespans also have `~TimespanProxy.begin` and `~TimespanProxy.end` 

163 properties to access scalar expression proxies for the bounds. 

164 

165 All proxy objects also have a `~ExpressionProxy.is_null` property. 

166 

167 Literal values can be created by calling `ExpressionFactory.literal`, 

168 but can almost always be created implicitly via overloaded operators 

169 instead. 

170 """ 

171 return ExpressionFactory(self._driver.universe) 

172 

173 def data_ids( 

174 self, dimensions: DimensionGroup | Iterable[str] | str | None = None 

175 ) -> DataCoordinateQueryResults: 

176 """Return a result object that is a `DataCoordinate` iterable. 

177 

178 Parameters 

179 ---------- 

180 dimensions : `DimensionGroup`, `str`, or \ 

181 `~collections.abc.Iterable` [`str`], optional 

182 The dimensions of the data IDs to yield, as either `DimensionGroup` 

183 instances or `str` names. Will be automatically expanded to a 

184 complete `DimensionGroup`. These dimensions do not need to match 

185 the query's current `dimensions`. Default is 

186 `constraint_dimensions`. 

187 

188 Returns 

189 ------- 

190 data_ids : `DataCoordinateQueryResults` 

191 Data IDs matching the given query parameters. These are guaranteed 

192 to identify all dimensions (`DataCoordinate.hasFull` returns 

193 `True`), but will not contain `DimensionRecord` objects 

194 (`DataCoordinate.hasRecords` returns `False`). Call 

195 `~DataCoordinateQueryResults.with_dimension_records` on the 

196 returned object to include dimension records as well. 

197 """ 

198 tree = self._tree 

199 if dimensions is None: 

200 dimensions = self._tree.dimensions 

201 else: 

202 dimensions = self._driver.universe.conform(dimensions) 

203 if not dimensions <= self._tree.dimensions: 

204 tree = tree.join_dimensions(dimensions) 

205 result_spec = DataCoordinateResultSpec(dimensions=dimensions, include_dimension_records=False) 

206 return DataCoordinateQueryResults(self._driver, tree, result_spec) 

207 

208 @overload 

209 def datasets( 

210 self, 

211 dataset_type: str | DatasetType, 

212 collections: str | Iterable[str] | None = None, 

213 *, 

214 find_first: bool = True, 

215 ) -> SingleTypeDatasetQueryResults: ... # pragma: no cover 

216 

217 @overload 

218 def datasets( 

219 self, 

220 dataset_type: Iterable[str | DatasetType] | EllipsisType, 

221 collections: str | Iterable[str] | None = None, 

222 *, 

223 find_first: bool = True, 

224 ) -> DatasetQueryResults: ... # pragma: no cover 

225 

226 def datasets( 

227 self, 

228 dataset_type: str | DatasetType | Iterable[str | DatasetType] | EllipsisType, 

229 collections: str | Iterable[str] | None = None, 

230 *, 

231 find_first: bool = True, 

232 ) -> DatasetQueryResults: 

233 """Return a result object that is a `DatasetRef` iterable. 

234 

235 Parameters 

236 ---------- 

237 dataset_type : `str`, `DatasetType`, \ 

238 `~collections.abc.Iterable` [ `str` or `DatasetType` ], \ 

239 or ``...`` 

240 The dataset type or types to search for. Passing ``...`` searches 

241 for all datasets in the given collections. 

242 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

243 The collection or collections to search, in order. If not provided 

244 or `None`, and the dataset has not already been joined into the 

245 query, the default collection search path for this butler is used. 

246 find_first : `bool`, optional 

247 If `True` (default), for each result data ID, only yield one 

248 `DatasetRef` of each `DatasetType`, from the first collection in 

249 which a dataset of that dataset type appears (according to the 

250 order of ``collections`` passed in). If `True`, ``collections`` 

251 must not be ``...``. 

252 

253 Returns 

254 ------- 

255 refs : `.queries.DatasetQueryResults` 

256 Dataset references matching the given query criteria. Nested data 

257 IDs are guaranteed to include values for all implied dimensions 

258 (i.e. `DataCoordinate.hasFull` will return `True`), but will not 

259 include dimension records (`DataCoordinate.hasRecords` will be 

260 `False`) unless 

261 `~.queries.DatasetQueryResults.with_dimension_records` is 

262 called on the result object (which returns a new one). 

263 

264 Raises 

265 ------ 

266 lsst.daf.butler.registry.DatasetTypeExpressionError 

267 Raised when the ``dataset_type`` expression is invalid. 

268 lsst.daf.butler.registry.NoDefaultCollectionError 

269 Raised when ``collections`` is `None` and default butler 

270 collections are not defined. 

271 TypeError 

272 Raised when the arguments are incompatible, such as when a 

273 collection wildcard is passed when ``find_first`` is `True` 

274 

275 Notes 

276 ----- 

277 When multiple dataset types are queried in a single call, the 

278 results of this operation are equivalent to querying for each dataset 

279 type separately in turn, and no information about the relationships 

280 between datasets of different types is included. 

281 """ 

282 queries: dict[str, Query] = {} 

283 if dataset_type is ...: 

284 if collections is None: 

285 collections = self._driver.get_default_collections() 

286 else: 

287 collections = tuple(ensure_iterable(collections)) 

288 for _, summary in self._driver.resolve_collection_path(collections): 

289 for dataset_type_name in summary.dataset_types.names: 

290 queries[dataset_type_name] = self.join_dataset_search(dataset_type_name, collections) 

291 else: 

292 for arg in ensure_iterable(dataset_type): 

293 dataset_type_name, query = self._join_dataset_search_impl(arg, collections) 

294 queries[dataset_type_name] = query 

295 

296 single_type_results: list[SingleTypeDatasetQueryResults] = [] 

297 for dataset_type_name in sorted(queries): 

298 query = queries[dataset_type_name] 

299 dataset_search = query._tree.datasets[dataset_type_name] 

300 if dataset_search.storage_class_name is None: 

301 raise MissingDatasetTypeError( 

302 f"No storage class provided for unregistered dataset type {dataset_type_name!r}. " 

303 "Provide a complete DatasetType object instead of a string name to turn this error " 

304 "into an empty result set." 

305 ) 

306 spec = DatasetRefResultSpec.model_construct( 

307 dataset_type_name=dataset_type_name, 

308 dimensions=dataset_search.dimensions, 

309 storage_class_name=dataset_search.storage_class_name, 

310 include_dimension_records=False, 

311 find_first=find_first, 

312 ) 

313 single_type_results.append( 

314 SingleTypeDatasetQueryResults(self._driver, tree=query._tree, spec=spec) 

315 ) 

316 if len(single_type_results) == 1: 

317 return single_type_results[0] 

318 else: 

319 return ChainedDatasetQueryResults(tuple(single_type_results)) 

320 

321 def dimension_records(self, element: str) -> DimensionRecordQueryResults: 

322 """Return a result object that is a `DimensionRecord` iterable. 

323 

324 Parameters 

325 ---------- 

326 element : `str` 

327 The name of a dimension element to obtain records for. 

328 

329 Returns 

330 ------- 

331 records : `.queries.DimensionRecordQueryResults` 

332 Data IDs matching the given query parameters. 

333 """ 

334 tree = self._tree 

335 if element not in tree.dimensions.elements: 

336 tree = tree.join_dimensions(self._driver.universe[element].minimal_group) 

337 result_spec = DimensionRecordResultSpec(element=self._driver.universe[element]) 

338 return DimensionRecordQueryResults(self._driver, tree, result_spec) 

339 

340 def materialize( 

341 self, 

342 *, 

343 dimensions: Iterable[str] | DimensionGroup | None = None, 

344 datasets: Iterable[str] | None = None, 

345 ) -> Query: 

346 """Execute the query, save its results to a temporary location, and 

347 return a new query that represents fetching or joining against those 

348 saved results. 

349 

350 Parameters 

351 ---------- 

352 dimensions : `~collections.abc.Iterable` [ `str` ] or \ 

353 `DimensionGroup`, optional 

354 Dimensions to include in the temporary results. Default is to 

355 include all dimensions in the query. 

356 datasets : `~collections.abc.Iterable` [ `str` ], optional 

357 Names of dataset types that should be included in the new query; 

358 default is to include `constraint_dataset_types`. 

359 

360 Returns 

361 ------- 

362 query : `Query` 

363 A new query object whose that represents the materialized rows. 

364 

365 Notes 

366 ----- 

367 Only dimension key columns and (at the discretion of the 

368 implementation) certain dataset columns are actually materialized, 

369 since at this stage we do not know which dataset or dimension record 

370 fields are actually needed in result rows, and these can be joined back 

371 in on the materialized dimension keys. But all constraints on those 

372 dimension keys (including dataset existence) are applied to the 

373 materialized rows. 

374 """ 

375 if datasets is None: 

376 datasets = frozenset(self.constraint_dataset_types) 

377 else: 

378 datasets = frozenset(datasets) 

379 if not (datasets <= self.constraint_dataset_types): 

380 raise InvalidQueryError( 

381 f"Dataset(s) {datasets - self.constraint_dataset_types} are present in the query." 

382 ) 

383 if dimensions is None: 

384 dimensions = self._tree.dimensions 

385 else: 

386 dimensions = self._driver.universe.conform(dimensions) 

387 key = self._driver.materialize(self._tree, dimensions, datasets) 

388 tree = make_identity_query_tree(self._driver.universe).join_materialization( 

389 key, dimensions=dimensions 

390 ) 

391 for dataset_type_name in datasets: 

392 dataset_search = self._tree.datasets[dataset_type_name] 

393 if not (dataset_search.dimensions <= tree.dimensions): 

394 raise InvalidQueryError( 

395 f"Materialization-backed query has dimensions {tree.dimensions}, which do not " 

396 f"cover the dimensions {dataset_search.dimensions} of dataset {dataset_type_name!r}. " 

397 "Expand the dimensions or drop this dataset type in the arguments to materialize to " 

398 "avoid this error." 

399 ) 

400 tree = tree.join_dataset(dataset_type_name, self._tree.datasets[dataset_type_name]) 

401 return Query(self._driver, tree) 

402 

403 def join_dataset_search( 

404 self, 

405 dataset_type: str | DatasetType, 

406 collections: Iterable[str] | None = None, 

407 dimensions: DimensionGroup | None = None, 

408 ) -> Query: 

409 """Return a new query with a search for a dataset joined in. 

410 

411 Parameters 

412 ---------- 

413 dataset_type : `str` or `DatasetType` 

414 Dataset type or name. May not refer to a dataset component. 

415 collections : `~collections.abc.Iterable` [ `str` ], optional 

416 Iterable of collections to search. Order is preserved, but will 

417 not matter if the dataset search is only used as a constraint on 

418 dimensions or if ``find_first=False`` when requesting results. If 

419 not present or `None`, the default collection search path will be 

420 used. 

421 dimensions : `DimensionGroup`, optional 

422 The dimensions to assume for the dataset type if it is not 

423 registered, or to check against if it is registered. When the 

424 dataset is not registered and this is not provided, 

425 `MissingDatasetTypeError` is raised, since we cannot construct a 

426 query without knowing the dataset's dimensions. Providing this 

427 argument causes the returned query to instead return no rows (as it 

428 does when the dataset type is registered but no matching datasets 

429 are found). 

430 

431 Returns 

432 ------- 

433 query : `Query` 

434 A new query object with dataset columns available and rows 

435 restricted to those consistent with the found data IDs. 

436 

437 Raises 

438 ------ 

439 DatasetTypeError 

440 Raised if the dimensions were provided but they do not match the 

441 registered dataset type. 

442 MissingDatasetTypeError 

443 Raised if the dimensions were not provided and the dataset type was 

444 not registered. 

445 

446 Notes 

447 ----- 

448 This method may require communication with the server unless the 

449 dataset type and collections have already been referenced by the same 

450 query context. 

451 """ 

452 _, query = self._join_dataset_search_impl(dataset_type, collections, dimensions) 

453 return query 

454 

455 def join_data_coordinates(self, iterable: Iterable[DataCoordinate]) -> Query: 

456 """Return a new query that joins in an explicit table of data IDs. 

457 

458 Parameters 

459 ---------- 

460 iterable : `~collections.abc.Iterable` [ `DataCoordinate` ] 

461 Iterable of `DataCoordinate`. All items must have the same 

462 dimensions. Must have at least one item. 

463 

464 Returns 

465 ------- 

466 query : `Query` 

467 A new query object with the data IDs joined in. 

468 """ 

469 rows: set[tuple[DataIdValue, ...]] = set() 

470 dimensions: DimensionGroup | None = None 

471 for data_coordinate in iterable: 

472 if dimensions is None: 

473 dimensions = data_coordinate.dimensions 

474 elif dimensions != data_coordinate.dimensions: 

475 raise InvalidQueryError( 

476 f"Inconsistent dimensions: {dimensions} != {data_coordinate.dimensions}." 

477 ) 

478 rows.add(data_coordinate.required_values) 

479 if dimensions is None: 

480 raise InvalidQueryError("Cannot upload an empty data coordinate set.") 

481 key = self._driver.upload_data_coordinates(dimensions, rows) 

482 return Query( 

483 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key), driver=self._driver 

484 ) 

485 

486 def join_dimensions(self, dimensions: Iterable[str] | DimensionGroup) -> Query: 

487 """Return a new query that joins the logical tables for additional 

488 dimensions. 

489 

490 Parameters 

491 ---------- 

492 dimensions : `~collections.abc.Iterable` [ `str` ] or `DimensionGroup` 

493 Names of dimensions to join in. 

494 

495 Returns 

496 ------- 

497 query : `Query` 

498 A new query object with the dimensions joined in. 

499 

500 Notes 

501 ----- 

502 Dimensions are automatically joined in whenever needed, so this method 

503 should rarely need to be called directly. 

504 """ 

505 dimensions = self._driver.universe.conform(dimensions) 

506 return Query(tree=self._tree.join_dimensions(dimensions), driver=self._driver) 

507 

508 def where( 

509 self, 

510 *args: str | Predicate | DataId, 

511 bind: Mapping[str, Any] | None = None, 

512 **kwargs: Any, 

513 ) -> Query: 

514 """Return a query with a boolean-expression filter on its rows. 

515 

516 Parameters 

517 ---------- 

518 *args 

519 Constraints to apply, combined with logical AND. Arguments may be 

520 `str` expressions to parse, `Predicate` objects (these are 

521 typically constructed via `expression_factory`) or data IDs. 

522 bind : `~collections.abc.Mapping` 

523 Mapping from string identifier appearing in a string expression to 

524 a literal value that should be substituted for it. This is 

525 recommended instead of embedding literals directly into the 

526 expression, especially for strings, timespans, or other types where 

527 quoting or formatting is nontrivial. 

528 **kwargs 

529 Data ID key value pairs that extend and override any present in 

530 ``*args``. 

531 

532 Returns 

533 ------- 

534 query : `Query` 

535 A new query object with the given row filters (as well as any 

536 already present in ``self``). All row filters are combined with 

537 logical AND. 

538 

539 Notes 

540 ----- 

541 If an expression references a dimension or dimension element that is 

542 not already present in the query, it will be joined in, but dataset 

543 searches must already be joined into a query in order to reference 

544 their fields in expressions. 

545 

546 Data ID values are not checked for consistency; they are extracted from 

547 ``args`` and then ``kwargs`` and combined, with later values overriding 

548 earlier ones. 

549 """ 

550 return Query( 

551 tree=self._tree.where( 

552 convert_where_args(self.dimensions, self.constraint_dataset_types, *args, bind=bind, **kwargs) 

553 ), 

554 driver=self._driver, 

555 ) 

556 

557 def _join_dataset_search_impl( 

558 self, 

559 dataset_type: str | DatasetType, 

560 collections: Iterable[str] | None = None, 

561 dimensions: DimensionGroup | None = None, 

562 ) -> tuple[str, Query]: 

563 """Implement `join_dataset_search`, and also return the dataset type 

564 name. 

565 """ 

566 # In this method we need the dimensions of the dataset type, but we 

567 # don't necessarily need the storage class, since the dataset may only 

568 # be used as an existence constraint. But we also want to remember the 

569 # storage class if it's passed in, so users don't get frustrated having 

570 # to pass it twice if they do want DatasetRefs back. 

571 storage_class_name: str | None = None 

572 # Handle DatasetType vs. str arg. 

573 if isinstance(dataset_type, DatasetType): 

574 dataset_type_name = dataset_type.name 

575 if dimensions is not None: 

576 raise TypeError("Cannot provide a full DatasetType object and separate dimensions.") 

577 dimensions = dataset_type.dimensions.as_group() 

578 storage_class_name = dataset_type.storageClass_name 

579 elif isinstance(dataset_type, str): 

580 dataset_type_name = dataset_type 

581 else: 

582 raise TypeError(f"Invalid dataset type argument {dataset_type!r}.") 

583 # See if this dataset has already been joined into the query. 

584 if existing_search := self._tree.datasets.get(dataset_type_name): 

585 if collections is None: 

586 collections = existing_search.collections 

587 else: 

588 collections = tuple(ensure_iterable(collections)) 

589 if collections != existing_search.collections: 

590 raise InvalidQueryError( 

591 f"Dataset type {dataset_type_name!r} was already joined into this " 

592 "query with a different collection search path (previously " 

593 f"[{', '.join(existing_search.collections)}], now [{', '.join(collections)}])." 

594 ) 

595 if dimensions is None: 

596 dimensions = existing_search.dimensions 

597 elif dimensions != existing_search.dimensions: 

598 raise DatasetTypeError( 

599 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the " 

600 f"previously-joined dimensions {existing_search.dimensions}." 

601 ) 

602 if storage_class_name is None or storage_class_name == existing_search.storage_class_name: 

603 # Nothing to do; this dataset has already been joined in with 

604 # the parameters we want. We don't need to check against the 

605 # registered dataset type since that will have been done the 

606 # first time we joined this dataset type in. 

607 return dataset_type_name, self 

608 else: 

609 if collections is None: 

610 collections = self._driver.get_default_collections() 

611 collections = tuple(ensure_iterable(collections)) 

612 # See if the dataset type is registered, to look up and/or check 

613 # dimensions, and get a storage class if there isn't one already. 

614 try: 

615 resolved_dataset_type = self._driver.get_dataset_type(dataset_type_name) 

616 resolved_dimensions = resolved_dataset_type.dimensions.as_group() 

617 if storage_class_name is None: 

618 storage_class_name = resolved_dataset_type.storageClass_name 

619 except MissingDatasetTypeError: 

620 if dimensions is None: 

621 raise 

622 resolved_dimensions = dimensions 

623 else: 

624 if dimensions is not None and dimensions != resolved_dimensions: 

625 raise DatasetTypeError( 

626 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the " 

627 f"registered dimensions {resolved_dimensions}." 

628 ) 

629 if ( 

630 storage_class_name is not None 

631 and storage_class_name != resolved_dataset_type.storageClass_name 

632 ): 

633 if not ( 

634 StorageClassFactory() 

635 .getStorageClass(storage_class_name) 

636 .can_convert(resolved_dataset_type.storageClass) 

637 ): 

638 raise DatasetTypeError( 

639 f"Given storage class {storage_class_name!r} for {dataset_type_name!r} is not " 

640 f"compatible with repository storage class {resolved_dataset_type.storageClass_name}." 

641 ) 

642 # We do not check the storage class for consistency with the registered 

643 # storage class at this point, because it's not going to be used for 

644 # anything yet other than a default that can still be overridden. 

645 dataset_search = DatasetSearch.model_construct( 

646 collections=collections, 

647 dimensions=resolved_dimensions, 

648 storage_class_name=storage_class_name, 

649 ) 

650 return dataset_type_name, Query( 

651 self._driver, self._tree.join_dataset(dataset_type_name, dataset_search) 

652 )