Coverage for python/lsst/daf/butler/registry/queries/_query_backend.py: 36%

106 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("QueryBackend",) 

30 

31from abc import abstractmethod 

32from collections.abc import Iterable, Mapping, Sequence, Set 

33from typing import TYPE_CHECKING, Any, Generic, TypeVar 

34 

35from lsst.daf.relation import ( 

36 BinaryOperationRelation, 

37 ColumnExpression, 

38 ColumnTag, 

39 LeafRelation, 

40 MarkerRelation, 

41 Predicate, 

42 Relation, 

43 UnaryOperationRelation, 

44) 

45 

46from ...core import ( 

47 DataCoordinate, 

48 DatasetColumnTag, 

49 DatasetType, 

50 DimensionGraph, 

51 DimensionKeyColumnTag, 

52 DimensionRecord, 

53 DimensionUniverse, 

54 timespan, 

55) 

56from .._collectionType import CollectionType 

57from .._exceptions import DatasetTypeError, MissingDatasetTypeError 

58from ..wildcards import CollectionWildcard 

59from ._query_context import QueryContext 

60from .find_first_dataset import FindFirstDataset 

61 

62if TYPE_CHECKING: 

63 from ..interfaces import CollectionRecord 

64 

65 

66_C = TypeVar("_C", bound=QueryContext) 

67 

68 

69class QueryBackend(Generic[_C]): 

70 """An interface for constructing and evaluating the 

71 `~lsst.daf.relation.Relation` objects that comprise registry queries. 

72 

73 This ABC is expected to have a concrete subclass for each concrete registry 

74 type, and most subclasses will be paired with a `QueryContext` subclass. 

75 See `QueryContext` for the division of responsibilities between these two 

76 interfaces. 

77 """ 

78 

79 @property 

80 @abstractmethod 

81 def universe(self) -> DimensionUniverse: 

82 """Definition of all dimensions and dimension elements for this 

83 registry (`DimensionUniverse`). 

84 """ 

85 raise NotImplementedError() 

86 

87 def context(self) -> _C: 

88 """Return a context manager that can be used to execute queries with 

89 this backend. 

90 

91 Returns 

92 ------- 

93 context : `QueryContext` 

94 Context manager that manages state and connections needed to 

95 execute queries. 

96 """ 

97 raise NotImplementedError() 

98 

99 @abstractmethod 

100 def get_collection_name(self, key: Any) -> str: 

101 """Return the collection name associated with a collection primary key 

102 value. 

103 

104 Parameters 

105 ---------- 

106 key 

107 Collection primary key value. 

108 

109 Returns 

110 ------- 

111 name : `str` 

112 Collection name. 

113 """ 

114 raise NotImplementedError() 

115 

116 @abstractmethod 

117 def resolve_collection_wildcard( 

118 self, 

119 expression: Any, 

120 *, 

121 collection_types: Set[CollectionType] = CollectionType.all(), 

122 done: set[str] | None = None, 

123 flatten_chains: bool = True, 

124 include_chains: bool | None = None, 

125 ) -> list[CollectionRecord]: 

126 """Return the collection records that match a wildcard expression. 

127 

128 Parameters 

129 ---------- 

130 expression 

131 Names and/or patterns for collections; will be passed to 

132 `CollectionWildcard.from_expression`. 

133 collection_types : `collections.abc.Set` [ `CollectionType` ], optional 

134 If provided, only yield collections of these types. 

135 done : `set` [ `str` ], optional 

136 A set of collection names that should be skipped, updated to 

137 include all processed collection names on return. 

138 flatten_chains : `bool`, optional 

139 If `True` (default) recursively yield the child collections of 

140 `~CollectionType.CHAINED` collections. 

141 include_chains : `bool`, optional 

142 If `False`, return records for `~CollectionType.CHAINED` 

143 collections themselves. The default is the opposite of 

144 ``flattenChains``: either return records for CHAINED collections or 

145 their children, but not both. 

146 

147 Returns 

148 ------- 

149 records : `list` [ `CollectionRecord` ] 

150 Matching collection records. 

151 """ 

152 raise NotImplementedError() 

153 

154 @abstractmethod 

155 def resolve_dataset_type_wildcard( 

156 self, 

157 expression: Any, 

158 components: bool | None = None, 

159 missing: list[str] | None = None, 

160 explicit_only: bool = False, 

161 components_deprecated: bool = True, 

162 ) -> dict[DatasetType, list[str | None]]: 

163 """Return the dataset types that match a wildcard expression. 

164 

165 Parameters 

166 ---------- 

167 expression 

168 Names and/or patterns for dataset types; will be passed to 

169 `DatasetTypeWildcard.from_expression`. 

170 components : `bool`, optional 

171 If `True`, apply all expression patterns to component dataset type 

172 names as well. If `False`, never apply patterns to components. If 

173 `None` (default), apply patterns to components only if their parent 

174 datasets were not matched by the expression. Fully-specified 

175 component datasets (`str` or `DatasetType` instances) are always 

176 included. 

177 missing : `list` of `str`, optional 

178 String dataset type names that were explicitly given (i.e. not 

179 regular expression patterns) but not found will be appended to this 

180 list, if it is provided. 

181 explicit_only : `bool`, optional 

182 If `True`, require explicit `DatasetType` instances or `str` names, 

183 with `re.Pattern` instances deprecated and ``...`` prohibited. 

184 components_deprecated : `bool`, optional 

185 If `True`, this is a context in which component dataset support is 

186 deprecated. This will result in a deprecation warning when 

187 ``components=True`` or ``components=None`` and a component dataset 

188 is matched. In the future this will become an error. 

189 

190 Returns 

191 ------- 

192 dataset_types : `dict` [ `DatasetType`, `list` [ `None`, `str` ] ] 

193 A mapping with resolved dataset types as keys and lists of 

194 matched component names as values, where `None` indicates the 

195 parent composite dataset type was matched. 

196 """ 

197 raise NotImplementedError() 

198 

199 def resolve_single_dataset_type_wildcard( 

200 self, 

201 expression: Any, 

202 components: bool | None = None, 

203 explicit_only: bool = False, 

204 components_deprecated: bool = True, 

205 ) -> tuple[DatasetType, list[str | None]]: 

206 """Return a single dataset type that matches a wildcard expression. 

207 

208 Parameters 

209 ---------- 

210 expression 

211 Names and/or patterns for the dataset type; will be passed to 

212 `DatasetTypeWildcard.from_expression`. 

213 components : `bool`, optional 

214 If `True`, apply all expression patterns to component dataset type 

215 names as well. If `False`, never apply patterns to components. If 

216 `None` (default), apply patterns to components only if their parent 

217 datasets were not matched by the expression. Fully-specified 

218 component datasets (`str` or `DatasetType` instances) are always 

219 included. 

220 explicit_only : `bool`, optional 

221 If `True`, require explicit `DatasetType` instances or `str` names, 

222 with `re.Pattern` instances deprecated and ``...`` prohibited. 

223 components_deprecated : `bool`, optional 

224 If `True`, this is a context in which component dataset support is 

225 deprecated. This will result in a deprecation warning when 

226 ``components=True`` or ``components=None`` and a component dataset 

227 is matched. In the future this will become an error. 

228 

229 Returns 

230 ------- 

231 single_parent : `DatasetType` 

232 The matched parent dataset type. 

233 single_components : `list` [ `str` | `None` ] 

234 The matched components that correspond to this parent, or `None` if 

235 the parent dataset type itself was matched. 

236 

237 Notes 

238 ----- 

239 This method really finds a single parent dataset type and any number of 

240 components, because it's only the parent dataset type that's known to 

241 registry at all; many callers are expected to discard the 

242 ``single_components`` return value. 

243 """ 

244 missing: list[str] = [] 

245 matching = self.resolve_dataset_type_wildcard( 

246 expression, 

247 components=components, 

248 missing=missing, 

249 explicit_only=explicit_only, 

250 components_deprecated=components_deprecated, 

251 ) 

252 if not matching: 

253 if missing: 

254 raise MissingDatasetTypeError( 

255 "\n".join( 

256 f"Dataset type {t!r} is not registered, so no instances of it can exist." 

257 for t in missing 

258 ) 

259 ) 

260 else: 

261 raise MissingDatasetTypeError( 

262 f"No registered dataset types matched expression {expression!r}, " 

263 "so no datasets will be found." 

264 ) 

265 if len(matching) > 1: 

266 raise DatasetTypeError( 

267 f"Expression {expression!r} matched multiple parent dataset types: " 

268 f"{[t.name for t in matching]}, but only one is allowed." 

269 ) 

270 ((single_parent, single_components),) = matching.items() 

271 if missing: 

272 raise DatasetTypeError( 

273 f"Expression {expression!r} appears to involve multiple dataset types, even though only " 

274 f"one ({single_parent.name}) is registered, and only one is allowed here." 

275 ) 

276 return single_parent, single_components 

277 

278 @abstractmethod 

279 def filter_dataset_collections( 

280 self, 

281 dataset_types: Iterable[DatasetType], 

282 collections: Sequence[CollectionRecord], 

283 *, 

284 governor_constraints: Mapping[str, Set[str]], 

285 rejections: list[str] | None = None, 

286 ) -> dict[DatasetType, list[CollectionRecord]]: 

287 """Filter a sequence of collections to those for which a dataset query 

288 might succeed. 

289 

290 Parameters 

291 ---------- 

292 dataset_types : `~collections.abc.Iterable` [ `DatasetType` ] 

293 Dataset types that are being queried. Must include only parent 

294 or standalone dataset types, not components. 

295 collections : `~collections.abc.Sequence` [ `CollectionRecord` ] 

296 Sequence of collections that will be searched. 

297 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

298 `~collections.abc.Set` [ `str` ] ], optional 

299 Constraints imposed by other aspects of the query on governor 

300 dimensions; collections inconsistent with these constraints will be 

301 skipped. 

302 rejections : `list` [ `str` ], optional 

303 If not `None`, a `list` that diagnostic messages will be appended 

304 to, for any collection that matches ``collections`` that is not 

305 returned. At least one message is guaranteed whenever the result 

306 is empty. 

307 

308 Returns 

309 ------- 

310 dataset_collections : `dict` [ `DatasetType`, \ 

311 `list` [ `CollectionRecord` ] ] 

312 The collections to search for each dataset. The dictionary's keys 

313 are always exactly ``dataset_types`` (in the same order), and each 

314 nested `list` of collections is ordered consistently with the 

315 given ``collections``. 

316 

317 Notes 

318 ----- 

319 This method accepts multiple dataset types and multiple collections at 

320 once to enable implementations to batch up the fetching of summary 

321 information needed to relate them. 

322 """ 

323 raise NotImplementedError() 

324 

325 def resolve_dataset_collections( 

326 self, 

327 dataset_type: DatasetType, 

328 collections: CollectionWildcard, 

329 *, 

330 governor_constraints: Mapping[str, Set[str]], 

331 rejections: list[str] | None = None, 

332 collection_types: Set[CollectionType] = CollectionType.all(), 

333 allow_calibration_collections: bool = False, 

334 ) -> list[CollectionRecord]: 

335 """Resolve the sequence of collections to query for a dataset type. 

336 

337 Parameters 

338 ---------- 

339 dataset_type : `DatasetType` 

340 Dataset type to be queried in the returned collections. 

341 collections : `CollectionWildcard` 

342 Expression for the collections to be queried. 

343 governor_constraints : `~collections.abc.Mapping` [ `str`, \ 

344 `~collections.abc.Set` ], optional 

345 Constraints imposed by other aspects of the query on governor 

346 dimensions; collections inconsistent with these constraints will be 

347 skipped. 

348 rejections : `list` [ `str` ], optional 

349 If not `None`, a `list` that diagnostic messages will be appended 

350 to, for any collection that matches ``collections`` that is not 

351 returned. At least one message is guaranteed whenever the result 

352 is empty. 

353 collection_types : `~collections.abc.Set` [ `CollectionType` ], \ 

354 optional 

355 Collection types to consider when resolving the collection 

356 expression. 

357 allow_calibration_collections : `bool`, optional 

358 If `False`, skip (with a ``rejections`` message) any calibration 

359 collections that match ``collections`` are not given explicitly by 

360 name, and raise `NotImplementedError` for any calibration 

361 collection that is given explicitly. This is a temporary option 

362 that will be removed when the query system can handle temporal 

363 joins involving calibration collections. 

364 

365 Returns 

366 ------- 

367 records : `list` [ `CollectionRecord` ] 

368 A new list of `CollectionRecord` instances, for collections that 

369 both match ``collections`` and may have datasets of the given type. 

370 

371 Notes 

372 ----- 

373 This is a higher-level driver for `resolve_collection_wildcard` and 

374 `filter_dataset_collections` that is mostly concerned with handling 

375 queries against `~Collection.Type.CALIBRATION` collections that aren't 

376 fully supported yet. Once that support improves, this method may be 

377 removed. 

378 """ 

379 if collections == CollectionWildcard() and collection_types == CollectionType.all(): 

380 collection_types = {CollectionType.RUN} 

381 explicit_collections = frozenset(collections.strings) 

382 matching_collection_records = self.resolve_collection_wildcard( 

383 collections, collection_types=collection_types 

384 ) 

385 ((_, filtered_collection_records),) = self.filter_dataset_collections( 

386 [dataset_type], 

387 matching_collection_records, 

388 governor_constraints=governor_constraints, 

389 rejections=rejections, 

390 ).items() 

391 if not allow_calibration_collections: 

392 supported_collection_records: list[CollectionRecord] = [] 

393 for record in filtered_collection_records: 

394 if record.type is CollectionType.CALIBRATION: 

395 # If collection name was provided explicitly then raise, 

396 # since this is a kind of query we don't support yet; 

397 # otherwise collection is a part of a chained one or regex 

398 # match, and we skip it to not break queries of other 

399 # included collections. 

400 if record.name in explicit_collections: 

401 raise NotImplementedError( 

402 f"Query for dataset type {dataset_type.name!r} in CALIBRATION-type " 

403 f"collection {record.name!r} is not yet supported." 

404 ) 

405 else: 

406 if rejections is not None: 

407 rejections.append( 

408 f"Not searching for dataset {dataset_type.name!r} in CALIBRATION " 

409 f"collection {record.name!r} because calibration queries aren't fully " 

410 "implemented; this is not an error only because the query structure " 

411 "implies that searching this collection may be incidental." 

412 ) 

413 supported_collection_records.append(record) 

414 else: 

415 supported_collection_records.append(record) 

416 else: 

417 supported_collection_records = filtered_collection_records 

418 if not supported_collection_records and rejections is not None and not rejections: 

419 rejections.append(f"No collections to search matching expression {collections!r}.") 

420 return supported_collection_records 

421 

422 @abstractmethod 

423 def _make_dataset_query_relation_impl( 

424 self, 

425 dataset_type: DatasetType, 

426 collections: Sequence[CollectionRecord], 

427 columns: Set[str], 

428 context: _C, 

429 ) -> Relation: 

430 """Construct a relation that represents an unordered query for datasets 

431 that returns matching results from all given collections. 

432 

433 Parameters 

434 ---------- 

435 dataset_type : `DatasetType` 

436 Type for the datasets being queried. 

437 collections : `~collections.abc.Sequence` [ `CollectionRecord` ] 

438 Records for collections to query. Should generally be the result 

439 of a call to `resolve_dataset_collections`, and must not be empty. 

440 context : `QueryContext` 

441 Context that manages per-query state. 

442 columns : `~collections.abc.Set` [ `str` ] 

443 Columns to include in the relation. See `Query.find_datasets` for 

444 details. 

445 

446 Returns 

447 ------- 

448 relation : `lsst.daf.relation.Relation` 

449 Relation representing a dataset query. 

450 

451 Notes 

452 ----- 

453 This method must be implemented by derived classes but is not 

454 responsible for joining the resulting relation to an existing relation. 

455 """ 

456 raise NotImplementedError() 

457 

458 def make_dataset_query_relation( 

459 self, 

460 dataset_type: DatasetType, 

461 collections: Sequence[CollectionRecord], 

462 columns: Set[str], 

463 context: _C, 

464 *, 

465 join_to: Relation | None = None, 

466 temporal_join_on: Set[ColumnTag] = frozenset(), 

467 ) -> Relation: 

468 """Construct a relation that represents an unordered query for datasets 

469 that returns matching results from all given collections. 

470 

471 Parameters 

472 ---------- 

473 dataset_type : `DatasetType` 

474 Type for the datasets being queried. 

475 collections : `~collections.abc.Sequence` [ `CollectionRecord` ] 

476 Records for collections to query. Should generally be the result 

477 of a call to `resolve_dataset_collections`, and must not be empty. 

478 context : `QueryContext` 

479 Context that manages per-query state. 

480 columns : `~collections.abc.Set` [ `str` ] 

481 Columns to include in the relation. See `Query.find_datasets` for 

482 details. 

483 join_to : `Relation`, optional 

484 Another relation to join with the query for datasets in all 

485 collections. 

486 temporal_join_on: `~collections.abc.Set` [ `ColumnTag` ], optional 

487 Timespan columns in ``join_to`` that calibration dataset timespans 

488 must overlap. Must already be present in ``join_to``. Ignored if 

489 ``join_to`` is `None` or if there are no calibration collections. 

490 

491 Returns 

492 ------- 

493 relation : `lsst.daf.relation.Relation` 

494 Relation representing a dataset query. 

495 """ 

496 # If we need to do a temporal join to a calibration collection, we need 

497 # to include the timespan column in the base query and prepare the join 

498 # predicate. 

499 join_predicates: list[Predicate] = [] 

500 base_timespan_tag: ColumnTag | None = None 

501 full_columns: set[str] = set(columns) 

502 if ( 

503 temporal_join_on 

504 and join_to is not None 

505 and any(r.type is CollectionType.CALIBRATION for r in collections) 

506 ): 

507 base_timespan_tag = DatasetColumnTag(dataset_type.name, "timespan") 

508 rhs = ColumnExpression.reference(base_timespan_tag, dtype=timespan.Timespan) 

509 full_columns.add("timespan") 

510 for timespan_tag in temporal_join_on: 

511 lhs = ColumnExpression.reference(timespan_tag, dtype=timespan.Timespan) 

512 join_predicates.append(lhs.predicate_method("overlaps", rhs)) 

513 # Delegate to the concrete QueryBackend subclass to do most of the 

514 # work. 

515 result = self._make_dataset_query_relation_impl( 

516 dataset_type, 

517 collections, 

518 full_columns, 

519 context=context, 

520 ) 

521 if join_to is not None: 

522 result = join_to.join( 

523 result, predicate=Predicate.logical_and(*join_predicates) if join_predicates else None 

524 ) 

525 if join_predicates and "timespan" not in columns: 

526 # Drop the timespan column we added for the join only if the 

527 # timespan wasn't requested in its own right. 

528 result = result.with_only_columns(result.columns - {base_timespan_tag}) 

529 return result 

530 

531 def make_dataset_search_relation( 

532 self, 

533 dataset_type: DatasetType, 

534 collections: Sequence[CollectionRecord], 

535 columns: Set[str], 

536 context: _C, 

537 *, 

538 join_to: Relation | None = None, 

539 temporal_join_on: Set[ColumnTag] = frozenset(), 

540 ) -> Relation: 

541 """Construct a relation that represents an order query for datasets 

542 that returns results from the first matching collection for each data 

543 ID. 

544 

545 Parameters 

546 ---------- 

547 dataset_type : `DatasetType` 

548 Type for the datasets being search. 

549 collections : `~collections.abc.Sequence` [ `CollectionRecord` ] 

550 Records for collections to search. Should generally be the result 

551 of a call to `resolve_dataset_collections`, and must not be empty. 

552 columns : `~collections.abc.Set` [ `str` ] 

553 Columns to include in the ``relation``. See 

554 `make_dataset_query_relation` for options. 

555 context : `QueryContext` 

556 Context that manages per-query state. 

557 join_to : `Relation`, optional 

558 Another relation to join with the query for datasets in all 

559 collections before filtering out out shadowed datasets. 

560 temporal_join_on: `~collections.abc.Set` [ `ColumnTag` ], optional 

561 Timespan columns in ``join_to`` that calibration dataset timespans 

562 must overlap. Must already be present in ``join_to``. Ignored if 

563 ``join_to`` is `None` or if there are no calibration collections. 

564 

565 Returns 

566 ------- 

567 relation : `lsst.daf.relation.Relation` 

568 Relation representing a find-first dataset search. 

569 """ 

570 base = self.make_dataset_query_relation( 

571 dataset_type, 

572 collections, 

573 columns | {"rank"}, 

574 context=context, 

575 join_to=join_to, 

576 temporal_join_on=temporal_join_on, 

577 ) 

578 # Query-simplification shortcut: if there is only one collection, a 

579 # find-first search is just a regular result subquery. Same if there 

580 # are no collections. 

581 if len(collections) <= 1: 

582 return base 

583 # We filter the dimension keys in the given relation through 

584 # DimensionGraph.required.names to minimize the set we partition on 

585 # and order it in a more index-friendly way. More precisely, any 

586 # index we define on dimensions will be consistent with this order, but 

587 # any particular index may not have the same dimension columns. 

588 dimensions = self.universe.extract( 

589 [tag.dimension for tag in DimensionKeyColumnTag.filter_from(base.columns)] 

590 ) 

591 find_first = FindFirstDataset( 

592 dimensions=DimensionKeyColumnTag.generate(dimensions.required.names), 

593 rank=DatasetColumnTag(dataset_type.name, "rank"), 

594 ) 

595 return find_first.apply( 

596 base, preferred_engine=context.preferred_engine, require_preferred_engine=True 

597 ).with_only_columns(base.columns - {find_first.rank}) 

598 

599 def make_doomed_dataset_relation( 

600 self, 

601 dataset_type: DatasetType, 

602 columns: Set[str], 

603 messages: Iterable[str], 

604 context: _C, 

605 ) -> Relation: 

606 """Construct a relation that represents a doomed query for datasets. 

607 

608 Parameters 

609 ---------- 

610 dataset_type : `DatasetType` 

611 Dataset type being queried. 

612 columns : `~collections.abc.Set` [ `str` ] 

613 Dataset columns to include (dimension key columns are always 

614 included). See `make_dataset_query_relation` for allowed values. 

615 messages : `~collections.abc.Iterable` [ `str` ] 

616 Diagnostic messages that explain why the query is doomed to yield 

617 no rows. 

618 context : `QueryContext` 

619 Context that manages per-query state. 

620 

621 Returns 

622 ------- 

623 relation : `lsst.daf.relation.Relation` 

624 Relation with the requested columns and no rows. 

625 """ 

626 column_tags: set[ColumnTag] = set( 

627 DimensionKeyColumnTag.generate(dataset_type.dimensions.required.names) 

628 ) 

629 column_tags.update(DatasetColumnTag.generate(dataset_type.name, columns)) 

630 return context.preferred_engine.make_doomed_relation(columns=column_tags, messages=list(messages)) 

631 

632 @abstractmethod 

633 def make_dimension_relation( 

634 self, 

635 dimensions: DimensionGraph, 

636 columns: Set[ColumnTag], 

637 context: _C, 

638 *, 

639 initial_relation: Relation | None = None, 

640 initial_join_max_columns: frozenset[ColumnTag] | None = None, 

641 initial_dimension_relationships: Set[frozenset[str]] | None = None, 

642 spatial_joins: Iterable[tuple[str, str]] = (), 

643 governor_constraints: Mapping[str, Set[str]], 

644 ) -> Relation: 

645 """Construct a relation that provides columns and constraints from 

646 dimension records. 

647 

648 Parameters 

649 ---------- 

650 dimensions : `DimensionGraph` 

651 Dimensions to include. The key columns for all dimensions (both 

652 required and implied) will be included in the returned relation. 

653 columns : `~collections.abc.Set` [ `ColumnTag` ] 

654 Dimension record columns to include. This set may include key 

655 column tags as well, though these may be ignored; the set of key 

656 columns to include is determined by the ``dimensions`` argument 

657 instead. 

658 context : `QueryContext` 

659 Context that manages per-query state. 

660 initial_relation : `~lsst.daf.relation.Relation`, optional 

661 Initial relation to join to the dimension relations. If this 

662 relation provides record columns, key columns, and relationships 

663 between key columns (see ``initial_dimension_relationships`` below) 

664 that would otherwise have been added by joining in a dimension 

665 element's relation, that relation may not be joined in at all. 

666 initial_join_max_columns : `frozenset` [ `ColumnTag` ], optional 

667 Maximum superset of common columns for joins to 

668 ``initial_relation`` (i.e. columns in the ``ON`` expression of SQL 

669 ``JOIN`` clauses). If provided, this is a subset of the dimension 

670 key columns in ``initial_relation``, which are otherwise all 

671 considered as potential common columns for joins. Ignored if 

672 ``initial_relation`` is not provided. 

673 initial_dimension_relationships : `~collections.abc.Set` \ 

674 [ `frozenset` [ `str` ] ], optional 

675 A set of sets of dimension names representing relationships between 

676 dimensions encoded in the rows of ``initial_relation``. If not 

677 provided (and ``initial_relation`` is), 

678 `extract_dimension_relationships` will be called on 

679 ``initial_relation``. 

680 spatial_joins : `collections.abc.Iterable` [ `tuple` [ `str`, `str` ] ] 

681 Iterable of dimension element name pairs that should be spatially 

682 joined. 

683 governor_constraints : `~collections.abc.Mapping` [ `str` \ 

684 [ `~collections.abc.Set` [ `str` ] ] ], optional 

685 Constraints on governor dimensions that are provided by other parts 

686 of the query that either have been included in ``initial_relation`` 

687 or are guaranteed to be added in the future. This is a mapping from 

688 governor dimension name to sets of values that dimension may take. 

689 

690 Returns 

691 ------- 

692 relation : `lsst.daf.relation.Relation` 

693 Relation containing the given dimension columns and constraints. 

694 """ 

695 raise NotImplementedError() 

696 

697 @abstractmethod 

698 def resolve_governor_constraints( 

699 self, dimensions: DimensionGraph, constraints: Mapping[str, Set[str]], context: _C 

700 ) -> Mapping[str, Set[str]]: 

701 """Resolve governor dimension constraints provided by user input to 

702 a query against the content in the `Registry`. 

703 

704 Parameters 

705 ---------- 

706 dimensions : `DimensionGraph` 

707 Dimensions that bound the governor dimensions to consider (via 

708 ``dimensions.governors``, more specifically). 

709 constraints : `~collections.abc.Mapping` [ `str`, \ 

710 `~collections.abc.Set` [ `str` ] ] 

711 Constraints from user input to the query (e.g. from data IDs and 

712 string expression predicates). 

713 context : `QueryContext` 

714 Object that manages state for the query; used here to fetch the 

715 governor dimension record cache if it has not already been loaded. 

716 

717 Returns 

718 ------- 

719 resolved : `~collections.abc.Mapping` [ `str`, \ 

720 `~collections.abc.Set` [ `str` ] ] 

721 A shallow copy of ``constraints`` with keys equal to 

722 ``dimensions.governors.names`` and value sets constrained by the 

723 Registry content if they were not already in ``constraints``. 

724 

725 Raises 

726 ------ 

727 DataIdValueError 

728 Raised if ``constraints`` includes governor dimension values that 

729 are not present in the `Registry`. 

730 """ 

731 raise NotImplementedError() 

732 

733 @abstractmethod 

734 def get_dimension_record_cache( 

735 self, element_name: str, context: _C 

736 ) -> Mapping[DataCoordinate, DimensionRecord] | None: 

737 """Return a local cache of all `DimensionRecord` objects for a 

738 dimension element, fetching it if necessary. 

739 

740 Parameters 

741 ---------- 

742 element_name : `str` 

743 Name of the dimension element. 

744 context : `.queries.SqlQueryContext` 

745 Context to be used to execute queries when no cached result is 

746 available. 

747 

748 Returns 

749 ------- 

750 cache : `~collections.abc.Mapping` [ `DataCoordinate`, \ 

751 `DimensionRecord` ] or `None` 

752 Mapping from data ID to dimension record, or `None` if this 

753 element's records are never cached. 

754 """ 

755 raise NotImplementedError() 

756 

757 def extract_dimension_relationships(self, relation: Relation) -> set[frozenset[str]]: 

758 """Extract the dimension key relationships encoded in a relation tree. 

759 

760 Parameters 

761 ---------- 

762 relation : `Relation` 

763 Relation tree to process. 

764 

765 Returns 

766 ------- 

767 relationships : `set` [ `frozenset` [ `str` ] ] 

768 Set of sets of dimension names, where each inner set represents a 

769 relationship between dimensions. 

770 

771 Notes 

772 ----- 

773 Dimension relationships include both many-to-one implied dependencies 

774 and many-to-many joins backed by "always-join" dimension elements, and 

775 it's important to join in the dimension table that defines a 

776 relationship in any query involving dimensions that are a superset of 

777 that relationship. For example, let's consider a relation tree that 

778 joins dataset existence-check relations for two dataset types, with 

779 dimensions ``{instrument, exposure, detector}`` and ``{instrument, 

780 physical_filter}``. The joined relation appears to have all dimension 

781 keys in its expanded graph present except ``band``, and the system 

782 could easily correct this by joining that dimension in directly. But 

783 it's also missing the ``{instrument, exposure, physical_filter}`` 

784 relationship we'd get from the ``exposure`` dimension's own relation 

785 (``exposure`` implies ``physical_filter``) and the similar 

786 ``{instrument, physical_filter, band}`` relationship from the 

787 ``physical_filter`` dimension relation; we need the relationship logic 

788 to recognize that those dimensions need to be joined in as well in 

789 order for the full relation to have rows that represent valid data IDs. 

790 

791 The implementation of this method relies on the assumption that 

792 `LeafRelation` objects always have rows that are consistent with all 

793 defined relationships (i.e. are valid data IDs). This is true for not 

794 just dimension relations themselves, but anything created from queries 

795 based on them, including datasets and query results. It is possible to 

796 construct `LeafRelation` objects that don't satisfy this criteria (e.g. 

797 when accepting in user-provided data IDs), and in this case 

798 higher-level guards or warnings must be provided.`` 

799 """ 

800 return { 

801 frozenset( 

802 tag.dimension 

803 for tag in DimensionKeyColumnTag.filter_from(leaf_relation.columns & relation.columns) 

804 ) 

805 for leaf_relation in self._extract_leaf_relations(relation).values() 

806 } 

807 

808 def _extract_leaf_relations(self, relation: Relation) -> dict[str, LeafRelation]: 

809 """Recursively extract leaf relations from a relation tree. 

810 

811 Parameters 

812 ---------- 

813 relation : `Relation` 

814 Tree to process. 

815 

816 Returns 

817 ------- 

818 leaves : `dict` [ `str`, `LeafRelation` ] 

819 Leaf relations, keyed and deduplicated by name. 

820 """ 

821 match relation: 

822 case LeafRelation() as leaf: 

823 return {leaf.name: leaf} 

824 case UnaryOperationRelation(target=target): 

825 return self._extract_leaf_relations(target) 

826 case BinaryOperationRelation(lhs=lhs, rhs=rhs): 

827 return self._extract_leaf_relations(lhs) | self._extract_leaf_relations(rhs) 

828 case MarkerRelation(target=target): 

829 return self._extract_leaf_relations(target) 

830 raise AssertionError("Match should be exhaustive and all branches should return.")