Coverage for python/lsst/daf/butler/registry/queries/_query_backend.py: 36%
106 statements
« prev ^ index » next coverage.py v7.3.0, created at 2023-09-02 09:34 +0000
« prev ^ index » next coverage.py v7.3.0, created at 2023-09-02 09:34 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBackend",)
25from abc import abstractmethod
26from collections.abc import Iterable, Mapping, Sequence, Set
27from typing import TYPE_CHECKING, Any, Generic, TypeVar
29from lsst.daf.relation import (
30 BinaryOperationRelation,
31 ColumnExpression,
32 ColumnTag,
33 LeafRelation,
34 MarkerRelation,
35 Predicate,
36 Relation,
37 UnaryOperationRelation,
38)
40from ...core import (
41 DataCoordinate,
42 DatasetColumnTag,
43 DatasetType,
44 DimensionGraph,
45 DimensionKeyColumnTag,
46 DimensionRecord,
47 DimensionUniverse,
48 timespan,
49)
50from .._collectionType import CollectionType
51from .._exceptions import DatasetTypeError, MissingDatasetTypeError
52from ..wildcards import CollectionWildcard
53from ._query_context import QueryContext
54from .find_first_dataset import FindFirstDataset
56if TYPE_CHECKING:
57 from ..interfaces import CollectionRecord
60_C = TypeVar("_C", bound=QueryContext)
63class QueryBackend(Generic[_C]):
64 """An interface for constructing and evaluating the
65 `~lsst.daf.relation.Relation` objects that comprise registry queries.
67 This ABC is expected to have a concrete subclass for each concrete registry
68 type, and most subclasses will be paired with a `QueryContext` subclass.
69 See `QueryContext` for the division of responsibilities between these two
70 interfaces.
71 """
73 @property
74 @abstractmethod
75 def universe(self) -> DimensionUniverse:
76 """Definition of all dimensions and dimension elements for this
77 registry (`DimensionUniverse`).
78 """
79 raise NotImplementedError()
81 def context(self) -> _C:
82 """Return a context manager that can be used to execute queries with
83 this backend.
85 Returns
86 -------
87 context : `QueryContext`
88 Context manager that manages state and connections needed to
89 execute queries.
90 """
91 raise NotImplementedError()
93 @abstractmethod
94 def get_collection_name(self, key: Any) -> str:
95 """Return the collection name associated with a collection primary key
96 value.
98 Parameters
99 ----------
100 key
101 Collection primary key value.
103 Returns
104 -------
105 name : `str`
106 Collection name.
107 """
108 raise NotImplementedError()
110 @abstractmethod
111 def resolve_collection_wildcard(
112 self,
113 expression: Any,
114 *,
115 collection_types: Set[CollectionType] = CollectionType.all(),
116 done: set[str] | None = None,
117 flatten_chains: bool = True,
118 include_chains: bool | None = None,
119 ) -> list[CollectionRecord]:
120 """Return the collection records that match a wildcard expression.
122 Parameters
123 ----------
124 expression
125 Names and/or patterns for collections; will be passed to
126 `CollectionWildcard.from_expression`.
127 collection_types : `collections.abc.Set` [ `CollectionType` ], optional
128 If provided, only yield collections of these types.
129 done : `set` [ `str` ], optional
130 A set of collection names that should be skipped, updated to
131 include all processed collection names on return.
132 flatten_chains : `bool`, optional
133 If `True` (default) recursively yield the child collections of
134 `~CollectionType.CHAINED` collections.
135 include_chains : `bool`, optional
136 If `False`, return records for `~CollectionType.CHAINED`
137 collections themselves. The default is the opposite of
138 ``flattenChains``: either return records for CHAINED collections or
139 their children, but not both.
141 Returns
142 -------
143 records : `list` [ `CollectionRecord` ]
144 Matching collection records.
145 """
146 raise NotImplementedError()
148 @abstractmethod
149 def resolve_dataset_type_wildcard(
150 self,
151 expression: Any,
152 components: bool | None = None,
153 missing: list[str] | None = None,
154 explicit_only: bool = False,
155 components_deprecated: bool = True,
156 ) -> dict[DatasetType, list[str | None]]:
157 """Return the dataset types that match a wildcard expression.
159 Parameters
160 ----------
161 expression
162 Names and/or patterns for dataset types; will be passed to
163 `DatasetTypeWildcard.from_expression`.
164 components : `bool`, optional
165 If `True`, apply all expression patterns to component dataset type
166 names as well. If `False`, never apply patterns to components. If
167 `None` (default), apply patterns to components only if their parent
168 datasets were not matched by the expression. Fully-specified
169 component datasets (`str` or `DatasetType` instances) are always
170 included.
171 missing : `list` of `str`, optional
172 String dataset type names that were explicitly given (i.e. not
173 regular expression patterns) but not found will be appended to this
174 list, if it is provided.
175 explicit_only : `bool`, optional
176 If `True`, require explicit `DatasetType` instances or `str` names,
177 with `re.Pattern` instances deprecated and ``...`` prohibited.
178 components_deprecated : `bool`, optional
179 If `True`, this is a context in which component dataset support is
180 deprecated. This will result in a deprecation warning when
181 ``components=True`` or ``components=None`` and a component dataset
182 is matched. In the future this will become an error.
184 Returns
185 -------
186 dataset_types : `dict` [ `DatasetType`, `list` [ `None`, `str` ] ]
187 A mapping with resolved dataset types as keys and lists of
188 matched component names as values, where `None` indicates the
189 parent composite dataset type was matched.
190 """
191 raise NotImplementedError()
193 def resolve_single_dataset_type_wildcard(
194 self,
195 expression: Any,
196 components: bool | None = None,
197 explicit_only: bool = False,
198 components_deprecated: bool = True,
199 ) -> tuple[DatasetType, list[str | None]]:
200 """Return a single dataset type that matches a wildcard expression.
202 Parameters
203 ----------
204 expression
205 Names and/or patterns for the dataset type; will be passed to
206 `DatasetTypeWildcard.from_expression`.
207 components : `bool`, optional
208 If `True`, apply all expression patterns to component dataset type
209 names as well. If `False`, never apply patterns to components. If
210 `None` (default), apply patterns to components only if their parent
211 datasets were not matched by the expression. Fully-specified
212 component datasets (`str` or `DatasetType` instances) are always
213 included.
214 explicit_only : `bool`, optional
215 If `True`, require explicit `DatasetType` instances or `str` names,
216 with `re.Pattern` instances deprecated and ``...`` prohibited.
217 components_deprecated : `bool`, optional
218 If `True`, this is a context in which component dataset support is
219 deprecated. This will result in a deprecation warning when
220 ``components=True`` or ``components=None`` and a component dataset
221 is matched. In the future this will become an error.
223 Returns
224 -------
225 single_parent : `DatasetType`
226 The matched parent dataset type.
227 single_components : `list` [ `str` | `None` ]
228 The matched components that correspond to this parent, or `None` if
229 the parent dataset type itself was matched.
231 Notes
232 -----
233 This method really finds a single parent dataset type and any number of
234 components, because it's only the parent dataset type that's known to
235 registry at all; many callers are expected to discard the
236 ``single_components`` return value.
237 """
238 missing: list[str] = []
239 matching = self.resolve_dataset_type_wildcard(
240 expression,
241 components=components,
242 missing=missing,
243 explicit_only=explicit_only,
244 components_deprecated=components_deprecated,
245 )
246 if not matching:
247 if missing:
248 raise MissingDatasetTypeError(
249 "\n".join(
250 f"Dataset type {t!r} is not registered, so no instances of it can exist."
251 for t in missing
252 )
253 )
254 else:
255 raise MissingDatasetTypeError(
256 f"No registered dataset types matched expression {expression!r}, "
257 "so no datasets will be found."
258 )
259 if len(matching) > 1:
260 raise DatasetTypeError(
261 f"Expression {expression!r} matched multiple parent dataset types: "
262 f"{[t.name for t in matching]}, but only one is allowed."
263 )
264 ((single_parent, single_components),) = matching.items()
265 if missing:
266 raise DatasetTypeError(
267 f"Expression {expression!r} appears to involve multiple dataset types, even though only "
268 f"one ({single_parent.name}) is registered, and only one is allowed here."
269 )
270 return single_parent, single_components
272 @abstractmethod
273 def filter_dataset_collections(
274 self,
275 dataset_types: Iterable[DatasetType],
276 collections: Sequence[CollectionRecord],
277 *,
278 governor_constraints: Mapping[str, Set[str]],
279 rejections: list[str] | None = None,
280 ) -> dict[DatasetType, list[CollectionRecord]]:
281 """Filter a sequence of collections to those for which a dataset query
282 might succeed.
284 Parameters
285 ----------
286 dataset_types : `~collections.abc.Iterable` [ `DatasetType` ]
287 Dataset types that are being queried. Must include only parent
288 or standalone dataset types, not components.
289 collections : `~collections.abc.Sequence` [ `CollectionRecord` ]
290 Sequence of collections that will be searched.
291 governor_constraints : `~collections.abc.Mapping` [ `str`, \
292 `~collections.abc.Set` [ `str` ] ], optional
293 Constraints imposed by other aspects of the query on governor
294 dimensions; collections inconsistent with these constraints will be
295 skipped.
296 rejections : `list` [ `str` ], optional
297 If not `None`, a `list` that diagnostic messages will be appended
298 to, for any collection that matches ``collections`` that is not
299 returned. At least one message is guaranteed whenever the result
300 is empty.
302 Returns
303 -------
304 dataset_collections : `dict` [ `DatasetType`, \
305 `list` [ `CollectionRecord` ] ]
306 The collections to search for each dataset. The dictionary's keys
307 are always exactly ``dataset_types`` (in the same order), and each
308 nested `list` of collections is ordered consistently with the
309 given ``collections``.
311 Notes
312 -----
313 This method accepts multiple dataset types and multiple collections at
314 once to enable implementations to batch up the fetching of summary
315 information needed to relate them.
316 """
317 raise NotImplementedError()
319 def resolve_dataset_collections(
320 self,
321 dataset_type: DatasetType,
322 collections: CollectionWildcard,
323 *,
324 governor_constraints: Mapping[str, Set[str]],
325 rejections: list[str] | None = None,
326 collection_types: Set[CollectionType] = CollectionType.all(),
327 allow_calibration_collections: bool = False,
328 ) -> list[CollectionRecord]:
329 """Resolve the sequence of collections to query for a dataset type.
331 Parameters
332 ----------
333 dataset_type : `DatasetType`
334 Dataset type to be queried in the returned collections.
335 collections : `CollectionWildcard`
336 Expression for the collections to be queried.
337 governor_constraints : `~collections.abc.Mapping` [ `str`, \
338 `~collections.abc.Set` ], optional
339 Constraints imposed by other aspects of the query on governor
340 dimensions; collections inconsistent with these constraints will be
341 skipped.
342 rejections : `list` [ `str` ], optional
343 If not `None`, a `list` that diagnostic messages will be appended
344 to, for any collection that matches ``collections`` that is not
345 returned. At least one message is guaranteed whenever the result
346 is empty.
347 collection_types : `~collections.abc.Set` [ `CollectionType` ], \
348 optional
349 Collection types to consider when resolving the collection
350 expression.
351 allow_calibration_collections : `bool`, optional
352 If `False`, skip (with a ``rejections`` message) any calibration
353 collections that match ``collections`` are not given explicitly by
354 name, and raise `NotImplementedError` for any calibration
355 collection that is given explicitly. This is a temporary option
356 that will be removed when the query system can handle temporal
357 joins involving calibration collections.
359 Returns
360 -------
361 records : `list` [ `CollectionRecord` ]
362 A new list of `CollectionRecord` instances, for collections that
363 both match ``collections`` and may have datasets of the given type.
365 Notes
366 -----
367 This is a higher-level driver for `resolve_collection_wildcard` and
368 `filter_dataset_collections` that is mostly concerned with handling
369 queries against `~Collection.Type.CALIBRATION` collections that aren't
370 fully supported yet. Once that support improves, this method may be
371 removed.
372 """
373 if collections == CollectionWildcard() and collection_types == CollectionType.all():
374 collection_types = {CollectionType.RUN}
375 explicit_collections = frozenset(collections.strings)
376 matching_collection_records = self.resolve_collection_wildcard(
377 collections, collection_types=collection_types
378 )
379 ((_, filtered_collection_records),) = self.filter_dataset_collections(
380 [dataset_type],
381 matching_collection_records,
382 governor_constraints=governor_constraints,
383 rejections=rejections,
384 ).items()
385 if not allow_calibration_collections:
386 supported_collection_records: list[CollectionRecord] = []
387 for record in filtered_collection_records:
388 if record.type is CollectionType.CALIBRATION:
389 # If collection name was provided explicitly then raise,
390 # since this is a kind of query we don't support yet;
391 # otherwise collection is a part of a chained one or regex
392 # match, and we skip it to not break queries of other
393 # included collections.
394 if record.name in explicit_collections:
395 raise NotImplementedError(
396 f"Query for dataset type {dataset_type.name!r} in CALIBRATION-type "
397 f"collection {record.name!r} is not yet supported."
398 )
399 else:
400 if rejections is not None:
401 rejections.append(
402 f"Not searching for dataset {dataset_type.name!r} in CALIBRATION "
403 f"collection {record.name!r} because calibration queries aren't fully "
404 "implemented; this is not an error only because the query structure "
405 "implies that searching this collection may be incidental."
406 )
407 supported_collection_records.append(record)
408 else:
409 supported_collection_records.append(record)
410 else:
411 supported_collection_records = filtered_collection_records
412 if not supported_collection_records and rejections is not None and not rejections:
413 rejections.append(f"No collections to search matching expression {collections!r}.")
414 return supported_collection_records
416 @abstractmethod
417 def _make_dataset_query_relation_impl(
418 self,
419 dataset_type: DatasetType,
420 collections: Sequence[CollectionRecord],
421 columns: Set[str],
422 context: _C,
423 ) -> Relation:
424 """Construct a relation that represents an unordered query for datasets
425 that returns matching results from all given collections.
427 Parameters
428 ----------
429 dataset_type : `DatasetType`
430 Type for the datasets being queried.
431 collections : `~collections.abc.Sequence` [ `CollectionRecord` ]
432 Records for collections to query. Should generally be the result
433 of a call to `resolve_dataset_collections`, and must not be empty.
434 context : `QueryContext`
435 Context that manages per-query state.
436 columns : `~collections.abc.Set` [ `str` ]
437 Columns to include in the relation. See `Query.find_datasets` for
438 details.
440 Returns
441 -------
442 relation : `lsst.daf.relation.Relation`
443 Relation representing a dataset query.
445 Notes
446 -----
447 This method must be implemented by derived classes but is not
448 responsible for joining the resulting relation to an existing relation.
449 """
450 raise NotImplementedError()
452 def make_dataset_query_relation(
453 self,
454 dataset_type: DatasetType,
455 collections: Sequence[CollectionRecord],
456 columns: Set[str],
457 context: _C,
458 *,
459 join_to: Relation | None = None,
460 temporal_join_on: Set[ColumnTag] = frozenset(),
461 ) -> Relation:
462 """Construct a relation that represents an unordered query for datasets
463 that returns matching results from all given collections.
465 Parameters
466 ----------
467 dataset_type : `DatasetType`
468 Type for the datasets being queried.
469 collections : `~collections.abc.Sequence` [ `CollectionRecord` ]
470 Records for collections to query. Should generally be the result
471 of a call to `resolve_dataset_collections`, and must not be empty.
472 context : `QueryContext`
473 Context that manages per-query state.
474 columns : `~collections.abc.Set` [ `str` ]
475 Columns to include in the relation. See `Query.find_datasets` for
476 details.
477 join_to : `Relation`, optional
478 Another relation to join with the query for datasets in all
479 collections.
480 temporal_join_on: `~collections.abc.Set` [ `ColumnTag` ], optional
481 Timespan columns in ``join_to`` that calibration dataset timespans
482 must overlap. Must already be present in ``join_to``. Ignored if
483 ``join_to`` is `None` or if there are no calibration collections.
485 Returns
486 -------
487 relation : `lsst.daf.relation.Relation`
488 Relation representing a dataset query.
489 """
490 # If we need to do a temporal join to a calibration collection, we need
491 # to include the timespan column in the base query and prepare the join
492 # predicate.
493 join_predicates: list[Predicate] = []
494 base_timespan_tag: ColumnTag | None = None
495 full_columns: set[str] = set(columns)
496 if (
497 temporal_join_on
498 and join_to is not None
499 and any(r.type is CollectionType.CALIBRATION for r in collections)
500 ):
501 base_timespan_tag = DatasetColumnTag(dataset_type.name, "timespan")
502 rhs = ColumnExpression.reference(base_timespan_tag, dtype=timespan.Timespan)
503 full_columns.add("timespan")
504 for timespan_tag in temporal_join_on:
505 lhs = ColumnExpression.reference(timespan_tag, dtype=timespan.Timespan)
506 join_predicates.append(lhs.predicate_method("overlaps", rhs))
507 # Delegate to the concrete QueryBackend subclass to do most of the
508 # work.
509 result = self._make_dataset_query_relation_impl(
510 dataset_type,
511 collections,
512 full_columns,
513 context=context,
514 )
515 if join_to is not None:
516 result = join_to.join(
517 result, predicate=Predicate.logical_and(*join_predicates) if join_predicates else None
518 )
519 if join_predicates and "timespan" not in columns:
520 # Drop the timespan column we added for the join only if the
521 # timespan wasn't requested in its own right.
522 result = result.with_only_columns(result.columns - {base_timespan_tag})
523 return result
525 def make_dataset_search_relation(
526 self,
527 dataset_type: DatasetType,
528 collections: Sequence[CollectionRecord],
529 columns: Set[str],
530 context: _C,
531 *,
532 join_to: Relation | None = None,
533 temporal_join_on: Set[ColumnTag] = frozenset(),
534 ) -> Relation:
535 """Construct a relation that represents an order query for datasets
536 that returns results from the first matching collection for each data
537 ID.
539 Parameters
540 ----------
541 dataset_type : `DatasetType`
542 Type for the datasets being search.
543 collections : `~collections.abc.Sequence` [ `CollectionRecord` ]
544 Records for collections to search. Should generally be the result
545 of a call to `resolve_dataset_collections`, and must not be empty.
546 columns : `~collections.abc.Set` [ `str` ]
547 Columns to include in the ``relation``. See
548 `make_dataset_query_relation` for options.
549 context : `QueryContext`
550 Context that manages per-query state.
551 join_to : `Relation`, optional
552 Another relation to join with the query for datasets in all
553 collections before filtering out out shadowed datasets.
554 temporal_join_on: `~collections.abc.Set` [ `ColumnTag` ], optional
555 Timespan columns in ``join_to`` that calibration dataset timespans
556 must overlap. Must already be present in ``join_to``. Ignored if
557 ``join_to`` is `None` or if there are no calibration collections.
559 Returns
560 -------
561 relation : `lsst.daf.relation.Relation`
562 Relation representing a find-first dataset search.
563 """
564 base = self.make_dataset_query_relation(
565 dataset_type,
566 collections,
567 columns | {"rank"},
568 context=context,
569 join_to=join_to,
570 temporal_join_on=temporal_join_on,
571 )
572 # Query-simplification shortcut: if there is only one collection, a
573 # find-first search is just a regular result subquery. Same if there
574 # are no collections.
575 if len(collections) <= 1:
576 return base
577 # We filter the dimension keys in the given relation through
578 # DimensionGraph.required.names to minimize the set we partition on
579 # and order it in a more index-friendly way. More precisely, any
580 # index we define on dimensions will be consistent with this order, but
581 # any particular index may not have the same dimension columns.
582 dimensions = self.universe.extract(
583 [tag.dimension for tag in DimensionKeyColumnTag.filter_from(base.columns)]
584 )
585 find_first = FindFirstDataset(
586 dimensions=DimensionKeyColumnTag.generate(dimensions.required.names),
587 rank=DatasetColumnTag(dataset_type.name, "rank"),
588 )
589 return find_first.apply(
590 base, preferred_engine=context.preferred_engine, require_preferred_engine=True
591 ).with_only_columns(base.columns - {find_first.rank})
593 def make_doomed_dataset_relation(
594 self,
595 dataset_type: DatasetType,
596 columns: Set[str],
597 messages: Iterable[str],
598 context: _C,
599 ) -> Relation:
600 """Construct a relation that represents a doomed query for datasets.
602 Parameters
603 ----------
604 dataset_type : `DatasetType`
605 Dataset type being queried.
606 columns : `~collections.abc.Set` [ `str` ]
607 Dataset columns to include (dimension key columns are always
608 included). See `make_dataset_query_relation` for allowed values.
609 messages : `~collections.abc.Iterable` [ `str` ]
610 Diagnostic messages that explain why the query is doomed to yield
611 no rows.
612 context : `QueryContext`
613 Context that manages per-query state.
615 Returns
616 -------
617 relation : `lsst.daf.relation.Relation`
618 Relation with the requested columns and no rows.
619 """
620 column_tags: set[ColumnTag] = set(
621 DimensionKeyColumnTag.generate(dataset_type.dimensions.required.names)
622 )
623 column_tags.update(DatasetColumnTag.generate(dataset_type.name, columns))
624 return context.preferred_engine.make_doomed_relation(columns=column_tags, messages=list(messages))
626 @abstractmethod
627 def make_dimension_relation(
628 self,
629 dimensions: DimensionGraph,
630 columns: Set[ColumnTag],
631 context: _C,
632 *,
633 initial_relation: Relation | None = None,
634 initial_join_max_columns: frozenset[ColumnTag] | None = None,
635 initial_dimension_relationships: Set[frozenset[str]] | None = None,
636 spatial_joins: Iterable[tuple[str, str]] = (),
637 governor_constraints: Mapping[str, Set[str]],
638 ) -> Relation:
639 """Construct a relation that provides columns and constraints from
640 dimension records.
642 Parameters
643 ----------
644 dimensions : `DimensionGraph`
645 Dimensions to include. The key columns for all dimensions (both
646 required and implied) will be included in the returned relation.
647 columns : `~collections.abc.Set` [ `ColumnTag` ]
648 Dimension record columns to include. This set may include key
649 column tags as well, though these may be ignored; the set of key
650 columns to include is determined by the ``dimensions`` argument
651 instead.
652 context : `QueryContext`
653 Context that manages per-query state.
654 initial_relation : `~lsst.daf.relation.Relation`, optional
655 Initial relation to join to the dimension relations. If this
656 relation provides record columns, key columns, and relationships
657 between key columns (see ``initial_dimension_relationships`` below)
658 that would otherwise have been added by joining in a dimension
659 element's relation, that relation may not be joined in at all.
660 initial_join_max_columns : `frozenset` [ `ColumnTag` ], optional
661 Maximum superset of common columns for joins to
662 ``initial_relation`` (i.e. columns in the ``ON`` expression of SQL
663 ``JOIN`` clauses). If provided, this is a subset of the dimension
664 key columns in ``initial_relation``, which are otherwise all
665 considered as potential common columns for joins. Ignored if
666 ``initial_relation`` is not provided.
667 initial_dimension_relationships : `~collections.abc.Set` \
668 [ `frozenset` [ `str` ] ], optional
669 A set of sets of dimension names representing relationships between
670 dimensions encoded in the rows of ``initial_relation``. If not
671 provided (and ``initial_relation`` is),
672 `extract_dimension_relationships` will be called on
673 ``initial_relation``.
674 spatial_joins : `collections.abc.Iterable` [ `tuple` [ `str`, `str` ] ]
675 Iterable of dimension element name pairs that should be spatially
676 joined.
677 governor_constraints : `~collections.abc.Mapping` [ `str` \
678 [ `~collections.abc.Set` [ `str` ] ] ], optional
679 Constraints on governor dimensions that are provided by other parts
680 of the query that either have been included in ``initial_relation``
681 or are guaranteed to be added in the future. This is a mapping from
682 governor dimension name to sets of values that dimension may take.
684 Returns
685 -------
686 relation : `lsst.daf.relation.Relation`
687 Relation containing the given dimension columns and constraints.
688 """
689 raise NotImplementedError()
691 @abstractmethod
692 def resolve_governor_constraints(
693 self, dimensions: DimensionGraph, constraints: Mapping[str, Set[str]], context: _C
694 ) -> Mapping[str, Set[str]]:
695 """Resolve governor dimension constraints provided by user input to
696 a query against the content in the `Registry`.
698 Parameters
699 ----------
700 dimensions : `DimensionGraph`
701 Dimensions that bound the governor dimensions to consider (via
702 ``dimensions.governors``, more specifically).
703 constraints : `~collections.abc.Mapping` [ `str`, \
704 `~collections.abc.Set` [ `str` ] ]
705 Constraints from user input to the query (e.g. from data IDs and
706 string expression predicates).
707 context : `QueryContext`
708 Object that manages state for the query; used here to fetch the
709 governor dimension record cache if it has not already been loaded.
711 Returns
712 -------
713 resolved : `~collections.abc.Mapping` [ `str`, \
714 `~collections.abc.Set` [ `str` ] ]
715 A shallow copy of ``constraints`` with keys equal to
716 ``dimensions.governors.names`` and value sets constrained by the
717 Registry content if they were not already in ``constraints``.
719 Raises
720 ------
721 DataIdValueError
722 Raised if ``constraints`` includes governor dimension values that
723 are not present in the `Registry`.
724 """
725 raise NotImplementedError()
727 @abstractmethod
728 def get_dimension_record_cache(
729 self, element_name: str, context: _C
730 ) -> Mapping[DataCoordinate, DimensionRecord] | None:
731 """Return a local cache of all `DimensionRecord` objects for a
732 dimension element, fetching it if necessary.
734 Parameters
735 ----------
736 element_name : `str`
737 Name of the dimension element.
738 context : `.queries.SqlQueryContext`
739 Context to be used to execute queries when no cached result is
740 available.
742 Returns
743 -------
744 cache : `~collections.abc.Mapping` [ `DataCoordinate`, \
745 `DimensionRecord` ] or `None`
746 Mapping from data ID to dimension record, or `None` if this
747 element's records are never cached.
748 """
749 raise NotImplementedError()
751 def extract_dimension_relationships(self, relation: Relation) -> set[frozenset[str]]:
752 """Extract the dimension key relationships encoded in a relation tree.
754 Parameters
755 ----------
756 relation : `Relation`
757 Relation tree to process.
759 Returns
760 -------
761 relationships : `set` [ `frozenset` [ `str` ] ]
762 Set of sets of dimension names, where each inner set represents a
763 relationship between dimensions.
765 Notes
766 -----
767 Dimension relationships include both many-to-one implied dependencies
768 and many-to-many joins backed by "always-join" dimension elements, and
769 it's important to join in the dimension table that defines a
770 relationship in any query involving dimensions that are a superset of
771 that relationship. For example, let's consider a relation tree that
772 joins dataset existence-check relations for two dataset types, with
773 dimensions ``{instrument, exposure, detector}`` and ``{instrument,
774 physical_filter}``. The joined relation appears to have all dimension
775 keys in its expanded graph present except ``band``, and the system
776 could easily correct this by joining that dimension in directly. But
777 it's also missing the ``{instrument, exposure, physical_filter}``
778 relationship we'd get from the ``exposure`` dimension's own relation
779 (``exposure`` implies ``physical_filter``) and the similar
780 ``{instrument, physical_filter, band}`` relationship from the
781 ``physical_filter`` dimension relation; we need the relationship logic
782 to recognize that those dimensions need to be joined in as well in
783 order for the full relation to have rows that represent valid data IDs.
785 The implementation of this method relies on the assumption that
786 `LeafRelation` objects always have rows that are consistent with all
787 defined relationships (i.e. are valid data IDs). This is true for not
788 just dimension relations themselves, but anything created from queries
789 based on them, including datasets and query results. It is possible to
790 construct `LeafRelation` objects that don't satisfy this criteria (e.g.
791 when accepting in user-provided data IDs), and in this case
792 higher-level guards or warnings must be provided.``
793 """
794 return {
795 frozenset(
796 tag.dimension
797 for tag in DimensionKeyColumnTag.filter_from(leaf_relation.columns & relation.columns)
798 )
799 for leaf_relation in self._extract_leaf_relations(relation).values()
800 }
802 def _extract_leaf_relations(self, relation: Relation) -> dict[str, LeafRelation]:
803 """Recursively extract leaf relations from a relation tree.
805 Parameters
806 ----------
807 relation : `Relation`
808 Tree to process.
810 Returns
811 -------
812 leaves : `dict` [ `str`, `LeafRelation` ]
813 Leaf relations, keyed and deduplicated by name.
814 """
815 match relation:
816 case LeafRelation() as leaf:
817 return {leaf.name: leaf}
818 case UnaryOperationRelation(target=target):
819 return self._extract_leaf_relations(target)
820 case BinaryOperationRelation(lhs=lhs, rhs=rhs):
821 return self._extract_leaf_relations(lhs) | self._extract_leaf_relations(rhs)
822 case MarkerRelation(target=target):
823 return self._extract_leaf_relations(target)
824 raise AssertionError("Match should be exhaustive and all branches should return.")