Coverage for python/lsst/daf/butler/registry/queries/_sql_query_backend.py: 16%
104 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("SqlQueryBackend",)
31from collections.abc import Iterable, Mapping, Sequence, Set
32from typing import TYPE_CHECKING, Any, cast
34from lsst.daf.relation import ColumnError, ColumnExpression, ColumnTag, Join, Predicate, Relation
36from ...core import (
37 ColumnCategorization,
38 DataCoordinate,
39 DatasetType,
40 DimensionGraph,
41 DimensionKeyColumnTag,
42 DimensionRecord,
43 DimensionRecordColumnTag,
44 DimensionUniverse,
45 SkyPixDimension,
46)
47from .._collectionType import CollectionType
48from .._exceptions import DataIdValueError
49from ..interfaces import CollectionRecord, Database
50from ._query_backend import QueryBackend
51from ._sql_query_context import SqlQueryContext
53if TYPE_CHECKING:
54 from ..managers import RegistryManagerInstances
57class SqlQueryBackend(QueryBackend[SqlQueryContext]):
58 """An implementation of `QueryBackend` for `SqlRegistry`.
60 Parameters
61 ----------
62 db : `Database`
63 Object that abstracts the database engine.
64 managers : `RegistryManagerInstances`
65 Struct containing the manager objects that back a `SqlRegistry`.
66 """
68 def __init__(
69 self,
70 db: Database,
71 managers: RegistryManagerInstances,
72 ):
73 self._db = db
74 self._managers = managers
76 @property
77 def universe(self) -> DimensionUniverse:
78 # Docstring inherited.
79 return self._managers.dimensions.universe
81 def context(self) -> SqlQueryContext:
82 # Docstring inherited.
83 return SqlQueryContext(self._db, self._managers.column_types)
85 def get_collection_name(self, key: Any) -> str:
86 return self._managers.collections[key].name
88 def resolve_collection_wildcard(
89 self,
90 expression: Any,
91 *,
92 collection_types: Set[CollectionType] = CollectionType.all(),
93 done: set[str] | None = None,
94 flatten_chains: bool = True,
95 include_chains: bool | None = None,
96 ) -> list[CollectionRecord]:
97 # Docstring inherited.
98 return self._managers.collections.resolve_wildcard(
99 expression,
100 collection_types=collection_types,
101 done=done,
102 flatten_chains=flatten_chains,
103 include_chains=include_chains,
104 )
106 def resolve_dataset_type_wildcard(
107 self,
108 expression: Any,
109 components: bool | None = None,
110 missing: list[str] | None = None,
111 explicit_only: bool = False,
112 components_deprecated: bool = True,
113 ) -> dict[DatasetType, list[str | None]]:
114 # Docstring inherited.
115 return self._managers.datasets.resolve_wildcard(
116 expression, components, missing, explicit_only, components_deprecated
117 )
119 def filter_dataset_collections(
120 self,
121 dataset_types: Iterable[DatasetType],
122 collections: Sequence[CollectionRecord],
123 *,
124 governor_constraints: Mapping[str, Set[str]],
125 rejections: list[str] | None = None,
126 ) -> dict[DatasetType, list[CollectionRecord]]:
127 # Docstring inherited.
128 result: dict[DatasetType, list[CollectionRecord]] = {
129 dataset_type: [] for dataset_type in dataset_types
130 }
131 for dataset_type, filtered_collections in result.items():
132 for collection_record in collections:
133 if not dataset_type.isCalibration() and collection_record.type is CollectionType.CALIBRATION:
134 if rejections is not None:
135 rejections.append(
136 f"Not searching for non-calibration dataset of type {dataset_type.name!r} "
137 f"in CALIBRATION collection {collection_record.name!r}."
138 )
139 else:
140 collection_summary = self._managers.datasets.getCollectionSummary(collection_record)
141 if collection_summary.is_compatible_with(
142 dataset_type,
143 governor_constraints,
144 rejections=rejections,
145 name=collection_record.name,
146 ):
147 filtered_collections.append(collection_record)
148 return result
150 def _make_dataset_query_relation_impl(
151 self,
152 dataset_type: DatasetType,
153 collections: Sequence[CollectionRecord],
154 columns: Set[str],
155 context: SqlQueryContext,
156 ) -> Relation:
157 # Docstring inherited.
158 assert len(collections) > 0, (
159 "Caller is responsible for handling the case of all collections being rejected (we can't "
160 "write a good error message without knowing why collections were rejected)."
161 )
162 dataset_storage = self._managers.datasets.find(dataset_type.name)
163 if dataset_storage is None:
164 # Unrecognized dataset type means no results.
165 return self.make_doomed_dataset_relation(
166 dataset_type,
167 columns,
168 messages=[
169 f"Dataset type {dataset_type.name!r} is not registered, "
170 "so no instances of it can exist in any collection."
171 ],
172 context=context,
173 )
174 else:
175 return dataset_storage.make_relation(
176 *collections,
177 columns=columns,
178 context=context,
179 )
181 def make_dimension_relation(
182 self,
183 dimensions: DimensionGraph,
184 columns: Set[ColumnTag],
185 context: SqlQueryContext,
186 *,
187 initial_relation: Relation | None = None,
188 initial_join_max_columns: frozenset[ColumnTag] | None = None,
189 initial_dimension_relationships: Set[frozenset[str]] | None = None,
190 spatial_joins: Iterable[tuple[str, str]] = (),
191 governor_constraints: Mapping[str, Set[str]],
192 ) -> Relation:
193 # Docstring inherited.
195 default_join = Join(max_columns=initial_join_max_columns)
197 # Set up the relation variable we'll update as we join more relations
198 # in, and ensure it is in the SQL engine.
199 relation = context.make_initial_relation(initial_relation)
201 if initial_dimension_relationships is None:
202 relationships = self.extract_dimension_relationships(relation)
203 else:
204 relationships = set(initial_dimension_relationships)
206 # Make a mutable copy of the columns argument.
207 columns_required = set(columns)
209 # Sort spatial joins to put those involving the commonSkyPix dimension
210 # first, since those join subqueries might get reused in implementing
211 # other joins later.
212 spatial_joins = list(spatial_joins)
213 spatial_joins.sort(key=lambda j: self.universe.commonSkyPix.name not in j)
215 # Next we'll handle spatial joins, since those can require refinement
216 # predicates that will need region columns to be included in the
217 # relations we'll join.
218 predicate: Predicate = Predicate.literal(True)
219 for element1, element2 in spatial_joins:
220 (overlaps, needs_refinement) = self._managers.dimensions.make_spatial_join_relation(
221 element1,
222 element2,
223 context=context,
224 governor_constraints=governor_constraints,
225 existing_relationships=relationships,
226 )
227 if needs_refinement:
228 predicate = predicate.logical_and(
229 context.make_spatial_region_overlap_predicate(
230 ColumnExpression.reference(DimensionRecordColumnTag(element1, "region")),
231 ColumnExpression.reference(DimensionRecordColumnTag(element2, "region")),
232 )
233 )
234 columns_required.add(DimensionRecordColumnTag(element1, "region"))
235 columns_required.add(DimensionRecordColumnTag(element2, "region"))
236 relation = relation.join(overlaps)
237 relationships.add(
238 frozenset(self.universe[element1].dimensions.names | self.universe[element2].dimensions.names)
239 )
241 # All skypix columns need to come from either the initial_relation or a
242 # spatial join, since we need all dimension key columns present in the
243 # SQL engine and skypix regions are added by postprocessing in the
244 # native iteration engine.
245 for dimension in dimensions:
246 if DimensionKeyColumnTag(dimension.name) not in relation.columns and isinstance(
247 dimension, SkyPixDimension
248 ):
249 raise NotImplementedError(
250 f"Cannot construct query involving skypix dimension {dimension.name} unless "
251 "it is part of a dataset subquery, spatial join, or other initial relation."
252 )
254 # Before joining in new tables to provide columns, attempt to restore
255 # them from the given relation by weakening projections applied to it.
256 relation, _ = context.restore_columns(relation, columns_required)
258 # Categorize columns not yet included in the relation to associate them
259 # with dimension elements and detect bad inputs.
260 missing_columns = ColumnCategorization.from_iterable(columns_required - relation.columns)
261 if not (missing_columns.dimension_keys <= dimensions.names):
262 raise ColumnError(
263 "Cannot add dimension key column(s) "
264 f"{{{', '.join(name for name in missing_columns.dimension_keys)}}} "
265 f"that were not included in the given dimensions {dimensions}."
266 )
267 if missing_columns.datasets:
268 raise ColumnError(
269 f"Unexpected dataset columns {missing_columns.datasets} in call to make_dimension_relation; "
270 "use make_dataset_query_relation or make_dataset_search relation instead, or filter them "
271 "out if they have already been added or will be added later."
272 )
273 for element_name in missing_columns.dimension_records:
274 if element_name not in dimensions.elements.names:
275 raise ColumnError(
276 f"Cannot join dimension element {element_name} whose dimensions are not a "
277 f"subset of {dimensions}."
278 )
280 # Iterate over all dimension elements whose relations definitely have
281 # to be joined in. The order doesn't matter as long as we can assume
282 # the database query optimizer is going to try to reorder them anyway.
283 for element in dimensions.elements:
284 columns_still_needed = missing_columns.dimension_records[element.name]
285 # Two separate conditions in play here:
286 # - if we need a record column (not just key columns) from this
287 # element, we have to join in its relation;
288 # - if the element establishes a relationship between key columns
289 # that wasn't already established by the initial relation, we
290 # always join that element's relation. Any element with
291 # implied dependencies or the alwaysJoin flag establishes such a
292 # relationship.
293 if columns_still_needed or (
294 (element.alwaysJoin or element.implied)
295 and frozenset(element.dimensions.names) not in relationships
296 ):
297 storage = self._managers.dimensions[element]
298 relation = storage.join(relation, default_join, context)
299 # At this point we've joined in all of the element relations that
300 # definitely need to be included, but we may not have all of the
301 # dimension key columns in the query that we want. To fill out that
302 # set, we iterate over just the given DimensionGraph's dimensions (not
303 # all dimension *elements*) in reverse topological order. That order
304 # should reduce the total number of tables we bring in, since each
305 # dimension will bring in keys for its required dependencies before we
306 # get to those required dependencies.
307 for dimension in self.universe.sorted(dimensions, reverse=True):
308 if DimensionKeyColumnTag(dimension.name) not in relation.columns:
309 storage = self._managers.dimensions[dimension]
310 relation = storage.join(relation, default_join, context)
312 # Add the predicates we constructed earlier, with a transfer to native
313 # iteration first if necessary.
314 if not predicate.as_trivial():
315 relation = relation.with_rows_satisfying(
316 predicate, preferred_engine=context.iteration_engine, transfer=True
317 )
319 # Finally project the new relation down to just the columns in the
320 # initial relation, the dimension key columns, and the new columns
321 # requested.
322 columns_kept = set(columns)
323 if initial_relation is not None:
324 columns_kept.update(initial_relation.columns)
325 columns_kept.update(DimensionKeyColumnTag.generate(dimensions.names))
326 relation = relation.with_only_columns(columns_kept, preferred_engine=context.preferred_engine)
328 return relation
330 def resolve_governor_constraints(
331 self, dimensions: DimensionGraph, constraints: Mapping[str, Set[str]], context: SqlQueryContext
332 ) -> Mapping[str, Set[str]]:
333 # Docstring inherited.
334 result: dict[str, Set[str]] = {}
335 for dimension in dimensions.governors:
336 storage = self._managers.dimensions[dimension]
337 records = storage.get_record_cache(context)
338 assert records is not None, "Governor dimensions are always cached."
339 all_values = {cast(str, data_id[dimension.name]) for data_id in records}
340 if (constraint_values := constraints.get(dimension.name)) is not None:
341 if not (constraint_values <= all_values):
342 raise DataIdValueError(
343 f"Unknown values specified for governor dimension {dimension.name}: "
344 f"{constraint_values - all_values}."
345 )
346 result[dimension.name] = constraint_values
347 else:
348 result[dimension.name] = all_values
349 return result
351 def get_dimension_record_cache(
352 self,
353 element_name: str,
354 context: SqlQueryContext,
355 ) -> Mapping[DataCoordinate, DimensionRecord] | None:
356 return self._managers.dimensions[element_name].get_record_cache(context)