Coverage for python / lsst / daf / butler / direct_query_driver / _query_analysis.py: 67%

80 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-22 08:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "QueryCollectionAnalysis", 

32 "QueryFindFirstAnalysis", 

33 "QueryJoinsAnalysis", 

34 "ResolvedDatasetSearch", 

35) 

36 

37import dataclasses 

38from collections.abc import Iterator, Mapping 

39from typing import TYPE_CHECKING, Generic, TypeVar 

40 

41from ..dimensions import DimensionElement, DimensionGroup 

42from ..queries import tree as qt 

43from ..registry import CollectionSummary 

44from ..registry.interfaces import CollectionRecord 

45 

46if TYPE_CHECKING: 

47 from ._postprocessing import Postprocessing 

48 from ._sql_builders import SqlSelectBuilder 

49 

50_T = TypeVar("_T") 

51 

52 

53@dataclasses.dataclass 

54class ResolvedDatasetSearch(Generic[_T]): 

55 """A struct describing a dataset search joined into a query, after 

56 resolving its collection search path. 

57 """ 

58 

59 name: _T 

60 """Name or names of the dataset type(s).""" 

61 

62 dimensions: DimensionGroup 

63 """Dimensions of the dataset type.""" 

64 

65 collection_records: list[CollectionRecord] = dataclasses.field(default_factory=list) 

66 """Records of the collections to search for this dataset, in order, after 

67 removing collections inconsistent with the dataset type or the query's 

68 data ID constraint. 

69 """ 

70 

71 messages: list[str] = dataclasses.field(default_factory=list) 

72 """Diagnostic messages about collections that were filtered out of 

73 collection records. 

74 """ 

75 

76 is_calibration_search: bool = False 

77 """`True` if any of the collections to be searched is a 

78 `~CollectionType.CALIBRATION` collection, `False` otherwise. 

79 

80 Since only calibration datasets can be present in 

81 `~CollectionType.CALIBRATION` collections, this also indicates that the 

82 dataset type is a calibration. 

83 """ 

84 

85 

86@dataclasses.dataclass 

87class QueryJoinsAnalysis: 

88 """A struct describing the "joins" section of a butler query. 

89 

90 See `DirectQueryDriver.build_query` for an overview of how queries are 

91 transformed into SQL, and the role this object plays in that. 

92 """ 

93 

94 predicate: qt.Predicate 

95 """Boolean expression to apply to rows.""" 

96 

97 columns: qt.ColumnSet 

98 """All columns whose tables need to be joined into the query. 

99 

100 This is updated after construction to include all columns required by 

101 `predicate`. 

102 """ 

103 

104 materializations: dict[qt.MaterializationKey, DimensionGroup] = dataclasses.field(default_factory=dict) 

105 """Materializations to join into the query.""" 

106 

107 datasets: dict[str, ResolvedDatasetSearch[str]] = dataclasses.field(default_factory=dict) 

108 """Dataset searches to join into the query.""" 

109 

110 data_coordinate_uploads: dict[qt.DataCoordinateUploadKey, DimensionGroup] = dataclasses.field( 

111 default_factory=dict 

112 ) 

113 """Data coordinate uploads to join into the query.""" 

114 

115 messages: list[str] = dataclasses.field(default_factory=list) 

116 """Diagnostic messages that report reasons the query may not return any 

117 rows. 

118 """ 

119 

120 def __post_init__(self) -> None: 

121 self.predicate.gather_required_columns(self.columns) 

122 

123 def iter_mandatory(self, union_dataset_dimensions: DimensionGroup | None) -> Iterator[DimensionElement]: 

124 """Return an iterator over the dimension elements that must be joined 

125 into the query. 

126 

127 These elements either provide "field" (non-key) columns or define 

128 relationships that result rows must be consistent with. They do not 

129 necessarily include all dimension keys in `columns`, since each of 

130 those can typically be included in a query in multiple different ways. 

131 

132 Parameters 

133 ---------- 

134 union_dataset_dimensions : `DimensionGroup` or `None` 

135 Dimensions of the union dataset types, or `None` if this is not 

136 a union dataset query. 

137 """ 

138 for element_name in self.columns.dimensions.elements: 

139 element = self.columns.dimensions.universe[element_name] 

140 if self.columns.dimension_fields[element_name]: 

141 # We need to get dimension record fields for this element, and 

142 # its table is the only place to get those. 

143 yield element 

144 elif element.defines_relationships: 

145 # We also need to join in DimensionElement tables that define 

146 # one-to-many and many-to-many relationships, but data 

147 # coordinate uploads, materializations, and datasets can also 

148 # provide these relationships. Data coordinate uploads and 

149 # dataset tables only have required dimensions, and can hence 

150 # only provide relationships involving those. 

151 if any( 

152 element.minimal_group.names <= upload_dimensions.required 

153 for upload_dimensions in self.data_coordinate_uploads.values() 

154 ): 

155 continue 

156 if any( 

157 element.minimal_group.names <= dataset_spec.dimensions.required 

158 for dataset_spec in self.datasets.values() 

159 ): 

160 continue 

161 if ( 

162 union_dataset_dimensions is not None 

163 and element.minimal_group.names <= union_dataset_dimensions.required 

164 ): 

165 continue 

166 # Materializations have all key columns for their dimensions. 

167 if any( 

168 element in materialization_dimensions.names 

169 for materialization_dimensions in self.materializations.values() 

170 ): 

171 continue 

172 yield element 

173 

174 

175@dataclasses.dataclass 

176class QueryFindFirstAnalysis(Generic[_T]): 

177 """A struct describing the "find-first" stage of a butler query. 

178 

179 See `DirectQueryDriver.build_query` for an overview of how queries are 

180 transformed into SQL, and the role this object plays in that. 

181 """ 

182 

183 search: ResolvedDatasetSearch[_T] 

184 """Information about the dataset type or types being searched for.""" 

185 

186 @property 

187 def dataset_type(self) -> _T: 

188 """Name(s) of the dataset type(s).""" 

189 return self.search.name 

190 

191 def __bool__(self) -> bool: 

192 return len(self.search.collection_records) > 1 

193 

194 

195@dataclasses.dataclass 

196class QueryCollectionAnalysis: 

197 """A struct containing information about all of the collections that appear 

198 in a butler query. 

199 """ 

200 

201 collection_records: Mapping[str, CollectionRecord] 

202 """All collection records, keyed by collection name. 

203 

204 This includes CHAINED collections. 

205 """ 

206 

207 calibration_dataset_types: set[str | qt.AnyDatasetType] = dataclasses.field(default_factory=set) 

208 """A set of the anmes of all calibration dataset types. 

209 

210 If ``ANY_DATASET`` appears in the set, the dataset type union includes at 

211 least one calibration dataset type. 

212 """ 

213 

214 summaries_by_dataset_type: dict[ 

215 str | qt.AnyDatasetType, list[tuple[CollectionRecord, CollectionSummary]] 

216 ] = dataclasses.field(default_factory=dict) 

217 """Collection records and summaries, in search order, keyed by dataset type 

218 name. 

219 

220 CHAINED collections are flattened out in the nested lists. Lists have been 

221 filtered to be consistent with the dataset types in the summaries, but not 

222 necessarily the governor dimensions in the summaries. 

223 """ 

224 

225 

226@dataclasses.dataclass 

227class QueryTreeAnalysis: 

228 """A struct aggregating all analysis results derived from the query tree. 

229 

230 See `DirectQueryDriver.build_query` for an overview of how queries are 

231 transformed into SQL, and the role this object plays in that. 

232 """ 

233 

234 joins: QueryJoinsAnalysis 

235 """Analysis of the "joins" stage, including all joins and columns needed by 

236 ``tree``. Additional columns will be added to this plan later. 

237 """ 

238 

239 union_datasets: list[ResolvedDatasetSearch[list[str]]] 

240 """Resolved dataset searches that expand `QueryTree.any_dataset` out 

241 into groups of dataset types with the same collection search path. 

242 """ 

243 

244 initial_select_builder: SqlSelectBuilder 

245 """In-progress SQL query builder, initialized with just spatial and 

246 temporal overlaps.""" 

247 

248 postprocessing: Postprocessing 

249 """Struct representing post-query processing to be done in Python."""