Coverage for python/lsst/daf/butler/queries/result_specs.py: 42%

132 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-26 02:47 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "ResultSpecBase", 

32 "DataCoordinateResultSpec", 

33 "DimensionRecordResultSpec", 

34 "DatasetRefResultSpec", 

35) 

36 

37from abc import ABC, abstractmethod 

38from collections.abc import Mapping 

39from typing import Annotated, Literal, TypeAlias, cast 

40 

41import pydantic 

42 

43from .._exceptions import InvalidQueryError 

44from ..dimensions import DimensionElement, DimensionGroup, DimensionUniverse 

45from ..pydantic_utils import DeferredValidation 

46from .tree import ColumnSet, DatasetFieldName, OrderExpression, QueryTree 

47 

48 

49class ResultSpecBase(pydantic.BaseModel, ABC): 

50 """Base class for all query-result specification objects. 

51 

52 A result specification is a struct that is combined with a `QueryTree` to 

53 represent a serializable query-results object. 

54 """ 

55 

56 result_type: str 

57 """String literal that corresponds to a concrete derived type.""" 

58 

59 order_by: tuple[OrderExpression, ...] = () 

60 """Expressions to sort the rows by.""" 

61 

62 limit: int | None = None 

63 """Maximum number of rows to return, or `None` for no bound.""" 

64 

65 def validate_tree(self, tree: QueryTree) -> None: 

66 """Check that this result object is consistent with a query tree. 

67 

68 Parameters 

69 ---------- 

70 tree : `QueryTree` 

71 Query tree that defines the joins and row-filtering that these 

72 results will come from. 

73 """ 

74 spec = cast(ResultSpec, self) 

75 if not spec.dimensions <= tree.dimensions: 

76 raise InvalidQueryError( 

77 f"Query result specification has dimensions {spec.dimensions} that are not a subset of the " 

78 f"query's dimensions {tree.dimensions}." 

79 ) 

80 result_columns = spec.get_result_columns() 

81 assert result_columns.dimensions == spec.dimensions, "enforced by ResultSpec implementations" 

82 for dataset_type in result_columns.dataset_fields: 

83 if dataset_type not in tree.datasets: 

84 raise InvalidQueryError(f"Dataset {dataset_type!r} is not available from this query.") 

85 order_by_columns = ColumnSet(spec.dimensions) 

86 for term in spec.order_by: 

87 term.gather_required_columns(order_by_columns) 

88 if not (order_by_columns.dimensions <= spec.dimensions): 

89 raise InvalidQueryError( 

90 "Order-by expression may not reference columns that are not in the result dimensions." 

91 ) 

92 for dataset_type in order_by_columns.dataset_fields.keys(): 

93 if dataset_type not in tree.datasets: 

94 raise InvalidQueryError( 

95 f"Dataset type {dataset_type!r} in order-by expression is not part of the query." 

96 ) 

97 

98 @property 

99 def find_first_dataset(self) -> str | None: 

100 """The dataset type for which find-first resolution is required, if 

101 any. 

102 """ 

103 return None 

104 

105 @abstractmethod 

106 def get_result_columns(self) -> ColumnSet: 

107 """Return the columns included in the actual result rows. 

108 

109 This does not necessarily include all columns required by the 

110 `order_by` terms that are also a part of this spec. 

111 """ 

112 raise NotImplementedError() 

113 

114 

115class DataCoordinateResultSpec(ResultSpecBase): 

116 """Specification for a query that yields `DataCoordinate` objects.""" 

117 

118 result_type: Literal["data_coordinate"] = "data_coordinate" 

119 

120 dimensions: DimensionGroup 

121 """The dimensions of the data IDs returned by this query.""" 

122 

123 include_dimension_records: bool = False 

124 """Whether the returned data IDs include dimension records.""" 

125 

126 def get_result_columns(self) -> ColumnSet: 

127 # Docstring inherited. 

128 result = ColumnSet(self.dimensions) 

129 if self.include_dimension_records: 

130 for element_name in self.dimensions.elements: 

131 element = self.dimensions.universe[element_name] 

132 if not element.is_cached and element not in self.dimensions.universe.skypix_dimensions: 

133 result.dimension_fields[element_name].update(element.schema.remainder.names) 

134 return result 

135 

136 

137class DimensionRecordResultSpec(ResultSpecBase): 

138 """Specification for a query that yields `DimensionRecord` objects.""" 

139 

140 result_type: Literal["dimension_record"] = "dimension_record" 

141 

142 element: DimensionElement 

143 """The name and definition of the dimension records returned by this query. 

144 """ 

145 

146 @property 

147 def dimensions(self) -> DimensionGroup: 

148 """The dimensions that are required or implied (directly or indirectly) 

149 by this dimension element. 

150 """ 

151 return self.element.minimal_group 

152 

153 def get_result_columns(self) -> ColumnSet: 

154 # Docstring inherited. 

155 result = ColumnSet(self.element.minimal_group) 

156 if self.element not in self.dimensions.universe.skypix_dimensions: 

157 result.dimension_fields[self.element.name].update(self.element.schema.remainder.names) 

158 result.drop_dimension_keys(self.element.minimal_group.names - self.element.dimensions.names) 

159 return result 

160 

161 

162class DatasetRefResultSpec(ResultSpecBase): 

163 """Specification for a query that yields `DatasetRef` objects.""" 

164 

165 result_type: Literal["dataset_ref"] = "dataset_ref" 

166 

167 dataset_type_name: str 

168 """The dataset type name of the datasets returned by this query.""" 

169 

170 dimensions: DimensionGroup 

171 """The dimensions of the datasets returned by this query.""" 

172 

173 storage_class_name: str 

174 """The name of the storage class of the datasets returned by this query.""" 

175 

176 include_dimension_records: bool = False 

177 """Whether the data IDs returned by this query include dimension records. 

178 """ 

179 

180 find_first: bool 

181 """Whether this query should resolve data ID duplicates according to the 

182 order of the collections to be searched. 

183 """ 

184 

185 @property 

186 def find_first_dataset(self) -> str | None: 

187 # Docstring inherited. 

188 return self.dataset_type_name if self.find_first else None 

189 

190 def get_result_columns(self) -> ColumnSet: 

191 # Docstring inherited. 

192 result = ColumnSet(self.dimensions) 

193 result.dataset_fields[self.dataset_type_name].update({"dataset_id", "run"}) 

194 if self.include_dimension_records: 

195 for element_name in self.dimensions.elements: 

196 element = self.dimensions.universe[element_name] 

197 if not element.is_cached and element not in self.dimensions.universe.skypix_dimensions: 

198 result.dimension_fields[element_name].update(element.schema.remainder.names) 

199 return result 

200 

201 

202class GeneralResultSpec(ResultSpecBase): 

203 """Specification for a query that yields a table with 

204 an explicit list of columns. 

205 """ 

206 

207 result_type: Literal["general"] = "general" 

208 

209 dimensions: DimensionGroup 

210 """The dimensions that span all fields returned by this query.""" 

211 

212 dimension_fields: Mapping[str, set[str]] 

213 """Dimension record fields included in this query.""" 

214 

215 dataset_fields: Mapping[str, set[DatasetFieldName]] 

216 """Dataset fields included in this query.""" 

217 

218 find_first: bool 

219 """Whether this query requires find-first resolution for a dataset. 

220 

221 This can only be `True` if exactly one dataset type's fields are included 

222 in the results. 

223 """ 

224 

225 @property 

226 def find_first_dataset(self) -> str | None: 

227 # Docstring inherited. 

228 if self.find_first: 

229 (dataset_type,) = self.dataset_fields.keys() 

230 return dataset_type 

231 return None 

232 

233 def get_result_columns(self) -> ColumnSet: 

234 # Docstring inherited. 

235 result = ColumnSet(self.dimensions) 

236 for element_name, fields_for_element in self.dimension_fields.items(): 

237 result.dimension_fields[element_name].update(fields_for_element) 

238 for dataset_type, fields_for_dataset in self.dataset_fields.items(): 

239 result.dataset_fields[dataset_type].update(fields_for_dataset) 

240 return result 

241 

242 @pydantic.model_validator(mode="after") 

243 def _validate(self) -> GeneralResultSpec: 

244 if self.find_first and len(self.dataset_fields) != 1: 

245 raise InvalidQueryError("find_first=True requires exactly one result dataset type.") 

246 for element_name, fields_for_element in self.dimension_fields.items(): 

247 if element_name not in self.dimensions.elements: 

248 raise InvalidQueryError(f"Dimension element {element_name} is not in {self.dimensions}.") 

249 if not fields_for_element: 

250 raise InvalidQueryError( 

251 f"Empty dimension element field set for {element_name!r} is not permitted." 

252 ) 

253 elif element_name in self.dimensions.universe.skypix_dimensions.names: 

254 raise InvalidQueryError( 

255 f"Regions for skypix dimension {element_name!r} are not stored; compute them via " 

256 f"{element_name}.pixelization.pixel(id) instead." 

257 ) 

258 for dataset_type, fields_for_dataset in self.dataset_fields.items(): 

259 if not fields_for_dataset: 

260 raise InvalidQueryError(f"Empty dataset field set for {dataset_type!r} is not permitted.") 

261 return self 

262 

263 

264ResultSpec: TypeAlias = Annotated[ 

265 DataCoordinateResultSpec | DimensionRecordResultSpec | DatasetRefResultSpec | GeneralResultSpec, 

266 pydantic.Field(discriminator="result_type"), 

267] 

268 

269 

270class SerializedResultSpec(DeferredValidation[ResultSpec]): 

271 def to_result_spec(self, universe: DimensionUniverse) -> ResultSpec: 

272 return self.validated(universe=universe)