Coverage for python/lsst/daf/butler/queries/result_specs.py: 43%

124 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-26 02:48 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "ResultSpecBase", 

32 "DataCoordinateResultSpec", 

33 "DimensionRecordResultSpec", 

34 "DatasetRefResultSpec", 

35) 

36 

37from abc import ABC, abstractmethod 

38from collections.abc import Mapping 

39from typing import Annotated, Literal, TypeAlias, Union, cast 

40 

41import pydantic 

42 

43from ..dimensions import DimensionElement, DimensionGroup 

44from .tree import ColumnSet, DatasetFieldName, InvalidQueryError, OrderExpression, QueryTree 

45 

46 

47class ResultSpecBase(pydantic.BaseModel, ABC): 

48 """Base class for all query-result specification objects. 

49 

50 A result specification is a struct that is combined with a `QueryTree` to 

51 represent a serializable query-results object. 

52 """ 

53 

54 result_type: str 

55 """String literal that corresponds to a concrete derived type.""" 

56 

57 order_by: tuple[OrderExpression, ...] = () 

58 """Expressions to sort the rows by.""" 

59 

60 limit: int | None = None 

61 """Maximum number of rows to return, or `None` for no bound.""" 

62 

63 def validate_tree(self, tree: QueryTree) -> None: 

64 """Check that this result object is consistent with a query tree. 

65 

66 Parameters 

67 ---------- 

68 tree : `QueryTree` 

69 Query tree that defines the joins and row-filtering that these 

70 results will come from. 

71 """ 

72 spec = cast(ResultSpec, self) 

73 if not spec.dimensions <= tree.dimensions: 

74 raise InvalidQueryError( 

75 f"Query result specification has dimensions {spec.dimensions} that are not a subset of the " 

76 f"query's dimensions {tree.dimensions}." 

77 ) 

78 result_columns = spec.get_result_columns() 

79 assert result_columns.dimensions == spec.dimensions, "enforced by ResultSpec implementations" 

80 for dataset_type in result_columns.dataset_fields: 

81 if dataset_type not in tree.datasets: 

82 raise InvalidQueryError(f"Dataset {dataset_type!r} is not available from this query.") 

83 order_by_columns = ColumnSet(spec.dimensions) 

84 for term in spec.order_by: 

85 term.gather_required_columns(order_by_columns) 

86 if not (order_by_columns.dimensions <= spec.dimensions): 

87 raise InvalidQueryError( 

88 "Order-by expression may not reference columns that are not in the result dimensions." 

89 ) 

90 for dataset_type in order_by_columns.dataset_fields.keys(): 

91 if dataset_type not in tree.datasets: 

92 raise InvalidQueryError( 

93 f"Dataset type {dataset_type!r} in order-by expression is not part of the query." 

94 ) 

95 

96 @property 

97 def find_first_dataset(self) -> str | None: 

98 """The dataset type for which find-first resolution is required, if 

99 any. 

100 """ 

101 return None 

102 

103 @abstractmethod 

104 def get_result_columns(self) -> ColumnSet: 

105 """Return the columns included in the actual result rows. 

106 

107 This does not necessarily include all columns required by the 

108 `order_by` terms that are also a part of this spec. 

109 """ 

110 raise NotImplementedError() 

111 

112 

113class DataCoordinateResultSpec(ResultSpecBase): 

114 """Specification for a query that yields `DataCoordinate` objects.""" 

115 

116 result_type: Literal["data_coordinate"] = "data_coordinate" 

117 

118 dimensions: DimensionGroup 

119 """The dimensions of the data IDs returned by this query.""" 

120 

121 include_dimension_records: bool = False 

122 """Whether the returned data IDs include dimension records.""" 

123 

124 def get_result_columns(self) -> ColumnSet: 

125 # Docstring inherited. 

126 result = ColumnSet(self.dimensions) 

127 if self.include_dimension_records: 

128 for element_name in self.dimensions.elements: 

129 element = self.dimensions.universe[element_name] 

130 if not element.is_cached: 

131 result.dimension_fields[element_name].update(element.schema.remainder.names) 

132 return result 

133 

134 

135class DimensionRecordResultSpec(ResultSpecBase): 

136 """Specification for a query that yields `DimensionRecord` objects.""" 

137 

138 result_type: Literal["dimension_record"] = "dimension_record" 

139 

140 element: DimensionElement 

141 """The name and definition of the dimension records returned by this query. 

142 """ 

143 

144 @property 

145 def dimensions(self) -> DimensionGroup: 

146 """The dimensions that are required or implied (directly or indirectly) 

147 by this dimension element. 

148 """ 

149 return self.element.minimal_group 

150 

151 def get_result_columns(self) -> ColumnSet: 

152 # Docstring inherited. 

153 result = ColumnSet(self.element.minimal_group) 

154 result.dimension_fields[self.element.name].update(self.element.schema.remainder.names) 

155 result.drop_dimension_keys(self.element.minimal_group.names - self.element.dimensions.names) 

156 return result 

157 

158 

159class DatasetRefResultSpec(ResultSpecBase): 

160 """Specification for a query that yields `DatasetRef` objects.""" 

161 

162 result_type: Literal["dataset_ref"] = "dataset_ref" 

163 

164 dataset_type_name: str 

165 """The dataset type name of the datasets returned by this query.""" 

166 

167 dimensions: DimensionGroup 

168 """The dimensions of the datasets returned by this query.""" 

169 

170 storage_class_name: str 

171 """The name of the storage class of the datasets returned by this query.""" 

172 

173 include_dimension_records: bool = False 

174 """Whether the data IDs returned by this query include dimension records. 

175 """ 

176 

177 find_first: bool 

178 """Whether this query should resolve data ID duplicates according to the 

179 order of the collections to be searched. 

180 """ 

181 

182 @property 

183 def find_first_dataset(self) -> str | None: 

184 # Docstring inherited. 

185 return self.dataset_type_name if self.find_first else None 

186 

187 def get_result_columns(self) -> ColumnSet: 

188 # Docstring inherited. 

189 result = ColumnSet(self.dimensions) 

190 result.dataset_fields[self.dataset_type_name].update({"dataset_id", "run"}) 

191 if self.include_dimension_records: 

192 for element_name in self.dimensions.elements: 

193 element = self.dimensions.universe[element_name] 

194 if not element.is_cached: 

195 result.dimension_fields[element_name].update(element.schema.remainder.names) 

196 return result 

197 

198 

199class GeneralResultSpec(ResultSpecBase): 

200 """Specification for a query that yields a table with 

201 an explicit list of columns. 

202 """ 

203 

204 result_type: Literal["general"] = "general" 

205 

206 dimensions: DimensionGroup 

207 """The dimensions that span all fields returned by this query.""" 

208 

209 dimension_fields: Mapping[str, set[str]] 

210 """Dimension record fields included in this query.""" 

211 

212 dataset_fields: Mapping[str, set[DatasetFieldName]] 

213 """Dataset fields included in this query.""" 

214 

215 find_first: bool 

216 """Whether this query requires find-first resolution for a dataset. 

217 

218 This can only be `True` if exactly one dataset type's fields are included 

219 in the results. 

220 """ 

221 

222 @property 

223 def find_first_dataset(self) -> str | None: 

224 # Docstring inherited. 

225 if self.find_first: 

226 (dataset_type,) = self.dataset_fields.keys() 

227 return dataset_type 

228 return None 

229 

230 def get_result_columns(self) -> ColumnSet: 

231 # Docstring inherited. 

232 result = ColumnSet(self.dimensions) 

233 for element_name, fields_for_element in self.dimension_fields.items(): 

234 result.dimension_fields[element_name].update(fields_for_element) 

235 for dataset_type, fields_for_dataset in self.dataset_fields.items(): 

236 result.dataset_fields[dataset_type].update(fields_for_dataset) 

237 return result 

238 

239 @pydantic.model_validator(mode="after") 

240 def _validate(self) -> GeneralResultSpec: 

241 if self.find_first and len(self.dataset_fields) != 1: 

242 raise InvalidQueryError("find_first=True requires exactly one result dataset type.") 

243 for element_name, fields_for_element in self.dimension_fields.items(): 

244 if element_name not in self.dimensions.elements: 

245 raise InvalidQueryError(f"Dimension element {element_name} is not in {self.dimensions}.") 

246 if not fields_for_element: 

247 raise InvalidQueryError( 

248 f"Empty dimension element field set for {element_name!r} is not permitted." 

249 ) 

250 for dataset_type, fields_for_dataset in self.dataset_fields.items(): 

251 if not fields_for_dataset: 

252 raise InvalidQueryError(f"Empty dataset field set for {dataset_type!r} is not permitted.") 

253 return self 

254 

255 

256ResultSpec: TypeAlias = Annotated[ 

257 Union[DataCoordinateResultSpec, DimensionRecordResultSpec, DatasetRefResultSpec, GeneralResultSpec], 

258 pydantic.Field(discriminator="result_type"), 

259]