Coverage for python/lsst/daf/butler/queries/result_specs.py: 43%

126 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-07 11:04 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "ResultSpecBase", 

32 "DataCoordinateResultSpec", 

33 "DimensionRecordResultSpec", 

34 "DatasetRefResultSpec", 

35) 

36 

37from abc import ABC, abstractmethod 

38from collections.abc import Mapping 

39from typing import Annotated, Literal, TypeAlias, Union, cast 

40 

41import pydantic 

42 

43from ..dimensions import DimensionElement, DimensionGroup 

44from .tree import ColumnSet, DatasetFieldName, InvalidQueryError, OrderExpression, QueryTree 

45 

46 

47class ResultSpecBase(pydantic.BaseModel, ABC): 

48 """Base class for all query-result specification objects. 

49 

50 A result specification is a struct that is combined with a `QueryTree` to 

51 represent a serializable query-results object. 

52 """ 

53 

54 result_type: str 

55 """String literal that corresponds to a concrete derived type.""" 

56 

57 order_by: tuple[OrderExpression, ...] = () 

58 """Expressions to sort the rows by.""" 

59 

60 offset: int = 0 

61 """Index of the first row to return.""" 

62 

63 limit: int | None = None 

64 """Maximum number of rows to return, or `None` for no bound.""" 

65 

66 def validate_tree(self, tree: QueryTree) -> None: 

67 """Check that this result object is consistent with a query tree. 

68 

69 Parameters 

70 ---------- 

71 tree : `QueryTree` 

72 Query tree that defines the joins and row-filtering that these 

73 results will come from. 

74 """ 

75 spec = cast(ResultSpec, self) 

76 if not spec.dimensions <= tree.dimensions: 

77 raise InvalidQueryError( 

78 f"Query result specification has dimensions {spec.dimensions} that are not a subset of the " 

79 f"query's dimensions {tree.dimensions}." 

80 ) 

81 result_columns = spec.get_result_columns() 

82 assert result_columns.dimensions == spec.dimensions, "enforced by ResultSpec implementations" 

83 for dataset_type in result_columns.dataset_fields: 

84 if dataset_type not in tree.datasets: 

85 raise InvalidQueryError(f"Dataset {dataset_type!r} is not available from this query.") 

86 order_by_columns = ColumnSet(spec.dimensions) 

87 for term in spec.order_by: 

88 term.gather_required_columns(order_by_columns) 

89 if not (order_by_columns.dimensions <= spec.dimensions): 

90 raise InvalidQueryError( 

91 "Order-by expression may not reference columns that are not in the result dimensions." 

92 ) 

93 for dataset_type in order_by_columns.dataset_fields.keys(): 

94 if dataset_type not in tree.datasets: 

95 raise InvalidQueryError( 

96 f"Dataset type {dataset_type!r} in order-by expression is not part of the query." 

97 ) 

98 

99 @property 

100 def find_first_dataset(self) -> str | None: 

101 """The dataset type for which find-first resolution is required, if 

102 any. 

103 """ 

104 return None 

105 

106 @abstractmethod 

107 def get_result_columns(self) -> ColumnSet: 

108 """Return the columns included in the actual result rows. 

109 

110 This does not necessarily include all columns required by the 

111 `order_by` terms that are also a part of this spec. 

112 """ 

113 raise NotImplementedError() 

114 

115 

116class DataCoordinateResultSpec(ResultSpecBase): 

117 """Specification for a query that yields `DataCoordinate` objects.""" 

118 

119 result_type: Literal["data_coordinate"] = "data_coordinate" 

120 

121 dimensions: DimensionGroup 

122 """The dimensions of the data IDs returned by this query.""" 

123 

124 include_dimension_records: bool = False 

125 """Whether the returned data IDs include dimension records.""" 

126 

127 def get_result_columns(self) -> ColumnSet: 

128 # Docstring inherited. 

129 result = ColumnSet(self.dimensions) 

130 if self.include_dimension_records: 

131 for element_name in self.dimensions.elements: 

132 element = self.dimensions.universe[element_name] 

133 if not element.is_cached: 

134 result.dimension_fields[element_name].update(element.schema.remainder.names) 

135 return result 

136 

137 

138class DimensionRecordResultSpec(ResultSpecBase): 

139 """Specification for a query that yields `DimensionRecord` objects.""" 

140 

141 result_type: Literal["dimension_record"] = "dimension_record" 

142 

143 element: DimensionElement 

144 """The name and definition of the dimension records returned by this query. 

145 """ 

146 

147 @property 

148 def dimensions(self) -> DimensionGroup: 

149 """The dimensions that are required or implied (directly or indirectly) 

150 by this dimension element. 

151 """ 

152 return self.element.minimal_group 

153 

154 def get_result_columns(self) -> ColumnSet: 

155 # Docstring inherited. 

156 result = ColumnSet(self.element.minimal_group) 

157 result.dimension_fields[self.element.name].update(self.element.schema.remainder.names) 

158 result.drop_dimension_keys(self.element.minimal_group.names - self.element.dimensions.names) 

159 return result 

160 

161 

162class DatasetRefResultSpec(ResultSpecBase): 

163 """Specification for a query that yields `DatasetRef` objects.""" 

164 

165 result_type: Literal["dataset_ref"] = "dataset_ref" 

166 

167 dataset_type_name: str 

168 """The dataset type name of the datasets returned by this query.""" 

169 

170 dimensions: DimensionGroup 

171 """The dimensions of the datasets returned by this query.""" 

172 

173 storage_class_name: str 

174 """The name of the storage class of the datasets returned by this query.""" 

175 

176 include_dimension_records: bool = False 

177 """Whether the data IDs returned by this query include dimension records. 

178 """ 

179 

180 find_first: bool 

181 """Whether this query should resolve data ID duplicates according to the 

182 order of the collections to be searched. 

183 """ 

184 

185 @property 

186 def find_first_dataset(self) -> str | None: 

187 # Docstring inherited. 

188 return self.dataset_type_name if self.find_first else None 

189 

190 def get_result_columns(self) -> ColumnSet: 

191 # Docstring inherited. 

192 result = ColumnSet(self.dimensions) 

193 result.dataset_fields[self.dataset_type_name].update({"dataset_id", "run"}) 

194 if self.include_dimension_records: 

195 for element_name in self.dimensions.elements: 

196 element = self.dimensions.universe[element_name] 

197 if not element.is_cached: 

198 result.dimension_fields[element_name].update(element.schema.remainder.names) 

199 return result 

200 

201 

202class GeneralResultSpec(ResultSpecBase): 

203 """Specification for a query that yields a table with 

204 an explicit list of columns. 

205 """ 

206 

207 result_type: Literal["general"] = "general" 

208 

209 dimensions: DimensionGroup 

210 """The dimensions that span all fields returned by this query.""" 

211 

212 dimension_fields: Mapping[str, set[str]] 

213 """Dimension record fields included in this query.""" 

214 

215 dataset_fields: Mapping[str, set[DatasetFieldName]] 

216 """Dataset fields included in this query.""" 

217 

218 find_first: bool 

219 """Whether this query requires find-first resolution for a dataset. 

220 

221 This can only be `True` if exactly one dataset type's fields are included 

222 in the results. 

223 """ 

224 

225 @property 

226 def find_first_dataset(self) -> str | None: 

227 # Docstring inherited. 

228 if self.find_first: 

229 (dataset_type,) = self.dataset_fields.keys() 

230 return dataset_type 

231 return None 

232 

233 def get_result_columns(self) -> ColumnSet: 

234 # Docstring inherited. 

235 result = ColumnSet(self.dimensions) 

236 for element_name, fields_for_element in self.dimension_fields.items(): 

237 result.dimension_fields[element_name].update(fields_for_element) 

238 for dataset_type, fields_for_dataset in self.dataset_fields.items(): 

239 result.dataset_fields[dataset_type].update(fields_for_dataset) 

240 return result 

241 

242 @pydantic.model_validator(mode="after") 

243 def _validate(self) -> GeneralResultSpec: 

244 if self.find_first and len(self.dataset_fields) != 1: 

245 raise InvalidQueryError("find_first=True requires exactly one result dataset type.") 

246 for element_name, fields_for_element in self.dimension_fields.items(): 

247 if element_name not in self.dimensions.elements: 

248 raise InvalidQueryError(f"Dimension element {element_name} is not in {self.dimensions}.") 

249 if not fields_for_element: 

250 raise InvalidQueryError( 

251 f"Empty dimension element field set for {element_name!r} is not permitted." 

252 ) 

253 for dataset_type, fields_for_dataset in self.dataset_fields.items(): 

254 if not fields_for_dataset: 

255 raise InvalidQueryError(f"Empty dataset field set for {dataset_type!r} is not permitted.") 

256 return self 

257 

258 

259ResultSpec: TypeAlias = Annotated[ 

260 Union[DataCoordinateResultSpec, DimensionRecordResultSpec, DatasetRefResultSpec, GeneralResultSpec], 

261 pydantic.Field(discriminator="result_type"), 

262]