Coverage for python/lsst/daf/butler/queries/convert_args.py: 7%

96 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-30 02:53 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "convert_where_args", 

32 "convert_order_by_args", 

33) 

34 

35import itertools 

36from collections.abc import Mapping, Set 

37from typing import Any, cast 

38 

39from .._exceptions import InvalidQueryError 

40from ..dimensions import DataCoordinate, DataId, Dimension, DimensionGroup 

41from .expression_factory import ExpressionFactory, ExpressionProxy 

42from .tree import ( 

43 DATASET_FIELD_NAMES, 

44 ColumnExpression, 

45 DatasetFieldName, 

46 DatasetFieldReference, 

47 DimensionFieldReference, 

48 DimensionKeyReference, 

49 OrderExpression, 

50 Predicate, 

51 Reversed, 

52 UnaryExpression, 

53 make_column_literal, 

54 validate_order_expression, 

55) 

56 

57 

58def convert_where_args( 

59 dimensions: DimensionGroup, 

60 datasets: Set[str], 

61 *args: str | Predicate | DataId, 

62 bind: Mapping[str, Any] | None = None, 

63 **kwargs: Any, 

64) -> Predicate: 

65 """Convert ``where`` arguments to a sequence of column expressions. 

66 

67 Parameters 

68 ---------- 

69 dimensions : `DimensionGroup` 

70 Dimensions already present in the query this filter is being applied 

71 to. Returned predicates may reference dimensions outside this set. 

72 datasets : `~collections.abc.Set` [ `str` ] 

73 Dataset types already present in the query this filter is being applied 

74 to. Returned predicates may reference datasets outside this set; this 

75 may be an error at a higher level, but it is not necessarily checked 

76 here. 

77 *args : `str`, `Predicate`, `DataCoordinate`, or `~collections.abc.Mapping` 

78 Expressions to convert into predicates. 

79 bind : `~collections.abc.Mapping`, optional 

80 Mapping from identifier to literal value used when parsing string 

81 expressions. 

82 **kwargs : `object` 

83 Additional data ID key-value pairs. 

84 

85 Returns 

86 ------- 

87 predicate : `Predicate` 

88 Standardized predicate object. 

89 

90 Notes 

91 ----- 

92 Data ID values are not checked for consistency; they are extracted from 

93 args and then kwargs and combined, with later extractions taking 

94 precedence. 

95 """ 

96 result = Predicate.from_bool(True) 

97 data_id_dict: dict[str, Any] = {} 

98 for arg in args: 

99 match arg: 

100 case str(): 

101 raise NotImplementedError("TODO: plug in registry.queries.expressions.parser") 

102 case Predicate(): 

103 result = result.logical_and(arg) 

104 case DataCoordinate(): 

105 data_id_dict.update(arg.mapping) 

106 case _: 

107 data_id_dict.update(arg) 

108 data_id_dict.update(kwargs) 

109 for k, v in data_id_dict.items(): 

110 result = result.logical_and( 

111 Predicate.compare( 

112 DimensionKeyReference.model_construct(dimension=dimensions.universe.dimensions[k]), 

113 "==", 

114 make_column_literal(v), 

115 ) 

116 ) 

117 return result 

118 

119 

120def convert_order_by_args( 

121 dimensions: DimensionGroup, datasets: Set[str], *args: str | OrderExpression | ExpressionProxy 

122) -> tuple[OrderExpression, ...]: 

123 """Convert ``order_by`` arguments to a sequence of column expressions. 

124 

125 Parameters 

126 ---------- 

127 dimensions : `DimensionGroup` 

128 Dimensions already present in the query whose rows are being sorted. 

129 Returned expressions may reference dimensions outside this set; this 

130 may be an error at a higher level, but it is not necessarily checked 

131 here. 

132 datasets : `~collections.abc.Set` [ `str` ] 

133 Dataset types already present in the query whose rows are being sorted. 

134 Returned expressions may reference datasets outside this set; this may 

135 be an error at a higher level, but it is not necessarily checked here. 

136 *args : `OrderExpression`, `str`, or `ExpressionObject` 

137 Expression or column names to sort by. 

138 

139 Returns 

140 ------- 

141 expressions : `tuple` [ `OrderExpression`, ... ] 

142 Standardized expression objects. 

143 """ 

144 result: list[OrderExpression] = [] 

145 for arg in args: 

146 match arg: 

147 case str(): 

148 reverse = False 

149 if arg.startswith("-"): 

150 reverse = True 

151 arg = arg[1:] 

152 arg = interpret_identifier(dimensions, datasets, arg, {}) 

153 if reverse: 

154 arg = Reversed(operand=arg) 

155 case ExpressionProxy(): 

156 arg = ExpressionFactory.unwrap(arg) 

157 if not hasattr(arg, "expression_type"): 

158 raise TypeError(f"Unrecognized order-by argument: {arg!r}.") 

159 result.append(validate_order_expression(arg)) 

160 return tuple(result) 

161 

162 

163def interpret_identifier( 

164 dimensions: DimensionGroup, datasets: Set[str], identifier: str, bind: Mapping[str, Any] 

165) -> ColumnExpression: 

166 """Associate an identifier in a ``where`` or ``order_by`` expression with 

167 a query column or bind literal. 

168 

169 Parameters 

170 ---------- 

171 dimensions : `DimensionGroup` 

172 Dimensions already present in the query this filter is being applied 

173 to. Returned expressions may reference dimensions outside this set. 

174 datasets : `~collections.abc.Set` [ `str` ] 

175 Dataset types already present in the query this filter is being applied 

176 to. Returned expressions may reference datasets outside this set. 

177 identifier : `str` 

178 String identifier to process. 

179 bind : `~collections.abc.Mapping` [ `str`, `object` ] 

180 Dictionary of bind literals to match identifiers against first. 

181 

182 Returns 

183 ------- 

184 expression : `ColumnExpression` 

185 Column expression corresponding to the identifier. 

186 """ 

187 if identifier in bind: 

188 return make_column_literal(bind[identifier]) 

189 terms = identifier.split(".") 

190 match len(terms): 

191 case 1: 

192 if identifier in dimensions.universe.dimensions: 

193 return DimensionKeyReference.model_construct( 

194 dimension=dimensions.universe.dimensions[identifier] 

195 ) 

196 # This is an unqualified reference to a field of a dimension 

197 # element or datasets; this is okay if it's unambiguous. 

198 element_matches: set[str] = set() 

199 for element_name in dimensions.elements: 

200 element = dimensions.universe[element_name] 

201 if identifier in element.schema.names: 

202 element_matches.add(element_name) 

203 if identifier in DATASET_FIELD_NAMES: 

204 dataset_matches = set(datasets) 

205 else: 

206 dataset_matches = set() 

207 if len(element_matches) + len(dataset_matches) > 1: 

208 match_str = ", ".join( 

209 f"'{x}.{identifier}'" for x in sorted(itertools.chain(element_matches, dataset_matches)) 

210 ) 

211 raise InvalidQueryError( 

212 f"Ambiguous identifier {identifier!r} matches multiple fields: {match_str}." 

213 ) 

214 elif element_matches: 

215 element = dimensions.universe[element_matches.pop()] 

216 return DimensionFieldReference.model_construct(element=element, field=identifier) 

217 elif dataset_matches: 

218 return DatasetFieldReference.model_construct( 

219 dataset_type=dataset_matches.pop(), field=cast(DatasetFieldName, identifier) 

220 ) 

221 case 2: 

222 first, second = terms 

223 if first in dimensions.universe.elements.names: 

224 element = dimensions.universe[first] 

225 if second in element.schema.dimensions.names: 

226 if isinstance(element, Dimension) and second == element.primary_key.name: 

227 # Identifier is something like "visit.id" which we want 

228 # to interpret the same way as just "visit". 

229 return DimensionKeyReference.model_construct(dimension=element) 

230 else: 

231 # Identifier is something like "visit.instrument", 

232 # which we want to interpret the same way as just 

233 # "instrument". 

234 dimension = dimensions.universe.dimensions[second] 

235 return DimensionKeyReference.model_construct(dimension=dimension) 

236 elif second in element.schema.remainder.names: 

237 return DimensionFieldReference.model_construct(element=element, field=second) 

238 else: 

239 raise InvalidQueryError(f"Unrecognized field {second!r} for {first}.") 

240 elif second in DATASET_FIELD_NAMES: 

241 # We just assume the dataset type is okay; it's the job of 

242 # higher-level code to complain otherwise. 

243 return DatasetFieldReference.model_construct( 

244 dataset_type=first, field=cast(DatasetFieldName, second) 

245 ) 

246 if first == "timespan": 

247 base = interpret_identifier(dimensions, datasets, "timespan", bind) 

248 if second == "begin": 

249 return UnaryExpression(operand=base, operator="begin_of") 

250 if second == "end": 

251 return UnaryExpression(operand=base, operator="end_of") 

252 elif first in datasets: 

253 raise InvalidQueryError( 

254 f"Identifier {identifier!r} references dataset type {first!r} but field " 

255 f"{second!r} is not valid for datasets." 

256 ) 

257 case 3: 

258 base = interpret_identifier(dimensions, datasets, ".".join(terms[:2]), bind) 

259 if terms[2] == "begin": 

260 return UnaryExpression(operand=base, operator="begin_of") 

261 if terms[2] == "end": 

262 return UnaryExpression(operand=base, operator="end_of") 

263 raise InvalidQueryError(f"Unrecognized identifier {identifier!r}.")