Coverage for python / lsst / daf / butler / queries / expressions / parser / parserLex.py: 53%

81 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 08:41 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Module which defines PLY lexer for user expressions parsed by pre-flight.""" 

29 

30__all__ = ["LexToken", "ParserLex", "ParserLexError"] 

31 

32import re 

33from typing import Any, Protocol 

34 

35from .ply import lex 

36 

37_RE_RANGE = r"(?P<start>-?\d+)\s*\.\.\s*(?P<stop>-?\d+)(\s*:\s*(?P<stride>[1-9]\d*))?" 

38"""Regular expression to match range literal in the form NUM..NUM[:NUM], 

39this must match t_RANGE_LITERAL docstring. 

40""" 

41 

42 

43class LexToken(Protocol): 

44 """Protocol for LexToken defined in ``ply.lex``.""" 

45 

46 value: Any 

47 type: str 

48 lexer: Any 

49 lexdata: str 

50 lexpos: int 

51 lineno: int 

52 

53 

54class ParserLexError(Exception): 

55 """Exception raised for lex-phase errors. 

56 

57 Parameters 

58 ---------- 

59 expression : `str` 

60 Full initial expression being parsed. 

61 remain : `str` 

62 Remaining non-parsed part of the expression. 

63 pos : `int` 

64 Current parsing position, offset from beginning of expression in 

65 characters. 

66 lineno : `int` 

67 Current line number in the expression. 

68 """ 

69 

70 def __init__(self, expression: str, remain: str, pos: int, lineno: int): 

71 Exception.__init__(self, f"Unexpected character at position {pos}") 

72 self.expression = expression 

73 self.remain = remain 

74 self.pos = pos 

75 self.lineno = lineno 

76 

77 

78class ParserLex: 

79 """Class which defines PLY lexer.""" 

80 

81 @classmethod 

82 def make_lexer(cls, reflags: int = 0, **kwargs: Any) -> Any: 

83 """Return lexer. 

84 

85 Parameters 

86 ---------- 

87 reflags : `int`, optional 

88 Regular expression flags. 

89 **kwargs 

90 Additional parameters for lexer. 

91 

92 Returns 

93 ------- 

94 `ply.lex.Lexer` 

95 Lexer instance. 

96 """ 

97 # make sure that flags that we need are there 

98 kw = dict(reflags=reflags | re.IGNORECASE | re.VERBOSE) 

99 kw.update(kwargs) 

100 

101 return lex.lex(object=cls(), **kw) 

102 

103 # literals = "" 

104 

105 # reserved words in a grammar. 

106 # SQL has reserved words which we could potentially make reserved in our 

107 # grammar too, for now try to pretend we don't care about SQL 

108 reserved = dict( 

109 # IS="IS", 

110 IN="IN", 

111 # NULL="NULL", 

112 OR="OR", 

113 AND="AND", 

114 NOT="NOT", 

115 OVERLAPS="OVERLAPS", 

116 # BETWEEN="BETWEEN", 

117 # LIKE="LIKE", 

118 # ESCAPE="ESCAPE", 

119 # REGEXP="REGEXP" 

120 ) 

121 

122 # List of token names. 

123 tokens = ( 

124 "NUMERIC_LITERAL", 

125 "TIME_LITERAL", 

126 "STRING_LITERAL", 

127 "RANGE_LITERAL", 

128 # 'DURATION_LITERAL', 

129 "QUALIFIED_IDENTIFIER", 

130 "SIMPLE_IDENTIFIER", 

131 "BIND_NAME", 

132 "LPAREN", 

133 "RPAREN", 

134 "EQ", 

135 "NE", 

136 "LT", 

137 "LE", 

138 "GT", 

139 "GE", 

140 "ADD", 

141 "SUB", 

142 "MUL", 

143 "DIV", 

144 "MOD", 

145 "COMMA", 

146 ) + tuple(reserved.values()) 

147 

148 # Regular expression rules for simple tokens 

149 t_LPAREN = r"\(" 

150 t_RPAREN = r"\)" 

151 t_EQ = "=" 

152 t_NE = "!=" 

153 t_LT = "<" 

154 t_LE = "<=" 

155 t_GT = ">" 

156 t_GE = ">=" 

157 t_ADD = r"\+" 

158 t_SUB = "-" 

159 t_MUL = r"\*" 

160 t_DIV = "/" 

161 t_MOD = "%" 

162 t_COMMA = "," 

163 

164 # A string containing ignored characters (spaces and tabs) 

165 t_ignore = " \t" 

166 

167 # Define a rule so we can track line numbers 

168 def t_newline(self, t: LexToken) -> None: 

169 r"""\n+""" 

170 t.lexer.lineno += len(t.value) 

171 

172 # quoted string prefixed with 'T' 

173 def t_TIME_LITERAL(self, t: LexToken) -> LexToken: 

174 """T'.*?'""" 

175 # strip quotes 

176 t.value = t.value[2:-1] 

177 return t 

178 

179 # quoted string 

180 def t_STRING_LITERAL(self, t: LexToken) -> LexToken: 

181 """'.*?'""" 

182 # strip quotes 

183 t.value = t.value[1:-1] 

184 return t 

185 

186 # range literal in format N..M[:S], spaces allowed, see _RE_RANGE 

187 @lex.TOKEN(_RE_RANGE) 

188 def t_RANGE_LITERAL(self, t: LexToken) -> LexToken: 

189 match = re.match(_RE_RANGE, t.value) 

190 assert match is not None, "Guaranteed by tokenization" 

191 start = int(match.group("start")) 

192 stop = int(match.group("stop")) 

193 stride = match.group("stride") 

194 if stride is not None: 

195 stride = int(stride) 

196 t.value = (start, stop, stride) 

197 return t 

198 

199 # numbers are used as strings by parser, do not convert 

200 def t_NUMERIC_LITERAL(self, t: LexToken) -> LexToken: 

201 r"""\d+(\.\d*)?(e[-+]?\d+)? # 1, 1., 1.1, 1e10, 1.1e-10, etc. 

202 | 

203 \.\d+(e[-+]?\d+)? # .1, .1e10, .1e+10 

204 """ 

205 return t 

206 

207 # qualified identifiers have one or two dots 

208 def t_QUALIFIED_IDENTIFIER(self, t: LexToken) -> LexToken: 

209 r"""[a-zA-Z_][a-zA-Z0-9_]*(\.[a-zA-Z_][a-zA-Z0-9_]*){1,2}""" 

210 t.type = "QUALIFIED_IDENTIFIER" 

211 return t 

212 

213 # we only support ASCII in identifier names 

214 def t_SIMPLE_IDENTIFIER(self, t: LexToken) -> LexToken: 

215 """[a-zA-Z_][a-zA-Z0-9_]*""" 

216 # Check for reserved words and make sure they are upper case 

217 reserved = self.reserved.get(t.value.upper()) 

218 if reserved is not None: 

219 t.type = reserved 

220 t.value = reserved 

221 else: 

222 t.type = "SIMPLE_IDENTIFIER" 

223 return t 

224 

225 # we only support ASCII in identifier names 

226 def t_BIND_NAME(self, t: LexToken) -> LexToken: 

227 """[:][a-zA-Z_][a-zA-Z0-9_]*""" 

228 # Drop colon to get the name. 

229 t.value = t.value[1:] 

230 t.type = "BIND_NAME" 

231 return t 

232 

233 def t_error(self, t: LexToken) -> None: 

234 """Error handling rule""" 

235 lexer = t.lexer 

236 raise ParserLexError(lexer.lexdata, t.value, lexer.lexpos, lexer.lineno)