Coverage for python/lsst/daf/butler/registry/queries/expressions/parser/parserLex.py: 53%

69 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-05 02:53 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28# type: ignore 

29 

30"""Module which defines PLY lexer for user expressions parsed by pre-flight. 

31""" 

32 

33__all__ = ["ParserLex", "ParserLexError"] 

34 

35# ------------------------------- 

36# Imports of standard modules -- 

37# ------------------------------- 

38import re 

39 

40# ----------------------------- 

41# Imports for other modules -- 

42# ----------------------------- 

43from .ply import lex 

44 

45# ---------------------------------- 

46# Local non-exported definitions -- 

47# ---------------------------------- 

48 

49_RE_RANGE = r"(?P<start>-?\d+)\s*\.\.\s*(?P<stop>-?\d+)(\s*:\s*(?P<stride>[1-9]\d*))?" 

50"""Regular expression to match range literal in the form NUM..NUM[:NUM], 

51this must match t_RANGE_LITERAL docstring. 

52""" 

53 

54# ------------------------ 

55# Exported definitions -- 

56# ------------------------ 

57 

58 

59class ParserLexError(Exception): 

60 """Exception raised for lex-phase errors. 

61 

62 Parameters 

63 ---------- 

64 expression : `str` 

65 Full initial expression being parsed. 

66 remain : `str` 

67 Remaining non-parsed part of the expression. 

68 pos : `int` 

69 Current parsing position, offset from beginning of expression in 

70 characters. 

71 lineno : `int` 

72 Current line number in the expression. 

73 """ 

74 

75 def __init__(self, expression, remain, pos, lineno): 

76 Exception.__init__(self, f"Unexpected character at position {pos}") 

77 self.expression = expression 

78 self.remain = remain 

79 self.pos = pos 

80 self.lineno = lineno 

81 

82 

83class ParserLex: 

84 """Class which defines PLY lexer.""" 

85 

86 @classmethod 

87 def make_lexer(cls, reflags=0, **kwargs): 

88 """Return lexer. 

89 

90 Parameters 

91 ---------- 

92 reflags : `int`, optional 

93 Regular expression flags. 

94 **kwargs 

95 Additional parameters for lexer. 

96 

97 Returns 

98 ------- 

99 `ply.lex.Lexer` 

100 Lexer instance. 

101 """ 

102 # make sure that flags that we need are there 

103 kw = dict(reflags=reflags | re.IGNORECASE | re.VERBOSE) 

104 kw.update(kwargs) 

105 

106 return lex.lex(object=cls(), **kw) 

107 

108 # literals = "" 

109 

110 # reserved words in a grammar. 

111 # SQL has reserved words which we could potentially make reserved in our 

112 # grammar too, for now try to pretend we don't care about SQL 

113 reserved = dict( 

114 # IS="IS", 

115 IN="IN", 

116 # NULL="NULL", 

117 OR="OR", 

118 AND="AND", 

119 NOT="NOT", 

120 OVERLAPS="OVERLAPS", 

121 # BETWEEN="BETWEEN", 

122 # LIKE="LIKE", 

123 # ESCAPE="ESCAPE", 

124 # REGEXP="REGEXP" 

125 ) 

126 

127 # List of token names. 

128 tokens = ( 

129 "NUMERIC_LITERAL", 

130 "TIME_LITERAL", 

131 "STRING_LITERAL", 

132 "RANGE_LITERAL", 

133 # 'DURATION_LITERAL', 

134 "QUALIFIED_IDENTIFIER", 

135 "SIMPLE_IDENTIFIER", 

136 "LPAREN", 

137 "RPAREN", 

138 "EQ", 

139 "NE", 

140 "LT", 

141 "LE", 

142 "GT", 

143 "GE", 

144 "ADD", 

145 "SUB", 

146 "MUL", 

147 "DIV", 

148 "MOD", 

149 "COMMA", 

150 ) + tuple(reserved.values()) 

151 

152 # Regular expression rules for simple tokens 

153 t_LPAREN = r"\(" 

154 t_RPAREN = r"\)" 

155 t_EQ = "=" 

156 t_NE = "!=" 

157 t_LT = "<" 

158 t_LE = "<=" 

159 t_GT = ">" 

160 t_GE = ">=" 

161 t_ADD = r"\+" 

162 t_SUB = "-" 

163 t_MUL = r"\*" 

164 t_DIV = "/" 

165 t_MOD = "%" 

166 t_COMMA = "," 

167 

168 # A string containing ignored characters (spaces and tabs) 

169 t_ignore = " \t" 

170 

171 # Define a rule so we can track line numbers 

172 def t_newline(self, t): 

173 r"""\n+""" 

174 t.lexer.lineno += len(t.value) 

175 

176 # quoted string prefixed with 'T' 

177 def t_TIME_LITERAL(self, t): 

178 """T'.*?'""" 

179 # strip quotes 

180 t.value = t.value[2:-1] 

181 return t 

182 

183 # quoted string 

184 def t_STRING_LITERAL(self, t): 

185 """'.*?'""" 

186 # strip quotes 

187 t.value = t.value[1:-1] 

188 return t 

189 

190 # range literal in format N..M[:S], spaces allowed, see _RE_RANGE 

191 @lex.TOKEN(_RE_RANGE) 

192 def t_RANGE_LITERAL(self, t): 

193 match = re.match(_RE_RANGE, t.value) 

194 start = int(match.group("start")) 

195 stop = int(match.group("stop")) 

196 stride = match.group("stride") 

197 if stride is not None: 

198 stride = int(stride) 

199 t.value = (start, stop, stride) 

200 return t 

201 

202 # numbers are used as strings by parser, do not convert 

203 def t_NUMERIC_LITERAL(self, t): 

204 r"""\d+(\.\d*)?(e[-+]?\d+)? # 1, 1., 1.1, 1e10, 1.1e-10, etc. 

205 | 

206 \.\d+(e[-+]?\d+)? # .1, .1e10, .1e+10 

207 """ 

208 return t 

209 

210 # qualified identifiers have one or two dots 

211 def t_QUALIFIED_IDENTIFIER(self, t): 

212 r"""[a-zA-Z_][a-zA-Z0-9_]*(\.[a-zA-Z_][a-zA-Z0-9_]*){1,2}""" 

213 t.type = "QUALIFIED_IDENTIFIER" 

214 return t 

215 

216 # we only support ASCII in identifier names 

217 def t_SIMPLE_IDENTIFIER(self, t): 

218 """[a-zA-Z_][a-zA-Z0-9_]*""" 

219 # Check for reserved words and make sure they are upper case 

220 reserved = self.reserved.get(t.value.upper()) 

221 if reserved is not None: 

222 t.type = reserved 

223 t.value = reserved 

224 else: 

225 t.type = "SIMPLE_IDENTIFIER" 

226 return t 

227 

228 def t_error(self, t): 

229 """Error handling rule""" 

230 lexer = t.lexer 

231 raise ParserLexError(lexer.lexdata, t.value, lexer.lexpos, lexer.lineno)