Coverage for python / lsst / daf / butler / queries / expressions / parser / parserLex.py: 53%
81 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-24 08:17 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-24 08:17 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Module which defines PLY lexer for user expressions parsed by pre-flight."""
30__all__ = ["LexToken", "ParserLex", "ParserLexError"]
32import re
33from typing import Any, Protocol
35from .ply import lex
37_RE_RANGE = r"(?P<start>-?\d+)\s*\.\.\s*(?P<stop>-?\d+)(\s*:\s*(?P<stride>[1-9]\d*))?"
38"""Regular expression to match range literal in the form NUM..NUM[:NUM],
39this must match t_RANGE_LITERAL docstring.
40"""
43class LexToken(Protocol):
44 """Protocol for LexToken defined in ``ply.lex``."""
46 value: Any
47 type: str
48 lexer: Any
49 lexdata: str
50 lexpos: int
51 lineno: int
54class ParserLexError(Exception):
55 """Exception raised for lex-phase errors.
57 Parameters
58 ----------
59 expression : `str`
60 Full initial expression being parsed.
61 remain : `str`
62 Remaining non-parsed part of the expression.
63 pos : `int`
64 Current parsing position, offset from beginning of expression in
65 characters.
66 lineno : `int`
67 Current line number in the expression.
68 """
70 def __init__(self, expression: str, remain: str, pos: int, lineno: int):
71 Exception.__init__(self, f"Unexpected character at position {pos}")
72 self.expression = expression
73 self.remain = remain
74 self.pos = pos
75 self.lineno = lineno
78class ParserLex:
79 """Class which defines PLY lexer."""
81 @classmethod
82 def make_lexer(cls, reflags: int = 0, **kwargs: Any) -> Any:
83 """Return lexer.
85 Parameters
86 ----------
87 reflags : `int`, optional
88 Regular expression flags.
89 **kwargs
90 Additional parameters for lexer.
92 Returns
93 -------
94 `ply.lex.Lexer`
95 Lexer instance.
96 """
97 # make sure that flags that we need are there
98 kw = dict(reflags=reflags | re.IGNORECASE | re.VERBOSE)
99 kw.update(kwargs)
101 return lex.lex(object=cls(), **kw)
103 # literals = ""
105 # reserved words in a grammar.
106 # SQL has reserved words which we could potentially make reserved in our
107 # grammar too, for now try to pretend we don't care about SQL
108 reserved = dict(
109 # IS="IS",
110 IN="IN",
111 # NULL="NULL",
112 OR="OR",
113 AND="AND",
114 NOT="NOT",
115 OVERLAPS="OVERLAPS",
116 # BETWEEN="BETWEEN",
117 # LIKE="LIKE",
118 # ESCAPE="ESCAPE",
119 # REGEXP="REGEXP"
120 )
122 # List of token names.
123 tokens = (
124 "NUMERIC_LITERAL",
125 "TIME_LITERAL",
126 "STRING_LITERAL",
127 "RANGE_LITERAL",
128 # 'DURATION_LITERAL',
129 "QUALIFIED_IDENTIFIER",
130 "SIMPLE_IDENTIFIER",
131 "BIND_NAME",
132 "LPAREN",
133 "RPAREN",
134 "EQ",
135 "NE",
136 "LT",
137 "LE",
138 "GT",
139 "GE",
140 "ADD",
141 "SUB",
142 "MUL",
143 "DIV",
144 "MOD",
145 "COMMA",
146 ) + tuple(reserved.values())
148 # Regular expression rules for simple tokens
149 t_LPAREN = r"\("
150 t_RPAREN = r"\)"
151 t_EQ = "="
152 t_NE = "!="
153 t_LT = "<"
154 t_LE = "<="
155 t_GT = ">"
156 t_GE = ">="
157 t_ADD = r"\+"
158 t_SUB = "-"
159 t_MUL = r"\*"
160 t_DIV = "/"
161 t_MOD = "%"
162 t_COMMA = ","
164 # A string containing ignored characters (spaces and tabs)
165 t_ignore = " \t"
167 # Define a rule so we can track line numbers
168 def t_newline(self, t: LexToken) -> None:
169 r"""\n+"""
170 t.lexer.lineno += len(t.value)
172 # quoted string prefixed with 'T'
173 def t_TIME_LITERAL(self, t: LexToken) -> LexToken:
174 """T'.*?'"""
175 # strip quotes
176 t.value = t.value[2:-1]
177 return t
179 # quoted string
180 def t_STRING_LITERAL(self, t: LexToken) -> LexToken:
181 """'.*?'"""
182 # strip quotes
183 t.value = t.value[1:-1]
184 return t
186 # range literal in format N..M[:S], spaces allowed, see _RE_RANGE
187 @lex.TOKEN(_RE_RANGE)
188 def t_RANGE_LITERAL(self, t: LexToken) -> LexToken:
189 match = re.match(_RE_RANGE, t.value)
190 assert match is not None, "Guaranteed by tokenization"
191 start = int(match.group("start"))
192 stop = int(match.group("stop"))
193 stride = match.group("stride")
194 if stride is not None:
195 stride = int(stride)
196 t.value = (start, stop, stride)
197 return t
199 # numbers are used as strings by parser, do not convert
200 def t_NUMERIC_LITERAL(self, t: LexToken) -> LexToken:
201 r"""\d+(\.\d*)?(e[-+]?\d+)? # 1, 1., 1.1, 1e10, 1.1e-10, etc.
202 |
203 \.\d+(e[-+]?\d+)? # .1, .1e10, .1e+10
204 """
205 return t
207 # qualified identifiers have one or two dots
208 def t_QUALIFIED_IDENTIFIER(self, t: LexToken) -> LexToken:
209 r"""[a-zA-Z_][a-zA-Z0-9_]*(\.[a-zA-Z_][a-zA-Z0-9_]*){1,2}"""
210 t.type = "QUALIFIED_IDENTIFIER"
211 return t
213 # we only support ASCII in identifier names
214 def t_SIMPLE_IDENTIFIER(self, t: LexToken) -> LexToken:
215 """[a-zA-Z_][a-zA-Z0-9_]*"""
216 # Check for reserved words and make sure they are upper case
217 reserved = self.reserved.get(t.value.upper())
218 if reserved is not None:
219 t.type = reserved
220 t.value = reserved
221 else:
222 t.type = "SIMPLE_IDENTIFIER"
223 return t
225 # we only support ASCII in identifier names
226 def t_BIND_NAME(self, t: LexToken) -> LexToken:
227 """[:][a-zA-Z_][a-zA-Z0-9_]*"""
228 # Drop colon to get the name.
229 t.value = t.value[1:]
230 t.type = "BIND_NAME"
231 return t
233 def t_error(self, t: LexToken) -> None:
234 """Error handling rule"""
235 lexer = t.lexer
236 raise ParserLexError(lexer.lexdata, t.value, lexer.lexpos, lexer.lineno)