Coverage for python/lsst/daf/butler/registry/queries/expressions/parser/ply/lex.py: 7%
692 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-06 10:53 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-06 10:53 +0000
1# -----------------------------------------------------------------------------
2# ply: lex.py
3#
4# Copyright (C) 2001-2018
5# David M. Beazley (Dabeaz LLC)
6# All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions are
10# met:
11#
12# * Redistributions of source code must retain the above copyright notice,
13# this list of conditions and the following disclaimer.
14# * Redistributions in binary form must reproduce the above copyright notice,
15# this list of conditions and the following disclaimer in the documentation
16# and/or other materials provided with the distribution.
17# * Neither the name of the David Beazley or Dabeaz LLC may be used to
18# endorse or promote products derived from this software without
19# specific prior written permission.
20#
21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32# -----------------------------------------------------------------------------
34__version__ = "3.11"
35__tabversion__ = "3.10"
37import copy
38import inspect
39import os
40import re
41import sys
42import types
44# This tuple contains known string types
45try:
46 # Python 2.6
47 StringTypes = (types.StringType, types.UnicodeType)
48except AttributeError:
49 # Python 3.0
50 StringTypes = (str, bytes)
52# This regular expression is used to match valid token names
53_is_identifier = re.compile(r"^[a-zA-Z0-9_]+$")
56# Exception thrown when invalid token encountered and no default error
57# handler is defined.
58class LexError(Exception):
59 def __init__(self, message, s):
60 self.args = (message,)
61 self.text = s
64# Token class. This class is used to represent the tokens produced.
65class LexToken(object):
66 def __str__(self):
67 return "LexToken(%s,%r,%d,%d)" % (self.type, self.value, self.lineno, self.lexpos)
69 def __repr__(self):
70 return str(self)
73# This object is a stand-in for a logging object created by the
74# logging module.
77class PlyLogger(object):
78 def __init__(self, f):
79 self.f = f
81 def critical(self, msg, *args, **kwargs):
82 self.f.write((msg % args) + "\n")
84 def warning(self, msg, *args, **kwargs):
85 self.f.write("WARNING: " + (msg % args) + "\n")
87 def error(self, msg, *args, **kwargs):
88 self.f.write("ERROR: " + (msg % args) + "\n")
90 info = critical
91 debug = critical
94# Null logger is used when no output is generated. Does nothing.
95class NullLogger(object):
96 def __getattribute__(self, name):
97 return self
99 def __call__(self, *args, **kwargs):
100 return self
103# -----------------------------------------------------------------------------
104# === Lexing Engine ===
105#
106# The following Lexer class implements the lexer runtime. There are only
107# a few public methods and attributes:
108#
109# input() - Store a new string in the lexer
110# token() - Get the next token
111# clone() - Clone the lexer
112#
113# lineno - Current line number
114# lexpos - Current position in the input string
115# -----------------------------------------------------------------------------
118class Lexer:
119 def __init__(self):
120 self.lexre = None # Master regular expression. This is a list of
121 # tuples (re, findex) where re is a compiled
122 # regular expression and findex is a list
123 # mapping regex group numbers to rules
124 self.lexretext = None # Current regular expression strings
125 self.lexstatere = {} # Dictionary mapping lexer states to master regexs
126 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings
127 self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names
128 self.lexstate = "INITIAL" # Current lexer state
129 self.lexstatestack = [] # Stack of lexer states
130 self.lexstateinfo = None # State information
131 self.lexstateignore = {} # Dictionary of ignored characters for each state
132 self.lexstateerrorf = {} # Dictionary of error functions for each state
133 self.lexstateeoff = {} # Dictionary of eof functions for each state
134 self.lexreflags = 0 # Optional re compile flags
135 self.lexdata = None # Actual input data (as a string)
136 self.lexpos = 0 # Current position in input text
137 self.lexlen = 0 # Length of the input text
138 self.lexerrorf = None # Error rule (if any)
139 self.lexeoff = None # EOF rule (if any)
140 self.lextokens = None # List of valid tokens
141 self.lexignore = "" # Ignored characters
142 self.lexliterals = "" # Literal characters that can be passed through
143 self.lexmodule = None # Module
144 self.lineno = 1 # Current line number
145 self.lexoptimize = False # Optimized mode
147 def clone(self, object=None):
148 c = copy.copy(self)
150 # If the object parameter has been supplied, it means we are attaching the
151 # lexer to a new object. In this case, we have to rebind all methods in
152 # the lexstatere and lexstateerrorf tables.
154 if object:
155 newtab = {}
156 for key, ritem in self.lexstatere.items():
157 newre = []
158 for cre, findex in ritem:
159 newfindex = []
160 for f in findex:
161 if not f or not f[0]:
162 newfindex.append(f)
163 continue
164 newfindex.append((getattr(object, f[0].__name__), f[1]))
165 newre.append((cre, newfindex))
166 newtab[key] = newre
167 c.lexstatere = newtab
168 c.lexstateerrorf = {}
169 for key, ef in self.lexstateerrorf.items():
170 c.lexstateerrorf[key] = getattr(object, ef.__name__)
171 c.lexmodule = object
172 return c
174 # ------------------------------------------------------------
175 # writetab() - Write lexer information to a table file
176 # ------------------------------------------------------------
177 def writetab(self, lextab, outputdir=""):
178 if isinstance(lextab, types.ModuleType):
179 raise IOError("Won't overwrite existing lextab module")
180 basetabmodule = lextab.split(".")[-1]
181 filename = os.path.join(outputdir, basetabmodule) + ".py"
182 with open(filename, "w") as tf:
183 tf.write(
184 "# %s.py. This file automatically created by PLY (version %s). Don't edit!\n"
185 % (basetabmodule, __version__)
186 )
187 tf.write("_tabversion = %s\n" % repr(__tabversion__))
188 tf.write("_lextokens = set(%s)\n" % repr(tuple(sorted(self.lextokens))))
189 tf.write("_lexreflags = %s\n" % repr(int(self.lexreflags)))
190 tf.write("_lexliterals = %s\n" % repr(self.lexliterals))
191 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))
193 # Rewrite the lexstatere table, replacing function objects with function names
194 tabre = {}
195 for statename, lre in self.lexstatere.items():
196 titem = []
197 for (pat, func), retext, renames in zip(
198 lre, self.lexstateretext[statename], self.lexstaterenames[statename]
199 ):
200 titem.append((retext, _funcs_to_names(func, renames)))
201 tabre[statename] = titem
203 tf.write("_lexstatere = %s\n" % repr(tabre))
204 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))
206 taberr = {}
207 for statename, ef in self.lexstateerrorf.items():
208 taberr[statename] = ef.__name__ if ef else None
209 tf.write("_lexstateerrorf = %s\n" % repr(taberr))
211 tabeof = {}
212 for statename, ef in self.lexstateeoff.items():
213 tabeof[statename] = ef.__name__ if ef else None
214 tf.write("_lexstateeoff = %s\n" % repr(tabeof))
216 # ------------------------------------------------------------
217 # readtab() - Read lexer information from a tab file
218 # ------------------------------------------------------------
219 def readtab(self, tabfile, fdict):
220 if isinstance(tabfile, types.ModuleType):
221 lextab = tabfile
222 else:
223 exec("import %s" % tabfile)
224 lextab = sys.modules[tabfile]
226 if getattr(lextab, "_tabversion", "0.0") != __tabversion__:
227 raise ImportError("Inconsistent PLY version")
229 self.lextokens = lextab._lextokens
230 self.lexreflags = lextab._lexreflags
231 self.lexliterals = lextab._lexliterals
232 self.lextokens_all = self.lextokens | set(self.lexliterals)
233 self.lexstateinfo = lextab._lexstateinfo
234 self.lexstateignore = lextab._lexstateignore
235 self.lexstatere = {}
236 self.lexstateretext = {}
237 for statename, lre in lextab._lexstatere.items():
238 titem = []
239 txtitem = []
240 for pat, func_name in lre:
241 titem.append((re.compile(pat, lextab._lexreflags), _names_to_funcs(func_name, fdict)))
243 self.lexstatere[statename] = titem
244 self.lexstateretext[statename] = txtitem
246 self.lexstateerrorf = {}
247 for statename, ef in lextab._lexstateerrorf.items():
248 self.lexstateerrorf[statename] = fdict[ef]
250 self.lexstateeoff = {}
251 for statename, ef in lextab._lexstateeoff.items():
252 self.lexstateeoff[statename] = fdict[ef]
254 self.begin("INITIAL")
256 # ------------------------------------------------------------
257 # input() - Push a new string into the lexer
258 # ------------------------------------------------------------
259 def input(self, s):
260 # Pull off the first character to see if s looks like a string
261 c = s[:1]
262 if not isinstance(c, StringTypes):
263 raise ValueError("Expected a string")
264 self.lexdata = s
265 self.lexpos = 0
266 self.lexlen = len(s)
268 # ------------------------------------------------------------
269 # begin() - Changes the lexing state
270 # ------------------------------------------------------------
271 def begin(self, state):
272 if state not in self.lexstatere:
273 raise ValueError("Undefined state")
274 self.lexre = self.lexstatere[state]
275 self.lexretext = self.lexstateretext[state]
276 self.lexignore = self.lexstateignore.get(state, "")
277 self.lexerrorf = self.lexstateerrorf.get(state, None)
278 self.lexeoff = self.lexstateeoff.get(state, None)
279 self.lexstate = state
281 # ------------------------------------------------------------
282 # push_state() - Changes the lexing state and saves old on stack
283 # ------------------------------------------------------------
284 def push_state(self, state):
285 self.lexstatestack.append(self.lexstate)
286 self.begin(state)
288 # ------------------------------------------------------------
289 # pop_state() - Restores the previous state
290 # ------------------------------------------------------------
291 def pop_state(self):
292 self.begin(self.lexstatestack.pop())
294 # ------------------------------------------------------------
295 # current_state() - Returns the current lexing state
296 # ------------------------------------------------------------
297 def current_state(self):
298 return self.lexstate
300 # ------------------------------------------------------------
301 # skip() - Skip ahead n characters
302 # ------------------------------------------------------------
303 def skip(self, n):
304 self.lexpos += n
306 # ------------------------------------------------------------
307 # opttoken() - Return the next token from the Lexer
308 #
309 # Note: This function has been carefully implemented to be as fast
310 # as possible. Don't make changes unless you really know what
311 # you are doing
312 # ------------------------------------------------------------
313 def token(self):
314 # Make local copies of frequently referenced attributes
315 lexpos = self.lexpos
316 lexlen = self.lexlen
317 lexignore = self.lexignore
318 lexdata = self.lexdata
320 while lexpos < lexlen:
321 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
322 if lexdata[lexpos] in lexignore:
323 lexpos += 1
324 continue
326 # Look for a regular expression match
327 for lexre, lexindexfunc in self.lexre:
328 m = lexre.match(lexdata, lexpos)
329 if not m:
330 continue
332 # Create a token for return
333 tok = LexToken()
334 tok.value = m.group()
335 tok.lineno = self.lineno
336 tok.lexpos = lexpos
338 i = m.lastindex
339 func, tok.type = lexindexfunc[i]
341 if not func:
342 # If no token type was set, it's an ignored token
343 if tok.type:
344 self.lexpos = m.end()
345 return tok
346 else:
347 lexpos = m.end()
348 break
350 lexpos = m.end()
352 # If token is processed by a function, call it
354 tok.lexer = self # Set additional attributes useful in token rules
355 self.lexmatch = m
356 self.lexpos = lexpos
358 newtok = func(tok)
360 # Every function must return a token, if nothing, we just move to next token
361 if not newtok:
362 lexpos = self.lexpos # This is here in case user has updated lexpos.
363 lexignore = self.lexignore # This is here in case there was a state change
364 break
366 # Verify type of the token. If not in the token map, raise an error
367 if not self.lexoptimize:
368 if newtok.type not in self.lextokens_all:
369 raise LexError(
370 "%s:%d: Rule '%s' returned an unknown token type '%s'"
371 % (
372 func.__code__.co_filename,
373 func.__code__.co_firstlineno,
374 func.__name__,
375 newtok.type,
376 ),
377 lexdata[lexpos:],
378 )
380 return newtok
381 else:
382 # No match, see if in literals
383 if lexdata[lexpos] in self.lexliterals:
384 tok = LexToken()
385 tok.value = lexdata[lexpos]
386 tok.lineno = self.lineno
387 tok.type = tok.value
388 tok.lexpos = lexpos
389 self.lexpos = lexpos + 1
390 return tok
392 # No match. Call t_error() if defined.
393 if self.lexerrorf:
394 tok = LexToken()
395 tok.value = self.lexdata[lexpos:]
396 tok.lineno = self.lineno
397 tok.type = "error"
398 tok.lexer = self
399 tok.lexpos = lexpos
400 self.lexpos = lexpos
401 newtok = self.lexerrorf(tok)
402 if lexpos == self.lexpos:
403 # Error method didn't change text position at all. This is an error.
404 raise LexError(
405 "Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]
406 )
407 lexpos = self.lexpos
408 if not newtok:
409 continue
410 return newtok
412 self.lexpos = lexpos
413 raise LexError(
414 "Illegal character '%s' at index %d" % (lexdata[lexpos], lexpos), lexdata[lexpos:]
415 )
417 if self.lexeoff:
418 tok = LexToken()
419 tok.type = "eof"
420 tok.value = ""
421 tok.lineno = self.lineno
422 tok.lexpos = lexpos
423 tok.lexer = self
424 self.lexpos = lexpos
425 newtok = self.lexeoff(tok)
426 return newtok
428 self.lexpos = lexpos + 1
429 if self.lexdata is None:
430 raise RuntimeError("No input string given with input()")
431 return None
433 # Iterator interface
434 def __iter__(self):
435 return self
437 def next(self):
438 t = self.token()
439 if t is None:
440 raise StopIteration
441 return t
443 __next__ = next
446# -----------------------------------------------------------------------------
447# ==== Lex Builder ===
448#
449# The functions and classes below are used to collect lexing information
450# and build a Lexer object from it.
451# -----------------------------------------------------------------------------
454# -----------------------------------------------------------------------------
455# _get_regex(func)
456#
457# Returns the regular expression assigned to a function either as a doc string
458# or as a .regex attribute attached by the @TOKEN decorator.
459# -----------------------------------------------------------------------------
460def _get_regex(func):
461 return getattr(func, "regex", func.__doc__)
464# -----------------------------------------------------------------------------
465# get_caller_module_dict()
466#
467# This function returns a dictionary containing all of the symbols defined within
468# a caller further down the call stack. This is used to get the environment
469# associated with the yacc() call if none was provided.
470# -----------------------------------------------------------------------------
471def get_caller_module_dict(levels):
472 f = sys._getframe(levels)
473 ldict = f.f_globals.copy()
474 if f.f_globals != f.f_locals:
475 ldict.update(f.f_locals)
476 return ldict
479# -----------------------------------------------------------------------------
480# _funcs_to_names()
481#
482# Given a list of regular expression functions, this converts it to a list
483# suitable for output to a table file
484# -----------------------------------------------------------------------------
485def _funcs_to_names(funclist, namelist):
486 result = []
487 for f, name in zip(funclist, namelist):
488 if f and f[0]:
489 result.append((name, f[1]))
490 else:
491 result.append(f)
492 return result
495# -----------------------------------------------------------------------------
496# _names_to_funcs()
497#
498# Given a list of regular expression function names, this converts it back to
499# functions.
500# -----------------------------------------------------------------------------
501def _names_to_funcs(namelist, fdict):
502 result = []
503 for n in namelist:
504 if n and n[0]:
505 result.append((fdict[n[0]], n[1]))
506 else:
507 result.append(n)
508 return result
511# -----------------------------------------------------------------------------
512# _form_master_re()
513#
514# This function takes a list of all of the regex components and attempts to
515# form the master regular expression. Given limitations in the Python re
516# module, it may be necessary to break the master regex into separate expressions.
517# -----------------------------------------------------------------------------
518def _form_master_re(relist, reflags, ldict, toknames):
519 if not relist:
520 return []
521 regex = "|".join(relist)
522 try:
523 lexre = re.compile(regex, reflags)
525 # Build the index to function map for the matching engine
526 lexindexfunc = [None] * (max(lexre.groupindex.values()) + 1)
527 lexindexnames = lexindexfunc[:]
529 for f, i in lexre.groupindex.items():
530 handle = ldict.get(f, None)
531 if type(handle) in (types.FunctionType, types.MethodType):
532 lexindexfunc[i] = (handle, toknames[f])
533 lexindexnames[i] = f
534 elif handle is not None:
535 lexindexnames[i] = f
536 if f.find("ignore_") > 0:
537 lexindexfunc[i] = (None, None)
538 else:
539 lexindexfunc[i] = (None, toknames[f])
541 return [(lexre, lexindexfunc)], [regex], [lexindexnames]
542 except Exception:
543 m = int(len(relist) / 2)
544 if m == 0:
545 m = 1
546 llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames)
547 rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames)
548 return (llist + rlist), (lre + rre), (lnames + rnames)
551# -----------------------------------------------------------------------------
552# def _statetoken(s,names)
553#
554# Given a declaration name s of the form "t_" and a dictionary whose keys are
555# state names, this function returns a tuple (states,tokenname) where states
556# is a tuple of state names and tokenname is the name of the token. For example,
557# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
558# -----------------------------------------------------------------------------
559def _statetoken(s, names):
560 parts = s.split("_")
561 for i, part in enumerate(parts[1:], 1):
562 if part not in names and part != "ANY":
563 break
565 if i > 1:
566 states = tuple(parts[1:i])
567 else:
568 states = ("INITIAL",)
570 if "ANY" in states:
571 states = tuple(names)
573 tokenname = "_".join(parts[i:])
574 return (states, tokenname)
577# -----------------------------------------------------------------------------
578# LexerReflect()
579#
580# This class represents information needed to build a lexer as extracted from a
581# user's input file.
582# -----------------------------------------------------------------------------
583class LexerReflect(object):
584 def __init__(self, ldict, log=None, reflags=0):
585 self.ldict = ldict
586 self.error_func = None
587 self.tokens = []
588 self.reflags = reflags
589 self.stateinfo = {"INITIAL": "inclusive"}
590 self.modules = set()
591 self.error = False
592 self.log = PlyLogger(sys.stderr) if log is None else log
594 # Get all of the basic information
595 def get_all(self):
596 self.get_tokens()
597 self.get_literals()
598 self.get_states()
599 self.get_rules()
601 # Validate all of the information
602 def validate_all(self):
603 self.validate_tokens()
604 self.validate_literals()
605 self.validate_rules()
606 return self.error
608 # Get the tokens map
609 def get_tokens(self):
610 tokens = self.ldict.get("tokens", None)
611 if not tokens:
612 self.log.error("No token list is defined")
613 self.error = True
614 return
616 if not isinstance(tokens, (list, tuple)):
617 self.log.error("tokens must be a list or tuple")
618 self.error = True
619 return
621 if not tokens:
622 self.log.error("tokens is empty")
623 self.error = True
624 return
626 self.tokens = tokens
628 # Validate the tokens
629 def validate_tokens(self):
630 terminals = {}
631 for n in self.tokens:
632 if not _is_identifier.match(n):
633 self.log.error("Bad token name '%s'", n)
634 self.error = True
635 if n in terminals:
636 self.log.warning("Token '%s' multiply defined", n)
637 terminals[n] = 1
639 # Get the literals specifier
640 def get_literals(self):
641 self.literals = self.ldict.get("literals", "")
642 if not self.literals:
643 self.literals = ""
645 # Validate literals
646 def validate_literals(self):
647 try:
648 for c in self.literals:
649 if not isinstance(c, StringTypes) or len(c) > 1:
650 self.log.error("Invalid literal %s. Must be a single character", repr(c))
651 self.error = True
653 except TypeError:
654 self.log.error("Invalid literals specification. literals must be a sequence of characters")
655 self.error = True
657 def get_states(self):
658 self.states = self.ldict.get("states", None)
659 # Build statemap
660 if self.states:
661 if not isinstance(self.states, (tuple, list)):
662 self.log.error("states must be defined as a tuple or list")
663 self.error = True
664 else:
665 for s in self.states:
666 if not isinstance(s, tuple) or len(s) != 2:
667 self.log.error(
668 "Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",
669 repr(s),
670 )
671 self.error = True
672 continue
673 name, statetype = s
674 if not isinstance(name, StringTypes):
675 self.log.error("State name %s must be a string", repr(name))
676 self.error = True
677 continue
678 if not (statetype == "inclusive" or statetype == "exclusive"):
679 self.log.error("State type for state %s must be 'inclusive' or 'exclusive'", name)
680 self.error = True
681 continue
682 if name in self.stateinfo:
683 self.log.error("State '%s' already defined", name)
684 self.error = True
685 continue
686 self.stateinfo[name] = statetype
688 # Get all of the symbols with a t_ prefix and sort them into various
689 # categories (functions, strings, error functions, and ignore characters)
691 def get_rules(self):
692 tsymbols = [f for f in self.ldict if f[:2] == "t_"]
694 # Now build up a list of functions and a list of strings
695 self.toknames = {} # Mapping of symbols to token names
696 self.funcsym = {} # Symbols defined as functions
697 self.strsym = {} # Symbols defined as strings
698 self.ignore = {} # Ignore strings by state
699 self.errorf = {} # Error functions by state
700 self.eoff = {} # EOF functions by state
702 for s in self.stateinfo:
703 self.funcsym[s] = []
704 self.strsym[s] = []
706 if len(tsymbols) == 0:
707 self.log.error("No rules of the form t_rulename are defined")
708 self.error = True
709 return
711 for f in tsymbols:
712 t = self.ldict[f]
713 states, tokname = _statetoken(f, self.stateinfo)
714 self.toknames[f] = tokname
716 if hasattr(t, "__call__"):
717 if tokname == "error":
718 for s in states:
719 self.errorf[s] = t
720 elif tokname == "eof":
721 for s in states:
722 self.eoff[s] = t
723 elif tokname == "ignore":
724 line = t.__code__.co_firstlineno
725 file = t.__code__.co_filename
726 self.log.error("%s:%d: Rule '%s' must be defined as a string", file, line, t.__name__)
727 self.error = True
728 else:
729 for s in states:
730 self.funcsym[s].append((f, t))
731 elif isinstance(t, StringTypes):
732 if tokname == "ignore":
733 for s in states:
734 self.ignore[s] = t
735 if "\\" in t:
736 self.log.warning("%s contains a literal backslash '\\'", f)
738 elif tokname == "error":
739 self.log.error("Rule '%s' must be defined as a function", f)
740 self.error = True
741 else:
742 for s in states:
743 self.strsym[s].append((f, t))
744 else:
745 self.log.error("%s not defined as a function or string", f)
746 self.error = True
748 # Sort the functions by line number
749 for f in self.funcsym.values():
750 f.sort(key=lambda x: x[1].__code__.co_firstlineno)
752 # Sort the strings by regular expression length
753 for s in self.strsym.values():
754 s.sort(key=lambda x: len(x[1]), reverse=True)
756 # Validate all of the t_rules collected
757 def validate_rules(self):
758 for state in self.stateinfo:
759 # Validate all rules defined by functions
761 for fname, f in self.funcsym[state]:
762 line = f.__code__.co_firstlineno
763 file = f.__code__.co_filename
764 module = inspect.getmodule(f)
765 self.modules.add(module)
767 tokname = self.toknames[fname]
768 if isinstance(f, types.MethodType):
769 reqargs = 2
770 else:
771 reqargs = 1
772 nargs = f.__code__.co_argcount
773 if nargs > reqargs:
774 self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__)
775 self.error = True
776 continue
778 if nargs < reqargs:
779 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)
780 self.error = True
781 continue
783 if not _get_regex(f):
784 self.log.error(
785 "%s:%d: No regular expression defined for rule '%s'", file, line, f.__name__
786 )
787 self.error = True
788 continue
790 try:
791 c = re.compile("(?P<%s>%s)" % (fname, _get_regex(f)), self.reflags)
792 if c.match(""):
793 self.log.error(
794 "%s:%d: Regular expression for rule '%s' matches empty string",
795 file,
796 line,
797 f.__name__,
798 )
799 self.error = True
800 except re.error as e:
801 self.log.error(
802 "%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e
803 )
804 if "#" in _get_regex(f):
805 self.log.error(
806 "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__
807 )
808 self.error = True
810 # Validate all rules defined by strings
811 for name, r in self.strsym[state]:
812 tokname = self.toknames[name]
813 if tokname == "error":
814 self.log.error("Rule '%s' must be defined as a function", name)
815 self.error = True
816 continue
818 if tokname not in self.tokens and tokname.find("ignore_") < 0:
819 self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname)
820 self.error = True
821 continue
823 try:
824 c = re.compile("(?P<%s>%s)" % (name, r), self.reflags)
825 if c.match(""):
826 self.log.error("Regular expression for rule '%s' matches empty string", name)
827 self.error = True
828 except re.error as e:
829 self.log.error("Invalid regular expression for rule '%s'. %s", name, e)
830 if "#" in r:
831 self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name)
832 self.error = True
834 if not self.funcsym[state] and not self.strsym[state]:
835 self.log.error("No rules defined for state '%s'", state)
836 self.error = True
838 # Validate the error function
839 efunc = self.errorf.get(state, None)
840 if efunc:
841 f = efunc
842 line = f.__code__.co_firstlineno
843 file = f.__code__.co_filename
844 module = inspect.getmodule(f)
845 self.modules.add(module)
847 if isinstance(f, types.MethodType):
848 reqargs = 2
849 else:
850 reqargs = 1
851 nargs = f.__code__.co_argcount
852 if nargs > reqargs:
853 self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__)
854 self.error = True
856 if nargs < reqargs:
857 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)
858 self.error = True
860 for module in self.modules:
861 self.validate_module(module)
863 # -----------------------------------------------------------------------------
864 # validate_module()
865 #
866 # This checks to see if there are duplicated t_rulename() functions or strings
867 # in the parser input file. This is done using a simple regular expression
868 # match on each line in the source code of the given module.
869 # -----------------------------------------------------------------------------
871 def validate_module(self, module):
872 try:
873 lines, linen = inspect.getsourcelines(module)
874 except IOError:
875 return
877 fre = re.compile(r"\s*def\s+(t_[a-zA-Z_0-9]*)\(")
878 sre = re.compile(r"\s*(t_[a-zA-Z_0-9]*)\s*=")
880 counthash = {}
881 linen += 1
882 for line in lines:
883 m = fre.match(line)
884 if not m:
885 m = sre.match(line)
886 if m:
887 name = m.group(1)
888 prev = counthash.get(name)
889 if not prev:
890 counthash[name] = linen
891 else:
892 filename = inspect.getsourcefile(module)
893 self.log.error(
894 "%s:%d: Rule %s redefined. Previously defined on line %d", filename, linen, name, prev
895 )
896 self.error = True
897 linen += 1
900# -----------------------------------------------------------------------------
901# lex(module)
902#
903# Build all of the regular expression rules from definitions in the supplied module
904# -----------------------------------------------------------------------------
905def lex(
906 module=None,
907 object=None,
908 debug=False,
909 optimize=False,
910 lextab="lextab",
911 reflags=int(re.VERBOSE),
912 nowarn=False,
913 outputdir=None,
914 debuglog=None,
915 errorlog=None,
916):
917 if lextab is None:
918 lextab = "lextab"
920 global lexer
922 ldict = None
923 stateinfo = {"INITIAL": "inclusive"}
924 lexobj = Lexer()
925 lexobj.lexoptimize = optimize
926 global token, input
928 if errorlog is None:
929 errorlog = PlyLogger(sys.stderr)
931 if debug:
932 if debuglog is None:
933 debuglog = PlyLogger(sys.stderr)
935 # Get the module dictionary used for the lexer
936 if object:
937 module = object
939 # Get the module dictionary used for the parser
940 if module:
941 _items = [(k, getattr(module, k)) for k in dir(module)]
942 ldict = dict(_items)
943 # If no __file__ attribute is available, try to obtain it from the __module__ instead
944 if "__file__" not in ldict:
945 ldict["__file__"] = sys.modules[ldict["__module__"]].__file__
946 else:
947 ldict = get_caller_module_dict(2)
949 # Determine if the module is package of a package or not.
950 # If so, fix the tabmodule setting so that tables load correctly
951 pkg = ldict.get("__package__")
952 if pkg and isinstance(lextab, str):
953 if "." not in lextab:
954 lextab = pkg + "." + lextab
956 # Collect parser information from the dictionary
957 linfo = LexerReflect(ldict, log=errorlog, reflags=reflags)
958 linfo.get_all()
959 if not optimize:
960 if linfo.validate_all():
961 raise SyntaxError("Can't build lexer")
963 if optimize and lextab:
964 try:
965 lexobj.readtab(lextab, ldict)
966 token = lexobj.token
967 input = lexobj.input
968 lexer = lexobj
969 return lexobj
971 except ImportError:
972 pass
974 # Dump some basic debugging information
975 if debug:
976 debuglog.info("lex: tokens = %r", linfo.tokens)
977 debuglog.info("lex: literals = %r", linfo.literals)
978 debuglog.info("lex: states = %r", linfo.stateinfo)
980 # Build a dictionary of valid token names
981 lexobj.lextokens = set()
982 for n in linfo.tokens:
983 lexobj.lextokens.add(n)
985 # Get literals specification
986 if isinstance(linfo.literals, (list, tuple)):
987 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals)
988 else:
989 lexobj.lexliterals = linfo.literals
991 lexobj.lextokens_all = lexobj.lextokens | set(lexobj.lexliterals)
993 # Get the stateinfo dictionary
994 stateinfo = linfo.stateinfo
996 regexs = {}
997 # Build the master regular expressions
998 for state in stateinfo:
999 regex_list = []
1001 # Add rules defined by functions first
1002 for fname, f in linfo.funcsym[state]:
1003 regex_list.append("(?P<%s>%s)" % (fname, _get_regex(f)))
1004 if debug:
1005 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, _get_regex(f), state)
1007 # Now add all of the simple rules
1008 for name, r in linfo.strsym[state]:
1009 regex_list.append("(?P<%s>%s)" % (name, r))
1010 if debug:
1011 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state)
1013 regexs[state] = regex_list
1015 # Build the master regular expressions
1017 if debug:
1018 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====")
1020 for state in regexs:
1021 lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames)
1022 lexobj.lexstatere[state] = lexre
1023 lexobj.lexstateretext[state] = re_text
1024 lexobj.lexstaterenames[state] = re_names
1025 if debug:
1026 for i, text in enumerate(re_text):
1027 debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, text)
1029 # For inclusive states, we need to add the regular expressions from the INITIAL state
1030 for state, stype in stateinfo.items():
1031 if state != "INITIAL" and stype == "inclusive":
1032 lexobj.lexstatere[state].extend(lexobj.lexstatere["INITIAL"])
1033 lexobj.lexstateretext[state].extend(lexobj.lexstateretext["INITIAL"])
1034 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames["INITIAL"])
1036 lexobj.lexstateinfo = stateinfo
1037 lexobj.lexre = lexobj.lexstatere["INITIAL"]
1038 lexobj.lexretext = lexobj.lexstateretext["INITIAL"]
1039 lexobj.lexreflags = reflags
1041 # Set up ignore variables
1042 lexobj.lexstateignore = linfo.ignore
1043 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL", "")
1045 # Set up error functions
1046 lexobj.lexstateerrorf = linfo.errorf
1047 lexobj.lexerrorf = linfo.errorf.get("INITIAL", None)
1048 if not lexobj.lexerrorf:
1049 errorlog.warning("No t_error rule is defined")
1051 # Set up eof functions
1052 lexobj.lexstateeoff = linfo.eoff
1053 lexobj.lexeoff = linfo.eoff.get("INITIAL", None)
1055 # Check state information for ignore and error rules
1056 for s, stype in stateinfo.items():
1057 if stype == "exclusive":
1058 if s not in linfo.errorf:
1059 errorlog.warning("No error rule is defined for exclusive state '%s'", s)
1060 if s not in linfo.ignore and lexobj.lexignore:
1061 errorlog.warning("No ignore rule is defined for exclusive state '%s'", s)
1062 elif stype == "inclusive":
1063 if s not in linfo.errorf:
1064 linfo.errorf[s] = linfo.errorf.get("INITIAL", None)
1065 if s not in linfo.ignore:
1066 linfo.ignore[s] = linfo.ignore.get("INITIAL", "")
1068 # Create global versions of the token() and input() functions
1069 token = lexobj.token
1070 input = lexobj.input
1071 lexer = lexobj
1073 # If in optimize mode, we write the lextab
1074 if lextab and optimize:
1075 if outputdir is None:
1076 # If no output directory is set, the location of the output files
1077 # is determined according to the following rules:
1078 # - If lextab specifies a package, files go into that package directory
1079 # - Otherwise, files go in the same directory as the specifying module
1080 if isinstance(lextab, types.ModuleType):
1081 srcfile = lextab.__file__
1082 else:
1083 if "." not in lextab:
1084 srcfile = ldict["__file__"]
1085 else:
1086 parts = lextab.split(".")
1087 pkgname = ".".join(parts[:-1])
1088 exec("import %s" % pkgname)
1089 srcfile = getattr(sys.modules[pkgname], "__file__", "")
1090 outputdir = os.path.dirname(srcfile)
1091 try:
1092 lexobj.writetab(lextab, outputdir)
1093 if lextab in sys.modules:
1094 del sys.modules[lextab]
1095 except IOError as e:
1096 errorlog.warning("Couldn't write lextab module %r. %s" % (lextab, e))
1098 return lexobj
1101# -----------------------------------------------------------------------------
1102# runmain()
1103#
1104# This runs the lexer as a main program
1105# -----------------------------------------------------------------------------
1108def runmain(lexer=None, data=None):
1109 if not data:
1110 try:
1111 filename = sys.argv[1]
1112 f = open(filename)
1113 data = f.read()
1114 f.close()
1115 except IndexError:
1116 sys.stdout.write("Reading from standard input (type EOF to end):\n")
1117 data = sys.stdin.read()
1119 if lexer:
1120 _input = lexer.input
1121 else:
1122 _input = input
1123 _input(data)
1124 if lexer:
1125 _token = lexer.token
1126 else:
1127 _token = token
1129 while True:
1130 tok = _token()
1131 if not tok:
1132 break
1133 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno, tok.lexpos))
1136# -----------------------------------------------------------------------------
1137# @TOKEN(regex)
1138#
1139# This decorator function can be used to set the regex expression on a function
1140# when its docstring might need to be set in an alternative way
1141# -----------------------------------------------------------------------------
1144def TOKEN(r):
1145 def set_regex(f):
1146 if hasattr(r, "__call__"): 1146 ↛ 1147line 1146 didn't jump to line 1147, because the condition on line 1146 was never true
1147 f.regex = _get_regex(r)
1148 else:
1149 f.regex = r
1150 return f
1152 return set_regex
1155# Alternative spelling of the TOKEN decorator
1156Token = TOKEN