Coverage for python/lsst/daf/butler/registry/queries/expressions/parser/ply/lex.py: 7%
692 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-07 09:47 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-07 09:47 +0000
1# -----------------------------------------------------------------------------
2# ply: lex.py
3#
4# Copyright (C) 2001-2018
5# David M. Beazley (Dabeaz LLC)
6# All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions are
10# met:
11#
12# * Redistributions of source code must retain the above copyright notice,
13# this list of conditions and the following disclaimer.
14# * Redistributions in binary form must reproduce the above copyright notice,
15# this list of conditions and the following disclaimer in the documentation
16# and/or other materials provided with the distribution.
17# * Neither the name of the David Beazley or Dabeaz LLC may be used to
18# endorse or promote products derived from this software without
19# specific prior written permission.
20#
21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32# -----------------------------------------------------------------------------
34__version__ = "3.11"
35__tabversion__ = "3.10"
37import copy
38import inspect
39import os
40import re
41import sys
42import types
44# This tuple contains known string types
45try:
46 # Python 2.6
47 StringTypes = (types.StringType, types.UnicodeType)
48except AttributeError:
49 # Python 3.0
50 StringTypes = (str, bytes)
52# This regular expression is used to match valid token names
53_is_identifier = re.compile(r"^[a-zA-Z0-9_]+$")
55# Exception thrown when invalid token encountered and no default error
56# handler is defined.
57class LexError(Exception):
58 def __init__(self, message, s):
59 self.args = (message,)
60 self.text = s
63# Token class. This class is used to represent the tokens produced.
64class LexToken(object):
65 def __str__(self):
66 return "LexToken(%s,%r,%d,%d)" % (self.type, self.value, self.lineno, self.lexpos)
68 def __repr__(self):
69 return str(self)
72# This object is a stand-in for a logging object created by the
73# logging module.
76class PlyLogger(object):
77 def __init__(self, f):
78 self.f = f
80 def critical(self, msg, *args, **kwargs):
81 self.f.write((msg % args) + "\n")
83 def warning(self, msg, *args, **kwargs):
84 self.f.write("WARNING: " + (msg % args) + "\n")
86 def error(self, msg, *args, **kwargs):
87 self.f.write("ERROR: " + (msg % args) + "\n")
89 info = critical
90 debug = critical
93# Null logger is used when no output is generated. Does nothing.
94class NullLogger(object):
95 def __getattribute__(self, name):
96 return self
98 def __call__(self, *args, **kwargs):
99 return self
102# -----------------------------------------------------------------------------
103# === Lexing Engine ===
104#
105# The following Lexer class implements the lexer runtime. There are only
106# a few public methods and attributes:
107#
108# input() - Store a new string in the lexer
109# token() - Get the next token
110# clone() - Clone the lexer
111#
112# lineno - Current line number
113# lexpos - Current position in the input string
114# -----------------------------------------------------------------------------
117class Lexer:
118 def __init__(self):
119 self.lexre = None # Master regular expression. This is a list of
120 # tuples (re, findex) where re is a compiled
121 # regular expression and findex is a list
122 # mapping regex group numbers to rules
123 self.lexretext = None # Current regular expression strings
124 self.lexstatere = {} # Dictionary mapping lexer states to master regexs
125 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings
126 self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names
127 self.lexstate = "INITIAL" # Current lexer state
128 self.lexstatestack = [] # Stack of lexer states
129 self.lexstateinfo = None # State information
130 self.lexstateignore = {} # Dictionary of ignored characters for each state
131 self.lexstateerrorf = {} # Dictionary of error functions for each state
132 self.lexstateeoff = {} # Dictionary of eof functions for each state
133 self.lexreflags = 0 # Optional re compile flags
134 self.lexdata = None # Actual input data (as a string)
135 self.lexpos = 0 # Current position in input text
136 self.lexlen = 0 # Length of the input text
137 self.lexerrorf = None # Error rule (if any)
138 self.lexeoff = None # EOF rule (if any)
139 self.lextokens = None # List of valid tokens
140 self.lexignore = "" # Ignored characters
141 self.lexliterals = "" # Literal characters that can be passed through
142 self.lexmodule = None # Module
143 self.lineno = 1 # Current line number
144 self.lexoptimize = False # Optimized mode
146 def clone(self, object=None):
147 c = copy.copy(self)
149 # If the object parameter has been supplied, it means we are attaching the
150 # lexer to a new object. In this case, we have to rebind all methods in
151 # the lexstatere and lexstateerrorf tables.
153 if object:
154 newtab = {}
155 for key, ritem in self.lexstatere.items():
156 newre = []
157 for cre, findex in ritem:
158 newfindex = []
159 for f in findex:
160 if not f or not f[0]:
161 newfindex.append(f)
162 continue
163 newfindex.append((getattr(object, f[0].__name__), f[1]))
164 newre.append((cre, newfindex))
165 newtab[key] = newre
166 c.lexstatere = newtab
167 c.lexstateerrorf = {}
168 for key, ef in self.lexstateerrorf.items():
169 c.lexstateerrorf[key] = getattr(object, ef.__name__)
170 c.lexmodule = object
171 return c
173 # ------------------------------------------------------------
174 # writetab() - Write lexer information to a table file
175 # ------------------------------------------------------------
176 def writetab(self, lextab, outputdir=""):
177 if isinstance(lextab, types.ModuleType):
178 raise IOError("Won't overwrite existing lextab module")
179 basetabmodule = lextab.split(".")[-1]
180 filename = os.path.join(outputdir, basetabmodule) + ".py"
181 with open(filename, "w") as tf:
182 tf.write(
183 "# %s.py. This file automatically created by PLY (version %s). Don't edit!\n"
184 % (basetabmodule, __version__)
185 )
186 tf.write("_tabversion = %s\n" % repr(__tabversion__))
187 tf.write("_lextokens = set(%s)\n" % repr(tuple(sorted(self.lextokens))))
188 tf.write("_lexreflags = %s\n" % repr(int(self.lexreflags)))
189 tf.write("_lexliterals = %s\n" % repr(self.lexliterals))
190 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))
192 # Rewrite the lexstatere table, replacing function objects with function names
193 tabre = {}
194 for statename, lre in self.lexstatere.items():
195 titem = []
196 for (pat, func), retext, renames in zip(
197 lre, self.lexstateretext[statename], self.lexstaterenames[statename]
198 ):
199 titem.append((retext, _funcs_to_names(func, renames)))
200 tabre[statename] = titem
202 tf.write("_lexstatere = %s\n" % repr(tabre))
203 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))
205 taberr = {}
206 for statename, ef in self.lexstateerrorf.items():
207 taberr[statename] = ef.__name__ if ef else None
208 tf.write("_lexstateerrorf = %s\n" % repr(taberr))
210 tabeof = {}
211 for statename, ef in self.lexstateeoff.items():
212 tabeof[statename] = ef.__name__ if ef else None
213 tf.write("_lexstateeoff = %s\n" % repr(tabeof))
215 # ------------------------------------------------------------
216 # readtab() - Read lexer information from a tab file
217 # ------------------------------------------------------------
218 def readtab(self, tabfile, fdict):
219 if isinstance(tabfile, types.ModuleType):
220 lextab = tabfile
221 else:
222 exec("import %s" % tabfile)
223 lextab = sys.modules[tabfile]
225 if getattr(lextab, "_tabversion", "0.0") != __tabversion__:
226 raise ImportError("Inconsistent PLY version")
228 self.lextokens = lextab._lextokens
229 self.lexreflags = lextab._lexreflags
230 self.lexliterals = lextab._lexliterals
231 self.lextokens_all = self.lextokens | set(self.lexliterals)
232 self.lexstateinfo = lextab._lexstateinfo
233 self.lexstateignore = lextab._lexstateignore
234 self.lexstatere = {}
235 self.lexstateretext = {}
236 for statename, lre in lextab._lexstatere.items():
237 titem = []
238 txtitem = []
239 for pat, func_name in lre:
240 titem.append((re.compile(pat, lextab._lexreflags), _names_to_funcs(func_name, fdict)))
242 self.lexstatere[statename] = titem
243 self.lexstateretext[statename] = txtitem
245 self.lexstateerrorf = {}
246 for statename, ef in lextab._lexstateerrorf.items():
247 self.lexstateerrorf[statename] = fdict[ef]
249 self.lexstateeoff = {}
250 for statename, ef in lextab._lexstateeoff.items():
251 self.lexstateeoff[statename] = fdict[ef]
253 self.begin("INITIAL")
255 # ------------------------------------------------------------
256 # input() - Push a new string into the lexer
257 # ------------------------------------------------------------
258 def input(self, s):
259 # Pull off the first character to see if s looks like a string
260 c = s[:1]
261 if not isinstance(c, StringTypes):
262 raise ValueError("Expected a string")
263 self.lexdata = s
264 self.lexpos = 0
265 self.lexlen = len(s)
267 # ------------------------------------------------------------
268 # begin() - Changes the lexing state
269 # ------------------------------------------------------------
270 def begin(self, state):
271 if state not in self.lexstatere:
272 raise ValueError("Undefined state")
273 self.lexre = self.lexstatere[state]
274 self.lexretext = self.lexstateretext[state]
275 self.lexignore = self.lexstateignore.get(state, "")
276 self.lexerrorf = self.lexstateerrorf.get(state, None)
277 self.lexeoff = self.lexstateeoff.get(state, None)
278 self.lexstate = state
280 # ------------------------------------------------------------
281 # push_state() - Changes the lexing state and saves old on stack
282 # ------------------------------------------------------------
283 def push_state(self, state):
284 self.lexstatestack.append(self.lexstate)
285 self.begin(state)
287 # ------------------------------------------------------------
288 # pop_state() - Restores the previous state
289 # ------------------------------------------------------------
290 def pop_state(self):
291 self.begin(self.lexstatestack.pop())
293 # ------------------------------------------------------------
294 # current_state() - Returns the current lexing state
295 # ------------------------------------------------------------
296 def current_state(self):
297 return self.lexstate
299 # ------------------------------------------------------------
300 # skip() - Skip ahead n characters
301 # ------------------------------------------------------------
302 def skip(self, n):
303 self.lexpos += n
305 # ------------------------------------------------------------
306 # opttoken() - Return the next token from the Lexer
307 #
308 # Note: This function has been carefully implemented to be as fast
309 # as possible. Don't make changes unless you really know what
310 # you are doing
311 # ------------------------------------------------------------
312 def token(self):
313 # Make local copies of frequently referenced attributes
314 lexpos = self.lexpos
315 lexlen = self.lexlen
316 lexignore = self.lexignore
317 lexdata = self.lexdata
319 while lexpos < lexlen:
320 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
321 if lexdata[lexpos] in lexignore:
322 lexpos += 1
323 continue
325 # Look for a regular expression match
326 for lexre, lexindexfunc in self.lexre:
327 m = lexre.match(lexdata, lexpos)
328 if not m:
329 continue
331 # Create a token for return
332 tok = LexToken()
333 tok.value = m.group()
334 tok.lineno = self.lineno
335 tok.lexpos = lexpos
337 i = m.lastindex
338 func, tok.type = lexindexfunc[i]
340 if not func:
341 # If no token type was set, it's an ignored token
342 if tok.type:
343 self.lexpos = m.end()
344 return tok
345 else:
346 lexpos = m.end()
347 break
349 lexpos = m.end()
351 # If token is processed by a function, call it
353 tok.lexer = self # Set additional attributes useful in token rules
354 self.lexmatch = m
355 self.lexpos = lexpos
357 newtok = func(tok)
359 # Every function must return a token, if nothing, we just move to next token
360 if not newtok:
361 lexpos = self.lexpos # This is here in case user has updated lexpos.
362 lexignore = self.lexignore # This is here in case there was a state change
363 break
365 # Verify type of the token. If not in the token map, raise an error
366 if not self.lexoptimize:
367 if newtok.type not in self.lextokens_all:
368 raise LexError(
369 "%s:%d: Rule '%s' returned an unknown token type '%s'"
370 % (
371 func.__code__.co_filename,
372 func.__code__.co_firstlineno,
373 func.__name__,
374 newtok.type,
375 ),
376 lexdata[lexpos:],
377 )
379 return newtok
380 else:
381 # No match, see if in literals
382 if lexdata[lexpos] in self.lexliterals:
383 tok = LexToken()
384 tok.value = lexdata[lexpos]
385 tok.lineno = self.lineno
386 tok.type = tok.value
387 tok.lexpos = lexpos
388 self.lexpos = lexpos + 1
389 return tok
391 # No match. Call t_error() if defined.
392 if self.lexerrorf:
393 tok = LexToken()
394 tok.value = self.lexdata[lexpos:]
395 tok.lineno = self.lineno
396 tok.type = "error"
397 tok.lexer = self
398 tok.lexpos = lexpos
399 self.lexpos = lexpos
400 newtok = self.lexerrorf(tok)
401 if lexpos == self.lexpos:
402 # Error method didn't change text position at all. This is an error.
403 raise LexError(
404 "Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]
405 )
406 lexpos = self.lexpos
407 if not newtok:
408 continue
409 return newtok
411 self.lexpos = lexpos
412 raise LexError(
413 "Illegal character '%s' at index %d" % (lexdata[lexpos], lexpos), lexdata[lexpos:]
414 )
416 if self.lexeoff:
417 tok = LexToken()
418 tok.type = "eof"
419 tok.value = ""
420 tok.lineno = self.lineno
421 tok.lexpos = lexpos
422 tok.lexer = self
423 self.lexpos = lexpos
424 newtok = self.lexeoff(tok)
425 return newtok
427 self.lexpos = lexpos + 1
428 if self.lexdata is None:
429 raise RuntimeError("No input string given with input()")
430 return None
432 # Iterator interface
433 def __iter__(self):
434 return self
436 def next(self):
437 t = self.token()
438 if t is None:
439 raise StopIteration
440 return t
442 __next__ = next
445# -----------------------------------------------------------------------------
446# ==== Lex Builder ===
447#
448# The functions and classes below are used to collect lexing information
449# and build a Lexer object from it.
450# -----------------------------------------------------------------------------
452# -----------------------------------------------------------------------------
453# _get_regex(func)
454#
455# Returns the regular expression assigned to a function either as a doc string
456# or as a .regex attribute attached by the @TOKEN decorator.
457# -----------------------------------------------------------------------------
458def _get_regex(func):
459 return getattr(func, "regex", func.__doc__)
462# -----------------------------------------------------------------------------
463# get_caller_module_dict()
464#
465# This function returns a dictionary containing all of the symbols defined within
466# a caller further down the call stack. This is used to get the environment
467# associated with the yacc() call if none was provided.
468# -----------------------------------------------------------------------------
469def get_caller_module_dict(levels):
470 f = sys._getframe(levels)
471 ldict = f.f_globals.copy()
472 if f.f_globals != f.f_locals:
473 ldict.update(f.f_locals)
474 return ldict
477# -----------------------------------------------------------------------------
478# _funcs_to_names()
479#
480# Given a list of regular expression functions, this converts it to a list
481# suitable for output to a table file
482# -----------------------------------------------------------------------------
483def _funcs_to_names(funclist, namelist):
484 result = []
485 for f, name in zip(funclist, namelist):
486 if f and f[0]:
487 result.append((name, f[1]))
488 else:
489 result.append(f)
490 return result
493# -----------------------------------------------------------------------------
494# _names_to_funcs()
495#
496# Given a list of regular expression function names, this converts it back to
497# functions.
498# -----------------------------------------------------------------------------
499def _names_to_funcs(namelist, fdict):
500 result = []
501 for n in namelist:
502 if n and n[0]:
503 result.append((fdict[n[0]], n[1]))
504 else:
505 result.append(n)
506 return result
509# -----------------------------------------------------------------------------
510# _form_master_re()
511#
512# This function takes a list of all of the regex components and attempts to
513# form the master regular expression. Given limitations in the Python re
514# module, it may be necessary to break the master regex into separate expressions.
515# -----------------------------------------------------------------------------
516def _form_master_re(relist, reflags, ldict, toknames):
517 if not relist:
518 return []
519 regex = "|".join(relist)
520 try:
521 lexre = re.compile(regex, reflags)
523 # Build the index to function map for the matching engine
524 lexindexfunc = [None] * (max(lexre.groupindex.values()) + 1)
525 lexindexnames = lexindexfunc[:]
527 for f, i in lexre.groupindex.items():
528 handle = ldict.get(f, None)
529 if type(handle) in (types.FunctionType, types.MethodType):
530 lexindexfunc[i] = (handle, toknames[f])
531 lexindexnames[i] = f
532 elif handle is not None:
533 lexindexnames[i] = f
534 if f.find("ignore_") > 0:
535 lexindexfunc[i] = (None, None)
536 else:
537 lexindexfunc[i] = (None, toknames[f])
539 return [(lexre, lexindexfunc)], [regex], [lexindexnames]
540 except Exception:
541 m = int(len(relist) / 2)
542 if m == 0:
543 m = 1
544 llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames)
545 rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames)
546 return (llist + rlist), (lre + rre), (lnames + rnames)
549# -----------------------------------------------------------------------------
550# def _statetoken(s,names)
551#
552# Given a declaration name s of the form "t_" and a dictionary whose keys are
553# state names, this function returns a tuple (states,tokenname) where states
554# is a tuple of state names and tokenname is the name of the token. For example,
555# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
556# -----------------------------------------------------------------------------
557def _statetoken(s, names):
558 parts = s.split("_")
559 for i, part in enumerate(parts[1:], 1):
560 if part not in names and part != "ANY":
561 break
563 if i > 1:
564 states = tuple(parts[1:i])
565 else:
566 states = ("INITIAL",)
568 if "ANY" in states:
569 states = tuple(names)
571 tokenname = "_".join(parts[i:])
572 return (states, tokenname)
575# -----------------------------------------------------------------------------
576# LexerReflect()
577#
578# This class represents information needed to build a lexer as extracted from a
579# user's input file.
580# -----------------------------------------------------------------------------
581class LexerReflect(object):
582 def __init__(self, ldict, log=None, reflags=0):
583 self.ldict = ldict
584 self.error_func = None
585 self.tokens = []
586 self.reflags = reflags
587 self.stateinfo = {"INITIAL": "inclusive"}
588 self.modules = set()
589 self.error = False
590 self.log = PlyLogger(sys.stderr) if log is None else log
592 # Get all of the basic information
593 def get_all(self):
594 self.get_tokens()
595 self.get_literals()
596 self.get_states()
597 self.get_rules()
599 # Validate all of the information
600 def validate_all(self):
601 self.validate_tokens()
602 self.validate_literals()
603 self.validate_rules()
604 return self.error
606 # Get the tokens map
607 def get_tokens(self):
608 tokens = self.ldict.get("tokens", None)
609 if not tokens:
610 self.log.error("No token list is defined")
611 self.error = True
612 return
614 if not isinstance(tokens, (list, tuple)):
615 self.log.error("tokens must be a list or tuple")
616 self.error = True
617 return
619 if not tokens:
620 self.log.error("tokens is empty")
621 self.error = True
622 return
624 self.tokens = tokens
626 # Validate the tokens
627 def validate_tokens(self):
628 terminals = {}
629 for n in self.tokens:
630 if not _is_identifier.match(n):
631 self.log.error("Bad token name '%s'", n)
632 self.error = True
633 if n in terminals:
634 self.log.warning("Token '%s' multiply defined", n)
635 terminals[n] = 1
637 # Get the literals specifier
638 def get_literals(self):
639 self.literals = self.ldict.get("literals", "")
640 if not self.literals:
641 self.literals = ""
643 # Validate literals
644 def validate_literals(self):
645 try:
646 for c in self.literals:
647 if not isinstance(c, StringTypes) or len(c) > 1:
648 self.log.error("Invalid literal %s. Must be a single character", repr(c))
649 self.error = True
651 except TypeError:
652 self.log.error("Invalid literals specification. literals must be a sequence of characters")
653 self.error = True
655 def get_states(self):
656 self.states = self.ldict.get("states", None)
657 # Build statemap
658 if self.states:
659 if not isinstance(self.states, (tuple, list)):
660 self.log.error("states must be defined as a tuple or list")
661 self.error = True
662 else:
663 for s in self.states:
664 if not isinstance(s, tuple) or len(s) != 2:
665 self.log.error(
666 "Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",
667 repr(s),
668 )
669 self.error = True
670 continue
671 name, statetype = s
672 if not isinstance(name, StringTypes):
673 self.log.error("State name %s must be a string", repr(name))
674 self.error = True
675 continue
676 if not (statetype == "inclusive" or statetype == "exclusive"):
677 self.log.error("State type for state %s must be 'inclusive' or 'exclusive'", name)
678 self.error = True
679 continue
680 if name in self.stateinfo:
681 self.log.error("State '%s' already defined", name)
682 self.error = True
683 continue
684 self.stateinfo[name] = statetype
686 # Get all of the symbols with a t_ prefix and sort them into various
687 # categories (functions, strings, error functions, and ignore characters)
689 def get_rules(self):
690 tsymbols = [f for f in self.ldict if f[:2] == "t_"]
692 # Now build up a list of functions and a list of strings
693 self.toknames = {} # Mapping of symbols to token names
694 self.funcsym = {} # Symbols defined as functions
695 self.strsym = {} # Symbols defined as strings
696 self.ignore = {} # Ignore strings by state
697 self.errorf = {} # Error functions by state
698 self.eoff = {} # EOF functions by state
700 for s in self.stateinfo:
701 self.funcsym[s] = []
702 self.strsym[s] = []
704 if len(tsymbols) == 0:
705 self.log.error("No rules of the form t_rulename are defined")
706 self.error = True
707 return
709 for f in tsymbols:
710 t = self.ldict[f]
711 states, tokname = _statetoken(f, self.stateinfo)
712 self.toknames[f] = tokname
714 if hasattr(t, "__call__"):
715 if tokname == "error":
716 for s in states:
717 self.errorf[s] = t
718 elif tokname == "eof":
719 for s in states:
720 self.eoff[s] = t
721 elif tokname == "ignore":
722 line = t.__code__.co_firstlineno
723 file = t.__code__.co_filename
724 self.log.error("%s:%d: Rule '%s' must be defined as a string", file, line, t.__name__)
725 self.error = True
726 else:
727 for s in states:
728 self.funcsym[s].append((f, t))
729 elif isinstance(t, StringTypes):
730 if tokname == "ignore":
731 for s in states:
732 self.ignore[s] = t
733 if "\\" in t:
734 self.log.warning("%s contains a literal backslash '\\'", f)
736 elif tokname == "error":
737 self.log.error("Rule '%s' must be defined as a function", f)
738 self.error = True
739 else:
740 for s in states:
741 self.strsym[s].append((f, t))
742 else:
743 self.log.error("%s not defined as a function or string", f)
744 self.error = True
746 # Sort the functions by line number
747 for f in self.funcsym.values():
748 f.sort(key=lambda x: x[1].__code__.co_firstlineno)
750 # Sort the strings by regular expression length
751 for s in self.strsym.values():
752 s.sort(key=lambda x: len(x[1]), reverse=True)
754 # Validate all of the t_rules collected
755 def validate_rules(self):
756 for state in self.stateinfo:
757 # Validate all rules defined by functions
759 for fname, f in self.funcsym[state]:
760 line = f.__code__.co_firstlineno
761 file = f.__code__.co_filename
762 module = inspect.getmodule(f)
763 self.modules.add(module)
765 tokname = self.toknames[fname]
766 if isinstance(f, types.MethodType):
767 reqargs = 2
768 else:
769 reqargs = 1
770 nargs = f.__code__.co_argcount
771 if nargs > reqargs:
772 self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__)
773 self.error = True
774 continue
776 if nargs < reqargs:
777 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)
778 self.error = True
779 continue
781 if not _get_regex(f):
782 self.log.error(
783 "%s:%d: No regular expression defined for rule '%s'", file, line, f.__name__
784 )
785 self.error = True
786 continue
788 try:
789 c = re.compile("(?P<%s>%s)" % (fname, _get_regex(f)), self.reflags)
790 if c.match(""):
791 self.log.error(
792 "%s:%d: Regular expression for rule '%s' matches empty string",
793 file,
794 line,
795 f.__name__,
796 )
797 self.error = True
798 except re.error as e:
799 self.log.error(
800 "%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e
801 )
802 if "#" in _get_regex(f):
803 self.log.error(
804 "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__
805 )
806 self.error = True
808 # Validate all rules defined by strings
809 for name, r in self.strsym[state]:
810 tokname = self.toknames[name]
811 if tokname == "error":
812 self.log.error("Rule '%s' must be defined as a function", name)
813 self.error = True
814 continue
816 if tokname not in self.tokens and tokname.find("ignore_") < 0:
817 self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname)
818 self.error = True
819 continue
821 try:
822 c = re.compile("(?P<%s>%s)" % (name, r), self.reflags)
823 if c.match(""):
824 self.log.error("Regular expression for rule '%s' matches empty string", name)
825 self.error = True
826 except re.error as e:
827 self.log.error("Invalid regular expression for rule '%s'. %s", name, e)
828 if "#" in r:
829 self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name)
830 self.error = True
832 if not self.funcsym[state] and not self.strsym[state]:
833 self.log.error("No rules defined for state '%s'", state)
834 self.error = True
836 # Validate the error function
837 efunc = self.errorf.get(state, None)
838 if efunc:
839 f = efunc
840 line = f.__code__.co_firstlineno
841 file = f.__code__.co_filename
842 module = inspect.getmodule(f)
843 self.modules.add(module)
845 if isinstance(f, types.MethodType):
846 reqargs = 2
847 else:
848 reqargs = 1
849 nargs = f.__code__.co_argcount
850 if nargs > reqargs:
851 self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__)
852 self.error = True
854 if nargs < reqargs:
855 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)
856 self.error = True
858 for module in self.modules:
859 self.validate_module(module)
861 # -----------------------------------------------------------------------------
862 # validate_module()
863 #
864 # This checks to see if there are duplicated t_rulename() functions or strings
865 # in the parser input file. This is done using a simple regular expression
866 # match on each line in the source code of the given module.
867 # -----------------------------------------------------------------------------
869 def validate_module(self, module):
870 try:
871 lines, linen = inspect.getsourcelines(module)
872 except IOError:
873 return
875 fre = re.compile(r"\s*def\s+(t_[a-zA-Z_0-9]*)\(")
876 sre = re.compile(r"\s*(t_[a-zA-Z_0-9]*)\s*=")
878 counthash = {}
879 linen += 1
880 for line in lines:
881 m = fre.match(line)
882 if not m:
883 m = sre.match(line)
884 if m:
885 name = m.group(1)
886 prev = counthash.get(name)
887 if not prev:
888 counthash[name] = linen
889 else:
890 filename = inspect.getsourcefile(module)
891 self.log.error(
892 "%s:%d: Rule %s redefined. Previously defined on line %d", filename, linen, name, prev
893 )
894 self.error = True
895 linen += 1
898# -----------------------------------------------------------------------------
899# lex(module)
900#
901# Build all of the regular expression rules from definitions in the supplied module
902# -----------------------------------------------------------------------------
903def lex(
904 module=None,
905 object=None,
906 debug=False,
907 optimize=False,
908 lextab="lextab",
909 reflags=int(re.VERBOSE),
910 nowarn=False,
911 outputdir=None,
912 debuglog=None,
913 errorlog=None,
914):
916 if lextab is None:
917 lextab = "lextab"
919 global lexer
921 ldict = None
922 stateinfo = {"INITIAL": "inclusive"}
923 lexobj = Lexer()
924 lexobj.lexoptimize = optimize
925 global token, input
927 if errorlog is None:
928 errorlog = PlyLogger(sys.stderr)
930 if debug:
931 if debuglog is None:
932 debuglog = PlyLogger(sys.stderr)
934 # Get the module dictionary used for the lexer
935 if object:
936 module = object
938 # Get the module dictionary used for the parser
939 if module:
940 _items = [(k, getattr(module, k)) for k in dir(module)]
941 ldict = dict(_items)
942 # If no __file__ attribute is available, try to obtain it from the __module__ instead
943 if "__file__" not in ldict:
944 ldict["__file__"] = sys.modules[ldict["__module__"]].__file__
945 else:
946 ldict = get_caller_module_dict(2)
948 # Determine if the module is package of a package or not.
949 # If so, fix the tabmodule setting so that tables load correctly
950 pkg = ldict.get("__package__")
951 if pkg and isinstance(lextab, str):
952 if "." not in lextab:
953 lextab = pkg + "." + lextab
955 # Collect parser information from the dictionary
956 linfo = LexerReflect(ldict, log=errorlog, reflags=reflags)
957 linfo.get_all()
958 if not optimize:
959 if linfo.validate_all():
960 raise SyntaxError("Can't build lexer")
962 if optimize and lextab:
963 try:
964 lexobj.readtab(lextab, ldict)
965 token = lexobj.token
966 input = lexobj.input
967 lexer = lexobj
968 return lexobj
970 except ImportError:
971 pass
973 # Dump some basic debugging information
974 if debug:
975 debuglog.info("lex: tokens = %r", linfo.tokens)
976 debuglog.info("lex: literals = %r", linfo.literals)
977 debuglog.info("lex: states = %r", linfo.stateinfo)
979 # Build a dictionary of valid token names
980 lexobj.lextokens = set()
981 for n in linfo.tokens:
982 lexobj.lextokens.add(n)
984 # Get literals specification
985 if isinstance(linfo.literals, (list, tuple)):
986 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals)
987 else:
988 lexobj.lexliterals = linfo.literals
990 lexobj.lextokens_all = lexobj.lextokens | set(lexobj.lexliterals)
992 # Get the stateinfo dictionary
993 stateinfo = linfo.stateinfo
995 regexs = {}
996 # Build the master regular expressions
997 for state in stateinfo:
998 regex_list = []
1000 # Add rules defined by functions first
1001 for fname, f in linfo.funcsym[state]:
1002 regex_list.append("(?P<%s>%s)" % (fname, _get_regex(f)))
1003 if debug:
1004 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, _get_regex(f), state)
1006 # Now add all of the simple rules
1007 for name, r in linfo.strsym[state]:
1008 regex_list.append("(?P<%s>%s)" % (name, r))
1009 if debug:
1010 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state)
1012 regexs[state] = regex_list
1014 # Build the master regular expressions
1016 if debug:
1017 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====")
1019 for state in regexs:
1020 lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames)
1021 lexobj.lexstatere[state] = lexre
1022 lexobj.lexstateretext[state] = re_text
1023 lexobj.lexstaterenames[state] = re_names
1024 if debug:
1025 for i, text in enumerate(re_text):
1026 debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, text)
1028 # For inclusive states, we need to add the regular expressions from the INITIAL state
1029 for state, stype in stateinfo.items():
1030 if state != "INITIAL" and stype == "inclusive":
1031 lexobj.lexstatere[state].extend(lexobj.lexstatere["INITIAL"])
1032 lexobj.lexstateretext[state].extend(lexobj.lexstateretext["INITIAL"])
1033 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames["INITIAL"])
1035 lexobj.lexstateinfo = stateinfo
1036 lexobj.lexre = lexobj.lexstatere["INITIAL"]
1037 lexobj.lexretext = lexobj.lexstateretext["INITIAL"]
1038 lexobj.lexreflags = reflags
1040 # Set up ignore variables
1041 lexobj.lexstateignore = linfo.ignore
1042 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL", "")
1044 # Set up error functions
1045 lexobj.lexstateerrorf = linfo.errorf
1046 lexobj.lexerrorf = linfo.errorf.get("INITIAL", None)
1047 if not lexobj.lexerrorf:
1048 errorlog.warning("No t_error rule is defined")
1050 # Set up eof functions
1051 lexobj.lexstateeoff = linfo.eoff
1052 lexobj.lexeoff = linfo.eoff.get("INITIAL", None)
1054 # Check state information for ignore and error rules
1055 for s, stype in stateinfo.items():
1056 if stype == "exclusive":
1057 if s not in linfo.errorf:
1058 errorlog.warning("No error rule is defined for exclusive state '%s'", s)
1059 if s not in linfo.ignore and lexobj.lexignore:
1060 errorlog.warning("No ignore rule is defined for exclusive state '%s'", s)
1061 elif stype == "inclusive":
1062 if s not in linfo.errorf:
1063 linfo.errorf[s] = linfo.errorf.get("INITIAL", None)
1064 if s not in linfo.ignore:
1065 linfo.ignore[s] = linfo.ignore.get("INITIAL", "")
1067 # Create global versions of the token() and input() functions
1068 token = lexobj.token
1069 input = lexobj.input
1070 lexer = lexobj
1072 # If in optimize mode, we write the lextab
1073 if lextab and optimize:
1074 if outputdir is None:
1075 # If no output directory is set, the location of the output files
1076 # is determined according to the following rules:
1077 # - If lextab specifies a package, files go into that package directory
1078 # - Otherwise, files go in the same directory as the specifying module
1079 if isinstance(lextab, types.ModuleType):
1080 srcfile = lextab.__file__
1081 else:
1082 if "." not in lextab:
1083 srcfile = ldict["__file__"]
1084 else:
1085 parts = lextab.split(".")
1086 pkgname = ".".join(parts[:-1])
1087 exec("import %s" % pkgname)
1088 srcfile = getattr(sys.modules[pkgname], "__file__", "")
1089 outputdir = os.path.dirname(srcfile)
1090 try:
1091 lexobj.writetab(lextab, outputdir)
1092 if lextab in sys.modules:
1093 del sys.modules[lextab]
1094 except IOError as e:
1095 errorlog.warning("Couldn't write lextab module %r. %s" % (lextab, e))
1097 return lexobj
1100# -----------------------------------------------------------------------------
1101# runmain()
1102#
1103# This runs the lexer as a main program
1104# -----------------------------------------------------------------------------
1107def runmain(lexer=None, data=None):
1108 if not data:
1109 try:
1110 filename = sys.argv[1]
1111 f = open(filename)
1112 data = f.read()
1113 f.close()
1114 except IndexError:
1115 sys.stdout.write("Reading from standard input (type EOF to end):\n")
1116 data = sys.stdin.read()
1118 if lexer:
1119 _input = lexer.input
1120 else:
1121 _input = input
1122 _input(data)
1123 if lexer:
1124 _token = lexer.token
1125 else:
1126 _token = token
1128 while True:
1129 tok = _token()
1130 if not tok:
1131 break
1132 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno, tok.lexpos))
1135# -----------------------------------------------------------------------------
1136# @TOKEN(regex)
1137#
1138# This decorator function can be used to set the regex expression on a function
1139# when its docstring might need to be set in an alternative way
1140# -----------------------------------------------------------------------------
1143def TOKEN(r):
1144 def set_regex(f):
1145 if hasattr(r, "__call__"): 1145 ↛ 1146line 1145 didn't jump to line 1146, because the condition on line 1145 was never true
1146 f.regex = _get_regex(r)
1147 else:
1148 f.regex = r
1149 return f
1151 return set_regex
1154# Alternative spelling of the TOKEN decorator
1155Token = TOKEN