Coverage for python/lsst/daf/butler/registry/queries/expressions/parser/ply/lex.py: 7%

692 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-29 02:20 -0700

1# ----------------------------------------------------------------------------- 

2# ply: lex.py 

3# 

4# Copyright (C) 2001-2018 

5# David M. Beazley (Dabeaz LLC) 

6# All rights reserved. 

7# 

8# Redistribution and use in source and binary forms, with or without 

9# modification, are permitted provided that the following conditions are 

10# met: 

11# 

12# * Redistributions of source code must retain the above copyright notice, 

13# this list of conditions and the following disclaimer. 

14# * Redistributions in binary form must reproduce the above copyright notice, 

15# this list of conditions and the following disclaimer in the documentation 

16# and/or other materials provided with the distribution. 

17# * Neither the name of the David Beazley or Dabeaz LLC may be used to 

18# endorse or promote products derived from this software without 

19# specific prior written permission. 

20# 

21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 

22# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 

23# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 

24# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 

25# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 

26# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 

27# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 

28# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 

29# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 

30# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 

31# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 

32# ----------------------------------------------------------------------------- 

33 

34__version__ = "3.11" 

35__tabversion__ = "3.10" 

36 

37import copy 

38import inspect 

39import os 

40import re 

41import sys 

42import types 

43 

44# This tuple contains known string types 

45try: 

46 # Python 2.6 

47 StringTypes = (types.StringType, types.UnicodeType) 

48except AttributeError: 

49 # Python 3.0 

50 StringTypes = (str, bytes) 

51 

52# This regular expression is used to match valid token names 

53_is_identifier = re.compile(r"^[a-zA-Z0-9_]+$") 

54 

55# Exception thrown when invalid token encountered and no default error 

56# handler is defined. 

57class LexError(Exception): 

58 def __init__(self, message, s): 

59 self.args = (message,) 

60 self.text = s 

61 

62 

63# Token class. This class is used to represent the tokens produced. 

64class LexToken(object): 

65 def __str__(self): 

66 return "LexToken(%s,%r,%d,%d)" % (self.type, self.value, self.lineno, self.lexpos) 

67 

68 def __repr__(self): 

69 return str(self) 

70 

71 

72# This object is a stand-in for a logging object created by the 

73# logging module. 

74 

75 

76class PlyLogger(object): 

77 def __init__(self, f): 

78 self.f = f 

79 

80 def critical(self, msg, *args, **kwargs): 

81 self.f.write((msg % args) + "\n") 

82 

83 def warning(self, msg, *args, **kwargs): 

84 self.f.write("WARNING: " + (msg % args) + "\n") 

85 

86 def error(self, msg, *args, **kwargs): 

87 self.f.write("ERROR: " + (msg % args) + "\n") 

88 

89 info = critical 

90 debug = critical 

91 

92 

93# Null logger is used when no output is generated. Does nothing. 

94class NullLogger(object): 

95 def __getattribute__(self, name): 

96 return self 

97 

98 def __call__(self, *args, **kwargs): 

99 return self 

100 

101 

102# ----------------------------------------------------------------------------- 

103# === Lexing Engine === 

104# 

105# The following Lexer class implements the lexer runtime. There are only 

106# a few public methods and attributes: 

107# 

108# input() - Store a new string in the lexer 

109# token() - Get the next token 

110# clone() - Clone the lexer 

111# 

112# lineno - Current line number 

113# lexpos - Current position in the input string 

114# ----------------------------------------------------------------------------- 

115 

116 

117class Lexer: 

118 def __init__(self): 

119 self.lexre = None # Master regular expression. This is a list of 

120 # tuples (re, findex) where re is a compiled 

121 # regular expression and findex is a list 

122 # mapping regex group numbers to rules 

123 self.lexretext = None # Current regular expression strings 

124 self.lexstatere = {} # Dictionary mapping lexer states to master regexs 

125 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings 

126 self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names 

127 self.lexstate = "INITIAL" # Current lexer state 

128 self.lexstatestack = [] # Stack of lexer states 

129 self.lexstateinfo = None # State information 

130 self.lexstateignore = {} # Dictionary of ignored characters for each state 

131 self.lexstateerrorf = {} # Dictionary of error functions for each state 

132 self.lexstateeoff = {} # Dictionary of eof functions for each state 

133 self.lexreflags = 0 # Optional re compile flags 

134 self.lexdata = None # Actual input data (as a string) 

135 self.lexpos = 0 # Current position in input text 

136 self.lexlen = 0 # Length of the input text 

137 self.lexerrorf = None # Error rule (if any) 

138 self.lexeoff = None # EOF rule (if any) 

139 self.lextokens = None # List of valid tokens 

140 self.lexignore = "" # Ignored characters 

141 self.lexliterals = "" # Literal characters that can be passed through 

142 self.lexmodule = None # Module 

143 self.lineno = 1 # Current line number 

144 self.lexoptimize = False # Optimized mode 

145 

146 def clone(self, object=None): 

147 c = copy.copy(self) 

148 

149 # If the object parameter has been supplied, it means we are attaching the 

150 # lexer to a new object. In this case, we have to rebind all methods in 

151 # the lexstatere and lexstateerrorf tables. 

152 

153 if object: 

154 newtab = {} 

155 for key, ritem in self.lexstatere.items(): 

156 newre = [] 

157 for cre, findex in ritem: 

158 newfindex = [] 

159 for f in findex: 

160 if not f or not f[0]: 

161 newfindex.append(f) 

162 continue 

163 newfindex.append((getattr(object, f[0].__name__), f[1])) 

164 newre.append((cre, newfindex)) 

165 newtab[key] = newre 

166 c.lexstatere = newtab 

167 c.lexstateerrorf = {} 

168 for key, ef in self.lexstateerrorf.items(): 

169 c.lexstateerrorf[key] = getattr(object, ef.__name__) 

170 c.lexmodule = object 

171 return c 

172 

173 # ------------------------------------------------------------ 

174 # writetab() - Write lexer information to a table file 

175 # ------------------------------------------------------------ 

176 def writetab(self, lextab, outputdir=""): 

177 if isinstance(lextab, types.ModuleType): 

178 raise IOError("Won't overwrite existing lextab module") 

179 basetabmodule = lextab.split(".")[-1] 

180 filename = os.path.join(outputdir, basetabmodule) + ".py" 

181 with open(filename, "w") as tf: 

182 tf.write( 

183 "# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" 

184 % (basetabmodule, __version__) 

185 ) 

186 tf.write("_tabversion = %s\n" % repr(__tabversion__)) 

187 tf.write("_lextokens = set(%s)\n" % repr(tuple(sorted(self.lextokens)))) 

188 tf.write("_lexreflags = %s\n" % repr(int(self.lexreflags))) 

189 tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) 

190 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) 

191 

192 # Rewrite the lexstatere table, replacing function objects with function names 

193 tabre = {} 

194 for statename, lre in self.lexstatere.items(): 

195 titem = [] 

196 for (pat, func), retext, renames in zip( 

197 lre, self.lexstateretext[statename], self.lexstaterenames[statename] 

198 ): 

199 titem.append((retext, _funcs_to_names(func, renames))) 

200 tabre[statename] = titem 

201 

202 tf.write("_lexstatere = %s\n" % repr(tabre)) 

203 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) 

204 

205 taberr = {} 

206 for statename, ef in self.lexstateerrorf.items(): 

207 taberr[statename] = ef.__name__ if ef else None 

208 tf.write("_lexstateerrorf = %s\n" % repr(taberr)) 

209 

210 tabeof = {} 

211 for statename, ef in self.lexstateeoff.items(): 

212 tabeof[statename] = ef.__name__ if ef else None 

213 tf.write("_lexstateeoff = %s\n" % repr(tabeof)) 

214 

215 # ------------------------------------------------------------ 

216 # readtab() - Read lexer information from a tab file 

217 # ------------------------------------------------------------ 

218 def readtab(self, tabfile, fdict): 

219 if isinstance(tabfile, types.ModuleType): 

220 lextab = tabfile 

221 else: 

222 exec("import %s" % tabfile) 

223 lextab = sys.modules[tabfile] 

224 

225 if getattr(lextab, "_tabversion", "0.0") != __tabversion__: 

226 raise ImportError("Inconsistent PLY version") 

227 

228 self.lextokens = lextab._lextokens 

229 self.lexreflags = lextab._lexreflags 

230 self.lexliterals = lextab._lexliterals 

231 self.lextokens_all = self.lextokens | set(self.lexliterals) 

232 self.lexstateinfo = lextab._lexstateinfo 

233 self.lexstateignore = lextab._lexstateignore 

234 self.lexstatere = {} 

235 self.lexstateretext = {} 

236 for statename, lre in lextab._lexstatere.items(): 

237 titem = [] 

238 txtitem = [] 

239 for pat, func_name in lre: 

240 titem.append((re.compile(pat, lextab._lexreflags), _names_to_funcs(func_name, fdict))) 

241 

242 self.lexstatere[statename] = titem 

243 self.lexstateretext[statename] = txtitem 

244 

245 self.lexstateerrorf = {} 

246 for statename, ef in lextab._lexstateerrorf.items(): 

247 self.lexstateerrorf[statename] = fdict[ef] 

248 

249 self.lexstateeoff = {} 

250 for statename, ef in lextab._lexstateeoff.items(): 

251 self.lexstateeoff[statename] = fdict[ef] 

252 

253 self.begin("INITIAL") 

254 

255 # ------------------------------------------------------------ 

256 # input() - Push a new string into the lexer 

257 # ------------------------------------------------------------ 

258 def input(self, s): 

259 # Pull off the first character to see if s looks like a string 

260 c = s[:1] 

261 if not isinstance(c, StringTypes): 

262 raise ValueError("Expected a string") 

263 self.lexdata = s 

264 self.lexpos = 0 

265 self.lexlen = len(s) 

266 

267 # ------------------------------------------------------------ 

268 # begin() - Changes the lexing state 

269 # ------------------------------------------------------------ 

270 def begin(self, state): 

271 if state not in self.lexstatere: 

272 raise ValueError("Undefined state") 

273 self.lexre = self.lexstatere[state] 

274 self.lexretext = self.lexstateretext[state] 

275 self.lexignore = self.lexstateignore.get(state, "") 

276 self.lexerrorf = self.lexstateerrorf.get(state, None) 

277 self.lexeoff = self.lexstateeoff.get(state, None) 

278 self.lexstate = state 

279 

280 # ------------------------------------------------------------ 

281 # push_state() - Changes the lexing state and saves old on stack 

282 # ------------------------------------------------------------ 

283 def push_state(self, state): 

284 self.lexstatestack.append(self.lexstate) 

285 self.begin(state) 

286 

287 # ------------------------------------------------------------ 

288 # pop_state() - Restores the previous state 

289 # ------------------------------------------------------------ 

290 def pop_state(self): 

291 self.begin(self.lexstatestack.pop()) 

292 

293 # ------------------------------------------------------------ 

294 # current_state() - Returns the current lexing state 

295 # ------------------------------------------------------------ 

296 def current_state(self): 

297 return self.lexstate 

298 

299 # ------------------------------------------------------------ 

300 # skip() - Skip ahead n characters 

301 # ------------------------------------------------------------ 

302 def skip(self, n): 

303 self.lexpos += n 

304 

305 # ------------------------------------------------------------ 

306 # opttoken() - Return the next token from the Lexer 

307 # 

308 # Note: This function has been carefully implemented to be as fast 

309 # as possible. Don't make changes unless you really know what 

310 # you are doing 

311 # ------------------------------------------------------------ 

312 def token(self): 

313 # Make local copies of frequently referenced attributes 

314 lexpos = self.lexpos 

315 lexlen = self.lexlen 

316 lexignore = self.lexignore 

317 lexdata = self.lexdata 

318 

319 while lexpos < lexlen: 

320 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 

321 if lexdata[lexpos] in lexignore: 

322 lexpos += 1 

323 continue 

324 

325 # Look for a regular expression match 

326 for lexre, lexindexfunc in self.lexre: 

327 m = lexre.match(lexdata, lexpos) 

328 if not m: 

329 continue 

330 

331 # Create a token for return 

332 tok = LexToken() 

333 tok.value = m.group() 

334 tok.lineno = self.lineno 

335 tok.lexpos = lexpos 

336 

337 i = m.lastindex 

338 func, tok.type = lexindexfunc[i] 

339 

340 if not func: 

341 # If no token type was set, it's an ignored token 

342 if tok.type: 

343 self.lexpos = m.end() 

344 return tok 

345 else: 

346 lexpos = m.end() 

347 break 

348 

349 lexpos = m.end() 

350 

351 # If token is processed by a function, call it 

352 

353 tok.lexer = self # Set additional attributes useful in token rules 

354 self.lexmatch = m 

355 self.lexpos = lexpos 

356 

357 newtok = func(tok) 

358 

359 # Every function must return a token, if nothing, we just move to next token 

360 if not newtok: 

361 lexpos = self.lexpos # This is here in case user has updated lexpos. 

362 lexignore = self.lexignore # This is here in case there was a state change 

363 break 

364 

365 # Verify type of the token. If not in the token map, raise an error 

366 if not self.lexoptimize: 

367 if newtok.type not in self.lextokens_all: 

368 raise LexError( 

369 "%s:%d: Rule '%s' returned an unknown token type '%s'" 

370 % ( 

371 func.__code__.co_filename, 

372 func.__code__.co_firstlineno, 

373 func.__name__, 

374 newtok.type, 

375 ), 

376 lexdata[lexpos:], 

377 ) 

378 

379 return newtok 

380 else: 

381 # No match, see if in literals 

382 if lexdata[lexpos] in self.lexliterals: 

383 tok = LexToken() 

384 tok.value = lexdata[lexpos] 

385 tok.lineno = self.lineno 

386 tok.type = tok.value 

387 tok.lexpos = lexpos 

388 self.lexpos = lexpos + 1 

389 return tok 

390 

391 # No match. Call t_error() if defined. 

392 if self.lexerrorf: 

393 tok = LexToken() 

394 tok.value = self.lexdata[lexpos:] 

395 tok.lineno = self.lineno 

396 tok.type = "error" 

397 tok.lexer = self 

398 tok.lexpos = lexpos 

399 self.lexpos = lexpos 

400 newtok = self.lexerrorf(tok) 

401 if lexpos == self.lexpos: 

402 # Error method didn't change text position at all. This is an error. 

403 raise LexError( 

404 "Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:] 

405 ) 

406 lexpos = self.lexpos 

407 if not newtok: 

408 continue 

409 return newtok 

410 

411 self.lexpos = lexpos 

412 raise LexError( 

413 "Illegal character '%s' at index %d" % (lexdata[lexpos], lexpos), lexdata[lexpos:] 

414 ) 

415 

416 if self.lexeoff: 

417 tok = LexToken() 

418 tok.type = "eof" 

419 tok.value = "" 

420 tok.lineno = self.lineno 

421 tok.lexpos = lexpos 

422 tok.lexer = self 

423 self.lexpos = lexpos 

424 newtok = self.lexeoff(tok) 

425 return newtok 

426 

427 self.lexpos = lexpos + 1 

428 if self.lexdata is None: 

429 raise RuntimeError("No input string given with input()") 

430 return None 

431 

432 # Iterator interface 

433 def __iter__(self): 

434 return self 

435 

436 def next(self): 

437 t = self.token() 

438 if t is None: 

439 raise StopIteration 

440 return t 

441 

442 __next__ = next 

443 

444 

445# ----------------------------------------------------------------------------- 

446# ==== Lex Builder === 

447# 

448# The functions and classes below are used to collect lexing information 

449# and build a Lexer object from it. 

450# ----------------------------------------------------------------------------- 

451 

452# ----------------------------------------------------------------------------- 

453# _get_regex(func) 

454# 

455# Returns the regular expression assigned to a function either as a doc string 

456# or as a .regex attribute attached by the @TOKEN decorator. 

457# ----------------------------------------------------------------------------- 

458def _get_regex(func): 

459 return getattr(func, "regex", func.__doc__) 

460 

461 

462# ----------------------------------------------------------------------------- 

463# get_caller_module_dict() 

464# 

465# This function returns a dictionary containing all of the symbols defined within 

466# a caller further down the call stack. This is used to get the environment 

467# associated with the yacc() call if none was provided. 

468# ----------------------------------------------------------------------------- 

469def get_caller_module_dict(levels): 

470 f = sys._getframe(levels) 

471 ldict = f.f_globals.copy() 

472 if f.f_globals != f.f_locals: 

473 ldict.update(f.f_locals) 

474 return ldict 

475 

476 

477# ----------------------------------------------------------------------------- 

478# _funcs_to_names() 

479# 

480# Given a list of regular expression functions, this converts it to a list 

481# suitable for output to a table file 

482# ----------------------------------------------------------------------------- 

483def _funcs_to_names(funclist, namelist): 

484 result = [] 

485 for f, name in zip(funclist, namelist): 

486 if f and f[0]: 

487 result.append((name, f[1])) 

488 else: 

489 result.append(f) 

490 return result 

491 

492 

493# ----------------------------------------------------------------------------- 

494# _names_to_funcs() 

495# 

496# Given a list of regular expression function names, this converts it back to 

497# functions. 

498# ----------------------------------------------------------------------------- 

499def _names_to_funcs(namelist, fdict): 

500 result = [] 

501 for n in namelist: 

502 if n and n[0]: 

503 result.append((fdict[n[0]], n[1])) 

504 else: 

505 result.append(n) 

506 return result 

507 

508 

509# ----------------------------------------------------------------------------- 

510# _form_master_re() 

511# 

512# This function takes a list of all of the regex components and attempts to 

513# form the master regular expression. Given limitations in the Python re 

514# module, it may be necessary to break the master regex into separate expressions. 

515# ----------------------------------------------------------------------------- 

516def _form_master_re(relist, reflags, ldict, toknames): 

517 if not relist: 

518 return [] 

519 regex = "|".join(relist) 

520 try: 

521 lexre = re.compile(regex, reflags) 

522 

523 # Build the index to function map for the matching engine 

524 lexindexfunc = [None] * (max(lexre.groupindex.values()) + 1) 

525 lexindexnames = lexindexfunc[:] 

526 

527 for f, i in lexre.groupindex.items(): 

528 handle = ldict.get(f, None) 

529 if type(handle) in (types.FunctionType, types.MethodType): 

530 lexindexfunc[i] = (handle, toknames[f]) 

531 lexindexnames[i] = f 

532 elif handle is not None: 

533 lexindexnames[i] = f 

534 if f.find("ignore_") > 0: 

535 lexindexfunc[i] = (None, None) 

536 else: 

537 lexindexfunc[i] = (None, toknames[f]) 

538 

539 return [(lexre, lexindexfunc)], [regex], [lexindexnames] 

540 except Exception: 

541 m = int(len(relist) / 2) 

542 if m == 0: 

543 m = 1 

544 llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames) 

545 rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames) 

546 return (llist + rlist), (lre + rre), (lnames + rnames) 

547 

548 

549# ----------------------------------------------------------------------------- 

550# def _statetoken(s,names) 

551# 

552# Given a declaration name s of the form "t_" and a dictionary whose keys are 

553# state names, this function returns a tuple (states,tokenname) where states 

554# is a tuple of state names and tokenname is the name of the token. For example, 

555# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') 

556# ----------------------------------------------------------------------------- 

557def _statetoken(s, names): 

558 parts = s.split("_") 

559 for i, part in enumerate(parts[1:], 1): 

560 if part not in names and part != "ANY": 

561 break 

562 

563 if i > 1: 

564 states = tuple(parts[1:i]) 

565 else: 

566 states = ("INITIAL",) 

567 

568 if "ANY" in states: 

569 states = tuple(names) 

570 

571 tokenname = "_".join(parts[i:]) 

572 return (states, tokenname) 

573 

574 

575# ----------------------------------------------------------------------------- 

576# LexerReflect() 

577# 

578# This class represents information needed to build a lexer as extracted from a 

579# user's input file. 

580# ----------------------------------------------------------------------------- 

581class LexerReflect(object): 

582 def __init__(self, ldict, log=None, reflags=0): 

583 self.ldict = ldict 

584 self.error_func = None 

585 self.tokens = [] 

586 self.reflags = reflags 

587 self.stateinfo = {"INITIAL": "inclusive"} 

588 self.modules = set() 

589 self.error = False 

590 self.log = PlyLogger(sys.stderr) if log is None else log 

591 

592 # Get all of the basic information 

593 def get_all(self): 

594 self.get_tokens() 

595 self.get_literals() 

596 self.get_states() 

597 self.get_rules() 

598 

599 # Validate all of the information 

600 def validate_all(self): 

601 self.validate_tokens() 

602 self.validate_literals() 

603 self.validate_rules() 

604 return self.error 

605 

606 # Get the tokens map 

607 def get_tokens(self): 

608 tokens = self.ldict.get("tokens", None) 

609 if not tokens: 

610 self.log.error("No token list is defined") 

611 self.error = True 

612 return 

613 

614 if not isinstance(tokens, (list, tuple)): 

615 self.log.error("tokens must be a list or tuple") 

616 self.error = True 

617 return 

618 

619 if not tokens: 

620 self.log.error("tokens is empty") 

621 self.error = True 

622 return 

623 

624 self.tokens = tokens 

625 

626 # Validate the tokens 

627 def validate_tokens(self): 

628 terminals = {} 

629 for n in self.tokens: 

630 if not _is_identifier.match(n): 

631 self.log.error("Bad token name '%s'", n) 

632 self.error = True 

633 if n in terminals: 

634 self.log.warning("Token '%s' multiply defined", n) 

635 terminals[n] = 1 

636 

637 # Get the literals specifier 

638 def get_literals(self): 

639 self.literals = self.ldict.get("literals", "") 

640 if not self.literals: 

641 self.literals = "" 

642 

643 # Validate literals 

644 def validate_literals(self): 

645 try: 

646 for c in self.literals: 

647 if not isinstance(c, StringTypes) or len(c) > 1: 

648 self.log.error("Invalid literal %s. Must be a single character", repr(c)) 

649 self.error = True 

650 

651 except TypeError: 

652 self.log.error("Invalid literals specification. literals must be a sequence of characters") 

653 self.error = True 

654 

655 def get_states(self): 

656 self.states = self.ldict.get("states", None) 

657 # Build statemap 

658 if self.states: 

659 if not isinstance(self.states, (tuple, list)): 

660 self.log.error("states must be defined as a tuple or list") 

661 self.error = True 

662 else: 

663 for s in self.states: 

664 if not isinstance(s, tuple) or len(s) != 2: 

665 self.log.error( 

666 "Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", 

667 repr(s), 

668 ) 

669 self.error = True 

670 continue 

671 name, statetype = s 

672 if not isinstance(name, StringTypes): 

673 self.log.error("State name %s must be a string", repr(name)) 

674 self.error = True 

675 continue 

676 if not (statetype == "inclusive" or statetype == "exclusive"): 

677 self.log.error("State type for state %s must be 'inclusive' or 'exclusive'", name) 

678 self.error = True 

679 continue 

680 if name in self.stateinfo: 

681 self.log.error("State '%s' already defined", name) 

682 self.error = True 

683 continue 

684 self.stateinfo[name] = statetype 

685 

686 # Get all of the symbols with a t_ prefix and sort them into various 

687 # categories (functions, strings, error functions, and ignore characters) 

688 

689 def get_rules(self): 

690 tsymbols = [f for f in self.ldict if f[:2] == "t_"] 

691 

692 # Now build up a list of functions and a list of strings 

693 self.toknames = {} # Mapping of symbols to token names 

694 self.funcsym = {} # Symbols defined as functions 

695 self.strsym = {} # Symbols defined as strings 

696 self.ignore = {} # Ignore strings by state 

697 self.errorf = {} # Error functions by state 

698 self.eoff = {} # EOF functions by state 

699 

700 for s in self.stateinfo: 

701 self.funcsym[s] = [] 

702 self.strsym[s] = [] 

703 

704 if len(tsymbols) == 0: 

705 self.log.error("No rules of the form t_rulename are defined") 

706 self.error = True 

707 return 

708 

709 for f in tsymbols: 

710 t = self.ldict[f] 

711 states, tokname = _statetoken(f, self.stateinfo) 

712 self.toknames[f] = tokname 

713 

714 if hasattr(t, "__call__"): 

715 if tokname == "error": 

716 for s in states: 

717 self.errorf[s] = t 

718 elif tokname == "eof": 

719 for s in states: 

720 self.eoff[s] = t 

721 elif tokname == "ignore": 

722 line = t.__code__.co_firstlineno 

723 file = t.__code__.co_filename 

724 self.log.error("%s:%d: Rule '%s' must be defined as a string", file, line, t.__name__) 

725 self.error = True 

726 else: 

727 for s in states: 

728 self.funcsym[s].append((f, t)) 

729 elif isinstance(t, StringTypes): 

730 if tokname == "ignore": 

731 for s in states: 

732 self.ignore[s] = t 

733 if "\\" in t: 

734 self.log.warning("%s contains a literal backslash '\\'", f) 

735 

736 elif tokname == "error": 

737 self.log.error("Rule '%s' must be defined as a function", f) 

738 self.error = True 

739 else: 

740 for s in states: 

741 self.strsym[s].append((f, t)) 

742 else: 

743 self.log.error("%s not defined as a function or string", f) 

744 self.error = True 

745 

746 # Sort the functions by line number 

747 for f in self.funcsym.values(): 

748 f.sort(key=lambda x: x[1].__code__.co_firstlineno) 

749 

750 # Sort the strings by regular expression length 

751 for s in self.strsym.values(): 

752 s.sort(key=lambda x: len(x[1]), reverse=True) 

753 

754 # Validate all of the t_rules collected 

755 def validate_rules(self): 

756 for state in self.stateinfo: 

757 # Validate all rules defined by functions 

758 

759 for fname, f in self.funcsym[state]: 

760 line = f.__code__.co_firstlineno 

761 file = f.__code__.co_filename 

762 module = inspect.getmodule(f) 

763 self.modules.add(module) 

764 

765 tokname = self.toknames[fname] 

766 if isinstance(f, types.MethodType): 

767 reqargs = 2 

768 else: 

769 reqargs = 1 

770 nargs = f.__code__.co_argcount 

771 if nargs > reqargs: 

772 self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__) 

773 self.error = True 

774 continue 

775 

776 if nargs < reqargs: 

777 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) 

778 self.error = True 

779 continue 

780 

781 if not _get_regex(f): 

782 self.log.error( 

783 "%s:%d: No regular expression defined for rule '%s'", file, line, f.__name__ 

784 ) 

785 self.error = True 

786 continue 

787 

788 try: 

789 c = re.compile("(?P<%s>%s)" % (fname, _get_regex(f)), self.reflags) 

790 if c.match(""): 

791 self.log.error( 

792 "%s:%d: Regular expression for rule '%s' matches empty string", 

793 file, 

794 line, 

795 f.__name__, 

796 ) 

797 self.error = True 

798 except re.error as e: 

799 self.log.error( 

800 "%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e 

801 ) 

802 if "#" in _get_regex(f): 

803 self.log.error( 

804 "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__ 

805 ) 

806 self.error = True 

807 

808 # Validate all rules defined by strings 

809 for name, r in self.strsym[state]: 

810 tokname = self.toknames[name] 

811 if tokname == "error": 

812 self.log.error("Rule '%s' must be defined as a function", name) 

813 self.error = True 

814 continue 

815 

816 if tokname not in self.tokens and tokname.find("ignore_") < 0: 

817 self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname) 

818 self.error = True 

819 continue 

820 

821 try: 

822 c = re.compile("(?P<%s>%s)" % (name, r), self.reflags) 

823 if c.match(""): 

824 self.log.error("Regular expression for rule '%s' matches empty string", name) 

825 self.error = True 

826 except re.error as e: 

827 self.log.error("Invalid regular expression for rule '%s'. %s", name, e) 

828 if "#" in r: 

829 self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name) 

830 self.error = True 

831 

832 if not self.funcsym[state] and not self.strsym[state]: 

833 self.log.error("No rules defined for state '%s'", state) 

834 self.error = True 

835 

836 # Validate the error function 

837 efunc = self.errorf.get(state, None) 

838 if efunc: 

839 f = efunc 

840 line = f.__code__.co_firstlineno 

841 file = f.__code__.co_filename 

842 module = inspect.getmodule(f) 

843 self.modules.add(module) 

844 

845 if isinstance(f, types.MethodType): 

846 reqargs = 2 

847 else: 

848 reqargs = 1 

849 nargs = f.__code__.co_argcount 

850 if nargs > reqargs: 

851 self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__) 

852 self.error = True 

853 

854 if nargs < reqargs: 

855 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) 

856 self.error = True 

857 

858 for module in self.modules: 

859 self.validate_module(module) 

860 

861 # ----------------------------------------------------------------------------- 

862 # validate_module() 

863 # 

864 # This checks to see if there are duplicated t_rulename() functions or strings 

865 # in the parser input file. This is done using a simple regular expression 

866 # match on each line in the source code of the given module. 

867 # ----------------------------------------------------------------------------- 

868 

869 def validate_module(self, module): 

870 try: 

871 lines, linen = inspect.getsourcelines(module) 

872 except IOError: 

873 return 

874 

875 fre = re.compile(r"\s*def\s+(t_[a-zA-Z_0-9]*)\(") 

876 sre = re.compile(r"\s*(t_[a-zA-Z_0-9]*)\s*=") 

877 

878 counthash = {} 

879 linen += 1 

880 for line in lines: 

881 m = fre.match(line) 

882 if not m: 

883 m = sre.match(line) 

884 if m: 

885 name = m.group(1) 

886 prev = counthash.get(name) 

887 if not prev: 

888 counthash[name] = linen 

889 else: 

890 filename = inspect.getsourcefile(module) 

891 self.log.error( 

892 "%s:%d: Rule %s redefined. Previously defined on line %d", filename, linen, name, prev 

893 ) 

894 self.error = True 

895 linen += 1 

896 

897 

898# ----------------------------------------------------------------------------- 

899# lex(module) 

900# 

901# Build all of the regular expression rules from definitions in the supplied module 

902# ----------------------------------------------------------------------------- 

903def lex( 

904 module=None, 

905 object=None, 

906 debug=False, 

907 optimize=False, 

908 lextab="lextab", 

909 reflags=int(re.VERBOSE), 

910 nowarn=False, 

911 outputdir=None, 

912 debuglog=None, 

913 errorlog=None, 

914): 

915 

916 if lextab is None: 

917 lextab = "lextab" 

918 

919 global lexer 

920 

921 ldict = None 

922 stateinfo = {"INITIAL": "inclusive"} 

923 lexobj = Lexer() 

924 lexobj.lexoptimize = optimize 

925 global token, input 

926 

927 if errorlog is None: 

928 errorlog = PlyLogger(sys.stderr) 

929 

930 if debug: 

931 if debuglog is None: 

932 debuglog = PlyLogger(sys.stderr) 

933 

934 # Get the module dictionary used for the lexer 

935 if object: 

936 module = object 

937 

938 # Get the module dictionary used for the parser 

939 if module: 

940 _items = [(k, getattr(module, k)) for k in dir(module)] 

941 ldict = dict(_items) 

942 # If no __file__ attribute is available, try to obtain it from the __module__ instead 

943 if "__file__" not in ldict: 

944 ldict["__file__"] = sys.modules[ldict["__module__"]].__file__ 

945 else: 

946 ldict = get_caller_module_dict(2) 

947 

948 # Determine if the module is package of a package or not. 

949 # If so, fix the tabmodule setting so that tables load correctly 

950 pkg = ldict.get("__package__") 

951 if pkg and isinstance(lextab, str): 

952 if "." not in lextab: 

953 lextab = pkg + "." + lextab 

954 

955 # Collect parser information from the dictionary 

956 linfo = LexerReflect(ldict, log=errorlog, reflags=reflags) 

957 linfo.get_all() 

958 if not optimize: 

959 if linfo.validate_all(): 

960 raise SyntaxError("Can't build lexer") 

961 

962 if optimize and lextab: 

963 try: 

964 lexobj.readtab(lextab, ldict) 

965 token = lexobj.token 

966 input = lexobj.input 

967 lexer = lexobj 

968 return lexobj 

969 

970 except ImportError: 

971 pass 

972 

973 # Dump some basic debugging information 

974 if debug: 

975 debuglog.info("lex: tokens = %r", linfo.tokens) 

976 debuglog.info("lex: literals = %r", linfo.literals) 

977 debuglog.info("lex: states = %r", linfo.stateinfo) 

978 

979 # Build a dictionary of valid token names 

980 lexobj.lextokens = set() 

981 for n in linfo.tokens: 

982 lexobj.lextokens.add(n) 

983 

984 # Get literals specification 

985 if isinstance(linfo.literals, (list, tuple)): 

986 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) 

987 else: 

988 lexobj.lexliterals = linfo.literals 

989 

990 lexobj.lextokens_all = lexobj.lextokens | set(lexobj.lexliterals) 

991 

992 # Get the stateinfo dictionary 

993 stateinfo = linfo.stateinfo 

994 

995 regexs = {} 

996 # Build the master regular expressions 

997 for state in stateinfo: 

998 regex_list = [] 

999 

1000 # Add rules defined by functions first 

1001 for fname, f in linfo.funcsym[state]: 

1002 regex_list.append("(?P<%s>%s)" % (fname, _get_regex(f))) 

1003 if debug: 

1004 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, _get_regex(f), state) 

1005 

1006 # Now add all of the simple rules 

1007 for name, r in linfo.strsym[state]: 

1008 regex_list.append("(?P<%s>%s)" % (name, r)) 

1009 if debug: 

1010 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state) 

1011 

1012 regexs[state] = regex_list 

1013 

1014 # Build the master regular expressions 

1015 

1016 if debug: 

1017 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") 

1018 

1019 for state in regexs: 

1020 lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames) 

1021 lexobj.lexstatere[state] = lexre 

1022 lexobj.lexstateretext[state] = re_text 

1023 lexobj.lexstaterenames[state] = re_names 

1024 if debug: 

1025 for i, text in enumerate(re_text): 

1026 debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, text) 

1027 

1028 # For inclusive states, we need to add the regular expressions from the INITIAL state 

1029 for state, stype in stateinfo.items(): 

1030 if state != "INITIAL" and stype == "inclusive": 

1031 lexobj.lexstatere[state].extend(lexobj.lexstatere["INITIAL"]) 

1032 lexobj.lexstateretext[state].extend(lexobj.lexstateretext["INITIAL"]) 

1033 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames["INITIAL"]) 

1034 

1035 lexobj.lexstateinfo = stateinfo 

1036 lexobj.lexre = lexobj.lexstatere["INITIAL"] 

1037 lexobj.lexretext = lexobj.lexstateretext["INITIAL"] 

1038 lexobj.lexreflags = reflags 

1039 

1040 # Set up ignore variables 

1041 lexobj.lexstateignore = linfo.ignore 

1042 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL", "") 

1043 

1044 # Set up error functions 

1045 lexobj.lexstateerrorf = linfo.errorf 

1046 lexobj.lexerrorf = linfo.errorf.get("INITIAL", None) 

1047 if not lexobj.lexerrorf: 

1048 errorlog.warning("No t_error rule is defined") 

1049 

1050 # Set up eof functions 

1051 lexobj.lexstateeoff = linfo.eoff 

1052 lexobj.lexeoff = linfo.eoff.get("INITIAL", None) 

1053 

1054 # Check state information for ignore and error rules 

1055 for s, stype in stateinfo.items(): 

1056 if stype == "exclusive": 

1057 if s not in linfo.errorf: 

1058 errorlog.warning("No error rule is defined for exclusive state '%s'", s) 

1059 if s not in linfo.ignore and lexobj.lexignore: 

1060 errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) 

1061 elif stype == "inclusive": 

1062 if s not in linfo.errorf: 

1063 linfo.errorf[s] = linfo.errorf.get("INITIAL", None) 

1064 if s not in linfo.ignore: 

1065 linfo.ignore[s] = linfo.ignore.get("INITIAL", "") 

1066 

1067 # Create global versions of the token() and input() functions 

1068 token = lexobj.token 

1069 input = lexobj.input 

1070 lexer = lexobj 

1071 

1072 # If in optimize mode, we write the lextab 

1073 if lextab and optimize: 

1074 if outputdir is None: 

1075 # If no output directory is set, the location of the output files 

1076 # is determined according to the following rules: 

1077 # - If lextab specifies a package, files go into that package directory 

1078 # - Otherwise, files go in the same directory as the specifying module 

1079 if isinstance(lextab, types.ModuleType): 

1080 srcfile = lextab.__file__ 

1081 else: 

1082 if "." not in lextab: 

1083 srcfile = ldict["__file__"] 

1084 else: 

1085 parts = lextab.split(".") 

1086 pkgname = ".".join(parts[:-1]) 

1087 exec("import %s" % pkgname) 

1088 srcfile = getattr(sys.modules[pkgname], "__file__", "") 

1089 outputdir = os.path.dirname(srcfile) 

1090 try: 

1091 lexobj.writetab(lextab, outputdir) 

1092 if lextab in sys.modules: 

1093 del sys.modules[lextab] 

1094 except IOError as e: 

1095 errorlog.warning("Couldn't write lextab module %r. %s" % (lextab, e)) 

1096 

1097 return lexobj 

1098 

1099 

1100# ----------------------------------------------------------------------------- 

1101# runmain() 

1102# 

1103# This runs the lexer as a main program 

1104# ----------------------------------------------------------------------------- 

1105 

1106 

1107def runmain(lexer=None, data=None): 

1108 if not data: 

1109 try: 

1110 filename = sys.argv[1] 

1111 f = open(filename) 

1112 data = f.read() 

1113 f.close() 

1114 except IndexError: 

1115 sys.stdout.write("Reading from standard input (type EOF to end):\n") 

1116 data = sys.stdin.read() 

1117 

1118 if lexer: 

1119 _input = lexer.input 

1120 else: 

1121 _input = input 

1122 _input(data) 

1123 if lexer: 

1124 _token = lexer.token 

1125 else: 

1126 _token = token 

1127 

1128 while True: 

1129 tok = _token() 

1130 if not tok: 

1131 break 

1132 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno, tok.lexpos)) 

1133 

1134 

1135# ----------------------------------------------------------------------------- 

1136# @TOKEN(regex) 

1137# 

1138# This decorator function can be used to set the regex expression on a function 

1139# when its docstring might need to be set in an alternative way 

1140# ----------------------------------------------------------------------------- 

1141 

1142 

1143def TOKEN(r): 

1144 def set_regex(f): 

1145 if hasattr(r, "__call__"): 1145 ↛ 1146line 1145 didn't jump to line 1146, because the condition on line 1145 was never true

1146 f.regex = _get_regex(r) 

1147 else: 

1148 f.regex = r 

1149 return f 

1150 

1151 return set_regex 

1152 

1153 

1154# Alternative spelling of the TOKEN decorator 

1155Token = TOKEN