Coverage for python/lsst/daf/butler/registry/queries/expressions/parser/ply/lex.py: 7%

692 statements  

« prev     ^ index     » next       coverage.py v7.2.4, created at 2023-04-29 02:58 -0700

1# ----------------------------------------------------------------------------- 

2# ply: lex.py 

3# 

4# Copyright (C) 2001-2018 

5# David M. Beazley (Dabeaz LLC) 

6# All rights reserved. 

7# 

8# Redistribution and use in source and binary forms, with or without 

9# modification, are permitted provided that the following conditions are 

10# met: 

11# 

12# * Redistributions of source code must retain the above copyright notice, 

13# this list of conditions and the following disclaimer. 

14# * Redistributions in binary form must reproduce the above copyright notice, 

15# this list of conditions and the following disclaimer in the documentation 

16# and/or other materials provided with the distribution. 

17# * Neither the name of the David Beazley or Dabeaz LLC may be used to 

18# endorse or promote products derived from this software without 

19# specific prior written permission. 

20# 

21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 

22# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 

23# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 

24# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 

25# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 

26# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 

27# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 

28# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 

29# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 

30# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 

31# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 

32# ----------------------------------------------------------------------------- 

33 

34__version__ = "3.11" 

35__tabversion__ = "3.10" 

36 

37import copy 

38import inspect 

39import os 

40import re 

41import sys 

42import types 

43 

44# This tuple contains known string types 

45try: 

46 # Python 2.6 

47 StringTypes = (types.StringType, types.UnicodeType) 

48except AttributeError: 

49 # Python 3.0 

50 StringTypes = (str, bytes) 

51 

52# This regular expression is used to match valid token names 

53_is_identifier = re.compile(r"^[a-zA-Z0-9_]+$") 

54 

55 

56# Exception thrown when invalid token encountered and no default error 

57# handler is defined. 

58class LexError(Exception): 

59 def __init__(self, message, s): 

60 self.args = (message,) 

61 self.text = s 

62 

63 

64# Token class. This class is used to represent the tokens produced. 

65class LexToken(object): 

66 def __str__(self): 

67 return "LexToken(%s,%r,%d,%d)" % (self.type, self.value, self.lineno, self.lexpos) 

68 

69 def __repr__(self): 

70 return str(self) 

71 

72 

73# This object is a stand-in for a logging object created by the 

74# logging module. 

75 

76 

77class PlyLogger(object): 

78 def __init__(self, f): 

79 self.f = f 

80 

81 def critical(self, msg, *args, **kwargs): 

82 self.f.write((msg % args) + "\n") 

83 

84 def warning(self, msg, *args, **kwargs): 

85 self.f.write("WARNING: " + (msg % args) + "\n") 

86 

87 def error(self, msg, *args, **kwargs): 

88 self.f.write("ERROR: " + (msg % args) + "\n") 

89 

90 info = critical 

91 debug = critical 

92 

93 

94# Null logger is used when no output is generated. Does nothing. 

95class NullLogger(object): 

96 def __getattribute__(self, name): 

97 return self 

98 

99 def __call__(self, *args, **kwargs): 

100 return self 

101 

102 

103# ----------------------------------------------------------------------------- 

104# === Lexing Engine === 

105# 

106# The following Lexer class implements the lexer runtime. There are only 

107# a few public methods and attributes: 

108# 

109# input() - Store a new string in the lexer 

110# token() - Get the next token 

111# clone() - Clone the lexer 

112# 

113# lineno - Current line number 

114# lexpos - Current position in the input string 

115# ----------------------------------------------------------------------------- 

116 

117 

118class Lexer: 

119 def __init__(self): 

120 self.lexre = None # Master regular expression. This is a list of 

121 # tuples (re, findex) where re is a compiled 

122 # regular expression and findex is a list 

123 # mapping regex group numbers to rules 

124 self.lexretext = None # Current regular expression strings 

125 self.lexstatere = {} # Dictionary mapping lexer states to master regexs 

126 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings 

127 self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names 

128 self.lexstate = "INITIAL" # Current lexer state 

129 self.lexstatestack = [] # Stack of lexer states 

130 self.lexstateinfo = None # State information 

131 self.lexstateignore = {} # Dictionary of ignored characters for each state 

132 self.lexstateerrorf = {} # Dictionary of error functions for each state 

133 self.lexstateeoff = {} # Dictionary of eof functions for each state 

134 self.lexreflags = 0 # Optional re compile flags 

135 self.lexdata = None # Actual input data (as a string) 

136 self.lexpos = 0 # Current position in input text 

137 self.lexlen = 0 # Length of the input text 

138 self.lexerrorf = None # Error rule (if any) 

139 self.lexeoff = None # EOF rule (if any) 

140 self.lextokens = None # List of valid tokens 

141 self.lexignore = "" # Ignored characters 

142 self.lexliterals = "" # Literal characters that can be passed through 

143 self.lexmodule = None # Module 

144 self.lineno = 1 # Current line number 

145 self.lexoptimize = False # Optimized mode 

146 

147 def clone(self, object=None): 

148 c = copy.copy(self) 

149 

150 # If the object parameter has been supplied, it means we are attaching the 

151 # lexer to a new object. In this case, we have to rebind all methods in 

152 # the lexstatere and lexstateerrorf tables. 

153 

154 if object: 

155 newtab = {} 

156 for key, ritem in self.lexstatere.items(): 

157 newre = [] 

158 for cre, findex in ritem: 

159 newfindex = [] 

160 for f in findex: 

161 if not f or not f[0]: 

162 newfindex.append(f) 

163 continue 

164 newfindex.append((getattr(object, f[0].__name__), f[1])) 

165 newre.append((cre, newfindex)) 

166 newtab[key] = newre 

167 c.lexstatere = newtab 

168 c.lexstateerrorf = {} 

169 for key, ef in self.lexstateerrorf.items(): 

170 c.lexstateerrorf[key] = getattr(object, ef.__name__) 

171 c.lexmodule = object 

172 return c 

173 

174 # ------------------------------------------------------------ 

175 # writetab() - Write lexer information to a table file 

176 # ------------------------------------------------------------ 

177 def writetab(self, lextab, outputdir=""): 

178 if isinstance(lextab, types.ModuleType): 

179 raise IOError("Won't overwrite existing lextab module") 

180 basetabmodule = lextab.split(".")[-1] 

181 filename = os.path.join(outputdir, basetabmodule) + ".py" 

182 with open(filename, "w") as tf: 

183 tf.write( 

184 "# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" 

185 % (basetabmodule, __version__) 

186 ) 

187 tf.write("_tabversion = %s\n" % repr(__tabversion__)) 

188 tf.write("_lextokens = set(%s)\n" % repr(tuple(sorted(self.lextokens)))) 

189 tf.write("_lexreflags = %s\n" % repr(int(self.lexreflags))) 

190 tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) 

191 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) 

192 

193 # Rewrite the lexstatere table, replacing function objects with function names 

194 tabre = {} 

195 for statename, lre in self.lexstatere.items(): 

196 titem = [] 

197 for (pat, func), retext, renames in zip( 

198 lre, self.lexstateretext[statename], self.lexstaterenames[statename] 

199 ): 

200 titem.append((retext, _funcs_to_names(func, renames))) 

201 tabre[statename] = titem 

202 

203 tf.write("_lexstatere = %s\n" % repr(tabre)) 

204 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) 

205 

206 taberr = {} 

207 for statename, ef in self.lexstateerrorf.items(): 

208 taberr[statename] = ef.__name__ if ef else None 

209 tf.write("_lexstateerrorf = %s\n" % repr(taberr)) 

210 

211 tabeof = {} 

212 for statename, ef in self.lexstateeoff.items(): 

213 tabeof[statename] = ef.__name__ if ef else None 

214 tf.write("_lexstateeoff = %s\n" % repr(tabeof)) 

215 

216 # ------------------------------------------------------------ 

217 # readtab() - Read lexer information from a tab file 

218 # ------------------------------------------------------------ 

219 def readtab(self, tabfile, fdict): 

220 if isinstance(tabfile, types.ModuleType): 

221 lextab = tabfile 

222 else: 

223 exec("import %s" % tabfile) 

224 lextab = sys.modules[tabfile] 

225 

226 if getattr(lextab, "_tabversion", "0.0") != __tabversion__: 

227 raise ImportError("Inconsistent PLY version") 

228 

229 self.lextokens = lextab._lextokens 

230 self.lexreflags = lextab._lexreflags 

231 self.lexliterals = lextab._lexliterals 

232 self.lextokens_all = self.lextokens | set(self.lexliterals) 

233 self.lexstateinfo = lextab._lexstateinfo 

234 self.lexstateignore = lextab._lexstateignore 

235 self.lexstatere = {} 

236 self.lexstateretext = {} 

237 for statename, lre in lextab._lexstatere.items(): 

238 titem = [] 

239 txtitem = [] 

240 for pat, func_name in lre: 

241 titem.append((re.compile(pat, lextab._lexreflags), _names_to_funcs(func_name, fdict))) 

242 

243 self.lexstatere[statename] = titem 

244 self.lexstateretext[statename] = txtitem 

245 

246 self.lexstateerrorf = {} 

247 for statename, ef in lextab._lexstateerrorf.items(): 

248 self.lexstateerrorf[statename] = fdict[ef] 

249 

250 self.lexstateeoff = {} 

251 for statename, ef in lextab._lexstateeoff.items(): 

252 self.lexstateeoff[statename] = fdict[ef] 

253 

254 self.begin("INITIAL") 

255 

256 # ------------------------------------------------------------ 

257 # input() - Push a new string into the lexer 

258 # ------------------------------------------------------------ 

259 def input(self, s): 

260 # Pull off the first character to see if s looks like a string 

261 c = s[:1] 

262 if not isinstance(c, StringTypes): 

263 raise ValueError("Expected a string") 

264 self.lexdata = s 

265 self.lexpos = 0 

266 self.lexlen = len(s) 

267 

268 # ------------------------------------------------------------ 

269 # begin() - Changes the lexing state 

270 # ------------------------------------------------------------ 

271 def begin(self, state): 

272 if state not in self.lexstatere: 

273 raise ValueError("Undefined state") 

274 self.lexre = self.lexstatere[state] 

275 self.lexretext = self.lexstateretext[state] 

276 self.lexignore = self.lexstateignore.get(state, "") 

277 self.lexerrorf = self.lexstateerrorf.get(state, None) 

278 self.lexeoff = self.lexstateeoff.get(state, None) 

279 self.lexstate = state 

280 

281 # ------------------------------------------------------------ 

282 # push_state() - Changes the lexing state and saves old on stack 

283 # ------------------------------------------------------------ 

284 def push_state(self, state): 

285 self.lexstatestack.append(self.lexstate) 

286 self.begin(state) 

287 

288 # ------------------------------------------------------------ 

289 # pop_state() - Restores the previous state 

290 # ------------------------------------------------------------ 

291 def pop_state(self): 

292 self.begin(self.lexstatestack.pop()) 

293 

294 # ------------------------------------------------------------ 

295 # current_state() - Returns the current lexing state 

296 # ------------------------------------------------------------ 

297 def current_state(self): 

298 return self.lexstate 

299 

300 # ------------------------------------------------------------ 

301 # skip() - Skip ahead n characters 

302 # ------------------------------------------------------------ 

303 def skip(self, n): 

304 self.lexpos += n 

305 

306 # ------------------------------------------------------------ 

307 # opttoken() - Return the next token from the Lexer 

308 # 

309 # Note: This function has been carefully implemented to be as fast 

310 # as possible. Don't make changes unless you really know what 

311 # you are doing 

312 # ------------------------------------------------------------ 

313 def token(self): 

314 # Make local copies of frequently referenced attributes 

315 lexpos = self.lexpos 

316 lexlen = self.lexlen 

317 lexignore = self.lexignore 

318 lexdata = self.lexdata 

319 

320 while lexpos < lexlen: 

321 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 

322 if lexdata[lexpos] in lexignore: 

323 lexpos += 1 

324 continue 

325 

326 # Look for a regular expression match 

327 for lexre, lexindexfunc in self.lexre: 

328 m = lexre.match(lexdata, lexpos) 

329 if not m: 

330 continue 

331 

332 # Create a token for return 

333 tok = LexToken() 

334 tok.value = m.group() 

335 tok.lineno = self.lineno 

336 tok.lexpos = lexpos 

337 

338 i = m.lastindex 

339 func, tok.type = lexindexfunc[i] 

340 

341 if not func: 

342 # If no token type was set, it's an ignored token 

343 if tok.type: 

344 self.lexpos = m.end() 

345 return tok 

346 else: 

347 lexpos = m.end() 

348 break 

349 

350 lexpos = m.end() 

351 

352 # If token is processed by a function, call it 

353 

354 tok.lexer = self # Set additional attributes useful in token rules 

355 self.lexmatch = m 

356 self.lexpos = lexpos 

357 

358 newtok = func(tok) 

359 

360 # Every function must return a token, if nothing, we just move to next token 

361 if not newtok: 

362 lexpos = self.lexpos # This is here in case user has updated lexpos. 

363 lexignore = self.lexignore # This is here in case there was a state change 

364 break 

365 

366 # Verify type of the token. If not in the token map, raise an error 

367 if not self.lexoptimize: 

368 if newtok.type not in self.lextokens_all: 

369 raise LexError( 

370 "%s:%d: Rule '%s' returned an unknown token type '%s'" 

371 % ( 

372 func.__code__.co_filename, 

373 func.__code__.co_firstlineno, 

374 func.__name__, 

375 newtok.type, 

376 ), 

377 lexdata[lexpos:], 

378 ) 

379 

380 return newtok 

381 else: 

382 # No match, see if in literals 

383 if lexdata[lexpos] in self.lexliterals: 

384 tok = LexToken() 

385 tok.value = lexdata[lexpos] 

386 tok.lineno = self.lineno 

387 tok.type = tok.value 

388 tok.lexpos = lexpos 

389 self.lexpos = lexpos + 1 

390 return tok 

391 

392 # No match. Call t_error() if defined. 

393 if self.lexerrorf: 

394 tok = LexToken() 

395 tok.value = self.lexdata[lexpos:] 

396 tok.lineno = self.lineno 

397 tok.type = "error" 

398 tok.lexer = self 

399 tok.lexpos = lexpos 

400 self.lexpos = lexpos 

401 newtok = self.lexerrorf(tok) 

402 if lexpos == self.lexpos: 

403 # Error method didn't change text position at all. This is an error. 

404 raise LexError( 

405 "Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:] 

406 ) 

407 lexpos = self.lexpos 

408 if not newtok: 

409 continue 

410 return newtok 

411 

412 self.lexpos = lexpos 

413 raise LexError( 

414 "Illegal character '%s' at index %d" % (lexdata[lexpos], lexpos), lexdata[lexpos:] 

415 ) 

416 

417 if self.lexeoff: 

418 tok = LexToken() 

419 tok.type = "eof" 

420 tok.value = "" 

421 tok.lineno = self.lineno 

422 tok.lexpos = lexpos 

423 tok.lexer = self 

424 self.lexpos = lexpos 

425 newtok = self.lexeoff(tok) 

426 return newtok 

427 

428 self.lexpos = lexpos + 1 

429 if self.lexdata is None: 

430 raise RuntimeError("No input string given with input()") 

431 return None 

432 

433 # Iterator interface 

434 def __iter__(self): 

435 return self 

436 

437 def next(self): 

438 t = self.token() 

439 if t is None: 

440 raise StopIteration 

441 return t 

442 

443 __next__ = next 

444 

445 

446# ----------------------------------------------------------------------------- 

447# ==== Lex Builder === 

448# 

449# The functions and classes below are used to collect lexing information 

450# and build a Lexer object from it. 

451# ----------------------------------------------------------------------------- 

452 

453 

454# ----------------------------------------------------------------------------- 

455# _get_regex(func) 

456# 

457# Returns the regular expression assigned to a function either as a doc string 

458# or as a .regex attribute attached by the @TOKEN decorator. 

459# ----------------------------------------------------------------------------- 

460def _get_regex(func): 

461 return getattr(func, "regex", func.__doc__) 

462 

463 

464# ----------------------------------------------------------------------------- 

465# get_caller_module_dict() 

466# 

467# This function returns a dictionary containing all of the symbols defined within 

468# a caller further down the call stack. This is used to get the environment 

469# associated with the yacc() call if none was provided. 

470# ----------------------------------------------------------------------------- 

471def get_caller_module_dict(levels): 

472 f = sys._getframe(levels) 

473 ldict = f.f_globals.copy() 

474 if f.f_globals != f.f_locals: 

475 ldict.update(f.f_locals) 

476 return ldict 

477 

478 

479# ----------------------------------------------------------------------------- 

480# _funcs_to_names() 

481# 

482# Given a list of regular expression functions, this converts it to a list 

483# suitable for output to a table file 

484# ----------------------------------------------------------------------------- 

485def _funcs_to_names(funclist, namelist): 

486 result = [] 

487 for f, name in zip(funclist, namelist): 

488 if f and f[0]: 

489 result.append((name, f[1])) 

490 else: 

491 result.append(f) 

492 return result 

493 

494 

495# ----------------------------------------------------------------------------- 

496# _names_to_funcs() 

497# 

498# Given a list of regular expression function names, this converts it back to 

499# functions. 

500# ----------------------------------------------------------------------------- 

501def _names_to_funcs(namelist, fdict): 

502 result = [] 

503 for n in namelist: 

504 if n and n[0]: 

505 result.append((fdict[n[0]], n[1])) 

506 else: 

507 result.append(n) 

508 return result 

509 

510 

511# ----------------------------------------------------------------------------- 

512# _form_master_re() 

513# 

514# This function takes a list of all of the regex components and attempts to 

515# form the master regular expression. Given limitations in the Python re 

516# module, it may be necessary to break the master regex into separate expressions. 

517# ----------------------------------------------------------------------------- 

518def _form_master_re(relist, reflags, ldict, toknames): 

519 if not relist: 

520 return [] 

521 regex = "|".join(relist) 

522 try: 

523 lexre = re.compile(regex, reflags) 

524 

525 # Build the index to function map for the matching engine 

526 lexindexfunc = [None] * (max(lexre.groupindex.values()) + 1) 

527 lexindexnames = lexindexfunc[:] 

528 

529 for f, i in lexre.groupindex.items(): 

530 handle = ldict.get(f, None) 

531 if type(handle) in (types.FunctionType, types.MethodType): 

532 lexindexfunc[i] = (handle, toknames[f]) 

533 lexindexnames[i] = f 

534 elif handle is not None: 

535 lexindexnames[i] = f 

536 if f.find("ignore_") > 0: 

537 lexindexfunc[i] = (None, None) 

538 else: 

539 lexindexfunc[i] = (None, toknames[f]) 

540 

541 return [(lexre, lexindexfunc)], [regex], [lexindexnames] 

542 except Exception: 

543 m = int(len(relist) / 2) 

544 if m == 0: 

545 m = 1 

546 llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames) 

547 rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames) 

548 return (llist + rlist), (lre + rre), (lnames + rnames) 

549 

550 

551# ----------------------------------------------------------------------------- 

552# def _statetoken(s,names) 

553# 

554# Given a declaration name s of the form "t_" and a dictionary whose keys are 

555# state names, this function returns a tuple (states,tokenname) where states 

556# is a tuple of state names and tokenname is the name of the token. For example, 

557# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') 

558# ----------------------------------------------------------------------------- 

559def _statetoken(s, names): 

560 parts = s.split("_") 

561 for i, part in enumerate(parts[1:], 1): 

562 if part not in names and part != "ANY": 

563 break 

564 

565 if i > 1: 

566 states = tuple(parts[1:i]) 

567 else: 

568 states = ("INITIAL",) 

569 

570 if "ANY" in states: 

571 states = tuple(names) 

572 

573 tokenname = "_".join(parts[i:]) 

574 return (states, tokenname) 

575 

576 

577# ----------------------------------------------------------------------------- 

578# LexerReflect() 

579# 

580# This class represents information needed to build a lexer as extracted from a 

581# user's input file. 

582# ----------------------------------------------------------------------------- 

583class LexerReflect(object): 

584 def __init__(self, ldict, log=None, reflags=0): 

585 self.ldict = ldict 

586 self.error_func = None 

587 self.tokens = [] 

588 self.reflags = reflags 

589 self.stateinfo = {"INITIAL": "inclusive"} 

590 self.modules = set() 

591 self.error = False 

592 self.log = PlyLogger(sys.stderr) if log is None else log 

593 

594 # Get all of the basic information 

595 def get_all(self): 

596 self.get_tokens() 

597 self.get_literals() 

598 self.get_states() 

599 self.get_rules() 

600 

601 # Validate all of the information 

602 def validate_all(self): 

603 self.validate_tokens() 

604 self.validate_literals() 

605 self.validate_rules() 

606 return self.error 

607 

608 # Get the tokens map 

609 def get_tokens(self): 

610 tokens = self.ldict.get("tokens", None) 

611 if not tokens: 

612 self.log.error("No token list is defined") 

613 self.error = True 

614 return 

615 

616 if not isinstance(tokens, (list, tuple)): 

617 self.log.error("tokens must be a list or tuple") 

618 self.error = True 

619 return 

620 

621 if not tokens: 

622 self.log.error("tokens is empty") 

623 self.error = True 

624 return 

625 

626 self.tokens = tokens 

627 

628 # Validate the tokens 

629 def validate_tokens(self): 

630 terminals = {} 

631 for n in self.tokens: 

632 if not _is_identifier.match(n): 

633 self.log.error("Bad token name '%s'", n) 

634 self.error = True 

635 if n in terminals: 

636 self.log.warning("Token '%s' multiply defined", n) 

637 terminals[n] = 1 

638 

639 # Get the literals specifier 

640 def get_literals(self): 

641 self.literals = self.ldict.get("literals", "") 

642 if not self.literals: 

643 self.literals = "" 

644 

645 # Validate literals 

646 def validate_literals(self): 

647 try: 

648 for c in self.literals: 

649 if not isinstance(c, StringTypes) or len(c) > 1: 

650 self.log.error("Invalid literal %s. Must be a single character", repr(c)) 

651 self.error = True 

652 

653 except TypeError: 

654 self.log.error("Invalid literals specification. literals must be a sequence of characters") 

655 self.error = True 

656 

657 def get_states(self): 

658 self.states = self.ldict.get("states", None) 

659 # Build statemap 

660 if self.states: 

661 if not isinstance(self.states, (tuple, list)): 

662 self.log.error("states must be defined as a tuple or list") 

663 self.error = True 

664 else: 

665 for s in self.states: 

666 if not isinstance(s, tuple) or len(s) != 2: 

667 self.log.error( 

668 "Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", 

669 repr(s), 

670 ) 

671 self.error = True 

672 continue 

673 name, statetype = s 

674 if not isinstance(name, StringTypes): 

675 self.log.error("State name %s must be a string", repr(name)) 

676 self.error = True 

677 continue 

678 if not (statetype == "inclusive" or statetype == "exclusive"): 

679 self.log.error("State type for state %s must be 'inclusive' or 'exclusive'", name) 

680 self.error = True 

681 continue 

682 if name in self.stateinfo: 

683 self.log.error("State '%s' already defined", name) 

684 self.error = True 

685 continue 

686 self.stateinfo[name] = statetype 

687 

688 # Get all of the symbols with a t_ prefix and sort them into various 

689 # categories (functions, strings, error functions, and ignore characters) 

690 

691 def get_rules(self): 

692 tsymbols = [f for f in self.ldict if f[:2] == "t_"] 

693 

694 # Now build up a list of functions and a list of strings 

695 self.toknames = {} # Mapping of symbols to token names 

696 self.funcsym = {} # Symbols defined as functions 

697 self.strsym = {} # Symbols defined as strings 

698 self.ignore = {} # Ignore strings by state 

699 self.errorf = {} # Error functions by state 

700 self.eoff = {} # EOF functions by state 

701 

702 for s in self.stateinfo: 

703 self.funcsym[s] = [] 

704 self.strsym[s] = [] 

705 

706 if len(tsymbols) == 0: 

707 self.log.error("No rules of the form t_rulename are defined") 

708 self.error = True 

709 return 

710 

711 for f in tsymbols: 

712 t = self.ldict[f] 

713 states, tokname = _statetoken(f, self.stateinfo) 

714 self.toknames[f] = tokname 

715 

716 if hasattr(t, "__call__"): 

717 if tokname == "error": 

718 for s in states: 

719 self.errorf[s] = t 

720 elif tokname == "eof": 

721 for s in states: 

722 self.eoff[s] = t 

723 elif tokname == "ignore": 

724 line = t.__code__.co_firstlineno 

725 file = t.__code__.co_filename 

726 self.log.error("%s:%d: Rule '%s' must be defined as a string", file, line, t.__name__) 

727 self.error = True 

728 else: 

729 for s in states: 

730 self.funcsym[s].append((f, t)) 

731 elif isinstance(t, StringTypes): 

732 if tokname == "ignore": 

733 for s in states: 

734 self.ignore[s] = t 

735 if "\\" in t: 

736 self.log.warning("%s contains a literal backslash '\\'", f) 

737 

738 elif tokname == "error": 

739 self.log.error("Rule '%s' must be defined as a function", f) 

740 self.error = True 

741 else: 

742 for s in states: 

743 self.strsym[s].append((f, t)) 

744 else: 

745 self.log.error("%s not defined as a function or string", f) 

746 self.error = True 

747 

748 # Sort the functions by line number 

749 for f in self.funcsym.values(): 

750 f.sort(key=lambda x: x[1].__code__.co_firstlineno) 

751 

752 # Sort the strings by regular expression length 

753 for s in self.strsym.values(): 

754 s.sort(key=lambda x: len(x[1]), reverse=True) 

755 

756 # Validate all of the t_rules collected 

757 def validate_rules(self): 

758 for state in self.stateinfo: 

759 # Validate all rules defined by functions 

760 

761 for fname, f in self.funcsym[state]: 

762 line = f.__code__.co_firstlineno 

763 file = f.__code__.co_filename 

764 module = inspect.getmodule(f) 

765 self.modules.add(module) 

766 

767 tokname = self.toknames[fname] 

768 if isinstance(f, types.MethodType): 

769 reqargs = 2 

770 else: 

771 reqargs = 1 

772 nargs = f.__code__.co_argcount 

773 if nargs > reqargs: 

774 self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__) 

775 self.error = True 

776 continue 

777 

778 if nargs < reqargs: 

779 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) 

780 self.error = True 

781 continue 

782 

783 if not _get_regex(f): 

784 self.log.error( 

785 "%s:%d: No regular expression defined for rule '%s'", file, line, f.__name__ 

786 ) 

787 self.error = True 

788 continue 

789 

790 try: 

791 c = re.compile("(?P<%s>%s)" % (fname, _get_regex(f)), self.reflags) 

792 if c.match(""): 

793 self.log.error( 

794 "%s:%d: Regular expression for rule '%s' matches empty string", 

795 file, 

796 line, 

797 f.__name__, 

798 ) 

799 self.error = True 

800 except re.error as e: 

801 self.log.error( 

802 "%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e 

803 ) 

804 if "#" in _get_regex(f): 

805 self.log.error( 

806 "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__ 

807 ) 

808 self.error = True 

809 

810 # Validate all rules defined by strings 

811 for name, r in self.strsym[state]: 

812 tokname = self.toknames[name] 

813 if tokname == "error": 

814 self.log.error("Rule '%s' must be defined as a function", name) 

815 self.error = True 

816 continue 

817 

818 if tokname not in self.tokens and tokname.find("ignore_") < 0: 

819 self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname) 

820 self.error = True 

821 continue 

822 

823 try: 

824 c = re.compile("(?P<%s>%s)" % (name, r), self.reflags) 

825 if c.match(""): 

826 self.log.error("Regular expression for rule '%s' matches empty string", name) 

827 self.error = True 

828 except re.error as e: 

829 self.log.error("Invalid regular expression for rule '%s'. %s", name, e) 

830 if "#" in r: 

831 self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name) 

832 self.error = True 

833 

834 if not self.funcsym[state] and not self.strsym[state]: 

835 self.log.error("No rules defined for state '%s'", state) 

836 self.error = True 

837 

838 # Validate the error function 

839 efunc = self.errorf.get(state, None) 

840 if efunc: 

841 f = efunc 

842 line = f.__code__.co_firstlineno 

843 file = f.__code__.co_filename 

844 module = inspect.getmodule(f) 

845 self.modules.add(module) 

846 

847 if isinstance(f, types.MethodType): 

848 reqargs = 2 

849 else: 

850 reqargs = 1 

851 nargs = f.__code__.co_argcount 

852 if nargs > reqargs: 

853 self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__) 

854 self.error = True 

855 

856 if nargs < reqargs: 

857 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) 

858 self.error = True 

859 

860 for module in self.modules: 

861 self.validate_module(module) 

862 

863 # ----------------------------------------------------------------------------- 

864 # validate_module() 

865 # 

866 # This checks to see if there are duplicated t_rulename() functions or strings 

867 # in the parser input file. This is done using a simple regular expression 

868 # match on each line in the source code of the given module. 

869 # ----------------------------------------------------------------------------- 

870 

871 def validate_module(self, module): 

872 try: 

873 lines, linen = inspect.getsourcelines(module) 

874 except IOError: 

875 return 

876 

877 fre = re.compile(r"\s*def\s+(t_[a-zA-Z_0-9]*)\(") 

878 sre = re.compile(r"\s*(t_[a-zA-Z_0-9]*)\s*=") 

879 

880 counthash = {} 

881 linen += 1 

882 for line in lines: 

883 m = fre.match(line) 

884 if not m: 

885 m = sre.match(line) 

886 if m: 

887 name = m.group(1) 

888 prev = counthash.get(name) 

889 if not prev: 

890 counthash[name] = linen 

891 else: 

892 filename = inspect.getsourcefile(module) 

893 self.log.error( 

894 "%s:%d: Rule %s redefined. Previously defined on line %d", filename, linen, name, prev 

895 ) 

896 self.error = True 

897 linen += 1 

898 

899 

900# ----------------------------------------------------------------------------- 

901# lex(module) 

902# 

903# Build all of the regular expression rules from definitions in the supplied module 

904# ----------------------------------------------------------------------------- 

905def lex( 

906 module=None, 

907 object=None, 

908 debug=False, 

909 optimize=False, 

910 lextab="lextab", 

911 reflags=int(re.VERBOSE), 

912 nowarn=False, 

913 outputdir=None, 

914 debuglog=None, 

915 errorlog=None, 

916): 

917 if lextab is None: 

918 lextab = "lextab" 

919 

920 global lexer 

921 

922 ldict = None 

923 stateinfo = {"INITIAL": "inclusive"} 

924 lexobj = Lexer() 

925 lexobj.lexoptimize = optimize 

926 global token, input 

927 

928 if errorlog is None: 

929 errorlog = PlyLogger(sys.stderr) 

930 

931 if debug: 

932 if debuglog is None: 

933 debuglog = PlyLogger(sys.stderr) 

934 

935 # Get the module dictionary used for the lexer 

936 if object: 

937 module = object 

938 

939 # Get the module dictionary used for the parser 

940 if module: 

941 _items = [(k, getattr(module, k)) for k in dir(module)] 

942 ldict = dict(_items) 

943 # If no __file__ attribute is available, try to obtain it from the __module__ instead 

944 if "__file__" not in ldict: 

945 ldict["__file__"] = sys.modules[ldict["__module__"]].__file__ 

946 else: 

947 ldict = get_caller_module_dict(2) 

948 

949 # Determine if the module is package of a package or not. 

950 # If so, fix the tabmodule setting so that tables load correctly 

951 pkg = ldict.get("__package__") 

952 if pkg and isinstance(lextab, str): 

953 if "." not in lextab: 

954 lextab = pkg + "." + lextab 

955 

956 # Collect parser information from the dictionary 

957 linfo = LexerReflect(ldict, log=errorlog, reflags=reflags) 

958 linfo.get_all() 

959 if not optimize: 

960 if linfo.validate_all(): 

961 raise SyntaxError("Can't build lexer") 

962 

963 if optimize and lextab: 

964 try: 

965 lexobj.readtab(lextab, ldict) 

966 token = lexobj.token 

967 input = lexobj.input 

968 lexer = lexobj 

969 return lexobj 

970 

971 except ImportError: 

972 pass 

973 

974 # Dump some basic debugging information 

975 if debug: 

976 debuglog.info("lex: tokens = %r", linfo.tokens) 

977 debuglog.info("lex: literals = %r", linfo.literals) 

978 debuglog.info("lex: states = %r", linfo.stateinfo) 

979 

980 # Build a dictionary of valid token names 

981 lexobj.lextokens = set() 

982 for n in linfo.tokens: 

983 lexobj.lextokens.add(n) 

984 

985 # Get literals specification 

986 if isinstance(linfo.literals, (list, tuple)): 

987 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) 

988 else: 

989 lexobj.lexliterals = linfo.literals 

990 

991 lexobj.lextokens_all = lexobj.lextokens | set(lexobj.lexliterals) 

992 

993 # Get the stateinfo dictionary 

994 stateinfo = linfo.stateinfo 

995 

996 regexs = {} 

997 # Build the master regular expressions 

998 for state in stateinfo: 

999 regex_list = [] 

1000 

1001 # Add rules defined by functions first 

1002 for fname, f in linfo.funcsym[state]: 

1003 regex_list.append("(?P<%s>%s)" % (fname, _get_regex(f))) 

1004 if debug: 

1005 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, _get_regex(f), state) 

1006 

1007 # Now add all of the simple rules 

1008 for name, r in linfo.strsym[state]: 

1009 regex_list.append("(?P<%s>%s)" % (name, r)) 

1010 if debug: 

1011 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state) 

1012 

1013 regexs[state] = regex_list 

1014 

1015 # Build the master regular expressions 

1016 

1017 if debug: 

1018 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") 

1019 

1020 for state in regexs: 

1021 lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames) 

1022 lexobj.lexstatere[state] = lexre 

1023 lexobj.lexstateretext[state] = re_text 

1024 lexobj.lexstaterenames[state] = re_names 

1025 if debug: 

1026 for i, text in enumerate(re_text): 

1027 debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, text) 

1028 

1029 # For inclusive states, we need to add the regular expressions from the INITIAL state 

1030 for state, stype in stateinfo.items(): 

1031 if state != "INITIAL" and stype == "inclusive": 

1032 lexobj.lexstatere[state].extend(lexobj.lexstatere["INITIAL"]) 

1033 lexobj.lexstateretext[state].extend(lexobj.lexstateretext["INITIAL"]) 

1034 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames["INITIAL"]) 

1035 

1036 lexobj.lexstateinfo = stateinfo 

1037 lexobj.lexre = lexobj.lexstatere["INITIAL"] 

1038 lexobj.lexretext = lexobj.lexstateretext["INITIAL"] 

1039 lexobj.lexreflags = reflags 

1040 

1041 # Set up ignore variables 

1042 lexobj.lexstateignore = linfo.ignore 

1043 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL", "") 

1044 

1045 # Set up error functions 

1046 lexobj.lexstateerrorf = linfo.errorf 

1047 lexobj.lexerrorf = linfo.errorf.get("INITIAL", None) 

1048 if not lexobj.lexerrorf: 

1049 errorlog.warning("No t_error rule is defined") 

1050 

1051 # Set up eof functions 

1052 lexobj.lexstateeoff = linfo.eoff 

1053 lexobj.lexeoff = linfo.eoff.get("INITIAL", None) 

1054 

1055 # Check state information for ignore and error rules 

1056 for s, stype in stateinfo.items(): 

1057 if stype == "exclusive": 

1058 if s not in linfo.errorf: 

1059 errorlog.warning("No error rule is defined for exclusive state '%s'", s) 

1060 if s not in linfo.ignore and lexobj.lexignore: 

1061 errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) 

1062 elif stype == "inclusive": 

1063 if s not in linfo.errorf: 

1064 linfo.errorf[s] = linfo.errorf.get("INITIAL", None) 

1065 if s not in linfo.ignore: 

1066 linfo.ignore[s] = linfo.ignore.get("INITIAL", "") 

1067 

1068 # Create global versions of the token() and input() functions 

1069 token = lexobj.token 

1070 input = lexobj.input 

1071 lexer = lexobj 

1072 

1073 # If in optimize mode, we write the lextab 

1074 if lextab and optimize: 

1075 if outputdir is None: 

1076 # If no output directory is set, the location of the output files 

1077 # is determined according to the following rules: 

1078 # - If lextab specifies a package, files go into that package directory 

1079 # - Otherwise, files go in the same directory as the specifying module 

1080 if isinstance(lextab, types.ModuleType): 

1081 srcfile = lextab.__file__ 

1082 else: 

1083 if "." not in lextab: 

1084 srcfile = ldict["__file__"] 

1085 else: 

1086 parts = lextab.split(".") 

1087 pkgname = ".".join(parts[:-1]) 

1088 exec("import %s" % pkgname) 

1089 srcfile = getattr(sys.modules[pkgname], "__file__", "") 

1090 outputdir = os.path.dirname(srcfile) 

1091 try: 

1092 lexobj.writetab(lextab, outputdir) 

1093 if lextab in sys.modules: 

1094 del sys.modules[lextab] 

1095 except IOError as e: 

1096 errorlog.warning("Couldn't write lextab module %r. %s" % (lextab, e)) 

1097 

1098 return lexobj 

1099 

1100 

1101# ----------------------------------------------------------------------------- 

1102# runmain() 

1103# 

1104# This runs the lexer as a main program 

1105# ----------------------------------------------------------------------------- 

1106 

1107 

1108def runmain(lexer=None, data=None): 

1109 if not data: 

1110 try: 

1111 filename = sys.argv[1] 

1112 f = open(filename) 

1113 data = f.read() 

1114 f.close() 

1115 except IndexError: 

1116 sys.stdout.write("Reading from standard input (type EOF to end):\n") 

1117 data = sys.stdin.read() 

1118 

1119 if lexer: 

1120 _input = lexer.input 

1121 else: 

1122 _input = input 

1123 _input(data) 

1124 if lexer: 

1125 _token = lexer.token 

1126 else: 

1127 _token = token 

1128 

1129 while True: 

1130 tok = _token() 

1131 if not tok: 

1132 break 

1133 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno, tok.lexpos)) 

1134 

1135 

1136# ----------------------------------------------------------------------------- 

1137# @TOKEN(regex) 

1138# 

1139# This decorator function can be used to set the regex expression on a function 

1140# when its docstring might need to be set in an alternative way 

1141# ----------------------------------------------------------------------------- 

1142 

1143 

1144def TOKEN(r): 

1145 def set_regex(f): 

1146 if hasattr(r, "__call__"): 1146 ↛ 1147line 1146 didn't jump to line 1147, because the condition on line 1146 was never true

1147 f.regex = _get_regex(r) 

1148 else: 

1149 f.regex = r 

1150 return f 

1151 

1152 return set_regex 

1153 

1154 

1155# Alternative spelling of the TOKEN decorator 

1156Token = TOKEN