From ab7536aa4d009bff83f8690b593eadfc033bafd5 Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Fri, 7 Oct 2016 14:16:19 -0400 Subject: [PATCH] Squashed 'xonsh/ply/' content from commit 393cc55 git-subtree-dir: xonsh/ply git-subtree-split: 393cc558722eb892724701c110e7ae4c101c88c3 --- .gitignore | 9 + .travis.yml | 10 + ANNOUNCE | 40 + CHANGES | 1390 ++++++++++ MANIFEST.in | 8 + README.md | 273 ++ TODO | 16 + doc/internal.html | 874 +++++++ doc/makedoc.py | 194 ++ doc/ply.html | 3496 +++++++++++++++++++++++++ example/BASIC/README | 79 + example/BASIC/basic.py | 65 + example/BASIC/basiclex.py | 61 + example/BASIC/basiclog.py | 73 + example/BASIC/basinterp.py | 496 ++++ example/BASIC/basparse.py | 474 ++++ example/BASIC/dim.bas | 14 + example/BASIC/func.bas | 5 + example/BASIC/gcd.bas | 22 + example/BASIC/gosub.bas | 13 + example/BASIC/hello.bas | 4 + example/BASIC/linear.bas | 17 + example/BASIC/maxsin.bas | 12 + example/BASIC/powers.bas | 13 + example/BASIC/rand.bas | 4 + example/BASIC/sales.bas | 20 + example/BASIC/sears.bas | 18 + example/BASIC/sqrt1.bas | 5 + example/BASIC/sqrt2.bas | 4 + example/GardenSnake/GardenSnake.py | 777 ++++++ example/GardenSnake/README | 5 + example/README | 10 + example/ansic/README | 2 + example/ansic/clex.py | 168 ++ example/ansic/cparse.py | 1048 ++++++++ example/calc/calc.py | 123 + example/calcdebug/calc.py | 129 + example/calceof/calc.py | 132 + example/classcalc/calc.py | 165 ++ example/cleanup.sh | 2 + example/closurecalc/calc.py | 132 + example/hedit/hedit.py | 48 + example/newclasscalc/calc.py | 167 ++ example/optcalc/README | 9 + example/optcalc/calc.py | 134 + example/unicalc/calc.py | 133 + example/yply/README | 41 + example/yply/ylex.py | 119 + example/yply/yparse.py | 244 ++ example/yply/yply.py | 51 + ply/__init__.py | 5 + ply/cpp.py | 917 +++++++ ply/ctokens.py | 133 + ply/lex.py | 1100 ++++++++ ply/yacc.py | 3502 ++++++++++++++++++++++++++ ply/ygen.py | 74 + setup.cfg | 5 + setup.py | 31 + test/README | 7 + test/calclex.py | 49 + test/cleanup.sh | 4 + test/lex_closure.py | 54 + test/lex_doc1.py | 26 + test/lex_dup1.py | 29 + test/lex_dup2.py | 33 + test/lex_dup3.py | 31 + test/lex_empty.py | 20 + test/lex_error1.py | 24 + test/lex_error2.py | 26 + test/lex_error3.py | 27 + test/lex_error4.py | 27 + test/lex_hedit.py | 47 + test/lex_ignore.py | 31 + test/lex_ignore2.py | 29 + test/lex_literal1.py | 25 + test/lex_literal2.py | 25 + test/lex_literal3.py | 26 + test/lex_many_tokens.py | 27 + test/lex_module.py | 10 + test/lex_module_import.py | 42 + test/lex_object.py | 55 + test/lex_opt_alias.py | 54 + test/lex_optimize.py | 50 + test/lex_optimize2.py | 50 + test/lex_optimize3.py | 52 + test/lex_re1.py | 27 + test/lex_re2.py | 27 + test/lex_re3.py | 29 + test/lex_rule1.py | 27 + test/lex_rule2.py | 29 + test/lex_rule3.py | 27 + test/lex_state1.py | 40 + test/lex_state2.py | 40 + test/lex_state3.py | 42 + test/lex_state4.py | 41 + test/lex_state5.py | 40 + test/lex_state_noerror.py | 39 + test/lex_state_norule.py | 40 + test/lex_state_try.py | 45 + test/lex_token1.py | 19 + test/lex_token2.py | 22 + test/lex_token3.py | 24 + test/lex_token4.py | 26 + test/lex_token5.py | 31 + test/lex_token_dup.py | 29 + test/pkg_test1/__init__.py | 9 + test/pkg_test1/parsing/__init__.py | 0 test/pkg_test1/parsing/calclex.py | 47 + test/pkg_test1/parsing/calcparse.py | 66 + test/pkg_test2/__init__.py | 9 + test/pkg_test2/parsing/__init__.py | 0 test/pkg_test2/parsing/calclex.py | 47 + test/pkg_test2/parsing/calcparse.py | 66 + test/pkg_test3/__init__.py | 9 + test/pkg_test3/generated/__init__.py | 0 test/pkg_test3/parsing/__init__.py | 0 test/pkg_test3/parsing/calclex.py | 47 + test/pkg_test3/parsing/calcparse.py | 66 + test/pkg_test4/__init__.py | 25 + test/pkg_test4/parsing/__init__.py | 0 test/pkg_test4/parsing/calclex.py | 47 + test/pkg_test4/parsing/calcparse.py | 66 + test/pkg_test5/__init__.py | 9 + test/pkg_test5/parsing/__init__.py | 0 test/pkg_test5/parsing/calclex.py | 48 + test/pkg_test5/parsing/calcparse.py | 67 + test/pkg_test6/__init__.py | 9 + test/pkg_test6/parsing/__init__.py | 0 test/pkg_test6/parsing/calclex.py | 48 + test/pkg_test6/parsing/calcparse.py | 33 + test/pkg_test6/parsing/expression.py | 31 + test/pkg_test6/parsing/statement.py | 9 + test/testlex.py | 660 +++++ test/testyacc.py | 452 ++++ test/yacc_badargs.py | 68 + test/yacc_badid.py | 77 + test/yacc_badprec.py | 64 + test/yacc_badprec2.py | 68 + test/yacc_badprec3.py | 68 + test/yacc_badrule.py | 68 + test/yacc_badtok.py | 68 + test/yacc_dup.py | 68 + test/yacc_error1.py | 68 + test/yacc_error2.py | 68 + test/yacc_error3.py | 67 + test/yacc_error4.py | 72 + test/yacc_error5.py | 94 + test/yacc_error6.py | 80 + test/yacc_error7.py | 80 + test/yacc_inf.py | 56 + test/yacc_literal.py | 69 + test/yacc_misplaced.py | 68 + test/yacc_missing1.py | 68 + test/yacc_nested.py | 33 + test/yacc_nodoc.py | 67 + test/yacc_noerror.py | 66 + test/yacc_nop.py | 68 + test/yacc_notfunc.py | 66 + test/yacc_notok.py | 67 + test/yacc_prec1.py | 68 + test/yacc_rr.py | 72 + test/yacc_rr_unused.py | 30 + test/yacc_simple.py | 68 + test/yacc_sr.py | 63 + test/yacc_term1.py | 68 + test/yacc_unicode_literals.py | 70 + test/yacc_unused.py | 77 + test/yacc_unused_rule.py | 72 + test/yacc_uprec.py | 63 + test/yacc_uprec2.py | 63 + 170 files changed, 22929 insertions(+) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 ANNOUNCE create mode 100644 CHANGES create mode 100644 MANIFEST.in create mode 100644 README.md create mode 100644 TODO create mode 100644 doc/internal.html create mode 100644 doc/makedoc.py create mode 100644 doc/ply.html create mode 100644 example/BASIC/README create mode 100644 example/BASIC/basic.py create mode 100644 example/BASIC/basiclex.py create mode 100644 example/BASIC/basiclog.py create mode 100644 example/BASIC/basinterp.py create mode 100644 example/BASIC/basparse.py create mode 100644 example/BASIC/dim.bas create mode 100644 example/BASIC/func.bas create mode 100644 example/BASIC/gcd.bas create mode 100644 example/BASIC/gosub.bas create mode 100644 example/BASIC/hello.bas create mode 100644 example/BASIC/linear.bas create mode 100644 example/BASIC/maxsin.bas create mode 100644 example/BASIC/powers.bas create mode 100644 example/BASIC/rand.bas create mode 100644 example/BASIC/sales.bas create mode 100644 example/BASIC/sears.bas create mode 100644 example/BASIC/sqrt1.bas create mode 100644 example/BASIC/sqrt2.bas create mode 100644 example/GardenSnake/GardenSnake.py create mode 100644 example/GardenSnake/README create mode 100644 example/README create mode 100644 example/ansic/README create mode 100644 example/ansic/clex.py create mode 100644 example/ansic/cparse.py create mode 100644 example/calc/calc.py create mode 100644 example/calcdebug/calc.py create mode 100644 example/calceof/calc.py create mode 100755 example/classcalc/calc.py create mode 100755 example/cleanup.sh create mode 100644 example/closurecalc/calc.py create mode 100644 example/hedit/hedit.py create mode 100755 example/newclasscalc/calc.py create mode 100644 example/optcalc/README create mode 100644 example/optcalc/calc.py create mode 100644 example/unicalc/calc.py create mode 100644 example/yply/README create mode 100644 example/yply/ylex.py create mode 100644 example/yply/yparse.py create mode 100755 example/yply/yply.py create mode 100644 ply/__init__.py create mode 100644 ply/cpp.py create mode 100644 ply/ctokens.py create mode 100644 ply/lex.py create mode 100644 ply/yacc.py create mode 100644 ply/ygen.py create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 test/README create mode 100644 test/calclex.py create mode 100755 test/cleanup.sh create mode 100644 test/lex_closure.py create mode 100644 test/lex_doc1.py create mode 100644 test/lex_dup1.py create mode 100644 test/lex_dup2.py create mode 100644 test/lex_dup3.py create mode 100644 test/lex_empty.py create mode 100644 test/lex_error1.py create mode 100644 test/lex_error2.py create mode 100644 test/lex_error3.py create mode 100644 test/lex_error4.py create mode 100644 test/lex_hedit.py create mode 100644 test/lex_ignore.py create mode 100644 test/lex_ignore2.py create mode 100644 test/lex_literal1.py create mode 100644 test/lex_literal2.py create mode 100644 test/lex_literal3.py create mode 100644 test/lex_many_tokens.py create mode 100644 test/lex_module.py create mode 100644 test/lex_module_import.py create mode 100644 test/lex_object.py create mode 100644 test/lex_opt_alias.py create mode 100644 test/lex_optimize.py create mode 100644 test/lex_optimize2.py create mode 100644 test/lex_optimize3.py create mode 100644 test/lex_re1.py create mode 100644 test/lex_re2.py create mode 100644 test/lex_re3.py create mode 100644 test/lex_rule1.py create mode 100644 test/lex_rule2.py create mode 100644 test/lex_rule3.py create mode 100644 test/lex_state1.py create mode 100644 test/lex_state2.py create mode 100644 test/lex_state3.py create mode 100644 test/lex_state4.py create mode 100644 test/lex_state5.py create mode 100644 test/lex_state_noerror.py create mode 100644 test/lex_state_norule.py create mode 100644 test/lex_state_try.py create mode 100644 test/lex_token1.py create mode 100644 test/lex_token2.py create mode 100644 test/lex_token3.py create mode 100644 test/lex_token4.py create mode 100644 test/lex_token5.py create mode 100644 test/lex_token_dup.py create mode 100644 test/pkg_test1/__init__.py create mode 100644 test/pkg_test1/parsing/__init__.py create mode 100644 test/pkg_test1/parsing/calclex.py create mode 100644 test/pkg_test1/parsing/calcparse.py create mode 100644 test/pkg_test2/__init__.py create mode 100644 test/pkg_test2/parsing/__init__.py create mode 100644 test/pkg_test2/parsing/calclex.py create mode 100644 test/pkg_test2/parsing/calcparse.py create mode 100644 test/pkg_test3/__init__.py create mode 100644 test/pkg_test3/generated/__init__.py create mode 100644 test/pkg_test3/parsing/__init__.py create mode 100644 test/pkg_test3/parsing/calclex.py create mode 100644 test/pkg_test3/parsing/calcparse.py create mode 100644 test/pkg_test4/__init__.py create mode 100644 test/pkg_test4/parsing/__init__.py create mode 100644 test/pkg_test4/parsing/calclex.py create mode 100644 test/pkg_test4/parsing/calcparse.py create mode 100644 test/pkg_test5/__init__.py create mode 100644 test/pkg_test5/parsing/__init__.py create mode 100644 test/pkg_test5/parsing/calclex.py create mode 100644 test/pkg_test5/parsing/calcparse.py create mode 100644 test/pkg_test6/__init__.py create mode 100644 test/pkg_test6/parsing/__init__.py create mode 100644 test/pkg_test6/parsing/calclex.py create mode 100644 test/pkg_test6/parsing/calcparse.py create mode 100644 test/pkg_test6/parsing/expression.py create mode 100644 test/pkg_test6/parsing/statement.py create mode 100755 test/testlex.py create mode 100644 test/testyacc.py create mode 100644 test/yacc_badargs.py create mode 100644 test/yacc_badid.py create mode 100644 test/yacc_badprec.py create mode 100644 test/yacc_badprec2.py create mode 100644 test/yacc_badprec3.py create mode 100644 test/yacc_badrule.py create mode 100644 test/yacc_badtok.py create mode 100644 test/yacc_dup.py create mode 100644 test/yacc_error1.py create mode 100644 test/yacc_error2.py create mode 100644 test/yacc_error3.py create mode 100644 test/yacc_error4.py create mode 100644 test/yacc_error5.py create mode 100644 test/yacc_error6.py create mode 100644 test/yacc_error7.py create mode 100644 test/yacc_inf.py create mode 100644 test/yacc_literal.py create mode 100644 test/yacc_misplaced.py create mode 100644 test/yacc_missing1.py create mode 100644 test/yacc_nested.py create mode 100644 test/yacc_nodoc.py create mode 100644 test/yacc_noerror.py create mode 100644 test/yacc_nop.py create mode 100644 test/yacc_notfunc.py create mode 100644 test/yacc_notok.py create mode 100644 test/yacc_prec1.py create mode 100644 test/yacc_rr.py create mode 100644 test/yacc_rr_unused.py create mode 100644 test/yacc_simple.py create mode 100644 test/yacc_sr.py create mode 100644 test/yacc_term1.py create mode 100644 test/yacc_unicode_literals.py create mode 100644 test/yacc_unused.py create mode 100644 test/yacc_unused_rule.py create mode 100644 test/yacc_uprec.py create mode 100644 test/yacc_uprec2.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..bd46d6e7a --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +*.pyc +*.pyo +__pycache__ +*.out +*.dif +*~ +/dist +/build +/*.egg-info diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..874e3dfef --- /dev/null +++ b/.travis.yml @@ -0,0 +1,10 @@ +language: python +python: + - "2.6" + - "2.7" + - "3.2" + - "3.3" + - "3.4" +install: + - "pip install . " +script: "cd test && python testlex.py && python testyacc.py" diff --git a/ANNOUNCE b/ANNOUNCE new file mode 100644 index 000000000..79ffebd38 --- /dev/null +++ b/ANNOUNCE @@ -0,0 +1,40 @@ +October 7, 2016 + + Announcing : PLY-3.10 (Python Lex-Yacc) + + http://www.dabeaz.com/ply + +I'm pleased to announce PLY-3.10--a pure Python implementation of the +common parsing tools lex and yacc. PLY-3.10 is a minor bug fix +release. It supports both Python 2 and Python 3. + +If you are new to PLY, here are a few highlights: + +- PLY is closely modeled after traditional lex/yacc. If you know how + to use these or similar tools in other languages, you will find + PLY to be comparable. + +- PLY provides very extensive error reporting and diagnostic + information to assist in parser construction. The original + implementation was developed for instructional purposes. As + a result, the system tries to identify the most common types + of errors made by novice users. + +- PLY provides full support for empty productions, error recovery, + precedence rules, and ambiguous grammars. + +- Parsing is based on LR-parsing which is fast, memory efficient, + better suited to large grammars, and which has a number of nice + properties when dealing with syntax errors and other parsing + problems. Currently, PLY can build its parsing tables using + either SLR or LALR(1) algorithms. + +More information about PLY can be obtained on the PLY webpage at: + + http://www.dabeaz.com/ply + +PLY is freely available. + +Cheers, + +David Beazley (http://www.dabeaz.com) \ No newline at end of file diff --git a/CHANGES b/CHANGES new file mode 100644 index 000000000..a974a7cef --- /dev/null +++ b/CHANGES @@ -0,0 +1,1390 @@ +Version 3.10 +--------------------- +10/07/16: beazley + Fixed Issue #101: Incorrect shift-reduce conflict resolution with + precedence specifier. + + PLY was incorrectly resolving shift-reduce conflicts in certain + cases. For example, in the example/calc/calc.py example, you + could trigger it doing this: + + calc > -3 - 4 + 1 (correct answer should be -7) + calc > + + Issue and suggested patch contributed by https://github.com/RomaVis + +Version 3.9 +--------------------- +08/30/16: beazley + Exposed the parser state number as the parser.state attribute + in productions and error functions. For example: + + def p_somerule(p): + ''' + rule : A B C + ''' + print('State:', p.parser.state) + + May address issue #65 (publish current state in error callback). + +08/30/16: beazley + Fixed Issue #88. Python3 compatibility with ply/cpp. + +08/30/16: beazley + Fixed Issue #93. Ply can crash if SyntaxError is raised inside + a production. Not actually sure if the original implementation + worked as documented at all. Yacc has been modified to follow + the spec as outlined in the CHANGES noted for 11/27/07 below. + +08/30/16: beazley + Fixed Issue #97. Failure with code validation when the original + source files aren't present. Validation step now ignores + the missing file. + +08/30/16: beazley + Minor fixes to version numbers. + +Version 3.8 +--------------------- +10/02/15: beazley + Fixed issues related to Python 3.5. Patch contributed by Barry Warsaw. + +Version 3.7 +--------------------- +08/25/15: beazley + Fixed problems when reading table files from pickled data. + +05/07/15: beazley + Fixed regression in handling of table modules if specified as module + objects. See https://github.com/dabeaz/ply/issues/63 + +Version 3.6 +--------------------- +04/25/15: beazley + If PLY is unable to create the 'parser.out' or 'parsetab.py' files due + to permission issues, it now just issues a warning message and + continues to operate. This could happen if a module using PLY + is installed in a funny way where tables have to be regenerated, but + for whatever reason, the user doesn't have write permission on + the directory where PLY wants to put them. + +04/24/15: beazley + Fixed some issues related to use of packages and table file + modules. Just to emphasize, PLY now generates its special + files such as 'parsetab.py' and 'lextab.py' in the *SAME* + directory as the source file that uses lex() and yacc(). + + If for some reason, you want to change the name of the table + module, use the tabmodule and lextab options: + + lexer = lex.lex(lextab='spamlextab') + parser = yacc.yacc(tabmodule='spamparsetab') + + If you specify a simple name as shown, the module will still be + created in the same directory as the file invoking lex() or yacc(). + If you want the table files to be placed into a different package, + then give a fully qualified package name. For example: + + lexer = lex.lex(lextab='pkgname.files.lextab') + parser = yacc.yacc(tabmodule='pkgname.files.parsetab') + + For this to work, 'pkgname.files' must already exist as a valid + Python package (i.e., the directories must already exist and be + set up with the proper __init__.py files, etc.). + +Version 3.5 +--------------------- +04/21/15: beazley + Added support for defaulted_states in the parser. A + defaulted_state is a state where the only legal action is a + reduction of a single grammar rule across all valid input + tokens. For such states, the rule is reduced and the + reading of the next lookahead token is delayed until it is + actually needed at a later point in time. + + This delay in consuming the next lookahead token is a + potentially important feature in advanced parsing + applications that require tight interaction between the + lexer and the parser. For example, a grammar rule change + modify the lexer state upon reduction and have such changes + take effect before the next input token is read. + + *** POTENTIAL INCOMPATIBILITY *** + One potential danger of defaulted_states is that syntax + errors might be deferred to a a later point of processing + than where they were detected in past versions of PLY. + Thus, it's possible that your error handling could change + slightly on the same inputs. defaulted_states do not change + the overall parsing of the input (i.e., the same grammar is + accepted). + + If for some reason, you need to disable defaulted states, + you can do this: + + parser = yacc.yacc() + parser.defaulted_states = {} + +04/21/15: beazley + Fixed debug logging in the parser. It wasn't properly reporting goto states + on grammar rule reductions. + +04/20/15: beazley + Added actions to be defined to character literals (Issue #32). For example: + + literals = [ '{', '}' ] + + def t_lbrace(t): + r'\{' + # Some action + t.type = '{' + return t + + def t_rbrace(t): + r'\}' + # Some action + t.type = '}' + return t + +04/19/15: beazley + Import of the 'parsetab.py' file is now constrained to only consider the + directory specified by the outputdir argument to yacc(). If not supplied, + the import will only consider the directory in which the grammar is defined. + This should greatly reduce problems with the wrong parsetab.py file being + imported by mistake. For example, if it's found somewhere else on the path + by accident. + + *** POTENTIAL INCOMPATIBILITY *** It's possible that this might break some + packaging/deployment setup if PLY was instructed to place its parsetab.py + in a different location. You'll have to specify a proper outputdir= argument + to yacc() to fix this if needed. + +04/19/15: beazley + Changed default output directory to be the same as that in which the + yacc grammar is defined. If your grammar is in a file 'calc.py', + then the parsetab.py and parser.out files should be generated in the + same directory as that file. The destination directory can be changed + using the outputdir= argument to yacc(). + +04/19/15: beazley + Changed the parsetab.py file signature slightly so that the parsetab won't + regenerate if created on a different major version of Python (ie., a + parsetab created on Python 2 will work with Python 3). + +04/16/15: beazley + Fixed Issue #44 call_errorfunc() should return the result of errorfunc() + +04/16/15: beazley + Support for versions of Python <2.7 is officially dropped. PLY may work, but + the unit tests requires Python 2.7 or newer. + +04/16/15: beazley + Fixed bug related to calling yacc(start=...). PLY wasn't regenerating the + table file correctly for this case. + +04/16/15: beazley + Added skipped tests for PyPy and Java. Related to use of Python's -O option. + +05/29/13: beazley + Added filter to make unit tests pass under 'python -3'. + Reported by Neil Muller. + +05/29/13: beazley + Fixed CPP_INTEGER regex in ply/cpp.py (Issue 21). + Reported by @vbraun. + +05/29/13: beazley + Fixed yacc validation bugs when from __future__ import unicode_literals + is being used. Reported by Kenn Knowles. + +05/29/13: beazley + Added support for Travis-CI. Contributed by Kenn Knowles. + +05/29/13: beazley + Added a .gitignore file. Suggested by Kenn Knowles. + +05/29/13: beazley + Fixed validation problems for source files that include a + different source code encoding specifier. Fix relies on + the inspect module. Should work on Python 2.6 and newer. + Not sure about older versions of Python. + Contributed by Michael Droettboom + +05/21/13: beazley + Fixed unit tests for yacc to eliminate random failures due to dict hash value + randomization in Python 3.3 + Reported by Arfrever + +10/15/12: beazley + Fixed comment whitespace processing bugs in ply/cpp.py. + Reported by Alexei Pososin. + +10/15/12: beazley + Fixed token names in ply/ctokens.py to match rule names. + Reported by Alexei Pososin. + +04/26/12: beazley + Changes to functions available in panic mode error recover. In previous versions + of PLY, the following global functions were available for use in the p_error() rule: + + yacc.errok() # Reset error state + yacc.token() # Get the next token + yacc.restart() # Reset the parsing stack + + The use of global variables was problematic for code involving multiple parsers + and frankly was a poor design overall. These functions have been moved to methods + of the parser instance created by the yacc() function. You should write code like + this: + + def p_error(p): + ... + parser.errok() + + parser = yacc.yacc() + + *** POTENTIAL INCOMPATIBILITY *** The original global functions now issue a + DeprecationWarning. + +04/19/12: beazley + Fixed some problems with line and position tracking and the use of error + symbols. If you have a grammar rule involving an error rule like this: + + def p_assignment_bad(p): + '''assignment : location EQUALS error SEMI''' + ... + + You can now do line and position tracking on the error token. For example: + + def p_assignment_bad(p): + '''assignment : location EQUALS error SEMI''' + start_line = p.lineno(3) + start_pos = p.lexpos(3) + + If the trackng=True option is supplied to parse(), you can additionally get + spans: + + def p_assignment_bad(p): + '''assignment : location EQUALS error SEMI''' + start_line, end_line = p.linespan(3) + start_pos, end_pos = p.lexspan(3) + + Note that error handling is still a hairy thing in PLY. This won't work + unless your lexer is providing accurate information. Please report bugs. + Suggested by a bug reported by Davis Herring. + +04/18/12: beazley + Change to doc string handling in lex module. Regex patterns are now first + pulled from a function's .regex attribute. If that doesn't exist, then + .doc is checked as a fallback. The @TOKEN decorator now sets the .regex + attribute of a function instead of its doc string. + Changed suggested by Kristoffer Ellersgaard Koch. + +04/18/12: beazley + Fixed issue #1: Fixed _tabversion. It should use __tabversion__ instead of __version__ + Reported by Daniele Tricoli + +04/18/12: beazley + Fixed issue #8: Literals empty list causes IndexError + Reported by Walter Nissen. + +04/18/12: beazley + Fixed issue #12: Typo in code snippet in documentation + Reported by florianschanda. + +04/18/12: beazley + Fixed issue #10: Correctly escape t_XOREQUAL pattern. + Reported by Andy Kittner. + +Version 3.4 +--------------------- +02/17/11: beazley + Minor patch to make cpp.py compatible with Python 3. Note: This + is an experimental file not currently used by the rest of PLY. + +02/17/11: beazley + Fixed setup.py trove classifiers to properly list PLY as + Python 3 compatible. + +01/02/11: beazley + Migration of repository to github. + +Version 3.3 +----------------------------- +08/25/09: beazley + Fixed issue 15 related to the set_lineno() method in yacc. Reported by + mdsherry. + +08/25/09: beazley + Fixed a bug related to regular expression compilation flags not being + properly stored in lextab.py files created by the lexer when running + in optimize mode. Reported by Bruce Frederiksen. + + +Version 3.2 +----------------------------- +03/24/09: beazley + Added an extra check to not print duplicated warning messages + about reduce/reduce conflicts. + +03/24/09: beazley + Switched PLY over to a BSD-license. + +03/23/09: beazley + Performance optimization. Discovered a few places to make + speedups in LR table generation. + +03/23/09: beazley + New warning message. PLY now warns about rules never + reduced due to reduce/reduce conflicts. Suggested by + Bruce Frederiksen. + +03/23/09: beazley + Some clean-up of warning messages related to reduce/reduce errors. + +03/23/09: beazley + Added a new picklefile option to yacc() to write the parsing + tables to a filename using the pickle module. Here is how + it works: + + yacc(picklefile="parsetab.p") + + This option can be used if the normal parsetab.py file is + extremely large. For example, on jython, it is impossible + to read parsing tables if the parsetab.py exceeds a certain + threshold. + + The filename supplied to the picklefile option is opened + relative to the current working directory of the Python + interpreter. If you need to refer to the file elsewhere, + you will need to supply an absolute or relative path. + + For maximum portability, the pickle file is written + using protocol 0. + +03/13/09: beazley + Fixed a bug in parser.out generation where the rule numbers + where off by one. + +03/13/09: beazley + Fixed a string formatting bug with one of the error messages. + Reported by Richard Reitmeyer + +Version 3.1 +----------------------------- +02/28/09: beazley + Fixed broken start argument to yacc(). PLY-3.0 broke this + feature by accident. + +02/28/09: beazley + Fixed debugging output. yacc() no longer reports shift/reduce + or reduce/reduce conflicts if debugging is turned off. This + restores similar behavior in PLY-2.5. Reported by Andrew Waters. + +Version 3.0 +----------------------------- +02/03/09: beazley + Fixed missing lexer attribute on certain tokens when + invoking the parser p_error() function. Reported by + Bart Whiteley. + +02/02/09: beazley + The lex() command now does all error-reporting and diagonistics + using the logging module interface. Pass in a Logger object + using the errorlog parameter to specify a different logger. + +02/02/09: beazley + Refactored ply.lex to use a more object-oriented and organized + approach to collecting lexer information. + +02/01/09: beazley + Removed the nowarn option from lex(). All output is controlled + by passing in a logger object. Just pass in a logger with a high + level setting to suppress output. This argument was never + documented to begin with so hopefully no one was relying upon it. + +02/01/09: beazley + Discovered and removed a dead if-statement in the lexer. This + resulted in a 6-7% speedup in lexing when I tested it. + +01/13/09: beazley + Minor change to the procedure for signalling a syntax error in a + production rule. A normal SyntaxError exception should be raised + instead of yacc.SyntaxError. + +01/13/09: beazley + Added a new method p.set_lineno(n,lineno) that can be used to set the + line number of symbol n in grammar rules. This simplifies manual + tracking of line numbers. + +01/11/09: beazley + Vastly improved debugging support for yacc.parse(). Instead of passing + debug as an integer, you can supply a Logging object (see the logging + module). Messages will be generated at the ERROR, INFO, and DEBUG + logging levels, each level providing progressively more information. + The debugging trace also shows states, grammar rule, values passed + into grammar rules, and the result of each reduction. + +01/09/09: beazley + The yacc() command now does all error-reporting and diagnostics using + the interface of the logging module. Use the errorlog parameter to + specify a logging object for error messages. Use the debuglog parameter + to specify a logging object for the 'parser.out' output. + +01/09/09: beazley + *HUGE* refactoring of the the ply.yacc() implementation. The high-level + user interface is backwards compatible, but the internals are completely + reorganized into classes. No more global variables. The internals + are also more extensible. For example, you can use the classes to + construct a LALR(1) parser in an entirely different manner than + what is currently the case. Documentation is forthcoming. + +01/07/09: beazley + Various cleanup and refactoring of yacc internals. + +01/06/09: beazley + Fixed a bug with precedence assignment. yacc was assigning the precedence + each rule based on the left-most token, when in fact, it should have been + using the right-most token. Reported by Bruce Frederiksen. + +11/27/08: beazley + Numerous changes to support Python 3.0 including removal of deprecated + statements (e.g., has_key) and the additional of compatibility code + to emulate features from Python 2 that have been removed, but which + are needed. Fixed the unit testing suite to work with Python 3.0. + The code should be backwards compatible with Python 2. + +11/26/08: beazley + Loosened the rules on what kind of objects can be passed in as the + "module" parameter to lex() and yacc(). Previously, you could only use + a module or an instance. Now, PLY just uses dir() to get a list of + symbols on whatever the object is without regard for its type. + +11/26/08: beazley + Changed all except: statements to be compatible with Python2.x/3.x syntax. + +11/26/08: beazley + Changed all raise Exception, value statements to raise Exception(value) for + forward compatibility. + +11/26/08: beazley + Removed all print statements from lex and yacc, using sys.stdout and sys.stderr + directly. Preparation for Python 3.0 support. + +11/04/08: beazley + Fixed a bug with referring to symbols on the the parsing stack using negative + indices. + +05/29/08: beazley + Completely revamped the testing system to use the unittest module for everything. + Added additional tests to cover new errors/warnings. + +Version 2.5 +----------------------------- +05/28/08: beazley + Fixed a bug with writing lex-tables in optimized mode and start states. + Reported by Kevin Henry. + +Version 2.4 +----------------------------- +05/04/08: beazley + A version number is now embedded in the table file signature so that + yacc can more gracefully accomodate changes to the output format + in the future. + +05/04/08: beazley + Removed undocumented .pushback() method on grammar productions. I'm + not sure this ever worked and can't recall ever using it. Might have + been an abandoned idea that never really got fleshed out. This + feature was never described or tested so removing it is hopefully + harmless. + +05/04/08: beazley + Added extra error checking to yacc() to detect precedence rules defined + for undefined terminal symbols. This allows yacc() to detect a potential + problem that can be really tricky to debug if no warning message or error + message is generated about it. + +05/04/08: beazley + lex() now has an outputdir that can specify the output directory for + tables when running in optimize mode. For example: + + lexer = lex.lex(optimize=True, lextab="ltab", outputdir="foo/bar") + + The behavior of specifying a table module and output directory are + more aligned with the behavior of yacc(). + +05/04/08: beazley + [Issue 9] + Fixed filename bug in when specifying the modulename in lex() and yacc(). + If you specified options such as the following: + + parser = yacc.yacc(tabmodule="foo.bar.parsetab",outputdir="foo/bar") + + yacc would create a file "foo.bar.parsetab.py" in the given directory. + Now, it simply generates a file "parsetab.py" in that directory. + Bug reported by cptbinho. + +05/04/08: beazley + Slight modification to lex() and yacc() to allow their table files + to be loaded from a previously loaded module. This might make + it easier to load the parsing tables from a complicated package + structure. For example: + + import foo.bar.spam.parsetab as parsetab + parser = yacc.yacc(tabmodule=parsetab) + + Note: lex and yacc will never regenerate the table file if used + in the form---you will get a warning message instead. + This idea suggested by Brian Clapper. + + +04/28/08: beazley + Fixed a big with p_error() functions being picked up correctly + when running in yacc(optimize=1) mode. Patch contributed by + Bart Whiteley. + +02/28/08: beazley + Fixed a bug with 'nonassoc' precedence rules. Basically the + non-precedence was being ignored and not producing the correct + run-time behavior in the parser. + +02/16/08: beazley + Slight relaxation of what the input() method to a lexer will + accept as a string. Instead of testing the input to see + if the input is a string or unicode string, it checks to see + if the input object looks like it contains string data. + This change makes it possible to pass string-like objects + in as input. For example, the object returned by mmap. + + import mmap, os + data = mmap.mmap(os.open(filename,os.O_RDONLY), + os.path.getsize(filename), + access=mmap.ACCESS_READ) + lexer.input(data) + + +11/29/07: beazley + Modification of ply.lex to allow token functions to aliased. + This is subtle, but it makes it easier to create libraries and + to reuse token specifications. For example, suppose you defined + a function like this: + + def number(t): + r'\d+' + t.value = int(t.value) + return t + + This change would allow you to define a token rule as follows: + + t_NUMBER = number + + In this case, the token type will be set to 'NUMBER' and use + the associated number() function to process tokens. + +11/28/07: beazley + Slight modification to lex and yacc to grab symbols from both + the local and global dictionaries of the caller. This + modification allows lexers and parsers to be defined using + inner functions and closures. + +11/28/07: beazley + Performance optimization: The lexer.lexmatch and t.lexer + attributes are no longer set for lexer tokens that are not + defined by functions. The only normal use of these attributes + would be in lexer rules that need to perform some kind of + special processing. Thus, it doesn't make any sense to set + them on every token. + + *** POTENTIAL INCOMPATIBILITY *** This might break code + that is mucking around with internal lexer state in some + sort of magical way. + +11/27/07: beazley + Added the ability to put the parser into error-handling mode + from within a normal production. To do this, simply raise + a yacc.SyntaxError exception like this: + + def p_some_production(p): + 'some_production : prod1 prod2' + ... + raise yacc.SyntaxError # Signal an error + + A number of things happen after this occurs: + + - The last symbol shifted onto the symbol stack is discarded + and parser state backed up to what it was before the + the rule reduction. + + - The current lookahead symbol is saved and replaced by + the 'error' symbol. + + - The parser enters error recovery mode where it tries + to either reduce the 'error' rule or it starts + discarding items off of the stack until the parser + resets. + + When an error is manually set, the parser does *not* call + the p_error() function (if any is defined). + *** NEW FEATURE *** Suggested on the mailing list + +11/27/07: beazley + Fixed structure bug in examples/ansic. Reported by Dion Blazakis. + +11/27/07: beazley + Fixed a bug in the lexer related to start conditions and ignored + token rules. If a rule was defined that changed state, but + returned no token, the lexer could be left in an inconsistent + state. Reported by + +11/27/07: beazley + Modified setup.py to support Python Eggs. Patch contributed by + Simon Cross. + +11/09/07: beazely + Fixed a bug in error handling in yacc. If a syntax error occurred and the + parser rolled the entire parse stack back, the parser would be left in in + inconsistent state that would cause it to trigger incorrect actions on + subsequent input. Reported by Ton Biegstraaten, Justin King, and others. + +11/09/07: beazley + Fixed a bug when passing empty input strings to yacc.parse(). This + would result in an error message about "No input given". Reported + by Andrew Dalke. + +Version 2.3 +----------------------------- +02/20/07: beazley + Fixed a bug with character literals if the literal '.' appeared as the + last symbol of a grammar rule. Reported by Ales Smrcka. + +02/19/07: beazley + Warning messages are now redirected to stderr instead of being printed + to standard output. + +02/19/07: beazley + Added a warning message to lex.py if it detects a literal backslash + character inside the t_ignore declaration. This is to help + problems that might occur if someone accidentally defines t_ignore + as a Python raw string. For example: + + t_ignore = r' \t' + + The idea for this is from an email I received from David Cimimi who + reported bizarre behavior in lexing as a result of defining t_ignore + as a raw string by accident. + +02/18/07: beazley + Performance improvements. Made some changes to the internal + table organization and LR parser to improve parsing performance. + +02/18/07: beazley + Automatic tracking of line number and position information must now be + enabled by a special flag to parse(). For example: + + yacc.parse(data,tracking=True) + + In many applications, it's just not that important to have the + parser automatically track all line numbers. By making this an + optional feature, it allows the parser to run significantly faster + (more than a 20% speed increase in many cases). Note: positional + information is always available for raw tokens---this change only + applies to positional information associated with nonterminal + grammar symbols. + *** POTENTIAL INCOMPATIBILITY *** + +02/18/07: beazley + Yacc no longer supports extended slices of grammar productions. + However, it does support regular slices. For example: + + def p_foo(p): + '''foo: a b c d e''' + p[0] = p[1:3] + + This change is a performance improvement to the parser--it streamlines + normal access to the grammar values since slices are now handled in + a __getslice__() method as opposed to __getitem__(). + +02/12/07: beazley + Fixed a bug in the handling of token names when combined with + start conditions. Bug reported by Todd O'Bryan. + +Version 2.2 +------------------------------ +11/01/06: beazley + Added lexpos() and lexspan() methods to grammar symbols. These + mirror the same functionality of lineno() and linespan(). For + example: + + def p_expr(p): + 'expr : expr PLUS expr' + p.lexpos(1) # Lexing position of left-hand-expression + p.lexpos(1) # Lexing position of PLUS + start,end = p.lexspan(3) # Lexing range of right hand expression + +11/01/06: beazley + Minor change to error handling. The recommended way to skip characters + in the input is to use t.lexer.skip() as shown here: + + def t_error(t): + print "Illegal character '%s'" % t.value[0] + t.lexer.skip(1) + + The old approach of just using t.skip(1) will still work, but won't + be documented. + +10/31/06: beazley + Discarded tokens can now be specified as simple strings instead of + functions. To do this, simply include the text "ignore_" in the + token declaration. For example: + + t_ignore_cppcomment = r'//.*' + + Previously, this had to be done with a function. For example: + + def t_ignore_cppcomment(t): + r'//.*' + pass + + If start conditions/states are being used, state names should appear + before the "ignore_" text. + +10/19/06: beazley + The Lex module now provides support for flex-style start conditions + as described at http://www.gnu.org/software/flex/manual/html_chapter/flex_11.html. + Please refer to this document to understand this change note. Refer to + the PLY documentation for PLY-specific explanation of how this works. + + To use start conditions, you first need to declare a set of states in + your lexer file: + + states = ( + ('foo','exclusive'), + ('bar','inclusive') + ) + + This serves the same role as the %s and %x specifiers in flex. + + One a state has been declared, tokens for that state can be + declared by defining rules of the form t_state_TOK. For example: + + t_PLUS = '\+' # Rule defined in INITIAL state + t_foo_NUM = '\d+' # Rule defined in foo state + t_bar_NUM = '\d+' # Rule defined in bar state + + t_foo_bar_NUM = '\d+' # Rule defined in both foo and bar + t_ANY_NUM = '\d+' # Rule defined in all states + + In addition to defining tokens for each state, the t_ignore and t_error + specifications can be customized for specific states. For example: + + t_foo_ignore = " " # Ignored characters for foo state + def t_bar_error(t): + # Handle errors in bar state + + With token rules, the following methods can be used to change states + + def t_TOKNAME(t): + t.lexer.begin('foo') # Begin state 'foo' + t.lexer.push_state('foo') # Begin state 'foo', push old state + # onto a stack + t.lexer.pop_state() # Restore previous state + t.lexer.current_state() # Returns name of current state + + These methods mirror the BEGIN(), yy_push_state(), yy_pop_state(), and + yy_top_state() functions in flex. + + The use of start states can be used as one way to write sub-lexers. + For example, the lexer or parser might instruct the lexer to start + generating a different set of tokens depending on the context. + + example/yply/ylex.py shows the use of start states to grab C/C++ + code fragments out of traditional yacc specification files. + + *** NEW FEATURE *** Suggested by Daniel Larraz with whom I also + discussed various aspects of the design. + +10/19/06: beazley + Minor change to the way in which yacc.py was reporting shift/reduce + conflicts. Although the underlying LALR(1) algorithm was correct, + PLY was under-reporting the number of conflicts compared to yacc/bison + when precedence rules were in effect. This change should make PLY + report the same number of conflicts as yacc. + +10/19/06: beazley + Modified yacc so that grammar rules could also include the '-' + character. For example: + + def p_expr_list(p): + 'expression-list : expression-list expression' + + Suggested by Oldrich Jedlicka. + +10/18/06: beazley + Attribute lexer.lexmatch added so that token rules can access the re + match object that was generated. For example: + + def t_FOO(t): + r'some regex' + m = t.lexer.lexmatch + # Do something with m + + + This may be useful if you want to access named groups specified within + the regex for a specific token. Suggested by Oldrich Jedlicka. + +10/16/06: beazley + Changed the error message that results if an illegal character + is encountered and no default error function is defined in lex. + The exception is now more informative about the actual cause of + the error. + +Version 2.1 +------------------------------ +10/02/06: beazley + The last Lexer object built by lex() can be found in lex.lexer. + The last Parser object built by yacc() can be found in yacc.parser. + +10/02/06: beazley + New example added: examples/yply + + This example uses PLY to convert Unix-yacc specification files to + PLY programs with the same grammar. This may be useful if you + want to convert a grammar from bison/yacc to use with PLY. + +10/02/06: beazley + Added support for a start symbol to be specified in the yacc + input file itself. Just do this: + + start = 'name' + + where 'name' matches some grammar rule. For example: + + def p_name(p): + 'name : A B C' + ... + + This mirrors the functionality of the yacc %start specifier. + +09/30/06: beazley + Some new examples added.: + + examples/GardenSnake : A simple indentation based language similar + to Python. Shows how you might handle + whitespace. Contributed by Andrew Dalke. + + examples/BASIC : An implementation of 1964 Dartmouth BASIC. + Contributed by Dave against his better + judgement. + +09/28/06: beazley + Minor patch to allow named groups to be used in lex regular + expression rules. For example: + + t_QSTRING = r'''(?P['"]).*?(?P=quote)''' + + Patch submitted by Adam Ring. + +09/28/06: beazley + LALR(1) is now the default parsing method. To use SLR, use + yacc.yacc(method="SLR"). Note: there is no performance impact + on parsing when using LALR(1) instead of SLR. However, constructing + the parsing tables will take a little longer. + +09/26/06: beazley + Change to line number tracking. To modify line numbers, modify + the line number of the lexer itself. For example: + + def t_NEWLINE(t): + r'\n' + t.lexer.lineno += 1 + + This modification is both cleanup and a performance optimization. + In past versions, lex was monitoring every token for changes in + the line number. This extra processing is unnecessary for a vast + majority of tokens. Thus, this new approach cleans it up a bit. + + *** POTENTIAL INCOMPATIBILITY *** + You will need to change code in your lexer that updates the line + number. For example, "t.lineno += 1" becomes "t.lexer.lineno += 1" + +09/26/06: beazley + Added the lexing position to tokens as an attribute lexpos. This + is the raw index into the input text at which a token appears. + This information can be used to compute column numbers and other + details (e.g., scan backwards from lexpos to the first newline + to get a column position). + +09/25/06: beazley + Changed the name of the __copy__() method on the Lexer class + to clone(). This is used to clone a Lexer object (e.g., if + you're running different lexers at the same time). + +09/21/06: beazley + Limitations related to the use of the re module have been eliminated. + Several users reported problems with regular expressions exceeding + more than 100 named groups. To solve this, lex.py is now capable + of automatically splitting its master regular regular expression into + smaller expressions as needed. This should, in theory, make it + possible to specify an arbitrarily large number of tokens. + +09/21/06: beazley + Improved error checking in lex.py. Rules that match the empty string + are now rejected (otherwise they cause the lexer to enter an infinite + loop). An extra check for rules containing '#' has also been added. + Since lex compiles regular expressions in verbose mode, '#' is interpreted + as a regex comment, it is critical to use '\#' instead. + +09/18/06: beazley + Added a @TOKEN decorator function to lex.py that can be used to + define token rules where the documentation string might be computed + in some way. + + digit = r'([0-9])' + nondigit = r'([_A-Za-z])' + identifier = r'(' + nondigit + r'(' + digit + r'|' + nondigit + r')*)' + + from ply.lex import TOKEN + + @TOKEN(identifier) + def t_ID(t): + # Do whatever + + The @TOKEN decorator merely sets the documentation string of the + associated token function as needed for lex to work. + + Note: An alternative solution is the following: + + def t_ID(t): + # Do whatever + + t_ID.__doc__ = identifier + + Note: Decorators require the use of Python 2.4 or later. If compatibility + with old versions is needed, use the latter solution. + + The need for this feature was suggested by Cem Karan. + +09/14/06: beazley + Support for single-character literal tokens has been added to yacc. + These literals must be enclosed in quotes. For example: + + def p_expr(p): + "expr : expr '+' expr" + ... + + def p_expr(p): + 'expr : expr "-" expr' + ... + + In addition to this, it is necessary to tell the lexer module about + literal characters. This is done by defining the variable 'literals' + as a list of characters. This should be defined in the module that + invokes the lex.lex() function. For example: + + literals = ['+','-','*','/','(',')','='] + + or simply + + literals = '+=*/()=' + + It is important to note that literals can only be a single character. + When the lexer fails to match a token using its normal regular expression + rules, it will check the current character against the literal list. + If found, it will be returned with a token type set to match the literal + character. Otherwise, an illegal character will be signalled. + + +09/14/06: beazley + Modified PLY to install itself as a proper Python package called 'ply'. + This will make it a little more friendly to other modules. This + changes the usage of PLY only slightly. Just do this to import the + modules + + import ply.lex as lex + import ply.yacc as yacc + + Alternatively, you can do this: + + from ply import * + + Which imports both the lex and yacc modules. + Change suggested by Lee June. + +09/13/06: beazley + Changed the handling of negative indices when used in production rules. + A negative production index now accesses already parsed symbols on the + parsing stack. For example, + + def p_foo(p): + "foo: A B C D" + print p[1] # Value of 'A' symbol + print p[2] # Value of 'B' symbol + print p[-1] # Value of whatever symbol appears before A + # on the parsing stack. + + p[0] = some_val # Sets the value of the 'foo' grammer symbol + + This behavior makes it easier to work with embedded actions within the + parsing rules. For example, in C-yacc, it is possible to write code like + this: + + bar: A { printf("seen an A = %d\n", $1); } B { do_stuff; } + + In this example, the printf() code executes immediately after A has been + parsed. Within the embedded action code, $1 refers to the A symbol on + the stack. + + To perform this equivalent action in PLY, you need to write a pair + of rules like this: + + def p_bar(p): + "bar : A seen_A B" + do_stuff + + def p_seen_A(p): + "seen_A :" + print "seen an A =", p[-1] + + The second rule "seen_A" is merely a empty production which should be + reduced as soon as A is parsed in the "bar" rule above. The use + of the negative index p[-1] is used to access whatever symbol appeared + before the seen_A symbol. + + This feature also makes it possible to support inherited attributes. + For example: + + def p_decl(p): + "decl : scope name" + + def p_scope(p): + """scope : GLOBAL + | LOCAL""" + p[0] = p[1] + + def p_name(p): + "name : ID" + if p[-1] == "GLOBAL": + # ... + else if p[-1] == "LOCAL": + #... + + In this case, the name rule is inheriting an attribute from the + scope declaration that precedes it. + + *** POTENTIAL INCOMPATIBILITY *** + If you are currently using negative indices within existing grammar rules, + your code will break. This should be extremely rare if non-existent in + most cases. The argument to various grammar rules is not usually not + processed in the same way as a list of items. + +Version 2.0 +------------------------------ +09/07/06: beazley + Major cleanup and refactoring of the LR table generation code. Both SLR + and LALR(1) table generation is now performed by the same code base with + only minor extensions for extra LALR(1) processing. + +09/07/06: beazley + Completely reimplemented the entire LALR(1) parsing engine to use the + DeRemer and Pennello algorithm for calculating lookahead sets. This + significantly improves the performance of generating LALR(1) tables + and has the added feature of actually working correctly! If you + experienced weird behavior with LALR(1) in prior releases, this should + hopefully resolve all of those problems. Many thanks to + Andrew Waters and Markus Schoepflin for submitting bug reports + and helping me test out the revised LALR(1) support. + +Version 1.8 +------------------------------ +08/02/06: beazley + Fixed a problem related to the handling of default actions in LALR(1) + parsing. If you experienced subtle and/or bizarre behavior when trying + to use the LALR(1) engine, this may correct those problems. Patch + contributed by Russ Cox. Note: This patch has been superceded by + revisions for LALR(1) parsing in Ply-2.0. + +08/02/06: beazley + Added support for slicing of productions in yacc. + Patch contributed by Patrick Mezard. + +Version 1.7 +------------------------------ +03/02/06: beazley + Fixed infinite recursion problem ReduceToTerminals() function that + would sometimes come up in LALR(1) table generation. Reported by + Markus Schoepflin. + +03/01/06: beazley + Added "reflags" argument to lex(). For example: + + lex.lex(reflags=re.UNICODE) + + This can be used to specify optional flags to the re.compile() function + used inside the lexer. This may be necessary for special situations such + as processing Unicode (e.g., if you want escapes like \w and \b to consult + the Unicode character property database). The need for this suggested by + Andreas Jung. + +03/01/06: beazley + Fixed a bug with an uninitialized variable on repeated instantiations of parser + objects when the write_tables=0 argument was used. Reported by Michael Brown. + +03/01/06: beazley + Modified lex.py to accept Unicode strings both as the regular expressions for + tokens and as input. Hopefully this is the only change needed for Unicode support. + Patch contributed by Johan Dahl. + +03/01/06: beazley + Modified the class-based interface to work with new-style or old-style classes. + Patch contributed by Michael Brown (although I tweaked it slightly so it would work + with older versions of Python). + +Version 1.6 +------------------------------ +05/27/05: beazley + Incorporated patch contributed by Christopher Stawarz to fix an extremely + devious bug in LALR(1) parser generation. This patch should fix problems + numerous people reported with LALR parsing. + +05/27/05: beazley + Fixed problem with lex.py copy constructor. Reported by Dave Aitel, Aaron Lav, + and Thad Austin. + +05/27/05: beazley + Added outputdir option to yacc() to control output directory. Contributed + by Christopher Stawarz. + +05/27/05: beazley + Added rununit.py test script to run tests using the Python unittest module. + Contributed by Miki Tebeka. + +Version 1.5 +------------------------------ +05/26/04: beazley + Major enhancement. LALR(1) parsing support is now working. + This feature was implemented by Elias Ioup (ezioup@alumni.uchicago.edu) + and optimized by David Beazley. To use LALR(1) parsing do + the following: + + yacc.yacc(method="LALR") + + Computing LALR(1) parsing tables takes about twice as long as + the default SLR method. However, LALR(1) allows you to handle + more complex grammars. For example, the ANSI C grammar + (in example/ansic) has 13 shift-reduce conflicts with SLR, but + only has 1 shift-reduce conflict with LALR(1). + +05/20/04: beazley + Added a __len__ method to parser production lists. Can + be used in parser rules like this: + + def p_somerule(p): + """a : B C D + | E F" + if (len(p) == 3): + # Must have been first rule + elif (len(p) == 2): + # Must be second rule + + Suggested by Joshua Gerth and others. + +Version 1.4 +------------------------------ +04/23/04: beazley + Incorporated a variety of patches contributed by Eric Raymond. + These include: + + 0. Cleans up some comments so they don't wrap on an 80-column display. + 1. Directs compiler errors to stderr where they belong. + 2. Implements and documents automatic line counting when \n is ignored. + 3. Changes the way progress messages are dumped when debugging is on. + The new format is both less verbose and conveys more information than + the old, including shift and reduce actions. + +04/23/04: beazley + Added a Python setup.py file to simply installation. Contributed + by Adam Kerrison. + +04/23/04: beazley + Added patches contributed by Adam Kerrison. + + - Some output is now only shown when debugging is enabled. This + means that PLY will be completely silent when not in debugging mode. + + - An optional parameter "write_tables" can be passed to yacc() to + control whether or not parsing tables are written. By default, + it is true, but it can be turned off if you don't want the yacc + table file. Note: disabling this will cause yacc() to regenerate + the parsing table each time. + +04/23/04: beazley + Added patches contributed by David McNab. This patch addes two + features: + + - The parser can be supplied as a class instead of a module. + For an example of this, see the example/classcalc directory. + + - Debugging output can be directed to a filename of the user's + choice. Use + + yacc(debugfile="somefile.out") + + +Version 1.3 +------------------------------ +12/10/02: jmdyck + Various minor adjustments to the code that Dave checked in today. + Updated test/yacc_{inf,unused}.exp to reflect today's changes. + +12/10/02: beazley + Incorporated a variety of minor bug fixes to empty production + handling and infinite recursion checking. Contributed by + Michael Dyck. + +12/10/02: beazley + Removed bogus recover() method call in yacc.restart() + +Version 1.2 +------------------------------ +11/27/02: beazley + Lexer and parser objects are now available as an attribute + of tokens and slices respectively. For example: + + def t_NUMBER(t): + r'\d+' + print t.lexer + + def p_expr_plus(t): + 'expr: expr PLUS expr' + print t.lexer + print t.parser + + This can be used for state management (if needed). + +10/31/02: beazley + Modified yacc.py to work with Python optimize mode. To make + this work, you need to use + + yacc.yacc(optimize=1) + + Furthermore, you need to first run Python in normal mode + to generate the necessary parsetab.py files. After that, + you can use python -O or python -OO. + + Note: optimized mode turns off a lot of error checking. + Only use when you are sure that your grammar is working. + Make sure parsetab.py is up to date! + +10/30/02: beazley + Added cloning of Lexer objects. For example: + + import copy + l = lex.lex() + lc = copy.copy(l) + + l.input("Some text") + lc.input("Some other text") + ... + + This might be useful if the same "lexer" is meant to + be used in different contexts---or if multiple lexers + are running concurrently. + +10/30/02: beazley + Fixed subtle bug with first set computation and empty productions. + Patch submitted by Michael Dyck. + +10/30/02: beazley + Fixed error messages to use "filename:line: message" instead + of "filename:line. message". This makes error reporting more + friendly to emacs. Patch submitted by François Pinard. + +10/30/02: beazley + Improvements to parser.out file. Terminals and nonterminals + are sorted instead of being printed in random order. + Patch submitted by François Pinard. + +10/30/02: beazley + Improvements to parser.out file output. Rules are now printed + in a way that's easier to understand. Contributed by Russ Cox. + +10/30/02: beazley + Added 'nonassoc' associativity support. This can be used + to disable the chaining of operators like a < b < c. + To use, simply specify 'nonassoc' in the precedence table + + precedence = ( + ('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), # Unary minus operator + ) + + Patch contributed by Russ Cox. + +10/30/02: beazley + Modified the lexer to provide optional support for Python -O and -OO + modes. To make this work, Python *first* needs to be run in + unoptimized mode. This reads the lexing information and creates a + file "lextab.py". Then, run lex like this: + + # module foo.py + ... + ... + lex.lex(optimize=1) + + Once the lextab file has been created, subsequent calls to + lex.lex() will read data from the lextab file instead of using + introspection. In optimized mode (-O, -OO) everything should + work normally despite the loss of doc strings. + + To change the name of the file 'lextab.py' use the following: + + lex.lex(lextab="footab") + + (this creates a file footab.py) + + +Version 1.1 October 25, 2001 +------------------------------ + +10/25/01: beazley + Modified the table generator to produce much more compact data. + This should greatly reduce the size of the parsetab.py[c] file. + Caveat: the tables still need to be constructed so a little more + work is done in parsetab on import. + +10/25/01: beazley + There may be a possible bug in the cycle detector that reports errors + about infinite recursion. I'm having a little trouble tracking it + down, but if you get this problem, you can disable the cycle + detector as follows: + + yacc.yacc(check_recursion = 0) + +10/25/01: beazley + Fixed a bug in lex.py that sometimes caused illegal characters to be + reported incorrectly. Reported by Sverre Jørgensen. + +7/8/01 : beazley + Added a reference to the underlying lexer object when tokens are handled by + functions. The lexer is available as the 'lexer' attribute. This + was added to provide better lexing support for languages such as Fortran + where certain types of tokens can't be conveniently expressed as regular + expressions (and where the tokenizing function may want to perform a + little backtracking). Suggested by Pearu Peterson. + +6/20/01 : beazley + Modified yacc() function so that an optional starting symbol can be specified. + For example: + + yacc.yacc(start="statement") + + Normally yacc always treats the first production rule as the starting symbol. + However, if you are debugging your grammar it may be useful to specify + an alternative starting symbol. Idea suggested by Rich Salz. + +Version 1.0 June 18, 2001 +-------------------------- +Initial public offering + diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..0d37431b0 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,8 @@ +recursive-include example * +recursive-include doc * +recursive-include test * +include ANNOUNCE +include README.md +include CHANGES +include TODO +global-exclude *.pyc diff --git a/README.md b/README.md new file mode 100644 index 000000000..75635832b --- /dev/null +++ b/README.md @@ -0,0 +1,273 @@ +PLY (Python Lex-Yacc) Version 3.10 + +Copyright (C) 2001-2016 +David M. Beazley (Dabeaz LLC) +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of the David Beazley or Dabeaz LLC may be used to + endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Introduction +============ + +PLY is a 100% Python implementation of the common parsing tools lex +and yacc. Here are a few highlights: + + - PLY is very closely modeled after traditional lex/yacc. + If you know how to use these tools in C, you will find PLY + to be similar. + + - PLY provides *very* extensive error reporting and diagnostic + information to assist in parser construction. The original + implementation was developed for instructional purposes. As + a result, the system tries to identify the most common types + of errors made by novice users. + + - PLY provides full support for empty productions, error recovery, + precedence specifiers, and moderately ambiguous grammars. + + - Parsing is based on LR-parsing which is fast, memory efficient, + better suited to large grammars, and which has a number of nice + properties when dealing with syntax errors and other parsing problems. + Currently, PLY builds its parsing tables using the LALR(1) + algorithm used in yacc. + + - PLY uses Python introspection features to build lexers and parsers. + This greatly simplifies the task of parser construction since it reduces + the number of files and eliminates the need to run a separate lex/yacc + tool before running your program. + + - PLY can be used to build parsers for "real" programming languages. + Although it is not ultra-fast due to its Python implementation, + PLY can be used to parse grammars consisting of several hundred + rules (as might be found for a language like C). The lexer and LR + parser are also reasonably efficient when parsing typically + sized programs. People have used PLY to build parsers for + C, C++, ADA, and other real programming languages. + +How to Use +========== + +PLY consists of two files : lex.py and yacc.py. These are contained +within the 'ply' directory which may also be used as a Python package. +To use PLY, simply copy the 'ply' directory to your project and import +lex and yacc from the associated 'ply' package. For example: + + import ply.lex as lex + import ply.yacc as yacc + +Alternatively, you can copy just the files lex.py and yacc.py +individually and use them as modules. For example: + + import lex + import yacc + +The file setup.py can be used to install ply using distutils. + +The file doc/ply.html contains complete documentation on how to use +the system. + +The example directory contains several different examples including a +PLY specification for ANSI C as given in K&R 2nd Ed. + +A simple example is found at the end of this document + +Requirements +============ +PLY requires the use of Python 2.6 or greater. However, you should +use the latest Python release if possible. It should work on just +about any platform. PLY has been tested with both CPython and Jython. +It also seems to work with IronPython. + +Resources +========= +More information about PLY can be obtained on the PLY webpage at: + + http://www.dabeaz.com/ply + +For a detailed overview of parsing theory, consult the excellent +book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and +Ullman. The topics found in "Lex & Yacc" by Levine, Mason, and Brown +may also be useful. + +The GitHub page for PLY can be found at: + + https://github.com/dabeaz/ply + +An old and relatively inactive discussion group for PLY is found at: + + http://groups.google.com/group/ply-hack + +Acknowledgments +=============== +A special thanks is in order for all of the students in CS326 who +suffered through about 25 different versions of these tools :-). + +The CHANGES file acknowledges those who have contributed patches. + +Elias Ioup did the first implementation of LALR(1) parsing in PLY-1.x. +Andrew Waters and Markus Schoepflin were instrumental in reporting bugs +and testing a revised LALR(1) implementation for PLY-2.0. + +Special Note for PLY-3.0 +======================== +PLY-3.0 the first PLY release to support Python 3. However, backwards +compatibility with Python 2.6 is still preserved. PLY provides dual +Python 2/3 compatibility by restricting its implementation to a common +subset of basic language features. You should not convert PLY using +2to3--it is not necessary and may in fact break the implementation. + +Example +======= + +Here is a simple example showing a PLY implementation of a calculator +with variables. + + # ----------------------------------------------------------------------------- + # calc.py + # + # A simple calculator with variables. + # ----------------------------------------------------------------------------- + + tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + + # Tokens + + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_EQUALS = r'=' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Ignored characters + t_ignore = " \t" + + def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + import ply.lex as lex + lex.lex() + + # Precedence rules for the arithmetic operators + precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + + # dictionary of names (for storing variables) + names = { } + + def p_statement_assign(p): + 'statement : NAME EQUALS expression' + names[p[1]] = p[3] + + def p_statement_expr(p): + 'statement : expression' + print(p[1]) + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if p[2] == '+' : p[0] = p[1] + p[3] + elif p[2] == '-': p[0] = p[1] - p[3] + elif p[2] == '*': p[0] = p[1] * p[3] + elif p[2] == '/': p[0] = p[1] / p[3] + + def p_expression_uminus(p): + 'expression : MINUS expression %prec UMINUS' + p[0] = -p[2] + + def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = p[2] + + def p_expression_number(p): + 'expression : NUMBER' + p[0] = p[1] + + def p_expression_name(p): + 'expression : NAME' + try: + p[0] = names[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + + def p_error(p): + print("Syntax error at '%s'" % p.value) + + import ply.yacc as yacc + yacc.yacc() + + while True: + try: + s = raw_input('calc > ') # use input() on Python 3 + except EOFError: + break + yacc.parse(s) + + +Bug Reports and Patches +======================= +My goal with PLY is to simply have a decent lex/yacc implementation +for Python. As a general rule, I don't spend huge amounts of time +working on it unless I receive very specific bug reports and/or +patches to fix problems. I also try to incorporate submitted feature +requests and enhancements into each new version. Please visit the PLY +github page at https://github.com/dabeaz/ply to submit issues and pull +requests. To contact me about bugs and/or new features, please send +email to dave@dabeaz.com. + +-- Dave + + + + + + + + + diff --git a/TODO b/TODO new file mode 100644 index 000000000..f4800aacf --- /dev/null +++ b/TODO @@ -0,0 +1,16 @@ +The PLY to-do list: + +1. Finish writing the C Preprocessor module. Started in the + file ply/cpp.py + +2. Create and document libraries of useful tokens. + +3. Expand the examples/yply tool that parses bison/yacc + files. + +4. Think of various diabolical things to do with the + new yacc internals. For example, it is now possible + to specify grammrs using completely different schemes + than the reflection approach used by PLY. + + diff --git a/doc/internal.html b/doc/internal.html new file mode 100644 index 000000000..f73bc43f2 --- /dev/null +++ b/doc/internal.html @@ -0,0 +1,874 @@ + + +PLY Internals + + + +

PLY Internals

+ + +David M. Beazley
+dave@dabeaz.com
+
+ +

+PLY Version: 3.10 +

+ + +

+ + + +

1. Introduction

+ + +This document describes classes and functions that make up the internal +operation of PLY. Using this programming interface, it is possible to +manually build an parser using a different interface specification +than what PLY normally uses. For example, you could build a gramar +from information parsed in a completely different input format. Some of +these objects may be useful for building more advanced parsing engines +such as GLR. + +

+It should be stressed that using PLY at this level is not for the +faint of heart. Generally, it's assumed that you know a bit of +the underlying compiler theory and how an LR parser is put together. + +

2. Grammar Class

+ + +The file ply.yacc defines a class Grammar that +is used to hold and manipulate information about a grammar +specification. It encapsulates the same basic information +about a grammar that is put into a YACC file including +the list of tokens, precedence rules, and grammar rules. +Various operations are provided to perform different validations +on the grammar. In addition, there are operations to compute +the first and follow sets that are needed by the various table +generation algorithms. + +

+Grammar(terminals) + +

+Creates a new grammar object. terminals is a list of strings +specifying the terminals for the grammar. An instance g of +Grammar has the following methods: +
+ +

+g.set_precedence(term,assoc,level) +

+Sets the precedence level and associativity for a given terminal term. +assoc is one of 'right', +'left', or 'nonassoc' and level is a positive integer. The higher +the value of level, the higher the precedence. Here is an example of typical +precedence settings: + +
+g.set_precedence('PLUS',  'left',1)
+g.set_precedence('MINUS', 'left',1)
+g.set_precedence('TIMES', 'left',2)
+g.set_precedence('DIVIDE','left',2)
+g.set_precedence('UMINUS','left',3)
+
+ +This method must be called prior to adding any productions to the +grammar with g.add_production(). The precedence of individual grammar +rules is determined by the precedence of the right-most terminal. + +
+

+g.add_production(name,syms,func=None,file='',line=0) +

+Adds a new grammar rule. name is the name of the rule, +syms is a list of symbols making up the right hand +side of the rule, func is the function to call when +reducing the rule. file and line specify +the filename and line number of the rule and are used for +generating error messages. + +

+The list of symbols in syms may include character +literals and %prec specifiers. Here are some +examples: + +

+g.add_production('expr',['expr','PLUS','term'],func,file,line)
+g.add_production('expr',['expr','"+"','term'],func,file,line)
+g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line)
+
+ +

+If any kind of error is detected, a GrammarError exception +is raised with a message indicating the reason for the failure. +

+ +

+g.set_start(start=None) +

+Sets the starting rule for the grammar. start is a string +specifying the name of the start rule. If start is omitted, +the first grammar rule added with add_production() is taken to be +the starting rule. This method must always be called after all +productions have been added. +
+ +

+g.find_unreachable() +

+Diagnostic function. Returns a list of all unreachable non-terminals +defined in the grammar. This is used to identify inactive parts of +the grammar specification. +
+ +

+g.infinite_cycle() +

+Diagnostic function. Returns a list of all non-terminals in the +grammar that result in an infinite cycle. This condition occurs if +there is no way for a grammar rule to expand to a string containing +only terminal symbols. +
+ +

+g.undefined_symbols() +

+Diagnostic function. Returns a list of tuples (name, prod) +corresponding to undefined symbols in the grammar. name is the +name of the undefined symbol and prod is an instance of +Production which has information about the production rule +where the undefined symbol was used. +
+ +

+g.unused_terminals() +

+Diagnostic function. Returns a list of terminals that were defined, +but never used in the grammar. +
+ +

+g.unused_rules() +

+Diagnostic function. Returns a list of Production instances +corresponding to production rules that were defined in the grammar, +but never used anywhere. This is slightly different +than find_unreachable(). +
+ +

+g.unused_precedence() +

+Diagnostic function. Returns a list of tuples (term, assoc) +corresponding to precedence rules that were set, but never used the +grammar. term is the terminal name and assoc is the +precedence associativity (e.g., 'left', 'right', +or 'nonassoc'. +
+ +

+g.compute_first() +

+Compute all of the first sets for all symbols in the grammar. Returns a dictionary +mapping symbol names to a list of all first symbols. +
+ +

+g.compute_follow() +

+Compute all of the follow sets for all non-terminals in the grammar. +The follow set is the set of all possible symbols that might follow a +given non-terminal. Returns a dictionary mapping non-terminal names +to a list of symbols. +
+ +

+g.build_lritems() +

+Calculates all of the LR items for all productions in the grammar. This +step is required before using the grammar for any kind of table generation. +See the section on LR items below. +
+ +

+The following attributes are set by the above methods and may be useful +in code that works with the grammar. All of these attributes should be +assumed to be read-only. Changing their values directly will likely +break the grammar. + +

+g.Productions +

+A list of all productions added. The first entry is reserved for +a production representing the starting rule. The objects in this list +are instances of the Production class, described shortly. +
+ +

+g.Prodnames +

+A dictionary mapping the names of nonterminals to a list of all +productions of that nonterminal. +
+ +

+g.Terminals +

+A dictionary mapping the names of terminals to a list of the +production numbers where they are used. +
+ +

+g.Nonterminals +

+A dictionary mapping the names of nonterminals to a list of the +production numbers where they are used. +
+ +

+g.First +

+A dictionary representing the first sets for all grammar symbols. This is +computed and returned by the compute_first() method. +
+ +

+g.Follow +

+A dictionary representing the follow sets for all grammar rules. This is +computed and returned by the compute_follow() method. +
+ +

+g.Start +

+Starting symbol for the grammar. Set by the set_start() method. +
+ +For the purposes of debugging, a Grammar object supports the __len__() and +__getitem__() special methods. Accessing g[n] returns the nth production +from the grammar. + + +

3. Productions

+ + +Grammar objects store grammar rules as instances of a Production class. This +class has no public constructor--you should only create productions by calling Grammar.add_production(). +The following attributes are available on a Production instance p. + +

+p.name +

+The name of the production. For a grammar rule such as A : B C D, this is 'A'. +
+ +

+p.prod +

+A tuple of symbols making up the right-hand side of the production. For a grammar rule such as A : B C D, this is ('B','C','D'). +
+ +

+p.number +

+Production number. An integer containing the index of the production in the grammar's Productions list. +
+ +

+p.func +

+The name of the reduction function associated with the production. +This is the function that will execute when reducing the entire +grammar rule during parsing. +
+ +

+p.callable +

+The callable object associated with the name in p.func. This is None +unless the production has been bound using bind(). +
+ +

+p.file +

+Filename associated with the production. Typically this is the file where the production was defined. Used for error messages. +
+ +

+p.lineno +

+Line number associated with the production. Typically this is the line number in p.file where the production was defined. Used for error messages. +
+ +

+p.prec +

+Precedence and associativity associated with the production. This is a tuple (assoc,level) where +assoc is one of 'left','right', or 'nonassoc' and level is +an integer. This value is determined by the precedence of the right-most terminal symbol in the production +or by use of the %prec specifier when adding the production. +
+ +

+p.usyms +

+A list of all unique symbols found in the production. +
+ +

+p.lr_items +

+A list of all LR items for this production. This attribute only has a meaningful value if the +Grammar.build_lritems() method has been called. The items in this list are +instances of LRItem described below. +
+ +

+p.lr_next +

+The head of a linked-list representation of the LR items in p.lr_items. +This attribute only has a meaningful value if the Grammar.build_lritems() +method has been called. Each LRItem instance has a lr_next attribute +to move to the next item. The list is terminated by None. +
+ +

+p.bind(dict) +

+Binds the production function name in p.func to a callable object in +dict. This operation is typically carried out in the last step +prior to running the parsing engine and is needed since parsing tables are typically +read from files which only include the function names, not the functions themselves. +
+ +

+Production objects support +the __len__(), __getitem__(), and __str__() +special methods. +len(p) returns the number of symbols in p.prod +and p[n] is the same as p.prod[n]. + +

4. LRItems

+ + +The construction of parsing tables in an LR-based parser generator is primarily +done over a set of "LR Items". An LR item represents a stage of parsing one +of the grammar rules. To compute the LR items, it is first necessary to +call Grammar.build_lritems(). Once this step, all of the productions +in the grammar will have their LR items attached to them. + +

+Here is an interactive example that shows what LR items look like if you +interactively experiment. In this example, g is a Grammar +object. + +

+
+>>> g.build_lritems()
+>>> p = g[1]
+>>> p
+Production(statement -> ID = expr)
+>>>
+
+
+ +In the above code, p represents the first grammar rule. In +this case, a rule 'statement -> ID = expr'. + +

+Now, let's look at the LR items for p. + +

+
+>>> p.lr_items
+[LRItem(statement -> . ID = expr), 
+ LRItem(statement -> ID . = expr), 
+ LRItem(statement -> ID = . expr), 
+ LRItem(statement -> ID = expr .)]
+>>>
+
+
+ +In each LR item, the dot (.) represents a specific stage of parsing. In each LR item, the dot +is advanced by one symbol. It is only when the dot reaches the very end that a production +is successfully parsed. + +

+An instance lr of LRItem has the following +attributes that hold information related to that specific stage of +parsing. + +

+lr.name +

+The name of the grammar rule. For example, 'statement' in the above example. +
+ +

+lr.prod +

+A tuple of symbols representing the right-hand side of the production, including the +special '.' character. For example, ('ID','.','=','expr'). +
+ +

+lr.number +

+An integer representing the production number in the grammar. +
+ +

+lr.usyms +

+A set of unique symbols in the production. Inherited from the original Production instance. +
+ +

+lr.lr_index +

+An integer representing the position of the dot (.). You should never use lr.prod.index() +to search for it--the result will be wrong if the grammar happens to also use (.) as a character +literal. +
+ +

+lr.lr_after +

+A list of all productions that can legally appear immediately to the right of the +dot (.). This list contains Production instances. This attribute +represents all of the possible branches a parse can take from the current position. +For example, suppose that lr represents a stage immediately before +an expression like this: + +
+>>> lr
+LRItem(statement -> ID = . expr)
+>>>
+
+ +Then, the value of lr.lr_after might look like this, showing all productions that +can legally appear next: + +
+>>> lr.lr_after
+[Production(expr -> expr PLUS expr), 
+ Production(expr -> expr MINUS expr), 
+ Production(expr -> expr TIMES expr), 
+ Production(expr -> expr DIVIDE expr), 
+ Production(expr -> MINUS expr), 
+ Production(expr -> LPAREN expr RPAREN), 
+ Production(expr -> NUMBER), 
+ Production(expr -> ID)]
+>>>
+
+ +
+ +

+lr.lr_before +

+The grammar symbol that appears immediately before the dot (.) or None if +at the beginning of the parse. +
+ +

+lr.lr_next +

+A link to the next LR item, representing the next stage of the parse. None if lr +is the last LR item. +
+ +LRItem instances also support the __len__() and __getitem__() special methods. +len(lr) returns the number of items in lr.prod including the dot (.). lr[n] +returns lr.prod[n]. + +

+It goes without saying that all of the attributes associated with LR +items should be assumed to be read-only. Modifications will very +likely create a small black-hole that will consume you and your code. + +

5. LRTable

+ + +The LRTable class is used to represent LR parsing table data. This +minimally includes the production list, action table, and goto table. + +

+LRTable() +

+Create an empty LRTable object. This object contains only the information needed to +run an LR parser. +
+ +An instance lrtab of LRTable has the following methods: + +

+lrtab.read_table(module) +

+Populates the LR table with information from the module specified in module. +module is either a module object already loaded with import or +the name of a Python module. If it's a string containing a module name, it is +loaded and parsing data is extracted. Returns the signature value that was used +when initially writing the tables. Raises a VersionError exception if +the module was created using an incompatible version of PLY. +
+ +

+lrtab.bind_callables(dict) +

+This binds all of the function names used in productions to callable objects +found in the dictionary dict. During table generation and when reading +LR tables from files, PLY only uses the names of action functions such as 'p_expr', +'p_statement', etc. In order to actually run the parser, these names +have to be bound to callable objects. This method is always called prior to +running a parser. +
+ +After lrtab has been populated, the following attributes are defined. + +

+lrtab.lr_method +

+The LR parsing method used (e.g., 'LALR') +
+ + +

+lrtab.lr_productions +

+The production list. If the parsing tables have been newly +constructed, this will be a list of Production instances. If +the parsing tables have been read from a file, it's a list +of MiniProduction instances. This, together +with lr_action and lr_goto contain all of the +information needed by the LR parsing engine. +
+ +

+lrtab.lr_action +

+The LR action dictionary that implements the underlying state machine. +The keys of this dictionary are the LR states. +
+ +

+lrtab.lr_goto +

+The LR goto table that contains information about grammar rule reductions. +
+ + +

6. LRGeneratedTable

+ + +The LRGeneratedTable class represents constructed LR parsing tables on a +grammar. It is a subclass of LRTable. + +

+LRGeneratedTable(grammar, method='LALR',log=None) +

+Create the LR parsing tables on a grammar. grammar is an instance of Grammar, +method is a string with the parsing method ('SLR' or 'LALR'), and +log is a logger object used to write debugging information. The debugging information +written to log is the same as what appears in the parser.out file created +by yacc. By supplying a custom logger with a different message format, it is possible to get +more information (e.g., the line number in yacc.py used for issuing each line of +output in the log). The result is an instance of LRGeneratedTable. +
+ +

+An instance lr of LRGeneratedTable has the following attributes. + +

+lr.grammar +

+A link to the Grammar object used to construct the parsing tables. +
+ +

+lr.lr_method +

+The LR parsing method used (e.g., 'LALR') +
+ + +

+lr.lr_productions +

+A reference to grammar.Productions. This, together with lr_action and lr_goto +contain all of the information needed by the LR parsing engine. +
+ +

+lr.lr_action +

+The LR action dictionary that implements the underlying state machine. The keys of this dictionary are +the LR states. +
+ +

+lr.lr_goto +

+The LR goto table that contains information about grammar rule reductions. +
+ +

+lr.sr_conflicts +

+A list of tuples (state,token,resolution) identifying all shift/reduce conflicts. state is the LR state +number where the conflict occurred, token is the token causing the conflict, and resolution is +a string describing the resolution taken. resolution is either 'shift' or 'reduce'. +
+ +

+lr.rr_conflicts +

+A list of tuples (state,rule,rejected) identifying all reduce/reduce conflicts. state is the +LR state number where the conflict occurred, rule is the production rule that was selected +and rejected is the production rule that was rejected. Both rule and rejected are +instances of Production. They can be inspected to provide the user with more information. +
+ +

+There are two public methods of LRGeneratedTable. + +

+lr.write_table(modulename,outputdir="",signature="") +

+Writes the LR parsing table information to a Python module. modulename is a string +specifying the name of a module such as "parsetab". outputdir is the name of a +directory where the module should be created. signature is a string representing a +grammar signature that's written into the output file. This can be used to detect when +the data stored in a module file is out-of-sync with the the grammar specification (and that +the tables need to be regenerated). If modulename is a string "parsetab", +this function creates a file called parsetab.py. If the module name represents a +package such as "foo.bar.parsetab", then only the last component, "parsetab" is +used. +
+ + +

7. LRParser

+ + +The LRParser class implements the low-level LR parsing engine. + + +

+LRParser(lrtab, error_func) +

+Create an LRParser. lrtab is an instance of LRTable +containing the LR production and state tables. error_func is the +error function to invoke in the event of a parsing error. +
+ +An instance p of LRParser has the following methods: + +

+p.parse(input=None,lexer=None,debug=0,tracking=0,tokenfunc=None) +

+Run the parser. input is a string, which if supplied is fed into the +lexer using its input() method. lexer is an instance of the +Lexer class to use for tokenizing. If not supplied, the last lexer +created with the lex module is used. debug is a boolean flag +that enables debugging. tracking is a boolean flag that tells the +parser to perform additional line number tracking. tokenfunc is a callable +function that returns the next token. If supplied, the parser will use it to get +all tokens. +
+ +

+p.restart() +

+Resets the parser state for a parse already in progress. +
+ +

8. ParserReflect

+ + +

+The ParserReflect class is used to collect parser specification data +from a Python module or object. This class is what collects all of the +p_rule() functions in a PLY file, performs basic error checking, +and collects all of the needed information to build a grammar. Most of the +high-level PLY interface as used by the yacc() function is actually +implemented by this class. + +

+ParserReflect(pdict, log=None) +

+Creates a ParserReflect instance. pdict is a dictionary +containing parser specification data. This dictionary typically corresponds +to the module or class dictionary of code that implements a PLY parser. +log is a logger instance that will be used to report error +messages. +
+ +An instance p of ParserReflect has the following methods: + +

+p.get_all() +

+Collect and store all required parsing information. +
+ +

+p.validate_all() +

+Validate all of the collected parsing information. This is a seprate step +from p.get_all() as a performance optimization. In order to +increase parser start-up time, a parser can elect to only validate the +parsing data when regenerating the parsing tables. The validation +step tries to collect as much information as possible rather than +raising an exception at the first sign of trouble. The attribute +p.error is set if there are any validation errors. The +value of this attribute is also returned. +
+ +

+p.signature() +

+Compute a signature representing the contents of the collected parsing +data. The signature value should change if anything in the parser +specification has changed in a way that would justify parser table +regeneration. This method can be called after p.get_all(), +but before p.validate_all(). +
+ +The following attributes are set in the process of collecting data: + +

+p.start +

+The grammar start symbol, if any. Taken from pdict['start']. +
+ +

+p.error_func +

+The error handling function or None. Taken from pdict['p_error']. +
+ +

+p.tokens +

+The token list. Taken from pdict['tokens']. +
+ +

+p.prec +

+The precedence specifier. Taken from pdict['precedence']. +
+ +

+p.preclist +

+A parsed version of the precedence specified. A list of tuples of the form +(token,assoc,level) where token is the terminal symbol, +assoc is the associativity (e.g., 'left') and level +is a numeric precedence level. +
+ +

+p.grammar +

+A list of tuples (name, rules) representing the grammar rules. name is the +name of a Python function or method in pdict that starts with "p_". +rules is a list of tuples (filename,line,prodname,syms) representing +the grammar rules found in the documentation string of that function. filename and line contain location +information that can be used for debugging. prodname is the name of the +production. syms is the right-hand side of the production. If you have a +function like this + +
+def p_expr(p):
+    '''expr : expr PLUS expr
+            | expr MINUS expr
+            | expr TIMES expr
+            | expr DIVIDE expr'''
+
+ +then the corresponding entry in p.grammar might look like this: + +
+('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']),
+             ('calc.py',11,'expr', ['expr','MINUS','expr']),
+             ('calc.py',12,'expr', ['expr','TIMES','expr']),
+             ('calc.py',13,'expr', ['expr','DIVIDE','expr'])
+           ])
+
+
+ +

+p.pfuncs +

+A sorted list of tuples (line, file, name, doc) representing all of +the p_ functions found. line and file give location +information. name is the name of the function. doc is the +documentation string. This list is sorted in ascending order by line number. +
+ +

+p.files +

+A dictionary holding all of the source filenames that were encountered +while collecting parser information. Only the keys of this dictionary have +any meaning. +
+ +

+p.error +

+An attribute that indicates whether or not any critical errors +occurred in validation. If this is set, it means that that some kind +of problem was detected and that no further processing should be +performed. +
+ + +

9. High-level operation

+ + +Using all of the above classes requires some attention to detail. The yacc() +function carries out a very specific sequence of operations to create a grammar. +This same sequence should be emulated if you build an alternative PLY interface. + +
    +
  1. A ParserReflect object is created and raw grammar specification data is +collected. +
  2. A Grammar object is created and populated with information +from the specification data. +
  3. A LRGenerator object is created to run the LALR algorithm over +the Grammar object. +
  4. Productions in the LRGenerator and bound to callables using the bind_callables() +method. +
  5. A LRParser object is created from from the information in the +LRGenerator object. +
+ + + + + + + + + + diff --git a/doc/makedoc.py b/doc/makedoc.py new file mode 100644 index 000000000..415a53aa0 --- /dev/null +++ b/doc/makedoc.py @@ -0,0 +1,194 @@ +#!/usr/local/bin/python + +############################################################################### +# Takes a chapter as input and adds internal links and numbering to all +# of the H1, H2, H3, H4 and H5 sections. +# +# Every heading HTML tag (H1, H2 etc) is given an autogenerated name to link +# to. However, if the name is not an autogenerated name from a previous run, +# it will be kept. If it is autogenerated, it might change on subsequent runs +# of this program. Thus if you want to create links to one of the headings, +# then change the heading link name to something that does not look like an +# autogenerated link name. +############################################################################### + +import sys +import re +import string + +############################################################################### +# Functions +############################################################################### + +# Regexs for +alink = re.compile(r"", re.IGNORECASE) +heading = re.compile(r"(_nn\d)", re.IGNORECASE) + +def getheadingname(m): + autogeneratedheading = True; + if m.group(1) != None: + amatch = alink.match(m.group(1)) + if amatch: + # A non-autogenerated heading - keep it + headingname = amatch.group(1) + autogeneratedheading = heading.match(headingname) + if autogeneratedheading: + # The heading name was either non-existent or autogenerated, + # We can create a new heading / change the existing heading + headingname = "%s_nn%d" % (filenamebase, nameindex) + return headingname + +############################################################################### +# Main program +############################################################################### + +if len(sys.argv) != 2: + print "usage: makedoc.py filename" + sys.exit(1) + +filename = sys.argv[1] +filenamebase = string.split(filename,".")[0] + +section = 0 +subsection = 0 +subsubsection = 0 +subsubsubsection = 0 +nameindex = 0 + +name = "" + +# Regexs for

,...

sections + +h1 = re.compile(r".*?

()*[\d\.\s]*(.*?)

", re.IGNORECASE) +h2 = re.compile(r".*?

()*[\d\.\s]*(.*?)

", re.IGNORECASE) +h3 = re.compile(r".*?

()*[\d\.\s]*(.*?)

", re.IGNORECASE) +h4 = re.compile(r".*?

()*[\d\.\s]*(.*?)

", re.IGNORECASE) +h5 = re.compile(r".*?
()*[\d\.\s]*(.*?)
", re.IGNORECASE) + +data = open(filename).read() # Read data +open(filename+".bak","w").write(data) # Make backup + +lines = data.splitlines() +result = [ ] # This is the result of postprocessing the file +index = "\n
\n" # index contains the index for adding at the top of the file. Also printed to stdout. + +skip = 0 +skipspace = 0 + +for s in lines: + if s == "": + if not skip: + result.append("@INDEX@") + skip = 1 + else: + skip = 0 + continue; + if skip: + continue + + if not s and skipspace: + continue + + if skipspace: + result.append("") + result.append("") + skipspace = 0 + + m = h2.match(s) + if m: + prevheadingtext = m.group(2) + nameindex += 1 + section += 1 + headingname = getheadingname(m) + result.append("""

%d. %s

""" % (headingname,section, prevheadingtext)) + + if subsubsubsection: + index += "\n" + if subsubsection: + index += "\n" + if subsection: + index += "\n" + if section == 1: + index += "
    \n" + + index += """
  • %s\n""" % (headingname,prevheadingtext) + subsection = 0 + subsubsection = 0 + subsubsubsection = 0 + skipspace = 1 + continue + m = h3.match(s) + if m: + prevheadingtext = m.group(2) + nameindex += 1 + subsection += 1 + headingname = getheadingname(m) + result.append("""

    %d.%d %s

    """ % (headingname,section, subsection, prevheadingtext)) + + if subsubsubsection: + index += "
\n" + if subsubsection: + index += "\n" + if subsection == 1: + index += "
    \n" + + index += """
  • %s\n""" % (headingname,prevheadingtext) + subsubsection = 0 + skipspace = 1 + continue + m = h4.match(s) + if m: + prevheadingtext = m.group(2) + nameindex += 1 + subsubsection += 1 + subsubsubsection = 0 + headingname = getheadingname(m) + result.append("""

    %d.%d.%d %s

    """ % (headingname,section, subsection, subsubsection, prevheadingtext)) + + if subsubsubsection: + index += "
\n" + if subsubsection == 1: + index += "
    \n" + + index += """
  • %s\n""" % (headingname,prevheadingtext) + skipspace = 1 + continue + m = h5.match(s) + if m: + prevheadingtext = m.group(2) + nameindex += 1 + subsubsubsection += 1 + headingname = getheadingname(m) + result.append("""
    %d.%d.%d.%d %s
    """ % (headingname,section, subsection, subsubsection, subsubsubsection, prevheadingtext)) + + if subsubsubsection == 1: + index += "
      \n" + + index += """
    • %s\n""" % (headingname,prevheadingtext) + skipspace = 1 + continue + + result.append(s) + +if subsubsubsection: + index += "
    \n" + +if subsubsection: + index += "
\n" + +if subsection: + index += "\n" + +if section: + index += "\n" + +index += "
\n\n" + +data = "\n".join(result) + +data = data.replace("@INDEX@",index) + "\n"; + +# Write the file back out +open(filename,"w").write(data) + + diff --git a/doc/ply.html b/doc/ply.html new file mode 100644 index 000000000..0b232b2c6 --- /dev/null +++ b/doc/ply.html @@ -0,0 +1,3496 @@ + + +PLY (Python Lex-Yacc) + + + +

PLY (Python Lex-Yacc)

+ + +David M. Beazley
+dave@dabeaz.com
+
+ +

+PLY Version: 3.10 +

+ + +

+ + + + + + + +

1. Preface and Requirements

+ + +

+This document provides an overview of lexing and parsing with PLY. +Given the intrinsic complexity of parsing, I would strongly advise +that you read (or at least skim) this entire document before jumping +into a big development project with PLY. +

+ +

+PLY-3.5 is compatible with both Python 2 and Python 3. If you are using +Python 2, you have to use Python 2.6 or newer. +

+ +

2. Introduction

+ + +PLY is a pure-Python implementation of the popular compiler +construction tools lex and yacc. The main goal of PLY is to stay +fairly faithful to the way in which traditional lex/yacc tools work. +This includes supporting LALR(1) parsing as well as providing +extensive input validation, error reporting, and diagnostics. Thus, +if you've used yacc in another programming language, it should be +relatively straightforward to use PLY. + +

+Early versions of PLY were developed to support an Introduction to +Compilers Course I taught in 2001 at the University of Chicago. +Since PLY was primarily developed as an instructional tool, you will +find it to be fairly picky about token and grammar rule +specification. In part, this +added formality is meant to catch common programming mistakes made by +novice users. However, advanced users will also find such features to +be useful when building complicated grammars for real programming +languages. It should also be noted that PLY does not provide much in +the way of bells and whistles (e.g., automatic construction of +abstract syntax trees, tree traversal, etc.). Nor would I consider it +to be a parsing framework. Instead, you will find a bare-bones, yet +fully capable lex/yacc implementation written entirely in Python. + +

+The rest of this document assumes that you are somewhat familiar with +parsing theory, syntax directed translation, and the use of compiler +construction tools such as lex and yacc in other programming +languages. If you are unfamiliar with these topics, you will probably +want to consult an introductory text such as "Compilers: Principles, +Techniques, and Tools", by Aho, Sethi, and Ullman. O'Reilly's "Lex +and Yacc" by John Levine may also be handy. In fact, the O'Reilly book can be +used as a reference for PLY as the concepts are virtually identical. + +

3. PLY Overview

+ + +

+PLY consists of two separate modules; lex.py and +yacc.py, both of which are found in a Python package +called ply. The lex.py module is used to break input text into a +collection of tokens specified by a collection of regular expression +rules. yacc.py is used to recognize language syntax that has +been specified in the form of a context free grammar. +

+ +

+The two tools are meant to work together. Specifically, +lex.py provides an external interface in the form of a +token() function that returns the next valid token on the +input stream. yacc.py calls this repeatedly to retrieve +tokens and invoke grammar rules. The output of yacc.py is +often an Abstract Syntax Tree (AST). However, this is entirely up to +the user. If desired, yacc.py can also be used to implement +simple one-pass compilers. + +

+Like its Unix counterpart, yacc.py provides most of the +features you expect including extensive error checking, grammar +validation, support for empty productions, error tokens, and ambiguity +resolution via precedence rules. In fact, almost everything that is possible in traditional yacc +should be supported in PLY. + +

+The primary difference between +yacc.py and Unix yacc is that yacc.py +doesn't involve a separate code-generation process. +Instead, PLY relies on reflection (introspection) +to build its lexers and parsers. Unlike traditional lex/yacc which +require a special input file that is converted into a separate source +file, the specifications given to PLY are valid Python +programs. This means that there are no extra source files nor is +there a special compiler construction step (e.g., running yacc to +generate Python code for the compiler). Since the generation of the +parsing tables is relatively expensive, PLY caches the results and +saves them to a file. If no changes are detected in the input source, +the tables are read from the cache. Otherwise, they are regenerated. + +

4. Lex

+ + +lex.py is used to tokenize an input string. For example, suppose +you're writing a programming language and a user supplied the following input string: + +
+
+x = 3 + 42 * (s - t)
+
+
+ +A tokenizer splits the string into individual tokens + +
+
+'x','=', '3', '+', '42', '*', '(', 's', '-', 't', ')'
+
+
+ +Tokens are usually given names to indicate what they are. For example: + +
+
+'ID','EQUALS','NUMBER','PLUS','NUMBER','TIMES',
+'LPAREN','ID','MINUS','ID','RPAREN'
+
+
+ +More specifically, the input is broken into pairs of token types and values. For example: + +
+
+('ID','x'), ('EQUALS','='), ('NUMBER','3'), 
+('PLUS','+'), ('NUMBER','42), ('TIMES','*'),
+('LPAREN','('), ('ID','s'), ('MINUS','-'),
+('ID','t'), ('RPAREN',')'
+
+
+ +The identification of tokens is typically done by writing a series of regular expression +rules. The next section shows how this is done using lex.py. + +

4.1 Lex Example

+ + +The following example shows how lex.py is used to write a simple tokenizer. + +
+
+# ------------------------------------------------------------
+# calclex.py
+#
+# tokenizer for a simple expression evaluator for
+# numbers and +,-,*,/
+# ------------------------------------------------------------
+import ply.lex as lex
+
+# List of token names.   This is always required
+tokens = (
+   'NUMBER',
+   'PLUS',
+   'MINUS',
+   'TIMES',
+   'DIVIDE',
+   'LPAREN',
+   'RPAREN',
+)
+
+# Regular expression rules for simple tokens
+t_PLUS    = r'\+'
+t_MINUS   = r'-'
+t_TIMES   = r'\*'
+t_DIVIDE  = r'/'
+t_LPAREN  = r'\('
+t_RPAREN  = r'\)'
+
+# A regular expression rule with some action code
+def t_NUMBER(t):
+    r'\d+'
+    t.value = int(t.value)    
+    return t
+
+# Define a rule so we can track line numbers
+def t_newline(t):
+    r'\n+'
+    t.lexer.lineno += len(t.value)
+
+# A string containing ignored characters (spaces and tabs)
+t_ignore  = ' \t'
+
+# Error handling rule
+def t_error(t):
+    print("Illegal character '%s'" % t.value[0])
+    t.lexer.skip(1)
+
+# Build the lexer
+lexer = lex.lex()
+
+
+
+To use the lexer, you first need to feed it some input text using +its input() method. After that, repeated calls +to token() produce tokens. The following code shows how this +works: + +
+
+
+# Test it out
+data = '''
+3 + 4 * 10
+  + -20 *2
+'''
+
+# Give the lexer some input
+lexer.input(data)
+
+# Tokenize
+while True:
+    tok = lexer.token()
+    if not tok: 
+        break      # No more input
+    print(tok)
+
+
+ +When executed, the example will produce the following output: + +
+
+$ python example.py
+LexToken(NUMBER,3,2,1)
+LexToken(PLUS,'+',2,3)
+LexToken(NUMBER,4,2,5)
+LexToken(TIMES,'*',2,7)
+LexToken(NUMBER,10,2,10)
+LexToken(PLUS,'+',3,14)
+LexToken(MINUS,'-',3,16)
+LexToken(NUMBER,20,3,18)
+LexToken(TIMES,'*',3,20)
+LexToken(NUMBER,2,3,21)
+
+
+ +Lexers also support the iteration protocol. So, you can write the above loop as follows: + +
+
+for tok in lexer:
+    print(tok)
+
+
+ +The tokens returned by lexer.token() are instances +of LexToken. This object has +attributes tok.type, tok.value, +tok.lineno, and tok.lexpos. The following code shows an example of +accessing these attributes: + +
+
+# Tokenize
+while True:
+    tok = lexer.token()
+    if not tok: 
+        break      # No more input
+    print(tok.type, tok.value, tok.lineno, tok.lexpos)
+
+
+ +The tok.type and tok.value attributes contain the +type and value of the token itself. +tok.line and tok.lexpos contain information about +the location of the token. tok.lexpos is the index of the +token relative to the start of the input text. + +

4.2 The tokens list

+ + +

+All lexers must provide a list tokens that defines all of the possible token +names that can be produced by the lexer. This list is always required +and is used to perform a variety of validation checks. The tokens list is also used by the +yacc.py module to identify terminals. +

+ +

+In the example, the following code specified the token names: + +

+
+tokens = (
+   'NUMBER',
+   'PLUS',
+   'MINUS',
+   'TIMES',
+   'DIVIDE',
+   'LPAREN',
+   'RPAREN',
+)
+
+
+ +

4.3 Specification of tokens

+ + +Each token is specified by writing a regular expression rule compatible with Python's re module. Each of these rules +are defined by making declarations with a special prefix t_ to indicate that it +defines a token. For simple tokens, the regular expression can +be specified as strings such as this (note: Python raw strings are used since they are the +most convenient way to write regular expression strings): + +
+
+t_PLUS = r'\+'
+
+
+ +In this case, the name following the t_ must exactly match one of the +names supplied in tokens. If some kind of action needs to be performed, +a token rule can be specified as a function. For example, this rule matches numbers and +converts the string into a Python integer. + +
+
+def t_NUMBER(t):
+    r'\d+'
+    t.value = int(t.value)
+    return t
+
+
+ +When a function is used, the regular expression rule is specified in the function documentation string. +The function always takes a single argument which is an instance of +LexToken. This object has attributes of t.type which is the token type (as a string), +t.value which is the lexeme (the actual text matched), t.lineno which is the current line number, and t.lexpos which +is the position of the token relative to the beginning of the input text. +By default, t.type is set to the name following the t_ prefix. The action +function can modify the contents of the LexToken object as appropriate. However, +when it is done, the resulting token should be returned. If no value is returned by the action +function, the token is simply discarded and the next token read. + +

+Internally, lex.py uses the re module to do its pattern matching. Patterns are compiled +using the re.VERBOSE flag which can be used to help readability. However, be aware that unescaped +whitespace is ignored and comments are allowed in this mode. If your pattern involves whitespace, make sure you +use \s. If you need to match the # character, use [#]. +

+ +

+When building the master regular expression, +rules are added in the following order: +

+ +

+

    +
  1. All tokens defined by functions are added in the same order as they appear in the lexer file. +
  2. Tokens defined by strings are added next by sorting them in order of decreasing regular expression length (longer expressions +are added first). +
+

+Without this ordering, it can be difficult to correctly match certain types of tokens. For example, if you +wanted to have separate tokens for "=" and "==", you need to make sure that "==" is checked first. By sorting regular +expressions in order of decreasing length, this problem is solved for rules defined as strings. For functions, +the order can be explicitly controlled since rules appearing first are checked first. + +

+To handle reserved words, you should write a single rule to match an +identifier and do a special name lookup in a function like this: + +

+
+reserved = {
+   'if' : 'IF',
+   'then' : 'THEN',
+   'else' : 'ELSE',
+   'while' : 'WHILE',
+   ...
+}
+
+tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values())
+
+def t_ID(t):
+    r'[a-zA-Z_][a-zA-Z_0-9]*'
+    t.type = reserved.get(t.value,'ID')    # Check for reserved words
+    return t
+
+
+ +This approach greatly reduces the number of regular expression rules and is likely to make things a little faster. + +

+Note: You should avoid writing individual rules for reserved words. For example, if you write rules like this, + +

+
+t_FOR   = r'for'
+t_PRINT = r'print'
+
+
+ +those rules will be triggered for identifiers that include those words as a prefix such as "forget" or "printed". This is probably not +what you want. + +

4.4 Token values

+ + +When tokens are returned by lex, they have a value that is stored in the value attribute. Normally, the value is the text +that was matched. However, the value can be assigned to any Python object. For instance, when lexing identifiers, you may +want to return both the identifier name and information from some sort of symbol table. To do this, you might write a rule like this: + +
+
+def t_ID(t):
+    ...
+    # Look up symbol table information and return a tuple
+    t.value = (t.value, symbol_lookup(t.value))
+    ...
+    return t
+
+
+ +It is important to note that storing data in other attribute names is not recommended. The yacc.py module only exposes the +contents of the value attribute. Thus, accessing other attributes may be unnecessarily awkward. If you +need to store multiple values on a token, assign a tuple, dictionary, or instance to value. + +

4.5 Discarded tokens

+ + +To discard a token, such as a comment, simply define a token rule that returns no value. For example: + +
+
+def t_COMMENT(t):
+    r'\#.*'
+    pass
+    # No return value. Token discarded
+
+
+ +Alternatively, you can include the prefix "ignore_" in the token declaration to force a token to be ignored. For example: + +
+
+t_ignore_COMMENT = r'\#.*'
+
+
+ +Be advised that if you are ignoring many different kinds of text, you may still want to use functions since these provide more precise +control over the order in which regular expressions are matched (i.e., functions are matched in order of specification whereas strings are +sorted by regular expression length). + +

4.6 Line numbers and positional information

+ + +

By default, lex.py knows nothing about line numbers. This is because lex.py doesn't know anything +about what constitutes a "line" of input (e.g., the newline character or even if the input is textual data). +To update this information, you need to write a special rule. In the example, the t_newline() rule shows how to do this. + +

+
+# Define a rule so we can track line numbers
+def t_newline(t):
+    r'\n+'
+    t.lexer.lineno += len(t.value)
+
+
+Within the rule, the lineno attribute of the underlying lexer t.lexer is updated. +After the line number is updated, the token is simply discarded since nothing is returned. + +

+lex.py does not perform and kind of automatic column tracking. However, it does record positional +information related to each token in the lexpos attribute. Using this, it is usually possible to compute +column information as a separate step. For instance, just count backwards until you reach a newline. + +

+
+# Compute column. 
+#     input is the input text string
+#     token is a token instance
+def find_column(input,token):
+    last_cr = input.rfind('\n',0,token.lexpos)
+    if last_cr < 0:
+	last_cr = 0
+    column = (token.lexpos - last_cr) + 1
+    return column
+
+
+ +Since column information is often only useful in the context of error handling, calculating the column +position can be performed when needed as opposed to doing it for each token. + +

4.7 Ignored characters

+ + +

+The special t_ignore rule is reserved by lex.py for characters +that should be completely ignored in the input stream. +Usually this is used to skip over whitespace and other non-essential characters. +Although it is possible to define a regular expression rule for whitespace in a manner +similar to t_newline(), the use of t_ignore provides substantially better +lexing performance because it is handled as a special case and is checked in a much +more efficient manner than the normal regular expression rules. +

+ +

+The characters given in t_ignore are not ignored when such characters are part of +other regular expression patterns. For example, if you had a rule to capture quoted text, +that pattern can include the ignored characters (which will be captured in the normal way). The +main purpose of t_ignore is to ignore whitespace and other padding between the +tokens that you actually want to parse. +

+ +

4.8 Literal characters

+ + +

+Literal characters can be specified by defining a variable literals in your lexing module. For example: + +

+
+literals = [ '+','-','*','/' ]
+
+
+ +or alternatively + +
+
+literals = "+-*/"
+
+
+ +A literal character is simply a single character that is returned "as is" when encountered by the lexer. Literals are checked +after all of the defined regular expression rules. Thus, if a rule starts with one of the literal characters, it will always +take precedence. + +

+When a literal token is returned, both its type and value attributes are set to the character itself. For example, '+'. +

+ +

+It's possible to write token functions that perform additional actions +when literals are matched. However, you'll need to set the token type +appropriately. For example: +

+ +
+
+literals = [ '{', '}' ]
+
+def t_lbrace(t):
+    r'\{'
+    t.type = '{'      # Set token type to the expected literal
+    return t
+
+def t_rbrace(t):
+    r'\}'
+    t.type = '}'      # Set token type to the expected literal
+    return t
+
+
+ +

4.9 Error handling

+ + +

+The t_error() +function is used to handle lexing errors that occur when illegal +characters are detected. In this case, the t.value attribute contains the +rest of the input string that has not been tokenized. In the example, the error function +was defined as follows: + +

+
+# Error handling rule
+def t_error(t):
+    print("Illegal character '%s'" % t.value[0])
+    t.lexer.skip(1)
+
+
+ +In this case, we simply print the offending character and skip ahead one character by calling t.lexer.skip(1). + +

4.10 EOF Handling

+ + +

+The t_eof() function is used to handle an end-of-file (EOF) condition in the input. As input, it +receives a token type 'eof' with the lineno and lexpos attributes set appropriately. +The main use of this function is provide more input to the lexer so that it can continue to parse. Here is an +example of how this works: +

+ +
+
+# EOF handling rule
+def t_eof(t):
+    # Get more input (Example)
+    more = raw_input('... ')
+    if more:
+        self.lexer.input(more)
+        return self.lexer.token()
+    return None
+
+
+ +

+The EOF function should return the next available token (by calling self.lexer.token()) or None to +indicate no more data. Be aware that setting more input with the self.lexer.input() method does +NOT reset the lexer state or the lineno attribute used for position tracking. The lexpos +attribute is reset so be aware of that if you're using it in error reporting. +

+ +

4.11 Building and using the lexer

+ + +

+To build the lexer, the function lex.lex() is used. For example:

+ +
+
+lexer = lex.lex()
+
+
+ +

This function +uses Python reflection (or introspection) to read the regular expression rules +out of the calling context and build the lexer. Once the lexer has been built, two methods can +be used to control the lexer. +

+ + +

4.12 The @TOKEN decorator

+ + +In some applications, you may want to define build tokens from as a series of +more complex regular expression rules. For example: + +
+
+digit            = r'([0-9])'
+nondigit         = r'([_A-Za-z])'
+identifier       = r'(' + nondigit + r'(' + digit + r'|' + nondigit + r')*)'        
+
+def t_ID(t):
+    # want docstring to be identifier above. ?????
+    ...
+
+
+ +In this case, we want the regular expression rule for ID to be one of the variables above. However, there is no +way to directly specify this using a normal documentation string. To solve this problem, you can use the @TOKEN +decorator. For example: + +
+
+from ply.lex import TOKEN
+
+@TOKEN(identifier)
+def t_ID(t):
+    ...
+
+
+ +

+This will attach identifier to the docstring for t_ID() allowing lex.py to work normally. +

+ +

4.13 Optimized mode

+ + +For improved performance, it may be desirable to use Python's +optimized mode (e.g., running Python with the -O +option). However, doing so causes Python to ignore documentation +strings. This presents special problems for lex.py. To +handle this case, you can create your lexer using +the optimize option as follows: + +
+
+lexer = lex.lex(optimize=1)
+
+
+ +Next, run Python in its normal operating mode. When you do +this, lex.py will write a file called lextab.py in +the same directory as the module containing the lexer specification. +This file contains all of the regular +expression rules and tables used during lexing. On subsequent +executions, +lextab.py will simply be imported to build the lexer. This +approach substantially improves the startup time of the lexer and it +works in Python's optimized mode. + +

+To change the name of the lexer-generated module, use the lextab keyword argument. For example: +

+ +
+
+lexer = lex.lex(optimize=1,lextab="footab")
+
+
+ +When running in optimized mode, it is important to note that lex disables most error checking. Thus, this is really only recommended +if you're sure everything is working correctly and you're ready to start releasing production code. + +

4.14 Debugging

+ + +For the purpose of debugging, you can run lex() in a debugging mode as follows: + +
+
+lexer = lex.lex(debug=1)
+
+
+ +

+This will produce various sorts of debugging information including all of the added rules, +the master regular expressions used by the lexer, and tokens generating during lexing. +

+ +

+In addition, lex.py comes with a simple main function which +will either tokenize input read from standard input or from a file specified +on the command line. To use it, simply put this in your lexer: +

+ +
+
+if __name__ == '__main__':
+     lex.runmain()
+
+
+ +Please refer to the "Debugging" section near the end for some more advanced details +of debugging. + +

4.15 Alternative specification of lexers

+ + +As shown in the example, lexers are specified all within one Python module. If you want to +put token rules in a different module from the one in which you invoke lex(), use the +module keyword argument. + +

+For example, you might have a dedicated module that just contains +the token rules: + +

+
+# module: tokrules.py
+# This module just contains the lexing rules
+
+# List of token names.   This is always required
+tokens = (
+   'NUMBER',
+   'PLUS',
+   'MINUS',
+   'TIMES',
+   'DIVIDE',
+   'LPAREN',
+   'RPAREN',
+)
+
+# Regular expression rules for simple tokens
+t_PLUS    = r'\+'
+t_MINUS   = r'-'
+t_TIMES   = r'\*'
+t_DIVIDE  = r'/'
+t_LPAREN  = r'\('
+t_RPAREN  = r'\)'
+
+# A regular expression rule with some action code
+def t_NUMBER(t):
+    r'\d+'
+    t.value = int(t.value)    
+    return t
+
+# Define a rule so we can track line numbers
+def t_newline(t):
+    r'\n+'
+    t.lexer.lineno += len(t.value)
+
+# A string containing ignored characters (spaces and tabs)
+t_ignore  = ' \t'
+
+# Error handling rule
+def t_error(t):
+    print("Illegal character '%s'" % t.value[0])
+    t.lexer.skip(1)
+
+
+ +Now, if you wanted to build a tokenizer from these rules from within a different module, you would do the following (shown for Python interactive mode): + +
+
+>>> import tokrules
+>>> lexer = lex.lex(module=tokrules)
+>>> lexer.input("3 + 4")
+>>> lexer.token()
+LexToken(NUMBER,3,1,1,0)
+>>> lexer.token()
+LexToken(PLUS,'+',1,2)
+>>> lexer.token()
+LexToken(NUMBER,4,1,4)
+>>> lexer.token()
+None
+>>>
+
+
+ +The module option can also be used to define lexers from instances of a class. For example: + +
+
+import ply.lex as lex
+
+class MyLexer(object):
+    # List of token names.   This is always required
+    tokens = (
+       'NUMBER',
+       'PLUS',
+       'MINUS',
+       'TIMES',
+       'DIVIDE',
+       'LPAREN',
+       'RPAREN',
+    )
+
+    # Regular expression rules for simple tokens
+    t_PLUS    = r'\+'
+    t_MINUS   = r'-'
+    t_TIMES   = r'\*'
+    t_DIVIDE  = r'/'
+    t_LPAREN  = r'\('
+    t_RPAREN  = r'\)'
+
+    # A regular expression rule with some action code
+    # Note addition of self parameter since we're in a class
+    def t_NUMBER(self,t):
+        r'\d+'
+        t.value = int(t.value)    
+        return t
+
+    # Define a rule so we can track line numbers
+    def t_newline(self,t):
+        r'\n+'
+        t.lexer.lineno += len(t.value)
+
+    # A string containing ignored characters (spaces and tabs)
+    t_ignore  = ' \t'
+
+    # Error handling rule
+    def t_error(self,t):
+        print("Illegal character '%s'" % t.value[0])
+        t.lexer.skip(1)
+
+    # Build the lexer
+    def build(self,**kwargs):
+        self.lexer = lex.lex(module=self, **kwargs)
+    
+    # Test it output
+    def test(self,data):
+        self.lexer.input(data)
+        while True:
+             tok = self.lexer.token()
+             if not tok: 
+                 break
+             print(tok)
+
+# Build the lexer and try it out
+m = MyLexer()
+m.build()           # Build the lexer
+m.test("3 + 4")     # Test it
+
+
+ + +When building a lexer from class, you should construct the lexer from +an instance of the class, not the class object itself. This is because +PLY only works properly if the lexer actions are defined by bound-methods. + +

+When using the module option to lex(), PLY collects symbols +from the underlying object using the dir() function. There is no +direct access to the __dict__ attribute of the object supplied as a +module value.

+ +

+Finally, if you want to keep things nicely encapsulated, but don't want to use a +full-fledged class definition, lexers can be defined using closures. For example: + +

+
+import ply.lex as lex
+
+# List of token names.   This is always required
+tokens = (
+  'NUMBER',
+  'PLUS',
+  'MINUS',
+  'TIMES',
+  'DIVIDE',
+  'LPAREN',
+  'RPAREN',
+)
+
+def MyLexer():
+    # Regular expression rules for simple tokens
+    t_PLUS    = r'\+'
+    t_MINUS   = r'-'
+    t_TIMES   = r'\*'
+    t_DIVIDE  = r'/'
+    t_LPAREN  = r'\('
+    t_RPAREN  = r'\)'
+
+    # A regular expression rule with some action code
+    def t_NUMBER(t):
+        r'\d+'
+        t.value = int(t.value)    
+        return t
+
+    # Define a rule so we can track line numbers
+    def t_newline(t):
+        r'\n+'
+        t.lexer.lineno += len(t.value)
+
+    # A string containing ignored characters (spaces and tabs)
+    t_ignore  = ' \t'
+
+    # Error handling rule
+    def t_error(t):
+        print("Illegal character '%s'" % t.value[0])
+        t.lexer.skip(1)
+
+    # Build the lexer from my environment and return it    
+    return lex.lex()
+
+
+ +

+Important note: If you are defining a lexer using a class or closure, be aware that PLY still requires you to only +define a single lexer per module (source file). There are extensive validation/error checking parts of the PLY that +may falsely report error messages if you don't follow this rule. +

+ +

4.16 Maintaining state

+ + +In your lexer, you may want to maintain a variety of state +information. This might include mode settings, symbol tables, and +other details. As an example, suppose that you wanted to keep +track of how many NUMBER tokens had been encountered. + +

+One way to do this is to keep a set of global variables in the module +where you created the lexer. For example: + +

+
+num_count = 0
+def t_NUMBER(t):
+    r'\d+'
+    global num_count
+    num_count += 1
+    t.value = int(t.value)    
+    return t
+
+
+ +If you don't like the use of a global variable, another place to store +information is inside the Lexer object created by lex(). +To this, you can use the lexer attribute of tokens passed to +the various rules. For example: + +
+
+def t_NUMBER(t):
+    r'\d+'
+    t.lexer.num_count += 1     # Note use of lexer attribute
+    t.value = int(t.value)    
+    return t
+
+lexer = lex.lex()
+lexer.num_count = 0            # Set the initial count
+
+
+ +This latter approach has the advantage of being simple and working +correctly in applications where multiple instantiations of a given +lexer exist in the same application. However, this might also feel +like a gross violation of encapsulation to OO purists. +Just to put your mind at some ease, all +internal attributes of the lexer (with the exception of lineno) have names that are prefixed +by lex (e.g., lexdata,lexpos, etc.). Thus, +it is perfectly safe to store attributes in the lexer that +don't have names starting with that prefix or a name that conflicts with one of the +predefined methods (e.g., input(), token(), etc.). + +

+If you don't like assigning values on the lexer object, you can define your lexer as a class as +shown in the previous section: + +

+
+class MyLexer:
+    ...
+    def t_NUMBER(self,t):
+        r'\d+'
+        self.num_count += 1
+        t.value = int(t.value)    
+        return t
+
+    def build(self, **kwargs):
+        self.lexer = lex.lex(object=self,**kwargs)
+
+    def __init__(self):
+        self.num_count = 0
+
+
+ +The class approach may be the easiest to manage if your application is +going to be creating multiple instances of the same lexer and you need +to manage a lot of state. + +

+State can also be managed through closures. For example, in Python 3: + +

+
+def MyLexer():
+    num_count = 0
+    ...
+    def t_NUMBER(t):
+        r'\d+'
+        nonlocal num_count
+        num_count += 1
+        t.value = int(t.value)    
+        return t
+    ...
+
+
+ +

4.17 Lexer cloning

+ + +

+If necessary, a lexer object can be duplicated by invoking its clone() method. For example: + +

+
+lexer = lex.lex()
+...
+newlexer = lexer.clone()
+
+
+ +When a lexer is cloned, the copy is exactly identical to the original lexer +including any input text and internal state. However, the clone allows a +different set of input text to be supplied which may be processed separately. +This may be useful in situations when you are writing a parser/compiler that +involves recursive or reentrant processing. For instance, if you +needed to scan ahead in the input for some reason, you could create a +clone and use it to look ahead. Or, if you were implementing some kind of preprocessor, +cloned lexers could be used to handle different input files. + +

+Creating a clone is different than calling lex.lex() in that +PLY doesn't regenerate any of the internal tables or regular expressions. + +

+Special considerations need to be made when cloning lexers that also +maintain their own internal state using classes or closures. Namely, +you need to be aware that the newly created lexers will share all of +this state with the original lexer. For example, if you defined a +lexer as a class and did this: + +

+
+m = MyLexer()
+a = lex.lex(object=m)      # Create a lexer
+
+b = a.clone()              # Clone the lexer
+
+
+ +Then both a and b are going to be bound to the same +object m and any changes to m will be reflected in both lexers. It's +important to emphasize that clone() is only meant to create a new lexer +that reuses the regular expressions and environment of another lexer. If you +need to make a totally new copy of a lexer, then call lex() again. + +

4.18 Internal lexer state

+ + +A Lexer object lexer has a number of internal attributes that may be useful in certain +situations. + +

+lexer.lexpos +

+This attribute is an integer that contains the current position within the input text. If you modify +the value, it will change the result of the next call to token(). Within token rule functions, this points +to the first character after the matched text. If the value is modified within a rule, the next returned token will be +matched at the new position. +
+ +

+lexer.lineno +

+The current value of the line number attribute stored in the lexer. PLY only specifies that the attribute +exists---it never sets, updates, or performs any processing with it. If you want to track line numbers, +you will need to add code yourself (see the section on line numbers and positional information). +
+ +

+lexer.lexdata +

+The current input text stored in the lexer. This is the string passed with the input() method. It +would probably be a bad idea to modify this unless you really know what you're doing. +
+ +

+lexer.lexmatch +

+This is the raw Match object returned by the Python re.match() function (used internally by PLY) for the +current token. If you have written a regular expression that contains named groups, you can use this to retrieve those values. +Note: This attribute is only updated when tokens are defined and processed by functions. +
+ +

4.19 Conditional lexing and start conditions

+ + +In advanced parsing applications, it may be useful to have different +lexing states. For instance, you may want the occurrence of a certain +token or syntactic construct to trigger a different kind of lexing. +PLY supports a feature that allows the underlying lexer to be put into +a series of different states. Each state can have its own tokens, +lexing rules, and so forth. The implementation is based largely on +the "start condition" feature of GNU flex. Details of this can be found +at http://flex.sourceforge.net/manual/Start-Conditions.html. + +

+To define a new lexing state, it must first be declared. This is done by including a "states" declaration in your +lex file. For example: + +

+
+states = (
+   ('foo','exclusive'),
+   ('bar','inclusive'),
+)
+
+
+ +This declaration declares two states, 'foo' +and 'bar'. States may be of two types; 'exclusive' +and 'inclusive'. An exclusive state completely overrides the +default behavior of the lexer. That is, lex will only return tokens +and apply rules defined specifically for that state. An inclusive +state adds additional tokens and rules to the default set of rules. +Thus, lex will return both the tokens defined by default in addition +to those defined for the inclusive state. + +

+Once a state has been declared, tokens and rules are declared by including the +state name in token/rule declaration. For example: + +

+
+t_foo_NUMBER = r'\d+'                      # Token 'NUMBER' in state 'foo'        
+t_bar_ID     = r'[a-zA-Z_][a-zA-Z0-9_]*'   # Token 'ID' in state 'bar'
+
+def t_foo_newline(t):
+    r'\n'
+    t.lexer.lineno += 1
+
+
+ +A token can be declared in multiple states by including multiple state names in the declaration. For example: + +
+
+t_foo_bar_NUMBER = r'\d+'         # Defines token 'NUMBER' in both state 'foo' and 'bar'
+
+
+ +Alternative, a token can be declared in all states using the 'ANY' in the name. + +
+
+t_ANY_NUMBER = r'\d+'         # Defines a token 'NUMBER' in all states
+
+
+ +If no state name is supplied, as is normally the case, the token is associated with a special state 'INITIAL'. For example, +these two declarations are identical: + +
+
+t_NUMBER = r'\d+'
+t_INITIAL_NUMBER = r'\d+'
+
+
+ +

+States are also associated with the special t_ignore, t_error(), and t_eof() declarations. For example, if a state treats +these differently, you can declare:

+ +
+
+t_foo_ignore = " \t\n"       # Ignored characters for state 'foo'
+
+def t_bar_error(t):          # Special error handler for state 'bar'
+    pass 
+
+
+ +By default, lexing operates in the 'INITIAL' state. This state includes all of the normally defined tokens. +For users who aren't using different states, this fact is completely transparent. If, during lexing or parsing, you want to change +the lexing state, use the begin() method. For example: + +
+
+def t_begin_foo(t):
+    r'start_foo'
+    t.lexer.begin('foo')             # Starts 'foo' state
+
+
+ +To get out of a state, you use begin() to switch back to the initial state. For example: + +
+
+def t_foo_end(t):
+    r'end_foo'
+    t.lexer.begin('INITIAL')        # Back to the initial state
+
+
+ +The management of states can also be done with a stack. For example: + +
+
+def t_begin_foo(t):
+    r'start_foo'
+    t.lexer.push_state('foo')             # Starts 'foo' state
+
+def t_foo_end(t):
+    r'end_foo'
+    t.lexer.pop_state()                   # Back to the previous state
+
+
+ +

+The use of a stack would be useful in situations where there are many ways of entering a new lexing state and you merely want to go back +to the previous state afterwards. + +

+An example might help clarify. Suppose you were writing a parser and you wanted to grab sections of arbitrary C code enclosed by +curly braces. That is, whenever you encounter a starting brace '{', you want to read all of the enclosed code up to the ending brace '}' +and return it as a string. Doing this with a normal regular expression rule is nearly (if not actually) impossible. This is because braces can +be nested and can be included in comments and strings. Thus, simply matching up to the first matching '}' character isn't good enough. Here is how +you might use lexer states to do this: + +

+
+# Declare the state
+states = (
+  ('ccode','exclusive'),
+)
+
+# Match the first {. Enter ccode state.
+def t_ccode(t):
+    r'\{'
+    t.lexer.code_start = t.lexer.lexpos        # Record the starting position
+    t.lexer.level = 1                          # Initial brace level
+    t.lexer.begin('ccode')                     # Enter 'ccode' state
+
+# Rules for the ccode state
+def t_ccode_lbrace(t):     
+    r'\{'
+    t.lexer.level +=1                
+
+def t_ccode_rbrace(t):
+    r'\}'
+    t.lexer.level -=1
+
+    # If closing brace, return the code fragment
+    if t.lexer.level == 0:
+         t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1]
+         t.type = "CCODE"
+         t.lexer.lineno += t.value.count('\n')
+         t.lexer.begin('INITIAL')           
+         return t
+
+# C or C++ comment (ignore)    
+def t_ccode_comment(t):
+    r'(/\*(.|\n)*?\*/)|(//.*)'
+    pass
+
+# C string
+def t_ccode_string(t):
+   r'\"([^\\\n]|(\\.))*?\"'
+
+# C character literal
+def t_ccode_char(t):
+   r'\'([^\\\n]|(\\.))*?\''
+
+# Any sequence of non-whitespace characters (not braces, strings)
+def t_ccode_nonspace(t):
+   r'[^\s\{\}\'\"]+'
+
+# Ignored characters (whitespace)
+t_ccode_ignore = " \t\n"
+
+# For bad characters, we just skip over it
+def t_ccode_error(t):
+    t.lexer.skip(1)
+
+
+ +In this example, the occurrence of the first '{' causes the lexer to record the starting position and enter a new state 'ccode'. A collection of rules then match +various parts of the input that follow (comments, strings, etc.). All of these rules merely discard the token (by not returning a value). +However, if the closing right brace is encountered, the rule t_ccode_rbrace collects all of the code (using the earlier recorded starting +position), stores it, and returns a token 'CCODE' containing all of that text. When returning the token, the lexing state is restored back to its +initial state. + +

4.20 Miscellaneous Issues

+ + +

+

  • The lexer requires input to be supplied as a single input string. Since most machines have more than enough memory, this +rarely presents a performance concern. However, it means that the lexer currently can't be used with streaming data +such as open files or sockets. This limitation is primarily a side-effect of using the re module. You might be +able to work around this by implementing an appropriate def t_eof() end-of-file handling rule. The main complication +here is that you'll probably need to ensure that data is fed to the lexer in a way so that it doesn't split in in the middle +of a token.

    + +

    +

  • The lexer should work properly with both Unicode strings given as token and pattern matching rules as +well as for input text. + +

    +

  • If you need to supply optional flags to the re.compile() function, use the reflags option to lex. For example: + +
    +
    +lex.lex(reflags=re.UNICODE)
    +
    +
    + +

    +

  • Since the lexer is written entirely in Python, its performance is +largely determined by that of the Python re module. Although +the lexer has been written to be as efficient as possible, it's not +blazingly fast when used on very large input files. If +performance is concern, you might consider upgrading to the most +recent version of Python, creating a hand-written lexer, or offloading +the lexer into a C extension module. + +

    +If you are going to create a hand-written lexer and you plan to use it with yacc.py, +it only needs to conform to the following requirements: + +

      +
    • It must provide a token() method that returns the next token or None if no more +tokens are available. +
    • The token() method must return an object tok that has type and value attributes. If +line number tracking is being used, then the token should also define a lineno attribute. +
    + +

    5. Parsing basics

    + + +yacc.py is used to parse language syntax. Before showing an +example, there are a few important bits of background that must be +mentioned. First, syntax is usually specified in terms of a BNF grammar. +For example, if you wanted to parse +simple arithmetic expressions, you might first write an unambiguous +grammar specification like this: + +
    +
     
    +expression : expression + term
    +           | expression - term
    +           | term
    +
    +term       : term * factor
    +           | term / factor
    +           | factor
    +
    +factor     : NUMBER
    +           | ( expression )
    +
    +
    + +In the grammar, symbols such as NUMBER, +, -, *, and / are known +as terminals and correspond to raw input tokens. Identifiers such as term and factor refer to +grammar rules comprised of a collection of terminals and other rules. These identifiers are known as non-terminals. +

    + +The semantic behavior of a language is often specified using a +technique known as syntax directed translation. In syntax directed +translation, attributes are attached to each symbol in a given grammar +rule along with an action. Whenever a particular grammar rule is +recognized, the action describes what to do. For example, given the +expression grammar above, you might write the specification for a +simple calculator like this: + +

    +
     
    +Grammar                             Action
    +--------------------------------    -------------------------------------------- 
    +expression0 : expression1 + term    expression0.val = expression1.val + term.val
    +            | expression1 - term    expression0.val = expression1.val - term.val
    +            | term                  expression0.val = term.val
    +
    +term0       : term1 * factor        term0.val = term1.val * factor.val
    +            | term1 / factor        term0.val = term1.val / factor.val
    +            | factor                term0.val = factor.val
    +
    +factor      : NUMBER                factor.val = int(NUMBER.lexval)
    +            | ( expression )        factor.val = expression.val
    +
    +
    + +A good way to think about syntax directed translation is to +view each symbol in the grammar as a kind of object. Associated +with each symbol is a value representing its "state" (for example, the +val attribute above). Semantic +actions are then expressed as a collection of functions or methods +that operate on the symbols and associated values. + +

    +Yacc uses a parsing technique known as LR-parsing or shift-reduce parsing. LR parsing is a +bottom up technique that tries to recognize the right-hand-side of various grammar rules. +Whenever a valid right-hand-side is found in the input, the appropriate action code is triggered and the +grammar symbols are replaced by the grammar symbol on the left-hand-side. + +

    +LR parsing is commonly implemented by shifting grammar symbols onto a +stack and looking at the stack and the next input token for patterns that +match one of the grammar rules. +The details of the algorithm can be found in a compiler textbook, but the +following example illustrates the steps that are performed if you +wanted to parse the expression +3 + 5 * (10 - 20) using the grammar defined above. In the example, +the special symbol $ represents the end of input. + + +

    +
    +Step Symbol Stack           Input Tokens            Action
    +---- ---------------------  ---------------------   -------------------------------
    +1                           3 + 5 * ( 10 - 20 )$    Shift 3
    +2    3                        + 5 * ( 10 - 20 )$    Reduce factor : NUMBER
    +3    factor                   + 5 * ( 10 - 20 )$    Reduce term   : factor
    +4    term                     + 5 * ( 10 - 20 )$    Reduce expr : term
    +5    expr                     + 5 * ( 10 - 20 )$    Shift +
    +6    expr +                     5 * ( 10 - 20 )$    Shift 5
    +7    expr + 5                     * ( 10 - 20 )$    Reduce factor : NUMBER
    +8    expr + factor                * ( 10 - 20 )$    Reduce term   : factor
    +9    expr + term                  * ( 10 - 20 )$    Shift *
    +10   expr + term *                  ( 10 - 20 )$    Shift (
    +11   expr + term * (                  10 - 20 )$    Shift 10
    +12   expr + term * ( 10                  - 20 )$    Reduce factor : NUMBER
    +13   expr + term * ( factor              - 20 )$    Reduce term : factor
    +14   expr + term * ( term                - 20 )$    Reduce expr : term
    +15   expr + term * ( expr                - 20 )$    Shift -
    +16   expr + term * ( expr -                20 )$    Shift 20
    +17   expr + term * ( expr - 20                )$    Reduce factor : NUMBER
    +18   expr + term * ( expr - factor            )$    Reduce term : factor
    +19   expr + term * ( expr - term              )$    Reduce expr : expr - term
    +20   expr + term * ( expr                     )$    Shift )
    +21   expr + term * ( expr )                    $    Reduce factor : (expr)
    +22   expr + term * factor                      $    Reduce term : term * factor
    +23   expr + term                               $    Reduce expr : expr + term
    +24   expr                                      $    Reduce expr
    +25                                             $    Success!
    +
    +
    + +When parsing the expression, an underlying state machine and the +current input token determine what happens next. If the next token +looks like part of a valid grammar rule (based on other items on the +stack), it is generally shifted onto the stack. If the top of the +stack contains a valid right-hand-side of a grammar rule, it is +usually "reduced" and the symbols replaced with the symbol on the +left-hand-side. When this reduction occurs, the appropriate action is +triggered (if defined). If the input token can't be shifted and the +top of stack doesn't match any grammar rules, a syntax error has +occurred and the parser must take some kind of recovery step (or bail +out). A parse is only successful if the parser reaches a state where +the symbol stack is empty and there are no more input tokens. + +

    +It is important to note that the underlying implementation is built +around a large finite-state machine that is encoded in a collection of +tables. The construction of these tables is non-trivial and +beyond the scope of this discussion. However, subtle details of this +process explain why, in the example above, the parser chooses to shift +a token onto the stack in step 9 rather than reducing the +rule expr : expr + term. + +

    6. Yacc

    + + +The ply.yacc module implements the parsing component of PLY. +The name "yacc" stands for "Yet Another Compiler Compiler" and is +borrowed from the Unix tool of the same name. + +

    6.1 An example

    + + +Suppose you wanted to make a grammar for simple arithmetic expressions as previously described. Here is +how you would do it with yacc.py: + +
    +
    +# Yacc example
    +
    +import ply.yacc as yacc
    +
    +# Get the token map from the lexer.  This is required.
    +from calclex import tokens
    +
    +def p_expression_plus(p):
    +    'expression : expression PLUS term'
    +    p[0] = p[1] + p[3]
    +
    +def p_expression_minus(p):
    +    'expression : expression MINUS term'
    +    p[0] = p[1] - p[3]
    +
    +def p_expression_term(p):
    +    'expression : term'
    +    p[0] = p[1]
    +
    +def p_term_times(p):
    +    'term : term TIMES factor'
    +    p[0] = p[1] * p[3]
    +
    +def p_term_div(p):
    +    'term : term DIVIDE factor'
    +    p[0] = p[1] / p[3]
    +
    +def p_term_factor(p):
    +    'term : factor'
    +    p[0] = p[1]
    +
    +def p_factor_num(p):
    +    'factor : NUMBER'
    +    p[0] = p[1]
    +
    +def p_factor_expr(p):
    +    'factor : LPAREN expression RPAREN'
    +    p[0] = p[2]
    +
    +# Error rule for syntax errors
    +def p_error(p):
    +    print("Syntax error in input!")
    +
    +# Build the parser
    +parser = yacc.yacc()
    +
    +while True:
    +   try:
    +       s = raw_input('calc > ')
    +   except EOFError:
    +       break
    +   if not s: continue
    +   result = parser.parse(s)
    +   print(result)
    +
    +
    + +In this example, each grammar rule is defined by a Python function +where the docstring to that function contains the appropriate +context-free grammar specification. The statements that make up the +function body implement the semantic actions of the rule. Each function +accepts a single argument p that is a sequence containing the +values of each grammar symbol in the corresponding rule. The values +of p[i] are mapped to grammar symbols as shown here: + +
    +
    +def p_expression_plus(p):
    +    'expression : expression PLUS term'
    +    #   ^            ^        ^    ^
    +    #  p[0]         p[1]     p[2] p[3]
    +
    +    p[0] = p[1] + p[3]
    +
    +
    + +

    +For tokens, the "value" of the corresponding p[i] is the +same as the p.value attribute assigned in the lexer +module. For non-terminals, the value is determined by whatever is +placed in p[0] when rules are reduced. This value can be +anything at all. However, it probably most common for the value to be +a simple Python type, a tuple, or an instance. In this example, we +are relying on the fact that the NUMBER token stores an +integer value in its value field. All of the other rules simply +perform various types of integer operations and propagate the result. +

    + +

    +Note: The use of negative indices have a special meaning in +yacc---specially p[-1] does not have the same value +as p[3] in this example. Please see the section on "Embedded +Actions" for further details. +

    + +

    +The first rule defined in the yacc specification determines the +starting grammar symbol (in this case, a rule for expression +appears first). Whenever the starting rule is reduced by the parser +and no more input is available, parsing stops and the final value is +returned (this value will be whatever the top-most rule placed +in p[0]). Note: an alternative starting symbol can be +specified using the start keyword argument to +yacc(). + +

    The p_error(p) rule is defined to catch syntax errors. +See the error handling section below for more detail. + +

    +To build the parser, call the yacc.yacc() function. This +function looks at the module and attempts to construct all of the LR +parsing tables for the grammar you have specified. The first +time yacc.yacc() is invoked, you will get a message such as +this: + +

    +
    +$ python calcparse.py
    +Generating LALR tables
    +calc > 
    +
    +
    + +

    +Since table construction is relatively expensive (especially for large +grammars), the resulting parsing table is written to +a file called parsetab.py. In addition, a +debugging file called parser.out is created. On subsequent +executions, yacc will reload the table from +parsetab.py unless it has detected a change in the underlying +grammar (in which case the tables and parsetab.py file are +regenerated). Both of these files are written to the same directory +as the module in which the parser is specified. +The name of the parsetab module can be changed using the +tabmodule keyword argument to yacc(). For example: +

    + +
    +
    +parser = yacc.yacc(tabmodule='fooparsetab')
    +
    +
    + +

    +If any errors are detected in your grammar specification, yacc.py will produce +diagnostic messages and possibly raise an exception. Some of the errors that can be detected include: + +

      +
    • Duplicated function names (if more than one rule function have the same name in the grammar file). +
    • Shift/reduce and reduce/reduce conflicts generated by ambiguous grammars. +
    • Badly specified grammar rules. +
    • Infinite recursion (rules that can never terminate). +
    • Unused rules and tokens +
    • Undefined rules and tokens +
    + +The next few sections discuss grammar specification in more detail. + +

    +The final part of the example shows how to actually run the parser +created by +yacc(). To run the parser, you simply have to call +the parse() with a string of input text. This will run all +of the grammar rules and return the result of the entire parse. This +result return is the value assigned to p[0] in the starting +grammar rule. + +

    6.2 Combining Grammar Rule Functions

    + + +When grammar rules are similar, they can be combined into a single function. +For example, consider the two rules in our earlier example: + +
    +
    +def p_expression_plus(p):
    +    'expression : expression PLUS term'
    +    p[0] = p[1] + p[3]
    +
    +def p_expression_minus(t):
    +    'expression : expression MINUS term'
    +    p[0] = p[1] - p[3]
    +
    +
    + +Instead of writing two functions, you might write a single function like this: + +
    +
    +def p_expression(p):
    +    '''expression : expression PLUS term
    +                  | expression MINUS term'''
    +    if p[2] == '+':
    +        p[0] = p[1] + p[3]
    +    elif p[2] == '-':
    +        p[0] = p[1] - p[3]
    +
    +
    + +In general, the doc string for any given function can contain multiple grammar rules. So, it would +have also been legal (although possibly confusing) to write this: + +
    +
    +def p_binary_operators(p):
    +    '''expression : expression PLUS term
    +                  | expression MINUS term
    +       term       : term TIMES factor
    +                  | term DIVIDE factor'''
    +    if p[2] == '+':
    +        p[0] = p[1] + p[3]
    +    elif p[2] == '-':
    +        p[0] = p[1] - p[3]
    +    elif p[2] == '*':
    +        p[0] = p[1] * p[3]
    +    elif p[2] == '/':
    +        p[0] = p[1] / p[3]
    +
    +
    + +When combining grammar rules into a single function, it is usually a good idea for all of the rules to have +a similar structure (e.g., the same number of terms). Otherwise, the corresponding action code may be more +complicated than necessary. However, it is possible to handle simple cases using len(). For example: + +
    +
    +def p_expressions(p):
    +    '''expression : expression MINUS expression
    +                  | MINUS expression'''
    +    if (len(p) == 4):
    +        p[0] = p[1] - p[3]
    +    elif (len(p) == 3):
    +        p[0] = -p[2]
    +
    +
    + +If parsing performance is a concern, you should resist the urge to put +too much conditional processing into a single grammar rule as shown in +these examples. When you add checks to see which grammar rule is +being handled, you are actually duplicating the work that the parser +has already performed (i.e., the parser already knows exactly what rule it +matched). You can eliminate this overhead by using a +separate p_rule() function for each grammar rule. + +

    6.3 Character Literals

    + + +If desired, a grammar may contain tokens defined as single character literals. For example: + +
    +
    +def p_binary_operators(p):
    +    '''expression : expression '+' term
    +                  | expression '-' term
    +       term       : term '*' factor
    +                  | term '/' factor'''
    +    if p[2] == '+':
    +        p[0] = p[1] + p[3]
    +    elif p[2] == '-':
    +        p[0] = p[1] - p[3]
    +    elif p[2] == '*':
    +        p[0] = p[1] * p[3]
    +    elif p[2] == '/':
    +        p[0] = p[1] / p[3]
    +
    +
    + +A character literal must be enclosed in quotes such as '+'. In addition, if literals are used, they must be declared in the +corresponding lex file through the use of a special literals declaration. + +
    +
    +# Literals.  Should be placed in module given to lex()
    +literals = ['+','-','*','/' ]
    +
    +
    + +Character literals are limited to a single character. Thus, it is not legal to specify literals such as '<=' or '=='. For this, use +the normal lexing rules (e.g., define a rule such as t_EQ = r'=='). + +

    6.4 Empty Productions

    + + +yacc.py can handle empty productions by defining a rule like this: + +
    +
    +def p_empty(p):
    +    'empty :'
    +    pass
    +
    +
    + +Now to use the empty production, simply use 'empty' as a symbol. For example: + +
    +
    +def p_optitem(p):
    +    'optitem : item'
    +    '        | empty'
    +    ...
    +
    +
    + +Note: You can write empty rules anywhere by simply specifying an empty +right hand side. However, I personally find that writing an "empty" +rule and using "empty" to denote an empty production is easier to read +and more clearly states your intentions. + +

    6.5 Changing the starting symbol

    + + +Normally, the first rule found in a yacc specification defines the starting grammar rule (top level rule). To change this, simply +supply a start specifier in your file. For example: + +
    +
    +start = 'foo'
    +
    +def p_bar(p):
    +    'bar : A B'
    +
    +# This is the starting rule due to the start specifier above
    +def p_foo(p):
    +    'foo : bar X'
    +...
    +
    +
    + +The use of a start specifier may be useful during debugging +since you can use it to have yacc build a subset of a larger grammar. +For this purpose, it is also possible to specify a starting symbol as +an argument to yacc(). For example: + +
    +
    +parser = yacc.yacc(start='foo')
    +
    +
    + +

    6.6 Dealing With Ambiguous Grammars

    + + +The expression grammar given in the earlier example has been written +in a special format to eliminate ambiguity. However, in many +situations, it is extremely difficult or awkward to write grammars in +this format. A much more natural way to express the grammar is in a +more compact form like this: + +
    +
    +expression : expression PLUS expression
    +           | expression MINUS expression
    +           | expression TIMES expression
    +           | expression DIVIDE expression
    +           | LPAREN expression RPAREN
    +           | NUMBER
    +
    +
    + +Unfortunately, this grammar specification is ambiguous. For example, +if you are parsing the string "3 * 4 + 5", there is no way to tell how +the operators are supposed to be grouped. For example, does the +expression mean "(3 * 4) + 5" or is it "3 * (4+5)"? + +

    +When an ambiguous grammar is given to yacc.py it will print +messages about "shift/reduce conflicts" or "reduce/reduce conflicts". +A shift/reduce conflict is caused when the parser generator can't +decide whether or not to reduce a rule or shift a symbol on the +parsing stack. For example, consider the string "3 * 4 + 5" and the +internal parsing stack: + +

    +
    +Step Symbol Stack           Input Tokens            Action
    +---- ---------------------  ---------------------   -------------------------------
    +1    $                                3 * 4 + 5$    Shift 3
    +2    $ 3                                * 4 + 5$    Reduce : expression : NUMBER
    +3    $ expr                             * 4 + 5$    Shift *
    +4    $ expr *                             4 + 5$    Shift 4
    +5    $ expr * 4                             + 5$    Reduce: expression : NUMBER
    +6    $ expr * expr                          + 5$    SHIFT/REDUCE CONFLICT ????
    +
    +
    + +In this case, when the parser reaches step 6, it has two options. One +is to reduce the rule expr : expr * expr on the stack. The +other option is to shift the token + on the stack. Both +options are perfectly legal from the rules of the +context-free-grammar. + +

    +By default, all shift/reduce conflicts are resolved in favor of +shifting. Therefore, in the above example, the parser will always +shift the + instead of reducing. Although this strategy +works in many cases (for example, the case of +"if-then" versus "if-then-else"), it is not enough for arithmetic expressions. In fact, +in the above example, the decision to shift + is completely +wrong---we should have reduced expr * expr since +multiplication has higher mathematical precedence than addition. + +

    To resolve ambiguity, especially in expression +grammars, yacc.py allows individual tokens to be assigned a +precedence level and associativity. This is done by adding a variable +precedence to the grammar file like this: + +

    +
    +precedence = (
    +    ('left', 'PLUS', 'MINUS'),
    +    ('left', 'TIMES', 'DIVIDE'),
    +)
    +
    +
    + +This declaration specifies that PLUS/MINUS have the +same precedence level and are left-associative and that +TIMES/DIVIDE have the same precedence and are +left-associative. Within the precedence declaration, tokens +are ordered from lowest to highest precedence. Thus, this declaration +specifies that TIMES/DIVIDE have higher precedence +than PLUS/MINUS (since they appear later in the +precedence specification). + +

    +The precedence specification works by associating a numerical +precedence level value and associativity direction to the listed +tokens. For example, in the above example you get: + +

    +
    +PLUS      : level = 1,  assoc = 'left'
    +MINUS     : level = 1,  assoc = 'left'
    +TIMES     : level = 2,  assoc = 'left'
    +DIVIDE    : level = 2,  assoc = 'left'
    +
    +
    + +These values are then used to attach a numerical precedence value and +associativity direction to each grammar rule. This is always +determined by looking at the precedence of the right-most terminal +symbol. For example: + +
    +
    +expression : expression PLUS expression                 # level = 1, left
    +           | expression MINUS expression                # level = 1, left
    +           | expression TIMES expression                # level = 2, left
    +           | expression DIVIDE expression               # level = 2, left
    +           | LPAREN expression RPAREN                   # level = None (not specified)
    +           | NUMBER                                     # level = None (not specified)
    +
    +
    + +When shift/reduce conflicts are encountered, the parser generator resolves the conflict by +looking at the precedence rules and associativity specifiers. + +

    +

      +
    1. If the current token has higher precedence than the rule on the stack, it is shifted. +
    2. If the grammar rule on the stack has higher precedence, the rule is reduced. +
    3. If the current token and the grammar rule have the same precedence, the +rule is reduced for left associativity, whereas the token is shifted for right associativity. +
    4. If nothing is known about the precedence, shift/reduce conflicts are resolved in +favor of shifting (the default). +
    + +For example, if "expression PLUS expression" has been parsed and the +next token is "TIMES", the action is going to be a shift because +"TIMES" has a higher precedence level than "PLUS". On the other hand, +if "expression TIMES expression" has been parsed and the next token is +"PLUS", the action is going to be reduce because "PLUS" has a lower +precedence than "TIMES." + +

    +When shift/reduce conflicts are resolved using the first three +techniques (with the help of precedence rules), yacc.py will +report no errors or conflicts in the grammar (although it will print +some information in the parser.out debugging file). + +

    +One problem with the precedence specifier technique is that it is +sometimes necessary to change the precedence of an operator in certain +contexts. For example, consider a unary-minus operator in "3 + 4 * +-5". Mathematically, the unary minus is normally given a very high +precedence--being evaluated before the multiply. However, in our +precedence specifier, MINUS has a lower precedence than TIMES. To +deal with this, precedence rules can be given for so-called "fictitious tokens" +like this: + +

    +
    +precedence = (
    +    ('left', 'PLUS', 'MINUS'),
    +    ('left', 'TIMES', 'DIVIDE'),
    +    ('right', 'UMINUS'),            # Unary minus operator
    +)
    +
    +
    + +Now, in the grammar file, we can write our unary minus rule like this: + +
    +
    +def p_expr_uminus(p):
    +    'expression : MINUS expression %prec UMINUS'
    +    p[0] = -p[2]
    +
    +
    + +In this case, %prec UMINUS overrides the default rule precedence--setting it to that +of UMINUS in the precedence specifier. + +

    +At first, the use of UMINUS in this example may appear very confusing. +UMINUS is not an input token or a grammar rule. Instead, you should +think of it as the name of a special marker in the precedence table. When you use the %prec qualifier, you're simply +telling yacc that you want the precedence of the expression to be the same as for this special marker instead of the usual precedence. + +

    +It is also possible to specify non-associativity in the precedence table. This would +be used when you don't want operations to chain together. For example, suppose +you wanted to support comparison operators like < and > but you didn't want to allow +combinations like a < b < c. To do this, simply specify a rule like this: + +

    +
    +precedence = (
    +    ('nonassoc', 'LESSTHAN', 'GREATERTHAN'),  # Nonassociative operators
    +    ('left', 'PLUS', 'MINUS'),
    +    ('left', 'TIMES', 'DIVIDE'),
    +    ('right', 'UMINUS'),            # Unary minus operator
    +)
    +
    +
    + +

    +If you do this, the occurrence of input text such as a < b < c will result in a syntax error. However, simple +expressions such as a < b will still be fine. + +

    +Reduce/reduce conflicts are caused when there are multiple grammar +rules that can be applied to a given set of symbols. This kind of +conflict is almost always bad and is always resolved by picking the +rule that appears first in the grammar file. Reduce/reduce conflicts +are almost always caused when different sets of grammar rules somehow +generate the same set of symbols. For example: + +

    +
    +assignment :  ID EQUALS NUMBER
    +           |  ID EQUALS expression
    +           
    +expression : expression PLUS expression
    +           | expression MINUS expression
    +           | expression TIMES expression
    +           | expression DIVIDE expression
    +           | LPAREN expression RPAREN
    +           | NUMBER
    +
    +
    + +In this case, a reduce/reduce conflict exists between these two rules: + +
    +
    +assignment  : ID EQUALS NUMBER
    +expression  : NUMBER
    +
    +
    + +For example, if you wrote "a = 5", the parser can't figure out if this +is supposed to be reduced as assignment : ID EQUALS NUMBER or +whether it's supposed to reduce the 5 as an expression and then reduce +the rule assignment : ID EQUALS expression. + +

    +It should be noted that reduce/reduce conflicts are notoriously +difficult to spot simply looking at the input grammar. When a +reduce/reduce conflict occurs, yacc() will try to help by +printing a warning message such as this: + +

    +
    +WARNING: 1 reduce/reduce conflict
    +WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER)
    +WARNING: rejected rule (expression -> NUMBER)
    +
    +
    + +This message identifies the two rules that are in conflict. However, +it may not tell you how the parser arrived at such a state. To try +and figure it out, you'll probably have to look at your grammar and +the contents of the +parser.out debugging file with an appropriately high level of +caffeination. + +

    6.7 The parser.out file

    + + +Tracking down shift/reduce and reduce/reduce conflicts is one of the finer pleasures of using an LR +parsing algorithm. To assist in debugging, yacc.py creates a debugging file called +'parser.out' when it generates the parsing table. The contents of this file look like the following: + +
    +
    +Unused terminals:
    +
    +
    +Grammar
    +
    +Rule 1     expression -> expression PLUS expression
    +Rule 2     expression -> expression MINUS expression
    +Rule 3     expression -> expression TIMES expression
    +Rule 4     expression -> expression DIVIDE expression
    +Rule 5     expression -> NUMBER
    +Rule 6     expression -> LPAREN expression RPAREN
    +
    +Terminals, with rules where they appear
    +
    +TIMES                : 3
    +error                : 
    +MINUS                : 2
    +RPAREN               : 6
    +LPAREN               : 6
    +DIVIDE               : 4
    +PLUS                 : 1
    +NUMBER               : 5
    +
    +Nonterminals, with rules where they appear
    +
    +expression           : 1 1 2 2 3 3 4 4 6 0
    +
    +
    +Parsing method: LALR
    +
    +
    +state 0
    +
    +    S' -> . expression
    +    expression -> . expression PLUS expression
    +    expression -> . expression MINUS expression
    +    expression -> . expression TIMES expression
    +    expression -> . expression DIVIDE expression
    +    expression -> . NUMBER
    +    expression -> . LPAREN expression RPAREN
    +
    +    NUMBER          shift and go to state 3
    +    LPAREN          shift and go to state 2
    +
    +
    +state 1
    +
    +    S' -> expression .
    +    expression -> expression . PLUS expression
    +    expression -> expression . MINUS expression
    +    expression -> expression . TIMES expression
    +    expression -> expression . DIVIDE expression
    +
    +    PLUS            shift and go to state 6
    +    MINUS           shift and go to state 5
    +    TIMES           shift and go to state 4
    +    DIVIDE          shift and go to state 7
    +
    +
    +state 2
    +
    +    expression -> LPAREN . expression RPAREN
    +    expression -> . expression PLUS expression
    +    expression -> . expression MINUS expression
    +    expression -> . expression TIMES expression
    +    expression -> . expression DIVIDE expression
    +    expression -> . NUMBER
    +    expression -> . LPAREN expression RPAREN
    +
    +    NUMBER          shift and go to state 3
    +    LPAREN          shift and go to state 2
    +
    +
    +state 3
    +
    +    expression -> NUMBER .
    +
    +    $               reduce using rule 5
    +    PLUS            reduce using rule 5
    +    MINUS           reduce using rule 5
    +    TIMES           reduce using rule 5
    +    DIVIDE          reduce using rule 5
    +    RPAREN          reduce using rule 5
    +
    +
    +state 4
    +
    +    expression -> expression TIMES . expression
    +    expression -> . expression PLUS expression
    +    expression -> . expression MINUS expression
    +    expression -> . expression TIMES expression
    +    expression -> . expression DIVIDE expression
    +    expression -> . NUMBER
    +    expression -> . LPAREN expression RPAREN
    +
    +    NUMBER          shift and go to state 3
    +    LPAREN          shift and go to state 2
    +
    +
    +state 5
    +
    +    expression -> expression MINUS . expression
    +    expression -> . expression PLUS expression
    +    expression -> . expression MINUS expression
    +    expression -> . expression TIMES expression
    +    expression -> . expression DIVIDE expression
    +    expression -> . NUMBER
    +    expression -> . LPAREN expression RPAREN
    +
    +    NUMBER          shift and go to state 3
    +    LPAREN          shift and go to state 2
    +
    +
    +state 6
    +
    +    expression -> expression PLUS . expression
    +    expression -> . expression PLUS expression
    +    expression -> . expression MINUS expression
    +    expression -> . expression TIMES expression
    +    expression -> . expression DIVIDE expression
    +    expression -> . NUMBER
    +    expression -> . LPAREN expression RPAREN
    +
    +    NUMBER          shift and go to state 3
    +    LPAREN          shift and go to state 2
    +
    +
    +state 7
    +
    +    expression -> expression DIVIDE . expression
    +    expression -> . expression PLUS expression
    +    expression -> . expression MINUS expression
    +    expression -> . expression TIMES expression
    +    expression -> . expression DIVIDE expression
    +    expression -> . NUMBER
    +    expression -> . LPAREN expression RPAREN
    +
    +    NUMBER          shift and go to state 3
    +    LPAREN          shift and go to state 2
    +
    +
    +state 8
    +
    +    expression -> LPAREN expression . RPAREN
    +    expression -> expression . PLUS expression
    +    expression -> expression . MINUS expression
    +    expression -> expression . TIMES expression
    +    expression -> expression . DIVIDE expression
    +
    +    RPAREN          shift and go to state 13
    +    PLUS            shift and go to state 6
    +    MINUS           shift and go to state 5
    +    TIMES           shift and go to state 4
    +    DIVIDE          shift and go to state 7
    +
    +
    +state 9
    +
    +    expression -> expression TIMES expression .
    +    expression -> expression . PLUS expression
    +    expression -> expression . MINUS expression
    +    expression -> expression . TIMES expression
    +    expression -> expression . DIVIDE expression
    +
    +    $               reduce using rule 3
    +    PLUS            reduce using rule 3
    +    MINUS           reduce using rule 3
    +    TIMES           reduce using rule 3
    +    DIVIDE          reduce using rule 3
    +    RPAREN          reduce using rule 3
    +
    +  ! PLUS            [ shift and go to state 6 ]
    +  ! MINUS           [ shift and go to state 5 ]
    +  ! TIMES           [ shift and go to state 4 ]
    +  ! DIVIDE          [ shift and go to state 7 ]
    +
    +state 10
    +
    +    expression -> expression MINUS expression .
    +    expression -> expression . PLUS expression
    +    expression -> expression . MINUS expression
    +    expression -> expression . TIMES expression
    +    expression -> expression . DIVIDE expression
    +
    +    $               reduce using rule 2
    +    PLUS            reduce using rule 2
    +    MINUS           reduce using rule 2
    +    RPAREN          reduce using rule 2
    +    TIMES           shift and go to state 4
    +    DIVIDE          shift and go to state 7
    +
    +  ! TIMES           [ reduce using rule 2 ]
    +  ! DIVIDE          [ reduce using rule 2 ]
    +  ! PLUS            [ shift and go to state 6 ]
    +  ! MINUS           [ shift and go to state 5 ]
    +
    +state 11
    +
    +    expression -> expression PLUS expression .
    +    expression -> expression . PLUS expression
    +    expression -> expression . MINUS expression
    +    expression -> expression . TIMES expression
    +    expression -> expression . DIVIDE expression
    +
    +    $               reduce using rule 1
    +    PLUS            reduce using rule 1
    +    MINUS           reduce using rule 1
    +    RPAREN          reduce using rule 1
    +    TIMES           shift and go to state 4
    +    DIVIDE          shift and go to state 7
    +
    +  ! TIMES           [ reduce using rule 1 ]
    +  ! DIVIDE          [ reduce using rule 1 ]
    +  ! PLUS            [ shift and go to state 6 ]
    +  ! MINUS           [ shift and go to state 5 ]
    +
    +state 12
    +
    +    expression -> expression DIVIDE expression .
    +    expression -> expression . PLUS expression
    +    expression -> expression . MINUS expression
    +    expression -> expression . TIMES expression
    +    expression -> expression . DIVIDE expression
    +
    +    $               reduce using rule 4
    +    PLUS            reduce using rule 4
    +    MINUS           reduce using rule 4
    +    TIMES           reduce using rule 4
    +    DIVIDE          reduce using rule 4
    +    RPAREN          reduce using rule 4
    +
    +  ! PLUS            [ shift and go to state 6 ]
    +  ! MINUS           [ shift and go to state 5 ]
    +  ! TIMES           [ shift and go to state 4 ]
    +  ! DIVIDE          [ shift and go to state 7 ]
    +
    +state 13
    +
    +    expression -> LPAREN expression RPAREN .
    +
    +    $               reduce using rule 6
    +    PLUS            reduce using rule 6
    +    MINUS           reduce using rule 6
    +    TIMES           reduce using rule 6
    +    DIVIDE          reduce using rule 6
    +    RPAREN          reduce using rule 6
    +
    +
    + +The different states that appear in this file are a representation of +every possible sequence of valid input tokens allowed by the grammar. +When receiving input tokens, the parser is building up a stack and +looking for matching rules. Each state keeps track of the grammar +rules that might be in the process of being matched at that point. Within each +rule, the "." character indicates the current location of the parse +within that rule. In addition, the actions for each valid input token +are listed. When a shift/reduce or reduce/reduce conflict arises, +rules not selected are prefixed with an !. For example: + +
    +
    +  ! TIMES           [ reduce using rule 2 ]
    +  ! DIVIDE          [ reduce using rule 2 ]
    +  ! PLUS            [ shift and go to state 6 ]
    +  ! MINUS           [ shift and go to state 5 ]
    +
    +
    + +By looking at these rules (and with a little practice), you can usually track down the source +of most parsing conflicts. It should also be stressed that not all shift-reduce conflicts are +bad. However, the only way to be sure that they are resolved correctly is to look at parser.out. + +

    6.8 Syntax Error Handling

    + + +If you are creating a parser for production use, the handling of +syntax errors is important. As a general rule, you don't want a +parser to simply throw up its hands and stop at the first sign of +trouble. Instead, you want it to report the error, recover if possible, and +continue parsing so that all of the errors in the input get reported +to the user at once. This is the standard behavior found in compilers +for languages such as C, C++, and Java. + +In PLY, when a syntax error occurs during parsing, the error is immediately +detected (i.e., the parser does not read any more tokens beyond the +source of the error). However, at this point, the parser enters a +recovery mode that can be used to try and continue further parsing. +As a general rule, error recovery in LR parsers is a delicate +topic that involves ancient rituals and black-magic. The recovery mechanism +provided by yacc.py is comparable to Unix yacc so you may want +consult a book like O'Reilly's "Lex and Yacc" for some of the finer details. + +

    +When a syntax error occurs, yacc.py performs the following steps: + +

      +
    1. On the first occurrence of an error, the user-defined p_error() function +is called with the offending token as an argument. However, if the syntax error is due to +reaching the end-of-file, p_error() is called with an + argument of None. +Afterwards, the parser enters +an "error-recovery" mode in which it will not make future calls to p_error() until it +has successfully shifted at least 3 tokens onto the parsing stack. + +

      +

    2. If no recovery action is taken in p_error(), the offending lookahead token is replaced +with a special error token. + +

      +

    3. If the offending lookahead token is already set to error, the top item of the parsing stack is +deleted. + +

      +

    4. If the entire parsing stack is unwound, the parser enters a restart state and attempts to start +parsing from its initial state. + +

      +

    5. If a grammar rule accepts error as a token, it will be +shifted onto the parsing stack. + +

      +

    6. If the top item of the parsing stack is error, lookahead tokens will be discarded until the +parser can successfully shift a new symbol or reduce a rule involving error. +
    + +

    6.8.1 Recovery and resynchronization with error rules

    + + +The most well-behaved approach for handling syntax errors is to write grammar rules that include the error +token. For example, suppose your language had a grammar rule for a print statement like this: + +
    +
    +def p_statement_print(p):
    +     'statement : PRINT expr SEMI'
    +     ...
    +
    +
    + +To account for the possibility of a bad expression, you might write an additional grammar rule like this: + +
    +
    +def p_statement_print_error(p):
    +     'statement : PRINT error SEMI'
    +     print("Syntax error in print statement. Bad expression")
    +
    +
    +
    + +In this case, the error token will match any sequence of +tokens that might appear up to the first semicolon that is +encountered. Once the semicolon is reached, the rule will be +invoked and the error token will go away. + +

    +This type of recovery is sometimes known as parser resynchronization. +The error token acts as a wildcard for any bad input text and +the token immediately following error acts as a +synchronization token. + +

    +It is important to note that the error token usually does not appear as the last token +on the right in an error rule. For example: + +

    +
    +def p_statement_print_error(p):
    +    'statement : PRINT error'
    +    print("Syntax error in print statement. Bad expression")
    +
    +
    + +This is because the first bad token encountered will cause the rule to +be reduced--which may make it difficult to recover if more bad tokens +immediately follow. + +

    6.8.2 Panic mode recovery

    + + +An alternative error recovery scheme is to enter a panic mode recovery in which tokens are +discarded to a point where the parser might be able to recover in some sensible manner. + +

    +Panic mode recovery is implemented entirely in the p_error() function. For example, this +function starts discarding tokens until it reaches a closing '}'. Then, it restarts the +parser in its initial state. + +

    +
    +def p_error(p):
    +    print("Whoa. You are seriously hosed.")
    +    if not p:
    +        print("End of File!")
    +        return
    +
    +    # Read ahead looking for a closing '}'
    +    while True:
    +        tok = parser.token()             # Get the next token
    +        if not tok or tok.type == 'RBRACE': 
    +            break
    +    parser.restart()
    +
    +
    + +

    +This function simply discards the bad token and tells the parser that the error was ok. + +

    +
    +def p_error(p):
    +    if p:
    +         print("Syntax error at token", p.type)
    +         # Just discard the token and tell the parser it's okay.
    +         parser.errok()
    +    else:
    +         print("Syntax error at EOF")
    +
    +
    + +

    +More information on these methods is as follows: +

    + +

    +

      +
    • parser.errok(). This resets the parser state so it doesn't think it's in error-recovery +mode. This will prevent an error token from being generated and will reset the internal +error counters so that the next syntax error will call p_error() again. + +

      +

    • parser.token(). This returns the next token on the input stream. + +

      +

    • parser.restart(). This discards the entire parsing stack and resets the parser +to its initial state. +
    + +

    +To supply the next lookahead token to the parser, p_error() can return a token. This might be +useful if trying to synchronize on special characters. For example: + +

    +
    +def p_error(p):
    +    # Read ahead looking for a terminating ";"
    +    while True:
    +        tok = parser.token()             # Get the next token
    +        if not tok or tok.type == 'SEMI': break
    +    parser.errok()
    +
    +    # Return SEMI to the parser as the next lookahead token
    +    return tok  
    +
    +
    + +

    +Keep in mind in that the above error handling functions, +parser is an instance of the parser created by +yacc(). You'll need to save this instance someplace in your +code so that you can refer to it during error handling. +

    + +

    6.8.3 Signalling an error from a production

    + + +If necessary, a production rule can manually force the parser to enter error recovery. This +is done by raising the SyntaxError exception like this: + +
    +
    +def p_production(p):
    +    'production : some production ...'
    +    raise SyntaxError
    +
    +
    + +The effect of raising SyntaxError is the same as if the last symbol shifted onto the +parsing stack was actually a syntax error. Thus, when you do this, the last symbol shifted is popped off +of the parsing stack and the current lookahead token is set to an error token. The parser +then enters error-recovery mode where it tries to reduce rules that can accept error tokens. +The steps that follow from this point are exactly the same as if a syntax error were detected and +p_error() were called. + +

    +One important aspect of manually setting an error is that the p_error() function will NOT be +called in this case. If you need to issue an error message, make sure you do it in the production that +raises SyntaxError. + +

    +Note: This feature of PLY is meant to mimic the behavior of the YYERROR macro in yacc. + +

    6.8.4 When Do Syntax Errors Get Reported

    + + +

    +In most cases, yacc will handle errors as soon as a bad input token is +detected on the input. However, be aware that yacc may choose to +delay error handling until after it has reduced one or more grammar +rules first. This behavior might be unexpected, but it's related to +special states in the underlying parsing table known as "defaulted +states." A defaulted state is parsing condition where the same +grammar rule will be reduced regardless of what valid token +comes next on the input. For such states, yacc chooses to go ahead +and reduce the grammar rule without reading the next input +token. If the next token is bad, yacc will eventually get around to reading it and +report a syntax error. It's just a little unusual in that you might +see some of your grammar rules firing immediately prior to the syntax +error. +

    + +

    +Usually, the delayed error reporting with defaulted states is harmless +(and there are other reasons for wanting PLY to behave in this way). +However, if you need to turn this behavior off for some reason. You +can clear the defaulted states table like this: +

    + +
    +
    +parser = yacc.yacc()
    +parser.defaulted_states = {}
    +
    +
    + +

    +Disabling defaulted states is not recommended if your grammar makes use +of embedded actions as described in Section 6.11.

    + +

    6.8.5 General comments on error handling

    + + +For normal types of languages, error recovery with error rules and resynchronization characters is probably the most reliable +technique. This is because you can instrument the grammar to catch errors at selected places where it is relatively easy +to recover and continue parsing. Panic mode recovery is really only useful in certain specialized applications where you might want +to discard huge portions of the input text to find a valid restart point. + +

    6.9 Line Number and Position Tracking

    + + +Position tracking is often a tricky problem when writing compilers. +By default, PLY tracks the line number and position of all tokens. +This information is available using the following functions: + +
      +
    • p.lineno(num). Return the line number for symbol num +
    • p.lexpos(num). Return the lexing position for symbol num +
    + +For example: + +
    +
    +def p_expression(p):
    +    'expression : expression PLUS expression'
    +    line   = p.lineno(2)        # line number of the PLUS token
    +    index  = p.lexpos(2)        # Position of the PLUS token
    +
    +
    + +As an optional feature, yacc.py can automatically track line +numbers and positions for all of the grammar symbols as well. +However, this extra tracking requires extra processing and can +significantly slow down parsing. Therefore, it must be enabled by +passing the +tracking=True option to yacc.parse(). For example: + +
    +
    +yacc.parse(data,tracking=True)
    +
    +
    + +Once enabled, the lineno() and lexpos() methods work +for all grammar symbols. In addition, two additional methods can be +used: + +
      +
    • p.linespan(num). Return a tuple (startline,endline) with the starting and ending line number for symbol num. +
    • p.lexspan(num). Return a tuple (start,end) with the starting and ending positions for symbol num. +
    + +For example: + +
    +
    +def p_expression(p):
    +    'expression : expression PLUS expression'
    +    p.lineno(1)        # Line number of the left expression
    +    p.lineno(2)        # line number of the PLUS operator
    +    p.lineno(3)        # line number of the right expression
    +    ...
    +    start,end = p.linespan(3)    # Start,end lines of the right expression
    +    starti,endi = p.lexspan(3)   # Start,end positions of right expression
    +
    +
    +
    + +Note: The lexspan() function only returns the range of values up to the start of the last grammar symbol. + +

    +Although it may be convenient for PLY to track position information on +all grammar symbols, this is often unnecessary. For example, if you +are merely using line number information in an error message, you can +often just key off of a specific token in the grammar rule. For +example: + +

    +
    +def p_bad_func(p):
    +    'funccall : fname LPAREN error RPAREN'
    +    # Line number reported from LPAREN token
    +    print("Bad function call at line", p.lineno(2))
    +
    +
    + +

    +Similarly, you may get better parsing performance if you only +selectively propagate line number information where it's needed using +the p.set_lineno() method. For example: + +

    +
    +def p_fname(p):
    +    'fname : ID'
    +    p[0] = p[1]
    +    p.set_lineno(0,p.lineno(1))
    +
    +
    + +PLY doesn't retain line number information from rules that have already been +parsed. If you are building an abstract syntax tree and need to have line numbers, +you should make sure that the line numbers appear in the tree itself. + +

    6.10 AST Construction

    + + +yacc.py provides no special functions for constructing an +abstract syntax tree. However, such construction is easy enough to do +on your own. + +

    A minimal way to construct a tree is to simply create and +propagate a tuple or list in each grammar rule function. There +are many possible ways to do this, but one example would be something +like this: + +

    +
    +def p_expression_binop(p):
    +    '''expression : expression PLUS expression
    +                  | expression MINUS expression
    +                  | expression TIMES expression
    +                  | expression DIVIDE expression'''
    +
    +    p[0] = ('binary-expression',p[2],p[1],p[3])
    +
    +def p_expression_group(p):
    +    'expression : LPAREN expression RPAREN'
    +    p[0] = ('group-expression',p[2])
    +
    +def p_expression_number(p):
    +    'expression : NUMBER'
    +    p[0] = ('number-expression',p[1])
    +
    +
    + +

    +Another approach is to create a set of data structure for different +kinds of abstract syntax tree nodes and assign nodes to p[0] +in each rule. For example: + +

    +
    +class Expr: pass
    +
    +class BinOp(Expr):
    +    def __init__(self,left,op,right):
    +        self.type = "binop"
    +        self.left = left
    +        self.right = right
    +        self.op = op
    +
    +class Number(Expr):
    +    def __init__(self,value):
    +        self.type = "number"
    +        self.value = value
    +
    +def p_expression_binop(p):
    +    '''expression : expression PLUS expression
    +                  | expression MINUS expression
    +                  | expression TIMES expression
    +                  | expression DIVIDE expression'''
    +
    +    p[0] = BinOp(p[1],p[2],p[3])
    +
    +def p_expression_group(p):
    +    'expression : LPAREN expression RPAREN'
    +    p[0] = p[2]
    +
    +def p_expression_number(p):
    +    'expression : NUMBER'
    +    p[0] = Number(p[1])
    +
    +
    + +The advantage to this approach is that it may make it easier to attach more complicated +semantics, type checking, code generation, and other features to the node classes. + +

    +To simplify tree traversal, it may make sense to pick a very generic +tree structure for your parse tree nodes. For example: + +

    +
    +class Node:
    +    def __init__(self,type,children=None,leaf=None):
    +         self.type = type
    +         if children:
    +              self.children = children
    +         else:
    +              self.children = [ ]
    +         self.leaf = leaf
    +	 
    +def p_expression_binop(p):
    +    '''expression : expression PLUS expression
    +                  | expression MINUS expression
    +                  | expression TIMES expression
    +                  | expression DIVIDE expression'''
    +
    +    p[0] = Node("binop", [p[1],p[3]], p[2])
    +
    +
    + +

    6.11 Embedded Actions

    + + +The parsing technique used by yacc only allows actions to be executed at the end of a rule. For example, +suppose you have a rule like this: + +
    +
    +def p_foo(p):
    +    "foo : A B C D"
    +    print("Parsed a foo", p[1],p[2],p[3],p[4])
    +
    +
    + +

    +In this case, the supplied action code only executes after all of the +symbols A, B, C, and D have been +parsed. Sometimes, however, it is useful to execute small code +fragments during intermediate stages of parsing. For example, suppose +you wanted to perform some action immediately after A has +been parsed. To do this, write an empty rule like this: + +

    +
    +def p_foo(p):
    +    "foo : A seen_A B C D"
    +    print("Parsed a foo", p[1],p[3],p[4],p[5])
    +    print("seen_A returned", p[2])
    +
    +def p_seen_A(p):
    +    "seen_A :"
    +    print("Saw an A = ", p[-1])   # Access grammar symbol to left
    +    p[0] = some_value            # Assign value to seen_A
    +
    +
    +
    + +

    +In this example, the empty seen_A rule executes immediately +after A is shifted onto the parsing stack. Within this +rule, p[-1] refers to the symbol on the stack that appears +immediately to the left of the seen_A symbol. In this case, +it would be the value of A in the foo rule +immediately above. Like other rules, a value can be returned from an +embedded action by simply assigning it to p[0] + +

    +The use of embedded actions can sometimes introduce extra shift/reduce conflicts. For example, +this grammar has no conflicts: + +

    +
    +def p_foo(p):
    +    """foo : abcd
    +           | abcx"""
    +
    +def p_abcd(p):
    +    "abcd : A B C D"
    +
    +def p_abcx(p):
    +    "abcx : A B C X"
    +
    +
    + +However, if you insert an embedded action into one of the rules like this, + +
    +
    +def p_foo(p):
    +    """foo : abcd
    +           | abcx"""
    +
    +def p_abcd(p):
    +    "abcd : A B C D"
    +
    +def p_abcx(p):
    +    "abcx : A B seen_AB C X"
    +
    +def p_seen_AB(p):
    +    "seen_AB :"
    +
    +
    + +an extra shift-reduce conflict will be introduced. This conflict is +caused by the fact that the same symbol C appears next in +both the abcd and abcx rules. The parser can either +shift the symbol (abcd rule) or reduce the empty +rule seen_AB (abcx rule). + +

    +A common use of embedded rules is to control other aspects of parsing +such as scoping of local variables. For example, if you were parsing C code, you might +write code like this: + +

    +
    +def p_statements_block(p):
    +    "statements: LBRACE new_scope statements RBRACE"""
    +    # Action code
    +    ...
    +    pop_scope()        # Return to previous scope
    +
    +def p_new_scope(p):
    +    "new_scope :"
    +    # Create a new scope for local variables
    +    s = new_scope()
    +    push_scope(s)
    +    ...
    +
    +
    + +In this case, the embedded action new_scope executes +immediately after a LBRACE ({) symbol is parsed. +This might adjust internal symbol tables and other aspects of the +parser. Upon completion of the rule statements_block, code +might undo the operations performed in the embedded action +(e.g., pop_scope()). + +

    6.12 Miscellaneous Yacc Notes

    + + +
      + +
    • By default, yacc.py relies on lex.py for tokenizing. However, an alternative tokenizer +can be supplied as follows: + +
      +
      +parser = yacc.parse(lexer=x)
      +
      +
      +in this case, x must be a Lexer object that minimally has a x.token() method for retrieving the next +token. If an input string is given to yacc.parse(), the lexer must also have an x.input() method. + +

      +

    • By default, the yacc generates tables in debugging mode (which produces the parser.out file and other output). +To disable this, use + +
      +
      +parser = yacc.yacc(debug=False)
      +
      +
      + +

      +

    • To change the name of the parsetab.py file, use: + +
      +
      +parser = yacc.yacc(tabmodule="foo")
      +
      +
      + +

      +Normally, the parsetab.py file is placed into the same directory as +the module where the parser is defined. If you want it to go somewhere else, you can +given an absolute package name for tabmodule instead. In that case, the +tables will be written there. +

      + +

      +

    • To change the directory in which the parsetab.py file (and other output files) are written, use: +
      +
      +parser = yacc.yacc(tabmodule="foo",outputdir="somedirectory")
      +
      +
      + +

      +Note: Be aware that unless the directory specified is also on Python's path (sys.path), subsequent +imports of the table file will fail. As a general rule, it's better to specify a destination using the +tabmodule argument instead of directly specifying a directory using the outputdir argument. +

      + +

      +

    • To prevent yacc from generating any kind of parser table file, use: +
      +
      +parser = yacc.yacc(write_tables=False)
      +
      +
      + +Note: If you disable table generation, yacc() will regenerate the parsing tables +each time it runs (which may take awhile depending on how large your grammar is). + +

      +

    • To print copious amounts of debugging during parsing, use: + +
      +
      +parser = yacc.parse(debug=True)     
      +
      +
      + +

      +

    • Since the generation of the LALR tables is relatively expensive, previously generated tables are +cached and reused if possible. The decision to regenerate the tables is determined by taking an MD5 +checksum of all grammar rules and precedence rules. Only in the event of a mismatch are the tables regenerated. + +

      +It should be noted that table generation is reasonably efficient, even for grammars that involve around a 100 rules +and several hundred states.

    • + + +

      +

    • Since LR parsing is driven by tables, the performance of the parser is largely independent of the +size of the grammar. The biggest bottlenecks will be the lexer and the complexity of the code in your grammar rules. +
    • +

      + +

      +

    • yacc() also allows parsers to be defined as classes and as closures (see the section on alternative specification of +lexers). However, be aware that only one parser may be defined in a single module (source file). There are various +error checks and validation steps that may issue confusing error messages if you try to define multiple parsers +in the same source file. +
    • +

      + +

      +

    • Decorators of production rules have to update the wrapped function's line number. wrapper.co_firstlineno = func.__code__.co_firstlineno: + +
      +
      +from functools import wraps
      +from nodes import Collection
      +
      +
      +def strict(*types):
      +    def decorate(func):
      +        @wraps(func)
      +        def wrapper(p):
      +            func(p)
      +            if not isinstance(p[0], types):
      +                raise TypeError
      +
      +        wrapper.co_firstlineno = func.__code__.co_firstlineno
      +        return wrapper
      +
      +    return decorate
      +
      +@strict(Collection)
      +def p_collection(p):
      +    """
      +    collection  : sequence
      +                | map
      +    """
      +    p[0] = p[1]
      +
      +
      + +
    • +

      + + +
    +

    + + +

    7. Multiple Parsers and Lexers

    + + +In advanced parsing applications, you may want to have multiple +parsers and lexers. + +

    +As a general rules this isn't a problem. However, to make it work, +you need to carefully make sure everything gets hooked up correctly. +First, make sure you save the objects returned by lex() and +yacc(). For example: + +

    +
    +lexer  = lex.lex()       # Return lexer object
    +parser = yacc.yacc()     # Return parser object
    +
    +
    + +Next, when parsing, make sure you give the parse() function a reference to the lexer it +should be using. For example: + +
    +
    +parser.parse(text,lexer=lexer)
    +
    +
    + +If you forget to do this, the parser will use the last lexer +created--which is not always what you want. + +

    +Within lexer and parser rule functions, these objects are also +available. In the lexer, the "lexer" attribute of a token refers to +the lexer object that triggered the rule. For example: + +

    +
    +def t_NUMBER(t):
    +   r'\d+'
    +   ...
    +   print(t.lexer)           # Show lexer object
    +
    +
    + +In the parser, the "lexer" and "parser" attributes refer to the lexer +and parser objects respectively. + +
    +
    +def p_expr_plus(p):
    +   'expr : expr PLUS expr'
    +   ...
    +   print(p.parser)          # Show parser object
    +   print(p.lexer)           # Show lexer object
    +
    +
    + +If necessary, arbitrary attributes can be attached to the lexer or parser object. +For example, if you wanted to have different parsing modes, you could attach a mode +attribute to the parser object and look at it later. + +

    8. Using Python's Optimized Mode

    + + +Because PLY uses information from doc-strings, parsing and lexing +information must be gathered while running the Python interpreter in +normal mode (i.e., not with the -O or -OO options). However, if you +specify optimized mode like this: + +
    +
    +lex.lex(optimize=1)
    +yacc.yacc(optimize=1)
    +
    +
    + +then PLY can later be used when Python runs in optimized mode. To make this work, +make sure you first run Python in normal mode. Once the lexing and parsing tables +have been generated the first time, run Python in optimized mode. PLY will use +the tables without the need for doc strings. + +

    +Beware: running PLY in optimized mode disables a lot of error +checking. You should only do this when your project has stabilized +and you don't need to do any debugging. One of the purposes of +optimized mode is to substantially decrease the startup time of +your compiler (by assuming that everything is already properly +specified and works). + +

    9. Advanced Debugging

    + + +

    +Debugging a compiler is typically not an easy task. PLY provides some +advanced diagostic capabilities through the use of Python's +logging module. The next two sections describe this: + +

    9.1 Debugging the lex() and yacc() commands

    + + +

    +Both the lex() and yacc() commands have a debugging +mode that can be enabled using the debug flag. For example: + +

    +
    +lex.lex(debug=True)
    +yacc.yacc(debug=True)
    +
    +
    + +Normally, the output produced by debugging is routed to either +standard error or, in the case of yacc(), to a file +parser.out. This output can be more carefully controlled +by supplying a logging object. Here is an example that adds +information about where different debugging messages are coming from: + +
    +
    +# Set up a logging object
    +import logging
    +logging.basicConfig(
    +    level = logging.DEBUG,
    +    filename = "parselog.txt",
    +    filemode = "w",
    +    format = "%(filename)10s:%(lineno)4d:%(message)s"
    +)
    +log = logging.getLogger()
    +
    +lex.lex(debug=True,debuglog=log)
    +yacc.yacc(debug=True,debuglog=log)
    +
    +
    + +If you supply a custom logger, the amount of debugging +information produced can be controlled by setting the logging level. +Typically, debugging messages are either issued at the DEBUG, +INFO, or WARNING levels. + +

    +PLY's error messages and warnings are also produced using the logging +interface. This can be controlled by passing a logging object +using the errorlog parameter. + +

    +
    +lex.lex(errorlog=log)
    +yacc.yacc(errorlog=log)
    +
    +
    + +If you want to completely silence warnings, you can either pass in a +logging object with an appropriate filter level or use the NullLogger +object defined in either lex or yacc. For example: + +
    +
    +yacc.yacc(errorlog=yacc.NullLogger())
    +
    +
    + +

    9.2 Run-time Debugging

    + + +

    +To enable run-time debugging of a parser, use the debug option to parse. This +option can either be an integer (which simply turns debugging on or off) or an instance +of a logger object. For example: + +

    +
    +log = logging.getLogger()
    +parser.parse(input,debug=log)
    +
    +
    + +If a logging object is passed, you can use its filtering level to control how much +output gets generated. The INFO level is used to produce information +about rule reductions. The DEBUG level will show information about the +parsing stack, token shifts, and other details. The ERROR level shows information +related to parsing errors. + +

    +For very complicated problems, you should pass in a logging object that +redirects to a file where you can more easily inspect the output after +execution. + +

    10. Packaging Advice

    + + +

    +If you are distributing a package that makes use of PLY, you should +spend a few moments thinking about how you want to handle the files +that are automatically generated. For example, the parsetab.py +file generated by the yacc() function.

    + +

    +Starting in PLY-3.6, the table files are created in the same directory +as the file where a parser is defined. This means that the +parsetab.py file will live side-by-side with your parser +specification. In terms of packaging, this is probably the easiest and +most sane approach to manage. You don't need to give yacc() +any extra arguments and it should just "work."

    + +

    +One concern is the management of the parsetab.py file itself. +For example, should you have this file checked into version control (e.g., GitHub), +should it be included in a package distribution as a normal file, or should you +just let PLY generate it automatically for the user when they install your package? +

    + +

    +As of PLY-3.6, the parsetab.py file should be compatible across all versions +of Python including Python 2 and 3. Thus, a table file generated in Python 2 should +work fine if it's used on Python 3. Because of this, it should be relatively harmless +to distribute the parsetab.py file yourself if you need to. However, be aware +that older/newer versions of PLY may try to regenerate the file if there are future +enhancements or changes to its format. +

    + +

    +To make the generation of table files easier for the purposes of installation, you might +way to make your parser files executable using the -m option or similar. For +example: +

    + +
    +
    +# calc.py
    +...
    +...
    +def make_parser():
    +    parser = yacc.yacc()
    +    return parser
    +
    +if __name__ == '__main__':
    +    make_parser()
    +
    +
    + +

    +You can then use a command such as python -m calc.py to generate the tables. Alternatively, +a setup.py script, can import the module and use make_parser() to create the +parsing tables. +

    + +

    +If you're willing to sacrifice a little startup time, you can also instruct PLY to never write the +tables using yacc.yacc(write_tables=False, debug=False). In this mode, PLY will regenerate +the parsing tables from scratch each time. For a small grammar, you probably won't notice. For a +large grammar, you should probably reconsider--the parsing tables are meant to dramatically speed up this process. +

    + +

    +During operation, is is normal for PLY to produce diagnostic error +messages (usually printed to standard error). These are generated +entirely using the logging module. If you want to redirect +these messages or silence them, you can provide your own logging +object to yacc(). For example: +

    + +
    +
    +import logging
    +log = logging.getLogger('ply')
    +...
    +parser = yacc.yacc(errorlog=log)
    +
    +
    + +

    11. Where to go from here?

    + + +The examples directory of the PLY distribution contains several simple examples. Please consult a +compilers textbook for the theory and underlying implementation details or LR parsing. + + + + + + + + + + diff --git a/example/BASIC/README b/example/BASIC/README new file mode 100644 index 000000000..be24a3005 --- /dev/null +++ b/example/BASIC/README @@ -0,0 +1,79 @@ +Inspired by a September 14, 2006 Salon article "Why Johnny Can't Code" by +David Brin (http://www.salon.com/tech/feature/2006/09/14/basic/index.html), +I thought that a fully working BASIC interpreter might be an interesting, +if not questionable, PLY example. Uh, okay, so maybe it's just a bad idea, +but in any case, here it is. + +In this example, you'll find a rough implementation of 1964 Dartmouth BASIC +as described in the manual at: + + http://www.bitsavers.org/pdf/dartmouth/BASIC_Oct64.pdf + +See also: + + http://en.wikipedia.org/wiki/Dartmouth_BASIC + +This dialect is downright primitive---there are no string variables +and no facilities for interactive input. Moreover, subroutines and functions +are brain-dead even more than they usually are for BASIC. Of course, +the GOTO statement is provided. + +Nevertheless, there are a few interesting aspects of this example: + + - It illustrates a fully working interpreter including lexing, parsing, + and interpretation of instructions. + + - The parser shows how to catch and report various kinds of parsing + errors in a more graceful way. + + - The example both parses files (supplied on command line) and + interactive input entered line by line. + + - It shows how you might represent parsed information. In this case, + each BASIC statement is encoded into a Python tuple containing the + statement type and parameters. These tuples are then stored in + a dictionary indexed by program line numbers. + + - Even though it's just BASIC, the parser contains more than 80 + rules and 150 parsing states. Thus, it's a little more meaty than + the calculator example. + +To use the example, run it as follows: + + % python basic.py hello.bas + HELLO WORLD + % + +or use it interactively: + + % python basic.py + [BASIC] 10 PRINT "HELLO WORLD" + [BASIC] 20 END + [BASIC] RUN + HELLO WORLD + [BASIC] + +The following files are defined: + + basic.py - High level script that controls everything + basiclex.py - BASIC tokenizer + basparse.py - BASIC parser + basinterp.py - BASIC interpreter that runs parsed programs. + +In addition, a number of sample BASIC programs (.bas suffix) are +provided. These were taken out of the Dartmouth manual. + +Disclaimer: I haven't spent a ton of time testing this and it's likely that +I've skimped here and there on a few finer details (e.g., strictly enforcing +variable naming rules). However, the interpreter seems to be able to run +the examples in the BASIC manual. + +Have fun! + +-Dave + + + + + + diff --git a/example/BASIC/basic.py b/example/BASIC/basic.py new file mode 100644 index 000000000..70ac9e7c7 --- /dev/null +++ b/example/BASIC/basic.py @@ -0,0 +1,65 @@ +# An implementation of Dartmouth BASIC (1964) +# + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +import basiclex +import basparse +import basinterp + +# If a filename has been specified, we try to run it. +# If a runtime error occurs, we bail out and enter +# interactive mode below +if len(sys.argv) == 2: + data = open(sys.argv[1]).read() + prog = basparse.parse(data) + if not prog: + raise SystemExit + b = basinterp.BasicInterpreter(prog) + try: + b.run() + raise SystemExit + except RuntimeError: + pass + +else: + b = basinterp.BasicInterpreter({}) + +# Interactive mode. This incrementally adds/deletes statements +# from the program stored in the BasicInterpreter object. In +# addition, special commands 'NEW','LIST',and 'RUN' are added. +# Specifying a line number with no code deletes that line from +# the program. + +while 1: + try: + line = raw_input("[BASIC] ") + except EOFError: + raise SystemExit + if not line: + continue + line += "\n" + prog = basparse.parse(line) + if not prog: + continue + + keys = list(prog) + if keys[0] > 0: + b.add_statements(prog) + else: + stat = prog[keys[0]] + if stat[0] == 'RUN': + try: + b.run() + except RuntimeError: + pass + elif stat[0] == 'LIST': + b.list() + elif stat[0] == 'BLANK': + b.del_line(stat[1]) + elif stat[0] == 'NEW': + b.new() diff --git a/example/BASIC/basiclex.py b/example/BASIC/basiclex.py new file mode 100644 index 000000000..4151f4c34 --- /dev/null +++ b/example/BASIC/basiclex.py @@ -0,0 +1,61 @@ +# An implementation of Dartmouth BASIC (1964) + +from ply import * + +keywords = ( + 'LET', 'READ', 'DATA', 'PRINT', 'GOTO', 'IF', 'THEN', 'FOR', 'NEXT', 'TO', 'STEP', + 'END', 'STOP', 'DEF', 'GOSUB', 'DIM', 'REM', 'RETURN', 'RUN', 'LIST', 'NEW', +) + +tokens = keywords + ( + 'EQUALS', 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'POWER', + 'LPAREN', 'RPAREN', 'LT', 'LE', 'GT', 'GE', 'NE', + 'COMMA', 'SEMI', 'INTEGER', 'FLOAT', 'STRING', + 'ID', 'NEWLINE' +) + +t_ignore = ' \t' + + +def t_REM(t): + r'REM .*' + return t + + +def t_ID(t): + r'[A-Z][A-Z0-9]*' + if t.value in keywords: + t.type = t.value + return t + +t_EQUALS = r'=' +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_POWER = r'\^' +t_DIVIDE = r'/' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_LT = r'<' +t_LE = r'<=' +t_GT = r'>' +t_GE = r'>=' +t_NE = r'<>' +t_COMMA = r'\,' +t_SEMI = r';' +t_INTEGER = r'\d+' +t_FLOAT = r'((\d*\.\d+)(E[\+-]?\d+)?|([1-9]\d*E[\+-]?\d+))' +t_STRING = r'\".*?\"' + + +def t_NEWLINE(t): + r'\n' + t.lexer.lineno += 1 + return t + + +def t_error(t): + print("Illegal character %s" % t.value[0]) + t.lexer.skip(1) + +lex.lex(debug=0) diff --git a/example/BASIC/basiclog.py b/example/BASIC/basiclog.py new file mode 100644 index 000000000..9dcc7feda --- /dev/null +++ b/example/BASIC/basiclog.py @@ -0,0 +1,73 @@ +# An implementation of Dartmouth BASIC (1964) +# + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +import logging +logging.basicConfig( + level=logging.INFO, + filename="parselog.txt", + filemode="w" +) +log = logging.getLogger() + +import basiclex +import basparse +import basinterp + +# If a filename has been specified, we try to run it. +# If a runtime error occurs, we bail out and enter +# interactive mode below +if len(sys.argv) == 2: + data = open(sys.argv[1]).read() + prog = basparse.parse(data, debug=log) + if not prog: + raise SystemExit + b = basinterp.BasicInterpreter(prog) + try: + b.run() + raise SystemExit + except RuntimeError: + pass + +else: + b = basinterp.BasicInterpreter({}) + +# Interactive mode. This incrementally adds/deletes statements +# from the program stored in the BasicInterpreter object. In +# addition, special commands 'NEW','LIST',and 'RUN' are added. +# Specifying a line number with no code deletes that line from +# the program. + +while 1: + try: + line = raw_input("[BASIC] ") + except EOFError: + raise SystemExit + if not line: + continue + line += "\n" + prog = basparse.parse(line, debug=log) + if not prog: + continue + + keys = list(prog) + if keys[0] > 0: + b.add_statements(prog) + else: + stat = prog[keys[0]] + if stat[0] == 'RUN': + try: + b.run() + except RuntimeError: + pass + elif stat[0] == 'LIST': + b.list() + elif stat[0] == 'BLANK': + b.del_line(stat[1]) + elif stat[0] == 'NEW': + b.new() diff --git a/example/BASIC/basinterp.py b/example/BASIC/basinterp.py new file mode 100644 index 000000000..67762c797 --- /dev/null +++ b/example/BASIC/basinterp.py @@ -0,0 +1,496 @@ +# This file provides the runtime support for running a basic program +# Assumes the program has been parsed using basparse.py + +import sys +import math +import random + + +class BasicInterpreter: + + # Initialize the interpreter. prog is a dictionary + # containing (line,statement) mappings + def __init__(self, prog): + self.prog = prog + + self.functions = { # Built-in function table + 'SIN': lambda z: math.sin(self.eval(z)), + 'COS': lambda z: math.cos(self.eval(z)), + 'TAN': lambda z: math.tan(self.eval(z)), + 'ATN': lambda z: math.atan(self.eval(z)), + 'EXP': lambda z: math.exp(self.eval(z)), + 'ABS': lambda z: abs(self.eval(z)), + 'LOG': lambda z: math.log(self.eval(z)), + 'SQR': lambda z: math.sqrt(self.eval(z)), + 'INT': lambda z: int(self.eval(z)), + 'RND': lambda z: random.random() + } + + # Collect all data statements + def collect_data(self): + self.data = [] + for lineno in self.stat: + if self.prog[lineno][0] == 'DATA': + self.data = self.data + self.prog[lineno][1] + self.dc = 0 # Initialize the data counter + + # Check for end statements + def check_end(self): + has_end = 0 + for lineno in self.stat: + if self.prog[lineno][0] == 'END' and not has_end: + has_end = lineno + if not has_end: + print("NO END INSTRUCTION") + self.error = 1 + return + if has_end != lineno: + print("END IS NOT LAST") + self.error = 1 + + # Check loops + def check_loops(self): + for pc in range(len(self.stat)): + lineno = self.stat[pc] + if self.prog[lineno][0] == 'FOR': + forinst = self.prog[lineno] + loopvar = forinst[1] + for i in range(pc + 1, len(self.stat)): + if self.prog[self.stat[i]][0] == 'NEXT': + nextvar = self.prog[self.stat[i]][1] + if nextvar != loopvar: + continue + self.loopend[pc] = i + break + else: + print("FOR WITHOUT NEXT AT LINE %s" % self.stat[pc]) + self.error = 1 + + # Evaluate an expression + def eval(self, expr): + etype = expr[0] + if etype == 'NUM': + return expr[1] + elif etype == 'GROUP': + return self.eval(expr[1]) + elif etype == 'UNARY': + if expr[1] == '-': + return -self.eval(expr[2]) + elif etype == 'BINOP': + if expr[1] == '+': + return self.eval(expr[2]) + self.eval(expr[3]) + elif expr[1] == '-': + return self.eval(expr[2]) - self.eval(expr[3]) + elif expr[1] == '*': + return self.eval(expr[2]) * self.eval(expr[3]) + elif expr[1] == '/': + return float(self.eval(expr[2])) / self.eval(expr[3]) + elif expr[1] == '^': + return abs(self.eval(expr[2]))**self.eval(expr[3]) + elif etype == 'VAR': + var, dim1, dim2 = expr[1] + if not dim1 and not dim2: + if var in self.vars: + return self.vars[var] + else: + print("UNDEFINED VARIABLE %s AT LINE %s" % + (var, self.stat[self.pc])) + raise RuntimeError + # May be a list lookup or a function evaluation + if dim1 and not dim2: + if var in self.functions: + # A function + return self.functions[var](dim1) + else: + # A list evaluation + if var in self.lists: + dim1val = self.eval(dim1) + if dim1val < 1 or dim1val > len(self.lists[var]): + print("LIST INDEX OUT OF BOUNDS AT LINE %s" % + self.stat[self.pc]) + raise RuntimeError + return self.lists[var][dim1val - 1] + if dim1 and dim2: + if var in self.tables: + dim1val = self.eval(dim1) + dim2val = self.eval(dim2) + if dim1val < 1 or dim1val > len(self.tables[var]) or dim2val < 1 or dim2val > len(self.tables[var][0]): + print("TABLE INDEX OUT OUT BOUNDS AT LINE %s" % + self.stat[self.pc]) + raise RuntimeError + return self.tables[var][dim1val - 1][dim2val - 1] + print("UNDEFINED VARIABLE %s AT LINE %s" % + (var, self.stat[self.pc])) + raise RuntimeError + + # Evaluate a relational expression + def releval(self, expr): + etype = expr[1] + lhs = self.eval(expr[2]) + rhs = self.eval(expr[3]) + if etype == '<': + if lhs < rhs: + return 1 + else: + return 0 + + elif etype == '<=': + if lhs <= rhs: + return 1 + else: + return 0 + + elif etype == '>': + if lhs > rhs: + return 1 + else: + return 0 + + elif etype == '>=': + if lhs >= rhs: + return 1 + else: + return 0 + + elif etype == '=': + if lhs == rhs: + return 1 + else: + return 0 + + elif etype == '<>': + if lhs != rhs: + return 1 + else: + return 0 + + # Assignment + def assign(self, target, value): + var, dim1, dim2 = target + if not dim1 and not dim2: + self.vars[var] = self.eval(value) + elif dim1 and not dim2: + # List assignment + dim1val = self.eval(dim1) + if not var in self.lists: + self.lists[var] = [0] * 10 + + if dim1val > len(self.lists[var]): + print ("DIMENSION TOO LARGE AT LINE %s" % self.stat[self.pc]) + raise RuntimeError + self.lists[var][dim1val - 1] = self.eval(value) + elif dim1 and dim2: + dim1val = self.eval(dim1) + dim2val = self.eval(dim2) + if not var in self.tables: + temp = [0] * 10 + v = [] + for i in range(10): + v.append(temp[:]) + self.tables[var] = v + # Variable already exists + if dim1val > len(self.tables[var]) or dim2val > len(self.tables[var][0]): + print("DIMENSION TOO LARGE AT LINE %s" % self.stat[self.pc]) + raise RuntimeError + self.tables[var][dim1val - 1][dim2val - 1] = self.eval(value) + + # Change the current line number + def goto(self, linenum): + if not linenum in self.prog: + print("UNDEFINED LINE NUMBER %d AT LINE %d" % + (linenum, self.stat[self.pc])) + raise RuntimeError + self.pc = self.stat.index(linenum) + + # Run it + def run(self): + self.vars = {} # All variables + self.lists = {} # List variables + self.tables = {} # Tables + self.loops = [] # Currently active loops + self.loopend = {} # Mapping saying where loops end + self.gosub = None # Gosub return point (if any) + self.error = 0 # Indicates program error + + self.stat = list(self.prog) # Ordered list of all line numbers + self.stat.sort() + self.pc = 0 # Current program counter + + # Processing prior to running + + self.collect_data() # Collect all of the data statements + self.check_end() + self.check_loops() + + if self.error: + raise RuntimeError + + while 1: + line = self.stat[self.pc] + instr = self.prog[line] + + op = instr[0] + + # END and STOP statements + if op == 'END' or op == 'STOP': + break # We're done + + # GOTO statement + elif op == 'GOTO': + newline = instr[1] + self.goto(newline) + continue + + # PRINT statement + elif op == 'PRINT': + plist = instr[1] + out = "" + for label, val in plist: + if out: + out += ' ' * (15 - (len(out) % 15)) + out += label + if val: + if label: + out += " " + eval = self.eval(val) + out += str(eval) + sys.stdout.write(out) + end = instr[2] + if not (end == ',' or end == ';'): + sys.stdout.write("\n") + if end == ',': + sys.stdout.write(" " * (15 - (len(out) % 15))) + if end == ';': + sys.stdout.write(" " * (3 - (len(out) % 3))) + + # LET statement + elif op == 'LET': + target = instr[1] + value = instr[2] + self.assign(target, value) + + # READ statement + elif op == 'READ': + for target in instr[1]: + if self.dc < len(self.data): + value = ('NUM', self.data[self.dc]) + self.assign(target, value) + self.dc += 1 + else: + # No more data. Program ends + return + elif op == 'IF': + relop = instr[1] + newline = instr[2] + if (self.releval(relop)): + self.goto(newline) + continue + + elif op == 'FOR': + loopvar = instr[1] + initval = instr[2] + finval = instr[3] + stepval = instr[4] + + # Check to see if this is a new loop + if not self.loops or self.loops[-1][0] != self.pc: + # Looks like a new loop. Make the initial assignment + newvalue = initval + self.assign((loopvar, None, None), initval) + if not stepval: + stepval = ('NUM', 1) + stepval = self.eval(stepval) # Evaluate step here + self.loops.append((self.pc, stepval)) + else: + # It's a repeat of the previous loop + # Update the value of the loop variable according to the + # step + stepval = ('NUM', self.loops[-1][1]) + newvalue = ( + 'BINOP', '+', ('VAR', (loopvar, None, None)), stepval) + + if self.loops[-1][1] < 0: + relop = '>=' + else: + relop = '<=' + if not self.releval(('RELOP', relop, newvalue, finval)): + # Loop is done. Jump to the NEXT + self.pc = self.loopend[self.pc] + self.loops.pop() + else: + self.assign((loopvar, None, None), newvalue) + + elif op == 'NEXT': + if not self.loops: + print("NEXT WITHOUT FOR AT LINE %s" % line) + return + + nextvar = instr[1] + self.pc = self.loops[-1][0] + loopinst = self.prog[self.stat[self.pc]] + forvar = loopinst[1] + if nextvar != forvar: + print("NEXT DOESN'T MATCH FOR AT LINE %s" % line) + return + continue + elif op == 'GOSUB': + newline = instr[1] + if self.gosub: + print("ALREADY IN A SUBROUTINE AT LINE %s" % line) + return + self.gosub = self.stat[self.pc] + self.goto(newline) + continue + + elif op == 'RETURN': + if not self.gosub: + print("RETURN WITHOUT A GOSUB AT LINE %s" % line) + return + self.goto(self.gosub) + self.gosub = None + + elif op == 'FUNC': + fname = instr[1] + pname = instr[2] + expr = instr[3] + + def eval_func(pvalue, name=pname, self=self, expr=expr): + self.assign((pname, None, None), pvalue) + return self.eval(expr) + self.functions[fname] = eval_func + + elif op == 'DIM': + for vname, x, y in instr[1]: + if y == 0: + # Single dimension variable + self.lists[vname] = [0] * x + else: + # Double dimension variable + temp = [0] * y + v = [] + for i in range(x): + v.append(temp[:]) + self.tables[vname] = v + + self.pc += 1 + + # Utility functions for program listing + def expr_str(self, expr): + etype = expr[0] + if etype == 'NUM': + return str(expr[1]) + elif etype == 'GROUP': + return "(%s)" % self.expr_str(expr[1]) + elif etype == 'UNARY': + if expr[1] == '-': + return "-" + str(expr[2]) + elif etype == 'BINOP': + return "%s %s %s" % (self.expr_str(expr[2]), expr[1], self.expr_str(expr[3])) + elif etype == 'VAR': + return self.var_str(expr[1]) + + def relexpr_str(self, expr): + return "%s %s %s" % (self.expr_str(expr[2]), expr[1], self.expr_str(expr[3])) + + def var_str(self, var): + varname, dim1, dim2 = var + if not dim1 and not dim2: + return varname + if dim1 and not dim2: + return "%s(%s)" % (varname, self.expr_str(dim1)) + return "%s(%s,%s)" % (varname, self.expr_str(dim1), self.expr_str(dim2)) + + # Create a program listing + def list(self): + stat = list(self.prog) # Ordered list of all line numbers + stat.sort() + for line in stat: + instr = self.prog[line] + op = instr[0] + if op in ['END', 'STOP', 'RETURN']: + print("%s %s" % (line, op)) + continue + elif op == 'REM': + print("%s %s" % (line, instr[1])) + elif op == 'PRINT': + _out = "%s %s " % (line, op) + first = 1 + for p in instr[1]: + if not first: + _out += ", " + if p[0] and p[1]: + _out += '"%s"%s' % (p[0], self.expr_str(p[1])) + elif p[1]: + _out += self.expr_str(p[1]) + else: + _out += '"%s"' % (p[0],) + first = 0 + if instr[2]: + _out += instr[2] + print(_out) + elif op == 'LET': + print("%s LET %s = %s" % + (line, self.var_str(instr[1]), self.expr_str(instr[2]))) + elif op == 'READ': + _out = "%s READ " % line + first = 1 + for r in instr[1]: + if not first: + _out += "," + _out += self.var_str(r) + first = 0 + print(_out) + elif op == 'IF': + print("%s IF %s THEN %d" % + (line, self.relexpr_str(instr[1]), instr[2])) + elif op == 'GOTO' or op == 'GOSUB': + print("%s %s %s" % (line, op, instr[1])) + elif op == 'FOR': + _out = "%s FOR %s = %s TO %s" % ( + line, instr[1], self.expr_str(instr[2]), self.expr_str(instr[3])) + if instr[4]: + _out += " STEP %s" % (self.expr_str(instr[4])) + print(_out) + elif op == 'NEXT': + print("%s NEXT %s" % (line, instr[1])) + elif op == 'FUNC': + print("%s DEF %s(%s) = %s" % + (line, instr[1], instr[2], self.expr_str(instr[3]))) + elif op == 'DIM': + _out = "%s DIM " % line + first = 1 + for vname, x, y in instr[1]: + if not first: + _out += "," + first = 0 + if y == 0: + _out += "%s(%d)" % (vname, x) + else: + _out += "%s(%d,%d)" % (vname, x, y) + + print(_out) + elif op == 'DATA': + _out = "%s DATA " % line + first = 1 + for v in instr[1]: + if not first: + _out += "," + first = 0 + _out += v + print(_out) + + # Erase the current program + def new(self): + self.prog = {} + + # Insert statements + def add_statements(self, prog): + for line, stat in prog.items(): + self.prog[line] = stat + + # Delete a statement + def del_line(self, lineno): + try: + del self.prog[lineno] + except KeyError: + pass diff --git a/example/BASIC/basparse.py b/example/BASIC/basparse.py new file mode 100644 index 000000000..d610c7d90 --- /dev/null +++ b/example/BASIC/basparse.py @@ -0,0 +1,474 @@ +# An implementation of Dartmouth BASIC (1964) +# + +from ply import * +import basiclex + +tokens = basiclex.tokens + +precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('left', 'POWER'), + ('right', 'UMINUS') +) + +# A BASIC program is a series of statements. We represent the program as a +# dictionary of tuples indexed by line number. + + +def p_program(p): + '''program : program statement + | statement''' + + if len(p) == 2 and p[1]: + p[0] = {} + line, stat = p[1] + p[0][line] = stat + elif len(p) == 3: + p[0] = p[1] + if not p[0]: + p[0] = {} + if p[2]: + line, stat = p[2] + p[0][line] = stat + +# This catch-all rule is used for any catastrophic errors. In this case, +# we simply return nothing + + +def p_program_error(p): + '''program : error''' + p[0] = None + p.parser.error = 1 + +# Format of all BASIC statements. + + +def p_statement(p): + '''statement : INTEGER command NEWLINE''' + if isinstance(p[2], str): + print("%s %s %s" % (p[2], "AT LINE", p[1])) + p[0] = None + p.parser.error = 1 + else: + lineno = int(p[1]) + p[0] = (lineno, p[2]) + +# Interactive statements. + + +def p_statement_interactive(p): + '''statement : RUN NEWLINE + | LIST NEWLINE + | NEW NEWLINE''' + p[0] = (0, (p[1], 0)) + +# Blank line number + + +def p_statement_blank(p): + '''statement : INTEGER NEWLINE''' + p[0] = (0, ('BLANK', int(p[1]))) + +# Error handling for malformed statements + + +def p_statement_bad(p): + '''statement : INTEGER error NEWLINE''' + print("MALFORMED STATEMENT AT LINE %s" % p[1]) + p[0] = None + p.parser.error = 1 + +# Blank line + + +def p_statement_newline(p): + '''statement : NEWLINE''' + p[0] = None + +# LET statement + + +def p_command_let(p): + '''command : LET variable EQUALS expr''' + p[0] = ('LET', p[2], p[4]) + + +def p_command_let_bad(p): + '''command : LET variable EQUALS error''' + p[0] = "BAD EXPRESSION IN LET" + +# READ statement + + +def p_command_read(p): + '''command : READ varlist''' + p[0] = ('READ', p[2]) + + +def p_command_read_bad(p): + '''command : READ error''' + p[0] = "MALFORMED VARIABLE LIST IN READ" + +# DATA statement + + +def p_command_data(p): + '''command : DATA numlist''' + p[0] = ('DATA', p[2]) + + +def p_command_data_bad(p): + '''command : DATA error''' + p[0] = "MALFORMED NUMBER LIST IN DATA" + +# PRINT statement + + +def p_command_print(p): + '''command : PRINT plist optend''' + p[0] = ('PRINT', p[2], p[3]) + + +def p_command_print_bad(p): + '''command : PRINT error''' + p[0] = "MALFORMED PRINT STATEMENT" + +# Optional ending on PRINT. Either a comma (,) or semicolon (;) + + +def p_optend(p): + '''optend : COMMA + | SEMI + |''' + if len(p) == 2: + p[0] = p[1] + else: + p[0] = None + +# PRINT statement with no arguments + + +def p_command_print_empty(p): + '''command : PRINT''' + p[0] = ('PRINT', [], None) + +# GOTO statement + + +def p_command_goto(p): + '''command : GOTO INTEGER''' + p[0] = ('GOTO', int(p[2])) + + +def p_command_goto_bad(p): + '''command : GOTO error''' + p[0] = "INVALID LINE NUMBER IN GOTO" + +# IF-THEN statement + + +def p_command_if(p): + '''command : IF relexpr THEN INTEGER''' + p[0] = ('IF', p[2], int(p[4])) + + +def p_command_if_bad(p): + '''command : IF error THEN INTEGER''' + p[0] = "BAD RELATIONAL EXPRESSION" + + +def p_command_if_bad2(p): + '''command : IF relexpr THEN error''' + p[0] = "INVALID LINE NUMBER IN THEN" + +# FOR statement + + +def p_command_for(p): + '''command : FOR ID EQUALS expr TO expr optstep''' + p[0] = ('FOR', p[2], p[4], p[6], p[7]) + + +def p_command_for_bad_initial(p): + '''command : FOR ID EQUALS error TO expr optstep''' + p[0] = "BAD INITIAL VALUE IN FOR STATEMENT" + + +def p_command_for_bad_final(p): + '''command : FOR ID EQUALS expr TO error optstep''' + p[0] = "BAD FINAL VALUE IN FOR STATEMENT" + + +def p_command_for_bad_step(p): + '''command : FOR ID EQUALS expr TO expr STEP error''' + p[0] = "MALFORMED STEP IN FOR STATEMENT" + +# Optional STEP qualifier on FOR statement + + +def p_optstep(p): + '''optstep : STEP expr + | empty''' + if len(p) == 3: + p[0] = p[2] + else: + p[0] = None + +# NEXT statement + + +def p_command_next(p): + '''command : NEXT ID''' + + p[0] = ('NEXT', p[2]) + + +def p_command_next_bad(p): + '''command : NEXT error''' + p[0] = "MALFORMED NEXT" + +# END statement + + +def p_command_end(p): + '''command : END''' + p[0] = ('END',) + +# REM statement + + +def p_command_rem(p): + '''command : REM''' + p[0] = ('REM', p[1]) + +# STOP statement + + +def p_command_stop(p): + '''command : STOP''' + p[0] = ('STOP',) + +# DEF statement + + +def p_command_def(p): + '''command : DEF ID LPAREN ID RPAREN EQUALS expr''' + p[0] = ('FUNC', p[2], p[4], p[7]) + + +def p_command_def_bad_rhs(p): + '''command : DEF ID LPAREN ID RPAREN EQUALS error''' + p[0] = "BAD EXPRESSION IN DEF STATEMENT" + + +def p_command_def_bad_arg(p): + '''command : DEF ID LPAREN error RPAREN EQUALS expr''' + p[0] = "BAD ARGUMENT IN DEF STATEMENT" + +# GOSUB statement + + +def p_command_gosub(p): + '''command : GOSUB INTEGER''' + p[0] = ('GOSUB', int(p[2])) + + +def p_command_gosub_bad(p): + '''command : GOSUB error''' + p[0] = "INVALID LINE NUMBER IN GOSUB" + +# RETURN statement + + +def p_command_return(p): + '''command : RETURN''' + p[0] = ('RETURN',) + +# DIM statement + + +def p_command_dim(p): + '''command : DIM dimlist''' + p[0] = ('DIM', p[2]) + + +def p_command_dim_bad(p): + '''command : DIM error''' + p[0] = "MALFORMED VARIABLE LIST IN DIM" + +# List of variables supplied to DIM statement + + +def p_dimlist(p): + '''dimlist : dimlist COMMA dimitem + | dimitem''' + if len(p) == 4: + p[0] = p[1] + p[0].append(p[3]) + else: + p[0] = [p[1]] + +# DIM items + + +def p_dimitem_single(p): + '''dimitem : ID LPAREN INTEGER RPAREN''' + p[0] = (p[1], eval(p[3]), 0) + + +def p_dimitem_double(p): + '''dimitem : ID LPAREN INTEGER COMMA INTEGER RPAREN''' + p[0] = (p[1], eval(p[3]), eval(p[5])) + +# Arithmetic expressions + + +def p_expr_binary(p): + '''expr : expr PLUS expr + | expr MINUS expr + | expr TIMES expr + | expr DIVIDE expr + | expr POWER expr''' + + p[0] = ('BINOP', p[2], p[1], p[3]) + + +def p_expr_number(p): + '''expr : INTEGER + | FLOAT''' + p[0] = ('NUM', eval(p[1])) + + +def p_expr_variable(p): + '''expr : variable''' + p[0] = ('VAR', p[1]) + + +def p_expr_group(p): + '''expr : LPAREN expr RPAREN''' + p[0] = ('GROUP', p[2]) + + +def p_expr_unary(p): + '''expr : MINUS expr %prec UMINUS''' + p[0] = ('UNARY', '-', p[2]) + +# Relational expressions + + +def p_relexpr(p): + '''relexpr : expr LT expr + | expr LE expr + | expr GT expr + | expr GE expr + | expr EQUALS expr + | expr NE expr''' + p[0] = ('RELOP', p[2], p[1], p[3]) + +# Variables + + +def p_variable(p): + '''variable : ID + | ID LPAREN expr RPAREN + | ID LPAREN expr COMMA expr RPAREN''' + if len(p) == 2: + p[0] = (p[1], None, None) + elif len(p) == 5: + p[0] = (p[1], p[3], None) + else: + p[0] = (p[1], p[3], p[5]) + +# Builds a list of variable targets as a Python list + + +def p_varlist(p): + '''varlist : varlist COMMA variable + | variable''' + if len(p) > 2: + p[0] = p[1] + p[0].append(p[3]) + else: + p[0] = [p[1]] + + +# Builds a list of numbers as a Python list + +def p_numlist(p): + '''numlist : numlist COMMA number + | number''' + + if len(p) > 2: + p[0] = p[1] + p[0].append(p[3]) + else: + p[0] = [p[1]] + +# A number. May be an integer or a float + + +def p_number(p): + '''number : INTEGER + | FLOAT''' + p[0] = eval(p[1]) + +# A signed number. + + +def p_number_signed(p): + '''number : MINUS INTEGER + | MINUS FLOAT''' + p[0] = eval("-" + p[2]) + +# List of targets for a print statement +# Returns a list of tuples (label,expr) + + +def p_plist(p): + '''plist : plist COMMA pitem + | pitem''' + if len(p) > 3: + p[0] = p[1] + p[0].append(p[3]) + else: + p[0] = [p[1]] + + +def p_item_string(p): + '''pitem : STRING''' + p[0] = (p[1][1:-1], None) + + +def p_item_string_expr(p): + '''pitem : STRING expr''' + p[0] = (p[1][1:-1], p[2]) + + +def p_item_expr(p): + '''pitem : expr''' + p[0] = ("", p[1]) + +# Empty + + +def p_empty(p): + '''empty : ''' + +# Catastrophic error handler + + +def p_error(p): + if not p: + print("SYNTAX ERROR AT EOF") + +bparser = yacc.yacc() + + +def parse(data, debug=0): + bparser.error = 0 + p = bparser.parse(data, debug=debug) + if bparser.error: + return None + return p diff --git a/example/BASIC/dim.bas b/example/BASIC/dim.bas new file mode 100644 index 000000000..87bd95b32 --- /dev/null +++ b/example/BASIC/dim.bas @@ -0,0 +1,14 @@ +5 DIM A(50,15) +10 FOR I = 1 TO 50 +20 FOR J = 1 TO 15 +30 LET A(I,J) = I + J +35 REM PRINT I,J, A(I,J) +40 NEXT J +50 NEXT I +100 FOR I = 1 TO 50 +110 FOR J = 1 TO 15 +120 PRINT A(I,J), +130 NEXT J +140 PRINT +150 NEXT I +999 END diff --git a/example/BASIC/func.bas b/example/BASIC/func.bas new file mode 100644 index 000000000..447ee16a9 --- /dev/null +++ b/example/BASIC/func.bas @@ -0,0 +1,5 @@ +10 DEF FDX(X) = 2*X +20 FOR I = 0 TO 100 +30 PRINT FDX(I) +40 NEXT I +50 END diff --git a/example/BASIC/gcd.bas b/example/BASIC/gcd.bas new file mode 100644 index 000000000..d0b774608 --- /dev/null +++ b/example/BASIC/gcd.bas @@ -0,0 +1,22 @@ +10 PRINT "A","B","C","GCD" +20 READ A,B,C +30 LET X = A +40 LET Y = B +50 GOSUB 200 +60 LET X = G +70 LET Y = C +80 GOSUB 200 +90 PRINT A, B, C, G +100 GOTO 20 +110 DATA 60, 90, 120 +120 DATA 38456, 64872, 98765 +130 DATA 32, 384, 72 +200 LET Q = INT(X/Y) +210 LET R = X - Q*Y +220 IF R = 0 THEN 300 +230 LET X = Y +240 LET Y = R +250 GOTO 200 +300 LET G = Y +310 RETURN +999 END diff --git a/example/BASIC/gosub.bas b/example/BASIC/gosub.bas new file mode 100644 index 000000000..99737b16f --- /dev/null +++ b/example/BASIC/gosub.bas @@ -0,0 +1,13 @@ +100 LET X = 3 +110 GOSUB 400 +120 PRINT U, V, W +200 LET X = 5 +210 GOSUB 400 +220 LET Z = U + 2*V + 3*W +230 PRINT Z +240 GOTO 999 +400 LET U = X*X +410 LET V = X*X*X +420 LET W = X*X*X*X + X*X*X + X*X + X +430 RETURN +999 END diff --git a/example/BASIC/hello.bas b/example/BASIC/hello.bas new file mode 100644 index 000000000..cc6f0b0b5 --- /dev/null +++ b/example/BASIC/hello.bas @@ -0,0 +1,4 @@ +5 REM HELLO WORLD PROGAM +10 PRINT "HELLO WORLD" +99 END + diff --git a/example/BASIC/linear.bas b/example/BASIC/linear.bas new file mode 100644 index 000000000..56c08220b --- /dev/null +++ b/example/BASIC/linear.bas @@ -0,0 +1,17 @@ +1 REM ::: SOLVE A SYSTEM OF LINEAR EQUATIONS +2 REM ::: A1*X1 + A2*X2 = B1 +3 REM ::: A3*X1 + A4*X2 = B2 +4 REM -------------------------------------- +10 READ A1, A2, A3, A4 +15 LET D = A1 * A4 - A3 * A2 +20 IF D = 0 THEN 65 +30 READ B1, B2 +37 LET X1 = (B1*A4 - B2*A2) / D +42 LET X2 = (A1*B2 - A3*B1) / D +55 PRINT X1, X2 +60 GOTO 30 +65 PRINT "NO UNIQUE SOLUTION" +70 DATA 1, 2, 4 +80 DATA 2, -7, 5 +85 DATA 1, 3, 4, -7 +90 END diff --git a/example/BASIC/maxsin.bas b/example/BASIC/maxsin.bas new file mode 100644 index 000000000..b96901530 --- /dev/null +++ b/example/BASIC/maxsin.bas @@ -0,0 +1,12 @@ +5 PRINT "X VALUE", "SINE", "RESOLUTION" +10 READ D +20 LET M = -1 +30 FOR X = 0 TO 3 STEP D +40 IF SIN(X) <= M THEN 80 +50 LET X0 = X +60 LET M = SIN(X) +80 NEXT X +85 PRINT X0, M, D +90 GOTO 10 +100 DATA .1, .01, .001 +110 END diff --git a/example/BASIC/powers.bas b/example/BASIC/powers.bas new file mode 100644 index 000000000..a454dc3e2 --- /dev/null +++ b/example/BASIC/powers.bas @@ -0,0 +1,13 @@ +5 PRINT "THIS PROGRAM COMPUTES AND PRINTS THE NTH POWERS" +6 PRINT "OF THE NUMBERS LESS THAN OR EQUAL TO N FOR VARIOUS" +7 PRINT "N FROM 1 THROUGH 7" +8 PRINT +10 FOR N = 1 TO 7 +15 PRINT "N = "N +20 FOR I = 1 TO N +30 PRINT I^N, +40 NEXT I +50 PRINT +60 PRINT +70 NEXT N +80 END diff --git a/example/BASIC/rand.bas b/example/BASIC/rand.bas new file mode 100644 index 000000000..4ff7a1467 --- /dev/null +++ b/example/BASIC/rand.bas @@ -0,0 +1,4 @@ +10 FOR I = 1 TO 20 +20 PRINT INT(10*RND(0)) +30 NEXT I +40 END diff --git a/example/BASIC/sales.bas b/example/BASIC/sales.bas new file mode 100644 index 000000000..a39aefb76 --- /dev/null +++ b/example/BASIC/sales.bas @@ -0,0 +1,20 @@ +10 FOR I = 1 TO 3 +20 READ P(I) +30 NEXT I +40 FOR I = 1 TO 3 +50 FOR J = 1 TO 5 +60 READ S(I,J) +70 NEXT J +80 NEXT I +90 FOR J = 1 TO 5 +100 LET S = 0 +110 FOR I = 1 TO 3 +120 LET S = S + P(I) * S(I,J) +130 NEXT I +140 PRINT "TOTAL SALES FOR SALESMAN"J, "$"S +150 NEXT J +200 DATA 1.25, 4.30, 2.50 +210 DATA 40, 20, 37, 29, 42 +220 DATA 10, 16, 3, 21, 8 +230 DATA 35, 47, 29, 16, 33 +300 END diff --git a/example/BASIC/sears.bas b/example/BASIC/sears.bas new file mode 100644 index 000000000..5ced3974e --- /dev/null +++ b/example/BASIC/sears.bas @@ -0,0 +1,18 @@ +1 REM :: THIS PROGRAM COMPUTES HOW MANY TIMES YOU HAVE TO FOLD +2 REM :: A PIECE OF PAPER SO THAT IT IS TALLER THAN THE +3 REM :: SEARS TOWER. +4 REM :: S = HEIGHT OF TOWER (METERS) +5 REM :: T = THICKNESS OF PAPER (MILLIMETERS) +10 LET S = 442 +20 LET T = 0.1 +30 REM CONVERT T TO METERS +40 LET T = T * .001 +50 LET F = 1 +60 LET H = T +100 IF H > S THEN 200 +120 LET H = 2 * H +125 LET F = F + 1 +130 GOTO 100 +200 PRINT "NUMBER OF FOLDS ="F +220 PRINT "FINAL HEIGHT ="H +999 END diff --git a/example/BASIC/sqrt1.bas b/example/BASIC/sqrt1.bas new file mode 100644 index 000000000..6673a9152 --- /dev/null +++ b/example/BASIC/sqrt1.bas @@ -0,0 +1,5 @@ +10 LET X = 0 +20 LET X = X + 1 +30 PRINT X, SQR(X) +40 IF X < 100 THEN 20 +50 END diff --git a/example/BASIC/sqrt2.bas b/example/BASIC/sqrt2.bas new file mode 100644 index 000000000..862d85ef2 --- /dev/null +++ b/example/BASIC/sqrt2.bas @@ -0,0 +1,4 @@ +10 FOR X = 1 TO 100 +20 PRINT X, SQR(X) +30 NEXT X +40 END diff --git a/example/GardenSnake/GardenSnake.py b/example/GardenSnake/GardenSnake.py new file mode 100644 index 000000000..8b493b40d --- /dev/null +++ b/example/GardenSnake/GardenSnake.py @@ -0,0 +1,777 @@ +# GardenSnake - a parser generator demonstration program +# +# This implements a modified version of a subset of Python: +# - only 'def', 'return' and 'if' statements +# - 'if' only has 'then' clause (no elif nor else) +# - single-quoted strings only, content in raw format +# - numbers are decimal.Decimal instances (not integers or floats) +# - no print statment; use the built-in 'print' function +# - only < > == + - / * implemented (and unary + -) +# - assignment and tuple assignment work +# - no generators of any sort +# - no ... well, no quite a lot + +# Why? I'm thinking about a new indentation-based configuration +# language for a project and wanted to figure out how to do it. Once +# I got that working I needed a way to test it out. My original AST +# was dumb so I decided to target Python's AST and compile it into +# Python code. Plus, it's pretty cool that it only took a day or so +# from sitting down with Ply to having working code. + +# This uses David Beazley's Ply from http://www.dabeaz.com/ply/ + +# This work is hereby released into the Public Domain. To view a copy of +# the public domain dedication, visit +# http://creativecommons.org/licenses/publicdomain/ or send a letter to +# Creative Commons, 543 Howard Street, 5th Floor, San Francisco, +# California, 94105, USA. +# +# Portions of this work are derived from Python's Grammar definition +# and may be covered under the Python copyright and license +# +# Andrew Dalke / Dalke Scientific Software, LLC +# 30 August 2006 / Cape Town, South Africa + +# Changelog: +# 30 August - added link to CC license; removed the "swapcase" encoding + +# Modifications for inclusion in PLY distribution +import sys +sys.path.insert(0, "../..") +from ply import * + +##### Lexer ###### +#import lex +import decimal + +tokens = ( + 'DEF', + 'IF', + 'NAME', + 'NUMBER', # Python decimals + 'STRING', # single quoted strings only; syntax of raw strings + 'LPAR', + 'RPAR', + 'COLON', + 'EQ', + 'ASSIGN', + 'LT', + 'GT', + 'PLUS', + 'MINUS', + 'MULT', + 'DIV', + 'RETURN', + 'WS', + 'NEWLINE', + 'COMMA', + 'SEMICOLON', + 'INDENT', + 'DEDENT', + 'ENDMARKER', +) + +#t_NUMBER = r'\d+' +# taken from decmial.py but without the leading sign + + +def t_NUMBER(t): + r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?""" + t.value = decimal.Decimal(t.value) + return t + + +def t_STRING(t): + r"'([^\\']+|\\'|\\\\)*'" # I think this is right ... + t.value = t.value[1:-1].decode("string-escape") # .swapcase() # for fun + return t + +t_COLON = r':' +t_EQ = r'==' +t_ASSIGN = r'=' +t_LT = r'<' +t_GT = r'>' +t_PLUS = r'\+' +t_MINUS = r'-' +t_MULT = r'\*' +t_DIV = r'/' +t_COMMA = r',' +t_SEMICOLON = r';' + +# Ply nicely documented how to do this. + +RESERVED = { + "def": "DEF", + "if": "IF", + "return": "RETURN", +} + + +def t_NAME(t): + r'[a-zA-Z_][a-zA-Z0-9_]*' + t.type = RESERVED.get(t.value, "NAME") + return t + +# Putting this before t_WS let it consume lines with only comments in +# them so the latter code never sees the WS part. Not consuming the +# newline. Needed for "if 1: #comment" + + +def t_comment(t): + r"[ ]*\043[^\n]*" # \043 is '#' + pass + + +# Whitespace +def t_WS(t): + r' [ ]+ ' + if t.lexer.at_line_start and t.lexer.paren_count == 0: + return t + +# Don't generate newline tokens when inside of parenthesis, eg +# a = (1, +# 2, 3) + + +def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + t.type = "NEWLINE" + if t.lexer.paren_count == 0: + return t + + +def t_LPAR(t): + r'\(' + t.lexer.paren_count += 1 + return t + + +def t_RPAR(t): + r'\)' + # check for underflow? should be the job of the parser + t.lexer.paren_count -= 1 + return t + + +def t_error(t): + raise SyntaxError("Unknown symbol %r" % (t.value[0],)) + print "Skipping", repr(t.value[0]) + t.lexer.skip(1) + +# I implemented INDENT / DEDENT generation as a post-processing filter + +# The original lex token stream contains WS and NEWLINE characters. +# WS will only occur before any other tokens on a line. + +# I have three filters. One tags tokens by adding two attributes. +# "must_indent" is True if the token must be indented from the +# previous code. The other is "at_line_start" which is True for WS +# and the first non-WS/non-NEWLINE on a line. It flags the check so +# see if the new line has changed indication level. + +# Python's syntax has three INDENT states +# 0) no colon hence no need to indent +# 1) "if 1: go()" - simple statements have a COLON but no need for an indent +# 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent +NO_INDENT = 0 +MAY_INDENT = 1 +MUST_INDENT = 2 + +# only care about whitespace at the start of a line + + +def track_tokens_filter(lexer, tokens): + lexer.at_line_start = at_line_start = True + indent = NO_INDENT + saw_colon = False + for token in tokens: + token.at_line_start = at_line_start + + if token.type == "COLON": + at_line_start = False + indent = MAY_INDENT + token.must_indent = False + + elif token.type == "NEWLINE": + at_line_start = True + if indent == MAY_INDENT: + indent = MUST_INDENT + token.must_indent = False + + elif token.type == "WS": + assert token.at_line_start == True + at_line_start = True + token.must_indent = False + + else: + # A real token; only indent after COLON NEWLINE + if indent == MUST_INDENT: + token.must_indent = True + else: + token.must_indent = False + at_line_start = False + indent = NO_INDENT + + yield token + lexer.at_line_start = at_line_start + + +def _new_token(type, lineno): + tok = lex.LexToken() + tok.type = type + tok.value = None + tok.lineno = lineno + return tok + +# Synthesize a DEDENT tag + + +def DEDENT(lineno): + return _new_token("DEDENT", lineno) + +# Synthesize an INDENT tag + + +def INDENT(lineno): + return _new_token("INDENT", lineno) + + +# Track the indentation level and emit the right INDENT / DEDENT events. +def indentation_filter(tokens): + # A stack of indentation levels; will never pop item 0 + levels = [0] + token = None + depth = 0 + prev_was_ws = False + for token in tokens: + # if 1: + # print "Process", token, + # if token.at_line_start: + # print "at_line_start", + # if token.must_indent: + # print "must_indent", + # print + + # WS only occurs at the start of the line + # There may be WS followed by NEWLINE so + # only track the depth here. Don't indent/dedent + # until there's something real. + if token.type == "WS": + assert depth == 0 + depth = len(token.value) + prev_was_ws = True + # WS tokens are never passed to the parser + continue + + if token.type == "NEWLINE": + depth = 0 + if prev_was_ws or token.at_line_start: + # ignore blank lines + continue + # pass the other cases on through + yield token + continue + + # then it must be a real token (not WS, not NEWLINE) + # which can affect the indentation level + + prev_was_ws = False + if token.must_indent: + # The current depth must be larger than the previous level + if not (depth > levels[-1]): + raise IndentationError("expected an indented block") + + levels.append(depth) + yield INDENT(token.lineno) + + elif token.at_line_start: + # Must be on the same level or one of the previous levels + if depth == levels[-1]: + # At the same level + pass + elif depth > levels[-1]: + raise IndentationError( + "indentation increase but not in new block") + else: + # Back up; but only if it matches a previous level + try: + i = levels.index(depth) + except ValueError: + raise IndentationError("inconsistent indentation") + for _ in range(i + 1, len(levels)): + yield DEDENT(token.lineno) + levels.pop() + + yield token + + ### Finished processing ### + + # Must dedent any remaining levels + if len(levels) > 1: + assert token is not None + for _ in range(1, len(levels)): + yield DEDENT(token.lineno) + + +# The top-level filter adds an ENDMARKER, if requested. +# Python's grammar uses it. +def filter(lexer, add_endmarker=True): + token = None + tokens = iter(lexer.token, None) + tokens = track_tokens_filter(lexer, tokens) + for token in indentation_filter(tokens): + yield token + + if add_endmarker: + lineno = 1 + if token is not None: + lineno = token.lineno + yield _new_token("ENDMARKER", lineno) + +# Combine Ply and my filters into a new lexer + + +class IndentLexer(object): + + def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0): + self.lexer = lex.lex(debug=debug, optimize=optimize, + lextab=lextab, reflags=reflags) + self.token_stream = None + + def input(self, s, add_endmarker=True): + self.lexer.paren_count = 0 + self.lexer.input(s) + self.token_stream = filter(self.lexer, add_endmarker) + + def token(self): + try: + return self.token_stream.next() + except StopIteration: + return None + +########## Parser (tokens -> AST) ###### + +# also part of Ply +#import yacc + +# I use the Python AST +from compiler import ast + +# Helper function + + +def Assign(left, right): + names = [] + if isinstance(left, ast.Name): + # Single assignment on left + return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right) + elif isinstance(left, ast.Tuple): + # List of things - make sure they are Name nodes + names = [] + for child in left.getChildren(): + if not isinstance(child, ast.Name): + raise SyntaxError("that assignment not supported") + names.append(child.name) + ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names] + return ast.Assign([ast.AssTuple(ass_list)], right) + else: + raise SyntaxError("Can't do that yet") + + +# The grammar comments come from Python's Grammar/Grammar file + +# NB: compound_stmt in single_input is followed by extra NEWLINE! +# file_input: (NEWLINE | stmt)* ENDMARKER +def p_file_input_end(p): + """file_input_end : file_input ENDMARKER""" + p[0] = ast.Stmt(p[1]) + + +def p_file_input(p): + """file_input : file_input NEWLINE + | file_input stmt + | NEWLINE + | stmt""" + if isinstance(p[len(p) - 1], basestring): + if len(p) == 3: + p[0] = p[1] + else: + p[0] = [] # p == 2 --> only a blank line + else: + if len(p) == 3: + p[0] = p[1] + p[2] + else: + p[0] = p[1] + + +# funcdef: [decorators] 'def' NAME parameters ':' suite +# ignoring decorators +def p_funcdef(p): + "funcdef : DEF NAME parameters COLON suite" + p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5]) + +# parameters: '(' [varargslist] ')' + + +def p_parameters(p): + """parameters : LPAR RPAR + | LPAR varargslist RPAR""" + if len(p) == 3: + p[0] = [] + else: + p[0] = p[2] + + +# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) | +# highly simplified +def p_varargslist(p): + """varargslist : varargslist COMMA NAME + | NAME""" + if len(p) == 4: + p[0] = p[1] + p[3] + else: + p[0] = [p[1]] + +# stmt: simple_stmt | compound_stmt + + +def p_stmt_simple(p): + """stmt : simple_stmt""" + # simple_stmt is a list + p[0] = p[1] + + +def p_stmt_compound(p): + """stmt : compound_stmt""" + p[0] = [p[1]] + +# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE + + +def p_simple_stmt(p): + """simple_stmt : small_stmts NEWLINE + | small_stmts SEMICOLON NEWLINE""" + p[0] = p[1] + + +def p_small_stmts(p): + """small_stmts : small_stmts SEMICOLON small_stmt + | small_stmt""" + if len(p) == 4: + p[0] = p[1] + [p[3]] + else: + p[0] = [p[1]] + +# small_stmt: expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt | +# import_stmt | global_stmt | exec_stmt | assert_stmt + + +def p_small_stmt(p): + """small_stmt : flow_stmt + | expr_stmt""" + p[0] = p[1] + +# expr_stmt: testlist (augassign (yield_expr|testlist) | +# ('=' (yield_expr|testlist))*) +# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | +# '<<=' | '>>=' | '**=' | '//=') + + +def p_expr_stmt(p): + """expr_stmt : testlist ASSIGN testlist + | testlist """ + if len(p) == 2: + # a list of expressions + p[0] = ast.Discard(p[1]) + else: + p[0] = Assign(p[1], p[3]) + + +def p_flow_stmt(p): + "flow_stmt : return_stmt" + p[0] = p[1] + +# return_stmt: 'return' [testlist] + + +def p_return_stmt(p): + "return_stmt : RETURN testlist" + p[0] = ast.Return(p[2]) + + +def p_compound_stmt(p): + """compound_stmt : if_stmt + | funcdef""" + p[0] = p[1] + + +def p_if_stmt(p): + 'if_stmt : IF test COLON suite' + p[0] = ast.If([(p[2], p[4])], None) + + +def p_suite(p): + """suite : simple_stmt + | NEWLINE INDENT stmts DEDENT""" + if len(p) == 2: + p[0] = ast.Stmt(p[1]) + else: + p[0] = ast.Stmt(p[3]) + + +def p_stmts(p): + """stmts : stmts stmt + | stmt""" + if len(p) == 3: + p[0] = p[1] + p[2] + else: + p[0] = p[1] + +# No using Python's approach because Ply supports precedence + +# comparison: expr (comp_op expr)* +# arith_expr: term (('+'|'-') term)* +# term: factor (('*'|'/'|'%'|'//') factor)* +# factor: ('+'|'-'|'~') factor | power +# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' + + +def make_lt_compare((left, right)): + return ast.Compare(left, [('<', right), ]) + + +def make_gt_compare((left, right)): + return ast.Compare(left, [('>', right), ]) + + +def make_eq_compare((left, right)): + return ast.Compare(left, [('==', right), ]) + + +binary_ops = { + "+": ast.Add, + "-": ast.Sub, + "*": ast.Mul, + "/": ast.Div, + "<": make_lt_compare, + ">": make_gt_compare, + "==": make_eq_compare, +} +unary_ops = { + "+": ast.UnaryAdd, + "-": ast.UnarySub, +} +precedence = ( + ("left", "EQ", "GT", "LT"), + ("left", "PLUS", "MINUS"), + ("left", "MULT", "DIV"), +) + + +def p_comparison(p): + """comparison : comparison PLUS comparison + | comparison MINUS comparison + | comparison MULT comparison + | comparison DIV comparison + | comparison LT comparison + | comparison EQ comparison + | comparison GT comparison + | PLUS comparison + | MINUS comparison + | power""" + if len(p) == 4: + p[0] = binary_ops[p[2]]((p[1], p[3])) + elif len(p) == 3: + p[0] = unary_ops[p[1]](p[2]) + else: + p[0] = p[1] + +# power: atom trailer* ['**' factor] +# trailers enables function calls. I only allow one level of calls +# so this is 'trailer' + + +def p_power(p): + """power : atom + | atom trailer""" + if len(p) == 2: + p[0] = p[1] + else: + if p[2][0] == "CALL": + p[0] = ast.CallFunc(p[1], p[2][1], None, None) + else: + raise AssertionError("not implemented") + + +def p_atom_name(p): + """atom : NAME""" + p[0] = ast.Name(p[1]) + + +def p_atom_number(p): + """atom : NUMBER + | STRING""" + p[0] = ast.Const(p[1]) + + +def p_atom_tuple(p): + """atom : LPAR testlist RPAR""" + p[0] = p[2] + +# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME + + +def p_trailer(p): + "trailer : LPAR arglist RPAR" + p[0] = ("CALL", p[2]) + +# testlist: test (',' test)* [','] +# Contains shift/reduce error + + +def p_testlist(p): + """testlist : testlist_multi COMMA + | testlist_multi """ + if len(p) == 2: + p[0] = p[1] + else: + # May need to promote singleton to tuple + if isinstance(p[1], list): + p[0] = p[1] + else: + p[0] = [p[1]] + # Convert into a tuple? + if isinstance(p[0], list): + p[0] = ast.Tuple(p[0]) + + +def p_testlist_multi(p): + """testlist_multi : testlist_multi COMMA test + | test""" + if len(p) == 2: + # singleton + p[0] = p[1] + else: + if isinstance(p[1], list): + p[0] = p[1] + [p[3]] + else: + # singleton -> tuple + p[0] = [p[1], p[3]] + + +# test: or_test ['if' or_test 'else' test] | lambdef +# as I don't support 'and', 'or', and 'not' this works down to 'comparison' +def p_test(p): + "test : comparison" + p[0] = p[1] + + +# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test) +# XXX INCOMPLETE: this doesn't allow the trailing comma +def p_arglist(p): + """arglist : arglist COMMA argument + | argument""" + if len(p) == 4: + p[0] = p[1] + [p[3]] + else: + p[0] = [p[1]] + +# argument: test [gen_for] | test '=' test # Really [keyword '='] test + + +def p_argument(p): + "argument : test" + p[0] = p[1] + + +def p_error(p): + # print "Error!", repr(p) + raise SyntaxError(p) + + +class GardenSnakeParser(object): + + def __init__(self, lexer=None): + if lexer is None: + lexer = IndentLexer() + self.lexer = lexer + self.parser = yacc.yacc(start="file_input_end") + + def parse(self, code): + self.lexer.input(code) + result = self.parser.parse(lexer=self.lexer) + return ast.Module(None, result) + + +###### Code generation ###### + +from compiler import misc, syntax, pycodegen + + +class GardenSnakeCompiler(object): + + def __init__(self): + self.parser = GardenSnakeParser() + + def compile(self, code, filename=""): + tree = self.parser.parse(code) + # print tree + misc.set_filename(filename, tree) + syntax.check(tree) + gen = pycodegen.ModuleCodeGenerator(tree) + code = gen.getCode() + return code + +####### Test code ####### + +compile = GardenSnakeCompiler().compile + +code = r""" + +print('LET\'S TRY THIS \\OUT') + +#Comment here +def x(a): + print('called with',a) + if a == 1: + return 2 + if a*2 > 10: return 999 / 4 + # Another comment here + + return a+2*3 + +ints = (1, 2, + 3, 4, +5) +print('mutiline-expression', ints) + +t = 4+1/3*2+6*(9-5+1) +print('predence test; should be 34+2/3:', t, t==(34+2/3)) + +print('numbers', 1,2,3,4,5) +if 1: + 8 + a=9 + print(x(a)) + +print(x(1)) +print(x(2)) +print(x(8),'3') +print('this is decimal', 1/5) +print('BIG DECIMAL', 1.234567891234567e12345) + +""" + +# Set up the GardenSnake run-time environment + + +def print_(*args): + print "-->", " ".join(map(str, args)) + +globals()["print"] = print_ + +compiled_code = compile(code) + +exec compiled_code in globals() +print "Done" diff --git a/example/GardenSnake/README b/example/GardenSnake/README new file mode 100644 index 000000000..4d8be2db0 --- /dev/null +++ b/example/GardenSnake/README @@ -0,0 +1,5 @@ +This example is Andrew Dalke's GardenSnake language. It shows how to process an +indentation-like language like Python. Further details can be found here: + +http://dalkescientific.com/writings/diary/archive/2006/08/30/gardensnake_language.html + diff --git a/example/README b/example/README new file mode 100644 index 000000000..63519b557 --- /dev/null +++ b/example/README @@ -0,0 +1,10 @@ +Simple examples: + calc - Simple calculator + classcalc - Simple calculate defined as a class + +Complex examples + ansic - ANSI C grammar from K&R + BASIC - A small BASIC interpreter + GardenSnake - A simple python-like language + yply - Converts Unix yacc files to PLY programs. + diff --git a/example/ansic/README b/example/ansic/README new file mode 100644 index 000000000..e049d3b4e --- /dev/null +++ b/example/ansic/README @@ -0,0 +1,2 @@ +This example is incomplete. Was going to specify an ANSI C parser. +This is part of it. diff --git a/example/ansic/clex.py b/example/ansic/clex.py new file mode 100644 index 000000000..4bde1d730 --- /dev/null +++ b/example/ansic/clex.py @@ -0,0 +1,168 @@ +# ---------------------------------------------------------------------- +# clex.py +# +# A lexer for ANSI C. +# ---------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +import ply.lex as lex + +# Reserved words +reserved = ( + 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', + 'ELSE', 'ENUM', 'EXTERN', 'FLOAT', 'FOR', 'GOTO', 'IF', 'INT', 'LONG', 'REGISTER', + 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', 'SWITCH', 'TYPEDEF', + 'UNION', 'UNSIGNED', 'VOID', 'VOLATILE', 'WHILE', +) + +tokens = reserved + ( + # Literals (identifier, integer constant, float constant, string constant, + # char const) + 'ID', 'TYPEID', 'ICONST', 'FCONST', 'SCONST', 'CCONST', + + # Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=) + 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', + 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', + 'LOR', 'LAND', 'LNOT', + 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', + + # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=) + 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL', + 'LSHIFTEQUAL', 'RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL', + + # Increment/decrement (++,--) + 'PLUSPLUS', 'MINUSMINUS', + + # Structure dereference (->) + 'ARROW', + + # Conditional operator (?) + 'CONDOP', + + # Delimeters ( ) [ ] { } , . ; : + 'LPAREN', 'RPAREN', + 'LBRACKET', 'RBRACKET', + 'LBRACE', 'RBRACE', + 'COMMA', 'PERIOD', 'SEMI', 'COLON', + + # Ellipsis (...) + 'ELLIPSIS', +) + +# Completely ignored characters +t_ignore = ' \t\x0c' + +# Newlines + + +def t_NEWLINE(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +# Operators +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_MOD = r'%' +t_OR = r'\|' +t_AND = r'&' +t_NOT = r'~' +t_XOR = r'\^' +t_LSHIFT = r'<<' +t_RSHIFT = r'>>' +t_LOR = r'\|\|' +t_LAND = r'&&' +t_LNOT = r'!' +t_LT = r'<' +t_GT = r'>' +t_LE = r'<=' +t_GE = r'>=' +t_EQ = r'==' +t_NE = r'!=' + +# Assignment operators + +t_EQUALS = r'=' +t_TIMESEQUAL = r'\*=' +t_DIVEQUAL = r'/=' +t_MODEQUAL = r'%=' +t_PLUSEQUAL = r'\+=' +t_MINUSEQUAL = r'-=' +t_LSHIFTEQUAL = r'<<=' +t_RSHIFTEQUAL = r'>>=' +t_ANDEQUAL = r'&=' +t_OREQUAL = r'\|=' +t_XOREQUAL = r'\^=' + +# Increment/decrement +t_PLUSPLUS = r'\+\+' +t_MINUSMINUS = r'--' + +# -> +t_ARROW = r'->' + +# ? +t_CONDOP = r'\?' + +# Delimeters +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_LBRACKET = r'\[' +t_RBRACKET = r'\]' +t_LBRACE = r'\{' +t_RBRACE = r'\}' +t_COMMA = r',' +t_PERIOD = r'\.' +t_SEMI = r';' +t_COLON = r':' +t_ELLIPSIS = r'\.\.\.' + +# Identifiers and reserved words + +reserved_map = {} +for r in reserved: + reserved_map[r.lower()] = r + + +def t_ID(t): + r'[A-Za-z_][\w_]*' + t.type = reserved_map.get(t.value, "ID") + return t + +# Integer literal +t_ICONST = r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?' + +# Floating literal +t_FCONST = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' + +# String literal +t_SCONST = r'\"([^\\\n]|(\\.))*?\"' + +# Character constant 'c' or L'c' +t_CCONST = r'(L)?\'([^\\\n]|(\\.))*?\'' + +# Comments + + +def t_comment(t): + r'/\*(.|\n)*?\*/' + t.lexer.lineno += t.value.count('\n') + +# Preprocessor directive (ignored) + + +def t_preprocessor(t): + r'\#(.)*?\n' + t.lexer.lineno += 1 + + +def t_error(t): + print("Illegal character %s" % repr(t.value[0])) + t.lexer.skip(1) + +lexer = lex.lex() +if __name__ == "__main__": + lex.runmain(lexer) diff --git a/example/ansic/cparse.py b/example/ansic/cparse.py new file mode 100644 index 000000000..5fe9bce04 --- /dev/null +++ b/example/ansic/cparse.py @@ -0,0 +1,1048 @@ +# ----------------------------------------------------------------------------- +# cparse.py +# +# Simple parser for ANSI C. Based on the grammar in K&R, 2nd Ed. +# ----------------------------------------------------------------------------- + +import sys +import clex +import ply.yacc as yacc + +# Get the token map +tokens = clex.tokens + +# translation-unit: + + +def p_translation_unit_1(t): + 'translation_unit : external_declaration' + pass + + +def p_translation_unit_2(t): + 'translation_unit : translation_unit external_declaration' + pass + +# external-declaration: + + +def p_external_declaration_1(t): + 'external_declaration : function_definition' + pass + + +def p_external_declaration_2(t): + 'external_declaration : declaration' + pass + +# function-definition: + + +def p_function_definition_1(t): + 'function_definition : declaration_specifiers declarator declaration_list compound_statement' + pass + + +def p_function_definition_2(t): + 'function_definition : declarator declaration_list compound_statement' + pass + + +def p_function_definition_3(t): + 'function_definition : declarator compound_statement' + pass + + +def p_function_definition_4(t): + 'function_definition : declaration_specifiers declarator compound_statement' + pass + +# declaration: + + +def p_declaration_1(t): + 'declaration : declaration_specifiers init_declarator_list SEMI' + pass + + +def p_declaration_2(t): + 'declaration : declaration_specifiers SEMI' + pass + +# declaration-list: + + +def p_declaration_list_1(t): + 'declaration_list : declaration' + pass + + +def p_declaration_list_2(t): + 'declaration_list : declaration_list declaration ' + pass + +# declaration-specifiers + + +def p_declaration_specifiers_1(t): + 'declaration_specifiers : storage_class_specifier declaration_specifiers' + pass + + +def p_declaration_specifiers_2(t): + 'declaration_specifiers : type_specifier declaration_specifiers' + pass + + +def p_declaration_specifiers_3(t): + 'declaration_specifiers : type_qualifier declaration_specifiers' + pass + + +def p_declaration_specifiers_4(t): + 'declaration_specifiers : storage_class_specifier' + pass + + +def p_declaration_specifiers_5(t): + 'declaration_specifiers : type_specifier' + pass + + +def p_declaration_specifiers_6(t): + 'declaration_specifiers : type_qualifier' + pass + +# storage-class-specifier + + +def p_storage_class_specifier(t): + '''storage_class_specifier : AUTO + | REGISTER + | STATIC + | EXTERN + | TYPEDEF + ''' + pass + +# type-specifier: + + +def p_type_specifier(t): + '''type_specifier : VOID + | CHAR + | SHORT + | INT + | LONG + | FLOAT + | DOUBLE + | SIGNED + | UNSIGNED + | struct_or_union_specifier + | enum_specifier + | TYPEID + ''' + pass + +# type-qualifier: + + +def p_type_qualifier(t): + '''type_qualifier : CONST + | VOLATILE''' + pass + +# struct-or-union-specifier + + +def p_struct_or_union_specifier_1(t): + 'struct_or_union_specifier : struct_or_union ID LBRACE struct_declaration_list RBRACE' + pass + + +def p_struct_or_union_specifier_2(t): + 'struct_or_union_specifier : struct_or_union LBRACE struct_declaration_list RBRACE' + pass + + +def p_struct_or_union_specifier_3(t): + 'struct_or_union_specifier : struct_or_union ID' + pass + +# struct-or-union: + + +def p_struct_or_union(t): + '''struct_or_union : STRUCT + | UNION + ''' + pass + +# struct-declaration-list: + + +def p_struct_declaration_list_1(t): + 'struct_declaration_list : struct_declaration' + pass + + +def p_struct_declaration_list_2(t): + 'struct_declaration_list : struct_declaration_list struct_declaration' + pass + +# init-declarator-list: + + +def p_init_declarator_list_1(t): + 'init_declarator_list : init_declarator' + pass + + +def p_init_declarator_list_2(t): + 'init_declarator_list : init_declarator_list COMMA init_declarator' + pass + +# init-declarator + + +def p_init_declarator_1(t): + 'init_declarator : declarator' + pass + + +def p_init_declarator_2(t): + 'init_declarator : declarator EQUALS initializer' + pass + +# struct-declaration: + + +def p_struct_declaration(t): + 'struct_declaration : specifier_qualifier_list struct_declarator_list SEMI' + pass + +# specifier-qualifier-list: + + +def p_specifier_qualifier_list_1(t): + 'specifier_qualifier_list : type_specifier specifier_qualifier_list' + pass + + +def p_specifier_qualifier_list_2(t): + 'specifier_qualifier_list : type_specifier' + pass + + +def p_specifier_qualifier_list_3(t): + 'specifier_qualifier_list : type_qualifier specifier_qualifier_list' + pass + + +def p_specifier_qualifier_list_4(t): + 'specifier_qualifier_list : type_qualifier' + pass + +# struct-declarator-list: + + +def p_struct_declarator_list_1(t): + 'struct_declarator_list : struct_declarator' + pass + + +def p_struct_declarator_list_2(t): + 'struct_declarator_list : struct_declarator_list COMMA struct_declarator' + pass + +# struct-declarator: + + +def p_struct_declarator_1(t): + 'struct_declarator : declarator' + pass + + +def p_struct_declarator_2(t): + 'struct_declarator : declarator COLON constant_expression' + pass + + +def p_struct_declarator_3(t): + 'struct_declarator : COLON constant_expression' + pass + +# enum-specifier: + + +def p_enum_specifier_1(t): + 'enum_specifier : ENUM ID LBRACE enumerator_list RBRACE' + pass + + +def p_enum_specifier_2(t): + 'enum_specifier : ENUM LBRACE enumerator_list RBRACE' + pass + + +def p_enum_specifier_3(t): + 'enum_specifier : ENUM ID' + pass + +# enumerator_list: + + +def p_enumerator_list_1(t): + 'enumerator_list : enumerator' + pass + + +def p_enumerator_list_2(t): + 'enumerator_list : enumerator_list COMMA enumerator' + pass + +# enumerator: + + +def p_enumerator_1(t): + 'enumerator : ID' + pass + + +def p_enumerator_2(t): + 'enumerator : ID EQUALS constant_expression' + pass + +# declarator: + + +def p_declarator_1(t): + 'declarator : pointer direct_declarator' + pass + + +def p_declarator_2(t): + 'declarator : direct_declarator' + pass + +# direct-declarator: + + +def p_direct_declarator_1(t): + 'direct_declarator : ID' + pass + + +def p_direct_declarator_2(t): + 'direct_declarator : LPAREN declarator RPAREN' + pass + + +def p_direct_declarator_3(t): + 'direct_declarator : direct_declarator LBRACKET constant_expression_opt RBRACKET' + pass + + +def p_direct_declarator_4(t): + 'direct_declarator : direct_declarator LPAREN parameter_type_list RPAREN ' + pass + + +def p_direct_declarator_5(t): + 'direct_declarator : direct_declarator LPAREN identifier_list RPAREN ' + pass + + +def p_direct_declarator_6(t): + 'direct_declarator : direct_declarator LPAREN RPAREN ' + pass + +# pointer: + + +def p_pointer_1(t): + 'pointer : TIMES type_qualifier_list' + pass + + +def p_pointer_2(t): + 'pointer : TIMES' + pass + + +def p_pointer_3(t): + 'pointer : TIMES type_qualifier_list pointer' + pass + + +def p_pointer_4(t): + 'pointer : TIMES pointer' + pass + +# type-qualifier-list: + + +def p_type_qualifier_list_1(t): + 'type_qualifier_list : type_qualifier' + pass + + +def p_type_qualifier_list_2(t): + 'type_qualifier_list : type_qualifier_list type_qualifier' + pass + +# parameter-type-list: + + +def p_parameter_type_list_1(t): + 'parameter_type_list : parameter_list' + pass + + +def p_parameter_type_list_2(t): + 'parameter_type_list : parameter_list COMMA ELLIPSIS' + pass + +# parameter-list: + + +def p_parameter_list_1(t): + 'parameter_list : parameter_declaration' + pass + + +def p_parameter_list_2(t): + 'parameter_list : parameter_list COMMA parameter_declaration' + pass + +# parameter-declaration: + + +def p_parameter_declaration_1(t): + 'parameter_declaration : declaration_specifiers declarator' + pass + + +def p_parameter_declaration_2(t): + 'parameter_declaration : declaration_specifiers abstract_declarator_opt' + pass + +# identifier-list: + + +def p_identifier_list_1(t): + 'identifier_list : ID' + pass + + +def p_identifier_list_2(t): + 'identifier_list : identifier_list COMMA ID' + pass + +# initializer: + + +def p_initializer_1(t): + 'initializer : assignment_expression' + pass + + +def p_initializer_2(t): + '''initializer : LBRACE initializer_list RBRACE + | LBRACE initializer_list COMMA RBRACE''' + pass + +# initializer-list: + + +def p_initializer_list_1(t): + 'initializer_list : initializer' + pass + + +def p_initializer_list_2(t): + 'initializer_list : initializer_list COMMA initializer' + pass + +# type-name: + + +def p_type_name(t): + 'type_name : specifier_qualifier_list abstract_declarator_opt' + pass + + +def p_abstract_declarator_opt_1(t): + 'abstract_declarator_opt : empty' + pass + + +def p_abstract_declarator_opt_2(t): + 'abstract_declarator_opt : abstract_declarator' + pass + +# abstract-declarator: + + +def p_abstract_declarator_1(t): + 'abstract_declarator : pointer ' + pass + + +def p_abstract_declarator_2(t): + 'abstract_declarator : pointer direct_abstract_declarator' + pass + + +def p_abstract_declarator_3(t): + 'abstract_declarator : direct_abstract_declarator' + pass + +# direct-abstract-declarator: + + +def p_direct_abstract_declarator_1(t): + 'direct_abstract_declarator : LPAREN abstract_declarator RPAREN' + pass + + +def p_direct_abstract_declarator_2(t): + 'direct_abstract_declarator : direct_abstract_declarator LBRACKET constant_expression_opt RBRACKET' + pass + + +def p_direct_abstract_declarator_3(t): + 'direct_abstract_declarator : LBRACKET constant_expression_opt RBRACKET' + pass + + +def p_direct_abstract_declarator_4(t): + 'direct_abstract_declarator : direct_abstract_declarator LPAREN parameter_type_list_opt RPAREN' + pass + + +def p_direct_abstract_declarator_5(t): + 'direct_abstract_declarator : LPAREN parameter_type_list_opt RPAREN' + pass + +# Optional fields in abstract declarators + + +def p_constant_expression_opt_1(t): + 'constant_expression_opt : empty' + pass + + +def p_constant_expression_opt_2(t): + 'constant_expression_opt : constant_expression' + pass + + +def p_parameter_type_list_opt_1(t): + 'parameter_type_list_opt : empty' + pass + + +def p_parameter_type_list_opt_2(t): + 'parameter_type_list_opt : parameter_type_list' + pass + +# statement: + + +def p_statement(t): + ''' + statement : labeled_statement + | expression_statement + | compound_statement + | selection_statement + | iteration_statement + | jump_statement + ''' + pass + +# labeled-statement: + + +def p_labeled_statement_1(t): + 'labeled_statement : ID COLON statement' + pass + + +def p_labeled_statement_2(t): + 'labeled_statement : CASE constant_expression COLON statement' + pass + + +def p_labeled_statement_3(t): + 'labeled_statement : DEFAULT COLON statement' + pass + +# expression-statement: + + +def p_expression_statement(t): + 'expression_statement : expression_opt SEMI' + pass + +# compound-statement: + + +def p_compound_statement_1(t): + 'compound_statement : LBRACE declaration_list statement_list RBRACE' + pass + + +def p_compound_statement_2(t): + 'compound_statement : LBRACE statement_list RBRACE' + pass + + +def p_compound_statement_3(t): + 'compound_statement : LBRACE declaration_list RBRACE' + pass + + +def p_compound_statement_4(t): + 'compound_statement : LBRACE RBRACE' + pass + +# statement-list: + + +def p_statement_list_1(t): + 'statement_list : statement' + pass + + +def p_statement_list_2(t): + 'statement_list : statement_list statement' + pass + +# selection-statement + + +def p_selection_statement_1(t): + 'selection_statement : IF LPAREN expression RPAREN statement' + pass + + +def p_selection_statement_2(t): + 'selection_statement : IF LPAREN expression RPAREN statement ELSE statement ' + pass + + +def p_selection_statement_3(t): + 'selection_statement : SWITCH LPAREN expression RPAREN statement ' + pass + +# iteration_statement: + + +def p_iteration_statement_1(t): + 'iteration_statement : WHILE LPAREN expression RPAREN statement' + pass + + +def p_iteration_statement_2(t): + 'iteration_statement : FOR LPAREN expression_opt SEMI expression_opt SEMI expression_opt RPAREN statement ' + pass + + +def p_iteration_statement_3(t): + 'iteration_statement : DO statement WHILE LPAREN expression RPAREN SEMI' + pass + +# jump_statement: + + +def p_jump_statement_1(t): + 'jump_statement : GOTO ID SEMI' + pass + + +def p_jump_statement_2(t): + 'jump_statement : CONTINUE SEMI' + pass + + +def p_jump_statement_3(t): + 'jump_statement : BREAK SEMI' + pass + + +def p_jump_statement_4(t): + 'jump_statement : RETURN expression_opt SEMI' + pass + + +def p_expression_opt_1(t): + 'expression_opt : empty' + pass + + +def p_expression_opt_2(t): + 'expression_opt : expression' + pass + +# expression: + + +def p_expression_1(t): + 'expression : assignment_expression' + pass + + +def p_expression_2(t): + 'expression : expression COMMA assignment_expression' + pass + +# assigment_expression: + + +def p_assignment_expression_1(t): + 'assignment_expression : conditional_expression' + pass + + +def p_assignment_expression_2(t): + 'assignment_expression : unary_expression assignment_operator assignment_expression' + pass + +# assignment_operator: + + +def p_assignment_operator(t): + ''' + assignment_operator : EQUALS + | TIMESEQUAL + | DIVEQUAL + | MODEQUAL + | PLUSEQUAL + | MINUSEQUAL + | LSHIFTEQUAL + | RSHIFTEQUAL + | ANDEQUAL + | OREQUAL + | XOREQUAL + ''' + pass + +# conditional-expression + + +def p_conditional_expression_1(t): + 'conditional_expression : logical_or_expression' + pass + + +def p_conditional_expression_2(t): + 'conditional_expression : logical_or_expression CONDOP expression COLON conditional_expression ' + pass + +# constant-expression + + +def p_constant_expression(t): + 'constant_expression : conditional_expression' + pass + +# logical-or-expression + + +def p_logical_or_expression_1(t): + 'logical_or_expression : logical_and_expression' + pass + + +def p_logical_or_expression_2(t): + 'logical_or_expression : logical_or_expression LOR logical_and_expression' + pass + +# logical-and-expression + + +def p_logical_and_expression_1(t): + 'logical_and_expression : inclusive_or_expression' + pass + + +def p_logical_and_expression_2(t): + 'logical_and_expression : logical_and_expression LAND inclusive_or_expression' + pass + +# inclusive-or-expression: + + +def p_inclusive_or_expression_1(t): + 'inclusive_or_expression : exclusive_or_expression' + pass + + +def p_inclusive_or_expression_2(t): + 'inclusive_or_expression : inclusive_or_expression OR exclusive_or_expression' + pass + +# exclusive-or-expression: + + +def p_exclusive_or_expression_1(t): + 'exclusive_or_expression : and_expression' + pass + + +def p_exclusive_or_expression_2(t): + 'exclusive_or_expression : exclusive_or_expression XOR and_expression' + pass + +# AND-expression + + +def p_and_expression_1(t): + 'and_expression : equality_expression' + pass + + +def p_and_expression_2(t): + 'and_expression : and_expression AND equality_expression' + pass + + +# equality-expression: +def p_equality_expression_1(t): + 'equality_expression : relational_expression' + pass + + +def p_equality_expression_2(t): + 'equality_expression : equality_expression EQ relational_expression' + pass + + +def p_equality_expression_3(t): + 'equality_expression : equality_expression NE relational_expression' + pass + + +# relational-expression: +def p_relational_expression_1(t): + 'relational_expression : shift_expression' + pass + + +def p_relational_expression_2(t): + 'relational_expression : relational_expression LT shift_expression' + pass + + +def p_relational_expression_3(t): + 'relational_expression : relational_expression GT shift_expression' + pass + + +def p_relational_expression_4(t): + 'relational_expression : relational_expression LE shift_expression' + pass + + +def p_relational_expression_5(t): + 'relational_expression : relational_expression GE shift_expression' + pass + +# shift-expression + + +def p_shift_expression_1(t): + 'shift_expression : additive_expression' + pass + + +def p_shift_expression_2(t): + 'shift_expression : shift_expression LSHIFT additive_expression' + pass + + +def p_shift_expression_3(t): + 'shift_expression : shift_expression RSHIFT additive_expression' + pass + +# additive-expression + + +def p_additive_expression_1(t): + 'additive_expression : multiplicative_expression' + pass + + +def p_additive_expression_2(t): + 'additive_expression : additive_expression PLUS multiplicative_expression' + pass + + +def p_additive_expression_3(t): + 'additive_expression : additive_expression MINUS multiplicative_expression' + pass + +# multiplicative-expression + + +def p_multiplicative_expression_1(t): + 'multiplicative_expression : cast_expression' + pass + + +def p_multiplicative_expression_2(t): + 'multiplicative_expression : multiplicative_expression TIMES cast_expression' + pass + + +def p_multiplicative_expression_3(t): + 'multiplicative_expression : multiplicative_expression DIVIDE cast_expression' + pass + + +def p_multiplicative_expression_4(t): + 'multiplicative_expression : multiplicative_expression MOD cast_expression' + pass + +# cast-expression: + + +def p_cast_expression_1(t): + 'cast_expression : unary_expression' + pass + + +def p_cast_expression_2(t): + 'cast_expression : LPAREN type_name RPAREN cast_expression' + pass + +# unary-expression: + + +def p_unary_expression_1(t): + 'unary_expression : postfix_expression' + pass + + +def p_unary_expression_2(t): + 'unary_expression : PLUSPLUS unary_expression' + pass + + +def p_unary_expression_3(t): + 'unary_expression : MINUSMINUS unary_expression' + pass + + +def p_unary_expression_4(t): + 'unary_expression : unary_operator cast_expression' + pass + + +def p_unary_expression_5(t): + 'unary_expression : SIZEOF unary_expression' + pass + + +def p_unary_expression_6(t): + 'unary_expression : SIZEOF LPAREN type_name RPAREN' + pass + +# unary-operator + + +def p_unary_operator(t): + '''unary_operator : AND + | TIMES + | PLUS + | MINUS + | NOT + | LNOT ''' + pass + +# postfix-expression: + + +def p_postfix_expression_1(t): + 'postfix_expression : primary_expression' + pass + + +def p_postfix_expression_2(t): + 'postfix_expression : postfix_expression LBRACKET expression RBRACKET' + pass + + +def p_postfix_expression_3(t): + 'postfix_expression : postfix_expression LPAREN argument_expression_list RPAREN' + pass + + +def p_postfix_expression_4(t): + 'postfix_expression : postfix_expression LPAREN RPAREN' + pass + + +def p_postfix_expression_5(t): + 'postfix_expression : postfix_expression PERIOD ID' + pass + + +def p_postfix_expression_6(t): + 'postfix_expression : postfix_expression ARROW ID' + pass + + +def p_postfix_expression_7(t): + 'postfix_expression : postfix_expression PLUSPLUS' + pass + + +def p_postfix_expression_8(t): + 'postfix_expression : postfix_expression MINUSMINUS' + pass + +# primary-expression: + + +def p_primary_expression(t): + '''primary_expression : ID + | constant + | SCONST + | LPAREN expression RPAREN''' + pass + +# argument-expression-list: + + +def p_argument_expression_list(t): + '''argument_expression_list : assignment_expression + | argument_expression_list COMMA assignment_expression''' + pass + +# constant: + + +def p_constant(t): + '''constant : ICONST + | FCONST + | CCONST''' + pass + + +def p_empty(t): + 'empty : ' + pass + + +def p_error(t): + print("Whoa. We're hosed") + +import profile +# Build the grammar + +yacc.yacc() +#yacc.yacc(method='LALR',write_tables=False,debug=False) + +#profile.run("yacc.yacc(method='LALR')") diff --git a/example/calc/calc.py b/example/calc/calc.py new file mode 100644 index 000000000..824c3d7d0 --- /dev/null +++ b/example/calc/calc.py @@ -0,0 +1,123 @@ +# ----------------------------------------------------------------------------- +# calc.py +# +# A simple calculator with variables. This is from O'Reilly's +# "Lex and Yacc", p. 63. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +tokens = ( + 'NAME', 'NUMBER', +) + +literals = ['=', '+', '-', '*', '/', '(', ')'] + +# Tokens + +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + +def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +t_ignore = " \t" + + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +import ply.lex as lex +lex.lex() + +# Parsing rules + +precedence = ( + ('left', '+', '-'), + ('left', '*', '/'), + ('right', 'UMINUS'), +) + +# dictionary of names +names = {} + + +def p_statement_assign(p): + 'statement : NAME "=" expression' + names[p[1]] = p[3] + + +def p_statement_expr(p): + 'statement : expression' + print(p[1]) + + +def p_expression_binop(p): + '''expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + + +def p_expression_uminus(p): + "expression : '-' expression %prec UMINUS" + p[0] = -p[2] + + +def p_expression_group(p): + "expression : '(' expression ')'" + p[0] = p[2] + + +def p_expression_number(p): + "expression : NUMBER" + p[0] = p[1] + + +def p_expression_name(p): + "expression : NAME" + try: + p[0] = names[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + + +def p_error(p): + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") + +import ply.yacc as yacc +yacc.yacc() + +while 1: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: + continue + yacc.parse(s) diff --git a/example/calcdebug/calc.py b/example/calcdebug/calc.py new file mode 100644 index 000000000..06831e2ca --- /dev/null +++ b/example/calcdebug/calc.py @@ -0,0 +1,129 @@ +# ----------------------------------------------------------------------------- +# calc.py +# +# This example shows how to run the parser in a debugging mode +# with output routed to a logging object. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +tokens = ( + 'NAME', 'NUMBER', +) + +literals = ['=', '+', '-', '*', '/', '(', ')'] + +# Tokens + +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + +def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +t_ignore = " \t" + + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +import ply.lex as lex +lex.lex() + +# Parsing rules + +precedence = ( + ('left', '+', '-'), + ('left', '*', '/'), + ('right', 'UMINUS'), +) + +# dictionary of names +names = {} + + +def p_statement_assign(p): + 'statement : NAME "=" expression' + names[p[1]] = p[3] + + +def p_statement_expr(p): + 'statement : expression' + print(p[1]) + + +def p_expression_binop(p): + '''expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + + +def p_expression_uminus(p): + "expression : '-' expression %prec UMINUS" + p[0] = -p[2] + + +def p_expression_group(p): + "expression : '(' expression ')'" + p[0] = p[2] + + +def p_expression_number(p): + "expression : NUMBER" + p[0] = p[1] + + +def p_expression_name(p): + "expression : NAME" + try: + p[0] = names[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + + +def p_error(p): + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") + +import ply.yacc as yacc +yacc.yacc() + +import logging +logging.basicConfig( + level=logging.INFO, + filename="parselog.txt" +) + +while 1: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: + continue + yacc.parse(s, debug=logging.getLogger()) diff --git a/example/calceof/calc.py b/example/calceof/calc.py new file mode 100644 index 000000000..22b39a41a --- /dev/null +++ b/example/calceof/calc.py @@ -0,0 +1,132 @@ +# ----------------------------------------------------------------------------- +# calc.py +# +# A simple calculator with variables. Asks the user for more input and +# demonstrates the use of the t_eof() rule. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +tokens = ( + 'NAME', 'NUMBER', +) + +literals = ['=', '+', '-', '*', '/', '(', ')'] + +# Tokens + +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + +def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +t_ignore = " \t" + + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + +def t_eof(t): + more = raw_input('... ') + if more: + t.lexer.input(more + '\n') + return t.lexer.token() + else: + return None + + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +import ply.lex as lex +lex.lex() + +# Parsing rules + +precedence = ( + ('left', '+', '-'), + ('left', '*', '/'), + ('right', 'UMINUS'), +) + +# dictionary of names +names = {} + + +def p_statement_assign(p): + 'statement : NAME "=" expression' + names[p[1]] = p[3] + + +def p_statement_expr(p): + 'statement : expression' + print(p[1]) + + +def p_expression_binop(p): + '''expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + + +def p_expression_uminus(p): + "expression : '-' expression %prec UMINUS" + p[0] = -p[2] + + +def p_expression_group(p): + "expression : '(' expression ')'" + p[0] = p[2] + + +def p_expression_number(p): + "expression : NUMBER" + p[0] = p[1] + + +def p_expression_name(p): + "expression : NAME" + try: + p[0] = names[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + + +def p_error(p): + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") + +import ply.yacc as yacc +yacc.yacc() + +while 1: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: + continue + yacc.parse(s + '\n') diff --git a/example/classcalc/calc.py b/example/classcalc/calc.py new file mode 100755 index 000000000..ada4afd42 --- /dev/null +++ b/example/classcalc/calc.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python + +# ----------------------------------------------------------------------------- +# calc.py +# +# A simple calculator with variables. This is from O'Reilly's +# "Lex and Yacc", p. 63. +# +# Class-based example contributed to PLY by David McNab +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +import ply.lex as lex +import ply.yacc as yacc +import os + + +class Parser: + """ + Base class for a lexer/parser that has the rules defined as methods + """ + tokens = () + precedence = () + + def __init__(self, **kw): + self.debug = kw.get('debug', 0) + self.names = {} + try: + modname = os.path.split(os.path.splitext(__file__)[0])[ + 1] + "_" + self.__class__.__name__ + except: + modname = "parser" + "_" + self.__class__.__name__ + self.debugfile = modname + ".dbg" + self.tabmodule = modname + "_" + "parsetab" + # print self.debugfile, self.tabmodule + + # Build the lexer and parser + lex.lex(module=self, debug=self.debug) + yacc.yacc(module=self, + debug=self.debug, + debugfile=self.debugfile, + tabmodule=self.tabmodule) + + def run(self): + while 1: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: + continue + yacc.parse(s) + + +class Calc(Parser): + + tokens = ( + 'NAME', 'NUMBER', + 'PLUS', 'MINUS', 'EXP', 'TIMES', 'DIVIDE', 'EQUALS', + 'LPAREN', 'RPAREN', + ) + + # Tokens + + t_PLUS = r'\+' + t_MINUS = r'-' + t_EXP = r'\*\*' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_EQUALS = r'=' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + def t_NUMBER(self, t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + # print "parsed number %s" % repr(t.value) + return t + + t_ignore = " \t" + + def t_newline(self, t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + def t_error(self, t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Parsing rules + + precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('left', 'EXP'), + ('right', 'UMINUS'), + ) + + def p_statement_assign(self, p): + 'statement : NAME EQUALS expression' + self.names[p[1]] = p[3] + + def p_statement_expr(self, p): + 'statement : expression' + print(p[1]) + + def p_expression_binop(self, p): + """ + expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression + | expression EXP expression + """ + # print [repr(p[i]) for i in range(0,4)] + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + elif p[2] == '**': + p[0] = p[1] ** p[3] + + def p_expression_uminus(self, p): + 'expression : MINUS expression %prec UMINUS' + p[0] = -p[2] + + def p_expression_group(self, p): + 'expression : LPAREN expression RPAREN' + p[0] = p[2] + + def p_expression_number(self, p): + 'expression : NUMBER' + p[0] = p[1] + + def p_expression_name(self, p): + 'expression : NAME' + try: + p[0] = self.names[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + + def p_error(self, p): + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") + +if __name__ == '__main__': + calc = Calc() + calc.run() diff --git a/example/cleanup.sh b/example/cleanup.sh new file mode 100755 index 000000000..3e115f41c --- /dev/null +++ b/example/cleanup.sh @@ -0,0 +1,2 @@ +#!/bin/sh +rm -f */*.pyc */parsetab.py */parser.out */*~ */*.class diff --git a/example/closurecalc/calc.py b/example/closurecalc/calc.py new file mode 100644 index 000000000..6031b0581 --- /dev/null +++ b/example/closurecalc/calc.py @@ -0,0 +1,132 @@ +# ----------------------------------------------------------------------------- +# calc.py +# +# A calculator parser that makes use of closures. The function make_calculator() +# returns a function that accepts an input string and returns a result. All +# lexing rules, parsing rules, and internal state are held inside the function. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +# Make a calculator function + + +def make_calculator(): + import ply.lex as lex + import ply.yacc as yacc + + # ------- Internal calculator state + + variables = {} # Dictionary of stored variables + + # ------- Calculator tokenizing rules + + tokens = ( + 'NAME', 'NUMBER', + ) + + literals = ['=', '+', '-', '*', '/', '(', ')'] + + t_ignore = " \t" + + t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + lexer = lex.lex() + + # ------- Calculator parsing rules + + precedence = ( + ('left', '+', '-'), + ('left', '*', '/'), + ('right', 'UMINUS'), + ) + + def p_statement_assign(p): + 'statement : NAME "=" expression' + variables[p[1]] = p[3] + p[0] = None + + def p_statement_expr(p): + 'statement : expression' + p[0] = p[1] + + def p_expression_binop(p): + '''expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + + def p_expression_uminus(p): + "expression : '-' expression %prec UMINUS" + p[0] = -p[2] + + def p_expression_group(p): + "expression : '(' expression ')'" + p[0] = p[2] + + def p_expression_number(p): + "expression : NUMBER" + p[0] = p[1] + + def p_expression_name(p): + "expression : NAME" + try: + p[0] = variables[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + + def p_error(p): + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") + + # Build the parser + parser = yacc.yacc() + + # ------- Input function + + def input(text): + result = parser.parse(text, lexer=lexer) + return result + + return input + +# Make a calculator object and use it +calc = make_calculator() + +while True: + try: + s = raw_input("calc > ") + except EOFError: + break + r = calc(s) + if r: + print(r) diff --git a/example/hedit/hedit.py b/example/hedit/hedit.py new file mode 100644 index 000000000..32da74567 --- /dev/null +++ b/example/hedit/hedit.py @@ -0,0 +1,48 @@ +# ----------------------------------------------------------------------------- +# hedit.py +# +# Paring of Fortran H Edit descriptions (Contributed by Pearu Peterson) +# +# These tokens can't be easily tokenized because they are of the following +# form: +# +# nHc1...cn +# +# where n is a positive integer and c1 ... cn are characters. +# +# This example shows how to modify the state of the lexer to parse +# such tokens +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + + +tokens = ( + 'H_EDIT_DESCRIPTOR', +) + +# Tokens +t_ignore = " \t\n" + + +def t_H_EDIT_DESCRIPTOR(t): + r"\d+H.*" # This grabs all of the remaining text + i = t.value.index('H') + n = eval(t.value[:i]) + + # Adjust the tokenizing position + t.lexer.lexpos -= len(t.value) - (i + 1 + n) + + t.value = t.value[i + 1:i + 1 + n] + return t + + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +import ply.lex as lex +lex.lex() +lex.runmain() diff --git a/example/newclasscalc/calc.py b/example/newclasscalc/calc.py new file mode 100755 index 000000000..43c9506a8 --- /dev/null +++ b/example/newclasscalc/calc.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python + +# ----------------------------------------------------------------------------- +# calc.py +# +# A simple calculator with variables. This is from O'Reilly's +# "Lex and Yacc", p. 63. +# +# Class-based example contributed to PLY by David McNab. +# +# Modified to use new-style classes. Test case. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +import ply.lex as lex +import ply.yacc as yacc +import os + + +class Parser(object): + """ + Base class for a lexer/parser that has the rules defined as methods + """ + tokens = () + precedence = () + + def __init__(self, **kw): + self.debug = kw.get('debug', 0) + self.names = {} + try: + modname = os.path.split(os.path.splitext(__file__)[0])[ + 1] + "_" + self.__class__.__name__ + except: + modname = "parser" + "_" + self.__class__.__name__ + self.debugfile = modname + ".dbg" + self.tabmodule = modname + "_" + "parsetab" + # print self.debugfile, self.tabmodule + + # Build the lexer and parser + lex.lex(module=self, debug=self.debug) + yacc.yacc(module=self, + debug=self.debug, + debugfile=self.debugfile, + tabmodule=self.tabmodule) + + def run(self): + while 1: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: + continue + yacc.parse(s) + + +class Calc(Parser): + + tokens = ( + 'NAME', 'NUMBER', + 'PLUS', 'MINUS', 'EXP', 'TIMES', 'DIVIDE', 'EQUALS', + 'LPAREN', 'RPAREN', + ) + + # Tokens + + t_PLUS = r'\+' + t_MINUS = r'-' + t_EXP = r'\*\*' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_EQUALS = r'=' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + def t_NUMBER(self, t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + # print "parsed number %s" % repr(t.value) + return t + + t_ignore = " \t" + + def t_newline(self, t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + def t_error(self, t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Parsing rules + + precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('left', 'EXP'), + ('right', 'UMINUS'), + ) + + def p_statement_assign(self, p): + 'statement : NAME EQUALS expression' + self.names[p[1]] = p[3] + + def p_statement_expr(self, p): + 'statement : expression' + print(p[1]) + + def p_expression_binop(self, p): + """ + expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression + | expression EXP expression + """ + # print [repr(p[i]) for i in range(0,4)] + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + elif p[2] == '**': + p[0] = p[1] ** p[3] + + def p_expression_uminus(self, p): + 'expression : MINUS expression %prec UMINUS' + p[0] = -p[2] + + def p_expression_group(self, p): + 'expression : LPAREN expression RPAREN' + p[0] = p[2] + + def p_expression_number(self, p): + 'expression : NUMBER' + p[0] = p[1] + + def p_expression_name(self, p): + 'expression : NAME' + try: + p[0] = self.names[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + + def p_error(self, p): + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") + +if __name__ == '__main__': + calc = Calc() + calc.run() diff --git a/example/optcalc/README b/example/optcalc/README new file mode 100644 index 000000000..53dd5fcd5 --- /dev/null +++ b/example/optcalc/README @@ -0,0 +1,9 @@ +An example showing how to use Python optimized mode. +To run: + + - First run 'python calc.py' + + - Then run 'python -OO calc.py' + +If working correctly, the second version should run the +same way. diff --git a/example/optcalc/calc.py b/example/optcalc/calc.py new file mode 100644 index 000000000..0c223e599 --- /dev/null +++ b/example/optcalc/calc.py @@ -0,0 +1,134 @@ +# ----------------------------------------------------------------------------- +# calc.py +# +# A simple calculator with variables. This is from O'Reilly's +# "Lex and Yacc", p. 63. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +tokens = ( + 'NAME', 'NUMBER', + 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'EQUALS', + 'LPAREN', 'RPAREN', +) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +import ply.lex as lex +lex.lex(optimize=1) + +# Parsing rules + +precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), +) + +# dictionary of names +names = {} + + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+': + t[0] = t[1] + t[3] + elif t[2] == '-': + t[0] = t[1] - t[3] + elif t[2] == '*': + t[0] = t[1] * t[3] + elif t[2] == '/': + t[0] = t[1] / t[3] + elif t[2] == '<': + t[0] = t[1] < t[3] + + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + + +def p_error(t): + if t: + print("Syntax error at '%s'" % t.value) + else: + print("Syntax error at EOF") + +import ply.yacc as yacc +yacc.yacc(optimize=1) + +while 1: + try: + s = raw_input('calc > ') + except EOFError: + break + yacc.parse(s) diff --git a/example/unicalc/calc.py b/example/unicalc/calc.py new file mode 100644 index 000000000..901c4b9d7 --- /dev/null +++ b/example/unicalc/calc.py @@ -0,0 +1,133 @@ +# ----------------------------------------------------------------------------- +# calc.py +# +# A simple calculator with variables. This is from O'Reilly's +# "Lex and Yacc", p. 63. +# +# This example uses unicode strings for tokens, docstrings, and input. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +tokens = ( + 'NAME', 'NUMBER', + 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'EQUALS', + 'LPAREN', 'RPAREN', +) + +# Tokens + +t_PLUS = ur'\+' +t_MINUS = ur'-' +t_TIMES = ur'\*' +t_DIVIDE = ur'/' +t_EQUALS = ur'=' +t_LPAREN = ur'\(' +t_RPAREN = ur'\)' +t_NAME = ur'[a-zA-Z_][a-zA-Z0-9_]*' + + +def t_NUMBER(t): + ur'\d+' + try: + t.value = int(t.value) + except ValueError: + print "Integer value too large", t.value + t.value = 0 + return t + +t_ignore = u" \t" + + +def t_newline(t): + ur'\n+' + t.lexer.lineno += t.value.count("\n") + + +def t_error(t): + print "Illegal character '%s'" % t.value[0] + t.lexer.skip(1) + +# Build the lexer +import ply.lex as lex +lex.lex() + +# Parsing rules + +precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), +) + +# dictionary of names +names = {} + + +def p_statement_assign(p): + 'statement : NAME EQUALS expression' + names[p[1]] = p[3] + + +def p_statement_expr(p): + 'statement : expression' + print p[1] + + +def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if p[2] == u'+': + p[0] = p[1] + p[3] + elif p[2] == u'-': + p[0] = p[1] - p[3] + elif p[2] == u'*': + p[0] = p[1] * p[3] + elif p[2] == u'/': + p[0] = p[1] / p[3] + + +def p_expression_uminus(p): + 'expression : MINUS expression %prec UMINUS' + p[0] = -p[2] + + +def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = p[2] + + +def p_expression_number(p): + 'expression : NUMBER' + p[0] = p[1] + + +def p_expression_name(p): + 'expression : NAME' + try: + p[0] = names[p[1]] + except LookupError: + print "Undefined name '%s'" % p[1] + p[0] = 0 + + +def p_error(p): + if p: + print "Syntax error at '%s'" % p.value + else: + print "Syntax error at EOF" + +import ply.yacc as yacc +yacc.yacc() + +while 1: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: + continue + yacc.parse(unicode(s)) diff --git a/example/yply/README b/example/yply/README new file mode 100644 index 000000000..bfadf3643 --- /dev/null +++ b/example/yply/README @@ -0,0 +1,41 @@ +yply.py + +This example implements a program yply.py that converts a UNIX-yacc +specification file into a PLY-compatible program. To use, simply +run it like this: + + % python yply.py [-nocode] inputfile.y >myparser.py + +The output of this program is Python code. In the output, +any C code in the original file is included, but is commented out. +If you use the -nocode option, then all of the C code in the +original file is just discarded. + +To use the resulting grammer with PLY, you'll need to edit the +myparser.py file. Within this file, some stub code is included that +can be used to test the construction of the parsing tables. However, +you'll need to do more editing to make a workable parser. + +Disclaimer: This just an example I threw together in an afternoon. +It might have some bugs. However, it worked when I tried it on +a yacc-specified C++ parser containing 442 rules and 855 parsing +states. + +Comments: + +1. This example does not parse specification files meant for lex/flex. + You'll need to specify the tokenizer on your own. + +2. This example shows a number of interesting PLY features including + + - Parsing of literal text delimited by nested parentheses + - Some interaction between the parser and the lexer. + - Use of literals in the grammar specification + - One pass compilation. The program just emits the result, + there is no intermediate parse tree. + +3. This program could probably be cleaned up and enhanced a lot. + It would be great if someone wanted to work on this (hint). + +-Dave + diff --git a/example/yply/ylex.py b/example/yply/ylex.py new file mode 100644 index 000000000..a3efe8e5e --- /dev/null +++ b/example/yply/ylex.py @@ -0,0 +1,119 @@ +# lexer for yacc-grammars +# +# Author: David Beazley (dave@dabeaz.com) +# Date : October 2, 2006 + +import sys +sys.path.append("../..") + +from ply import * + +tokens = ( + 'LITERAL', 'SECTION', 'TOKEN', 'LEFT', 'RIGHT', 'PREC', 'START', 'TYPE', 'NONASSOC', 'UNION', 'CODE', + 'ID', 'QLITERAL', 'NUMBER', +) + +states = (('code', 'exclusive'),) + +literals = [';', ',', '<', '>', '|', ':'] +t_ignore = ' \t' + +t_TOKEN = r'%token' +t_LEFT = r'%left' +t_RIGHT = r'%right' +t_NONASSOC = r'%nonassoc' +t_PREC = r'%prec' +t_START = r'%start' +t_TYPE = r'%type' +t_UNION = r'%union' +t_ID = r'[a-zA-Z_][a-zA-Z_0-9]*' +t_QLITERAL = r'''(?P['"]).*?(?P=quote)''' +t_NUMBER = r'\d+' + + +def t_SECTION(t): + r'%%' + if getattr(t.lexer, "lastsection", 0): + t.value = t.lexer.lexdata[t.lexpos + 2:] + t.lexer.lexpos = len(t.lexer.lexdata) + else: + t.lexer.lastsection = 0 + return t + +# Comments + + +def t_ccomment(t): + r'/\*(.|\n)*?\*/' + t.lexer.lineno += t.value.count('\n') + +t_ignore_cppcomment = r'//.*' + + +def t_LITERAL(t): + r'%\{(.|\n)*?%\}' + t.lexer.lineno += t.value.count("\n") + return t + + +def t_NEWLINE(t): + r'\n' + t.lexer.lineno += 1 + + +def t_code(t): + r'\{' + t.lexer.codestart = t.lexpos + t.lexer.level = 1 + t.lexer.begin('code') + + +def t_code_ignore_string(t): + r'\"([^\\\n]|(\\.))*?\"' + + +def t_code_ignore_char(t): + r'\'([^\\\n]|(\\.))*?\'' + + +def t_code_ignore_comment(t): + r'/\*(.|\n)*?\*/' + + +def t_code_ignore_cppcom(t): + r'//.*' + + +def t_code_lbrace(t): + r'\{' + t.lexer.level += 1 + + +def t_code_rbrace(t): + r'\}' + t.lexer.level -= 1 + if t.lexer.level == 0: + t.type = 'CODE' + t.value = t.lexer.lexdata[t.lexer.codestart:t.lexpos + 1] + t.lexer.begin('INITIAL') + t.lexer.lineno += t.value.count('\n') + return t + +t_code_ignore_nonspace = r'[^\s\}\'\"\{]+' +t_code_ignore_whitespace = r'\s+' +t_code_ignore = "" + + +def t_code_error(t): + raise RuntimeError + + +def t_error(t): + print "%d: Illegal character '%s'" % (t.lexer.lineno, t.value[0]) + print t.value + t.lexer.skip(1) + +lex.lex() + +if __name__ == '__main__': + lex.runmain() diff --git a/example/yply/yparse.py b/example/yply/yparse.py new file mode 100644 index 000000000..fff887aa7 --- /dev/null +++ b/example/yply/yparse.py @@ -0,0 +1,244 @@ +# parser for Unix yacc-based grammars +# +# Author: David Beazley (dave@dabeaz.com) +# Date : October 2, 2006 + +import ylex +tokens = ylex.tokens + +from ply import * + +tokenlist = [] +preclist = [] + +emit_code = 1 + + +def p_yacc(p): + '''yacc : defsection rulesection''' + + +def p_defsection(p): + '''defsection : definitions SECTION + | SECTION''' + p.lexer.lastsection = 1 + print "tokens = ", repr(tokenlist) + print + print "precedence = ", repr(preclist) + print + print "# -------------- RULES ----------------" + print + + +def p_rulesection(p): + '''rulesection : rules SECTION''' + + print "# -------------- RULES END ----------------" + print_code(p[2], 0) + + +def p_definitions(p): + '''definitions : definitions definition + | definition''' + + +def p_definition_literal(p): + '''definition : LITERAL''' + print_code(p[1], 0) + + +def p_definition_start(p): + '''definition : START ID''' + print "start = '%s'" % p[2] + + +def p_definition_token(p): + '''definition : toktype opttype idlist optsemi ''' + for i in p[3]: + if i[0] not in "'\"": + tokenlist.append(i) + if p[1] == '%left': + preclist.append(('left',) + tuple(p[3])) + elif p[1] == '%right': + preclist.append(('right',) + tuple(p[3])) + elif p[1] == '%nonassoc': + preclist.append(('nonassoc',) + tuple(p[3])) + + +def p_toktype(p): + '''toktype : TOKEN + | LEFT + | RIGHT + | NONASSOC''' + p[0] = p[1] + + +def p_opttype(p): + '''opttype : '<' ID '>' + | empty''' + + +def p_idlist(p): + '''idlist : idlist optcomma tokenid + | tokenid''' + if len(p) == 2: + p[0] = [p[1]] + else: + p[0] = p[1] + p[1].append(p[3]) + + +def p_tokenid(p): + '''tokenid : ID + | ID NUMBER + | QLITERAL + | QLITERAL NUMBER''' + p[0] = p[1] + + +def p_optsemi(p): + '''optsemi : ';' + | empty''' + + +def p_optcomma(p): + '''optcomma : ',' + | empty''' + + +def p_definition_type(p): + '''definition : TYPE '<' ID '>' namelist optsemi''' + # type declarations are ignored + + +def p_namelist(p): + '''namelist : namelist optcomma ID + | ID''' + + +def p_definition_union(p): + '''definition : UNION CODE optsemi''' + # Union declarations are ignored + + +def p_rules(p): + '''rules : rules rule + | rule''' + if len(p) == 2: + rule = p[1] + else: + rule = p[2] + + # Print out a Python equivalent of this rule + + embedded = [] # Embedded actions (a mess) + embed_count = 0 + + rulename = rule[0] + rulecount = 1 + for r in rule[1]: + # r contains one of the rule possibilities + print "def p_%s_%d(p):" % (rulename, rulecount) + prod = [] + prodcode = "" + for i in range(len(r)): + item = r[i] + if item[0] == '{': # A code block + if i == len(r) - 1: + prodcode = item + break + else: + # an embedded action + embed_name = "_embed%d_%s" % (embed_count, rulename) + prod.append(embed_name) + embedded.append((embed_name, item)) + embed_count += 1 + else: + prod.append(item) + print " '''%s : %s'''" % (rulename, " ".join(prod)) + # Emit code + print_code(prodcode, 4) + print + rulecount += 1 + + for e, code in embedded: + print "def p_%s(p):" % e + print " '''%s : '''" % e + print_code(code, 4) + print + + +def p_rule(p): + '''rule : ID ':' rulelist ';' ''' + p[0] = (p[1], [p[3]]) + + +def p_rule2(p): + '''rule : ID ':' rulelist morerules ';' ''' + p[4].insert(0, p[3]) + p[0] = (p[1], p[4]) + + +def p_rule_empty(p): + '''rule : ID ':' ';' ''' + p[0] = (p[1], [[]]) + + +def p_rule_empty2(p): + '''rule : ID ':' morerules ';' ''' + + p[3].insert(0, []) + p[0] = (p[1], p[3]) + + +def p_morerules(p): + '''morerules : morerules '|' rulelist + | '|' rulelist + | '|' ''' + + if len(p) == 2: + p[0] = [[]] + elif len(p) == 3: + p[0] = [p[2]] + else: + p[0] = p[1] + p[0].append(p[3]) + +# print "morerules", len(p), p[0] + + +def p_rulelist(p): + '''rulelist : rulelist ruleitem + | ruleitem''' + + if len(p) == 2: + p[0] = [p[1]] + else: + p[0] = p[1] + p[1].append(p[2]) + + +def p_ruleitem(p): + '''ruleitem : ID + | QLITERAL + | CODE + | PREC''' + p[0] = p[1] + + +def p_empty(p): + '''empty : ''' + + +def p_error(p): + pass + +yacc.yacc(debug=0) + + +def print_code(code, indent): + if not emit_code: + return + codelines = code.splitlines() + for c in codelines: + print "%s# %s" % (" " * indent, c) diff --git a/example/yply/yply.py b/example/yply/yply.py new file mode 100755 index 000000000..1aa24c3f4 --- /dev/null +++ b/example/yply/yply.py @@ -0,0 +1,51 @@ +#!/usr/local/bin/python +# yply.py +# +# Author: David Beazley (dave@dabeaz.com) +# Date : October 2, 2006 +# +# Converts a UNIX-yacc specification file into a PLY-compatible +# specification. To use, simply do this: +# +# % python yply.py [-nocode] inputfile.y >myparser.py +# +# The output of this program is Python code. In the output, +# any C code in the original file is included, but is commented. +# If you use the -nocode option, then all of the C code in the +# original file is discarded. +# +# Disclaimer: This just an example I threw together in an afternoon. +# It might have some bugs. However, it worked when I tried it on +# a yacc-specified C++ parser containing 442 rules and 855 parsing +# states. +# + +import sys +sys.path.insert(0, "../..") + +import ylex +import yparse + +from ply import * + +if len(sys.argv) == 1: + print "usage : yply.py [-nocode] inputfile" + raise SystemExit + +if len(sys.argv) == 3: + if sys.argv[1] == '-nocode': + yparse.emit_code = 0 + else: + print "Unknown option '%s'" % sys.argv[1] + raise SystemExit + filename = sys.argv[2] +else: + filename = sys.argv[1] + +yacc.parse(open(filename).read()) + +print """ +if __name__ == '__main__': + from ply import * + yacc.yacc() +""" diff --git a/ply/__init__.py b/ply/__init__.py new file mode 100644 index 000000000..6e53cddcf --- /dev/null +++ b/ply/__init__.py @@ -0,0 +1,5 @@ +# PLY package +# Author: David Beazley (dave@dabeaz.com) + +__version__ = '3.9' +__all__ = ['lex','yacc'] diff --git a/ply/cpp.py b/ply/cpp.py new file mode 100644 index 000000000..ade2987c5 --- /dev/null +++ b/ply/cpp.py @@ -0,0 +1,917 @@ +# ----------------------------------------------------------------------------- +# cpp.py +# +# Author: David Beazley (http://www.dabeaz.com) +# Copyright (C) 2007 +# All rights reserved +# +# This module implements an ANSI-C style lexical preprocessor for PLY. +# ----------------------------------------------------------------------------- +from __future__ import generators + +import sys + +# Some Python 3 compatibility shims +if sys.version_info.major < 3: + STRING_TYPES = (str, unicode) +else: + STRING_TYPES = str + xrange = range + +# ----------------------------------------------------------------------------- +# Default preprocessor lexer definitions. These tokens are enough to get +# a basic preprocessor working. Other modules may import these if they want +# ----------------------------------------------------------------------------- + +tokens = ( + 'CPP_ID','CPP_INTEGER', 'CPP_FLOAT', 'CPP_STRING', 'CPP_CHAR', 'CPP_WS', 'CPP_COMMENT1', 'CPP_COMMENT2', 'CPP_POUND','CPP_DPOUND' +) + +literals = "+-*/%|&~^<>=!?()[]{}.,;:\\\'\"" + +# Whitespace +def t_CPP_WS(t): + r'\s+' + t.lexer.lineno += t.value.count("\n") + return t + +t_CPP_POUND = r'\#' +t_CPP_DPOUND = r'\#\#' + +# Identifier +t_CPP_ID = r'[A-Za-z_][\w_]*' + +# Integer literal +def CPP_INTEGER(t): + r'(((((0x)|(0X))[0-9a-fA-F]+)|(\d+))([uU][lL]|[lL][uU]|[uU]|[lL])?)' + return t + +t_CPP_INTEGER = CPP_INTEGER + +# Floating literal +t_CPP_FLOAT = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' + +# String literal +def t_CPP_STRING(t): + r'\"([^\\\n]|(\\(.|\n)))*?\"' + t.lexer.lineno += t.value.count("\n") + return t + +# Character constant 'c' or L'c' +def t_CPP_CHAR(t): + r'(L)?\'([^\\\n]|(\\(.|\n)))*?\'' + t.lexer.lineno += t.value.count("\n") + return t + +# Comment +def t_CPP_COMMENT1(t): + r'(/\*(.|\n)*?\*/)' + ncr = t.value.count("\n") + t.lexer.lineno += ncr + # replace with one space or a number of '\n' + t.type = 'CPP_WS'; t.value = '\n' * ncr if ncr else ' ' + return t + +# Line comment +def t_CPP_COMMENT2(t): + r'(//.*?(\n|$))' + # replace with '/n' + t.type = 'CPP_WS'; t.value = '\n' + +def t_error(t): + t.type = t.value[0] + t.value = t.value[0] + t.lexer.skip(1) + return t + +import re +import copy +import time +import os.path + +# ----------------------------------------------------------------------------- +# trigraph() +# +# Given an input string, this function replaces all trigraph sequences. +# The following mapping is used: +# +# ??= # +# ??/ \ +# ??' ^ +# ??( [ +# ??) ] +# ??! | +# ??< { +# ??> } +# ??- ~ +# ----------------------------------------------------------------------------- + +_trigraph_pat = re.compile(r'''\?\?[=/\'\(\)\!<>\-]''') +_trigraph_rep = { + '=':'#', + '/':'\\', + "'":'^', + '(':'[', + ')':']', + '!':'|', + '<':'{', + '>':'}', + '-':'~' +} + +def trigraph(input): + return _trigraph_pat.sub(lambda g: _trigraph_rep[g.group()[-1]],input) + +# ------------------------------------------------------------------ +# Macro object +# +# This object holds information about preprocessor macros +# +# .name - Macro name (string) +# .value - Macro value (a list of tokens) +# .arglist - List of argument names +# .variadic - Boolean indicating whether or not variadic macro +# .vararg - Name of the variadic parameter +# +# When a macro is created, the macro replacement token sequence is +# pre-scanned and used to create patch lists that are later used +# during macro expansion +# ------------------------------------------------------------------ + +class Macro(object): + def __init__(self,name,value,arglist=None,variadic=False): + self.name = name + self.value = value + self.arglist = arglist + self.variadic = variadic + if variadic: + self.vararg = arglist[-1] + self.source = None + +# ------------------------------------------------------------------ +# Preprocessor object +# +# Object representing a preprocessor. Contains macro definitions, +# include directories, and other information +# ------------------------------------------------------------------ + +class Preprocessor(object): + def __init__(self,lexer=None): + if lexer is None: + lexer = lex.lexer + self.lexer = lexer + self.macros = { } + self.path = [] + self.temp_path = [] + + # Probe the lexer for selected tokens + self.lexprobe() + + tm = time.localtime() + self.define("__DATE__ \"%s\"" % time.strftime("%b %d %Y",tm)) + self.define("__TIME__ \"%s\"" % time.strftime("%H:%M:%S",tm)) + self.parser = None + + # ----------------------------------------------------------------------------- + # tokenize() + # + # Utility function. Given a string of text, tokenize into a list of tokens + # ----------------------------------------------------------------------------- + + def tokenize(self,text): + tokens = [] + self.lexer.input(text) + while True: + tok = self.lexer.token() + if not tok: break + tokens.append(tok) + return tokens + + # --------------------------------------------------------------------- + # error() + # + # Report a preprocessor error/warning of some kind + # ---------------------------------------------------------------------- + + def error(self,file,line,msg): + print("%s:%d %s" % (file,line,msg)) + + # ---------------------------------------------------------------------- + # lexprobe() + # + # This method probes the preprocessor lexer object to discover + # the token types of symbols that are important to the preprocessor. + # If this works right, the preprocessor will simply "work" + # with any suitable lexer regardless of how tokens have been named. + # ---------------------------------------------------------------------- + + def lexprobe(self): + + # Determine the token type for identifiers + self.lexer.input("identifier") + tok = self.lexer.token() + if not tok or tok.value != "identifier": + print("Couldn't determine identifier type") + else: + self.t_ID = tok.type + + # Determine the token type for integers + self.lexer.input("12345") + tok = self.lexer.token() + if not tok or int(tok.value) != 12345: + print("Couldn't determine integer type") + else: + self.t_INTEGER = tok.type + self.t_INTEGER_TYPE = type(tok.value) + + # Determine the token type for strings enclosed in double quotes + self.lexer.input("\"filename\"") + tok = self.lexer.token() + if not tok or tok.value != "\"filename\"": + print("Couldn't determine string type") + else: + self.t_STRING = tok.type + + # Determine the token type for whitespace--if any + self.lexer.input(" ") + tok = self.lexer.token() + if not tok or tok.value != " ": + self.t_SPACE = None + else: + self.t_SPACE = tok.type + + # Determine the token type for newlines + self.lexer.input("\n") + tok = self.lexer.token() + if not tok or tok.value != "\n": + self.t_NEWLINE = None + print("Couldn't determine token for newlines") + else: + self.t_NEWLINE = tok.type + + self.t_WS = (self.t_SPACE, self.t_NEWLINE) + + # Check for other characters used by the preprocessor + chars = [ '<','>','#','##','\\','(',')',',','.'] + for c in chars: + self.lexer.input(c) + tok = self.lexer.token() + if not tok or tok.value != c: + print("Unable to lex '%s' required for preprocessor" % c) + + # ---------------------------------------------------------------------- + # add_path() + # + # Adds a search path to the preprocessor. + # ---------------------------------------------------------------------- + + def add_path(self,path): + self.path.append(path) + + # ---------------------------------------------------------------------- + # group_lines() + # + # Given an input string, this function splits it into lines. Trailing whitespace + # is removed. Any line ending with \ is grouped with the next line. This + # function forms the lowest level of the preprocessor---grouping into text into + # a line-by-line format. + # ---------------------------------------------------------------------- + + def group_lines(self,input): + lex = self.lexer.clone() + lines = [x.rstrip() for x in input.splitlines()] + for i in xrange(len(lines)): + j = i+1 + while lines[i].endswith('\\') and (j < len(lines)): + lines[i] = lines[i][:-1]+lines[j] + lines[j] = "" + j += 1 + + input = "\n".join(lines) + lex.input(input) + lex.lineno = 1 + + current_line = [] + while True: + tok = lex.token() + if not tok: + break + current_line.append(tok) + if tok.type in self.t_WS and '\n' in tok.value: + yield current_line + current_line = [] + + if current_line: + yield current_line + + # ---------------------------------------------------------------------- + # tokenstrip() + # + # Remove leading/trailing whitespace tokens from a token list + # ---------------------------------------------------------------------- + + def tokenstrip(self,tokens): + i = 0 + while i < len(tokens) and tokens[i].type in self.t_WS: + i += 1 + del tokens[:i] + i = len(tokens)-1 + while i >= 0 and tokens[i].type in self.t_WS: + i -= 1 + del tokens[i+1:] + return tokens + + + # ---------------------------------------------------------------------- + # collect_args() + # + # Collects comma separated arguments from a list of tokens. The arguments + # must be enclosed in parenthesis. Returns a tuple (tokencount,args,positions) + # where tokencount is the number of tokens consumed, args is a list of arguments, + # and positions is a list of integers containing the starting index of each + # argument. Each argument is represented by a list of tokens. + # + # When collecting arguments, leading and trailing whitespace is removed + # from each argument. + # + # This function properly handles nested parenthesis and commas---these do not + # define new arguments. + # ---------------------------------------------------------------------- + + def collect_args(self,tokenlist): + args = [] + positions = [] + current_arg = [] + nesting = 1 + tokenlen = len(tokenlist) + + # Search for the opening '('. + i = 0 + while (i < tokenlen) and (tokenlist[i].type in self.t_WS): + i += 1 + + if (i < tokenlen) and (tokenlist[i].value == '('): + positions.append(i+1) + else: + self.error(self.source,tokenlist[0].lineno,"Missing '(' in macro arguments") + return 0, [], [] + + i += 1 + + while i < tokenlen: + t = tokenlist[i] + if t.value == '(': + current_arg.append(t) + nesting += 1 + elif t.value == ')': + nesting -= 1 + if nesting == 0: + if current_arg: + args.append(self.tokenstrip(current_arg)) + positions.append(i) + return i+1,args,positions + current_arg.append(t) + elif t.value == ',' and nesting == 1: + args.append(self.tokenstrip(current_arg)) + positions.append(i+1) + current_arg = [] + else: + current_arg.append(t) + i += 1 + + # Missing end argument + self.error(self.source,tokenlist[-1].lineno,"Missing ')' in macro arguments") + return 0, [],[] + + # ---------------------------------------------------------------------- + # macro_prescan() + # + # Examine the macro value (token sequence) and identify patch points + # This is used to speed up macro expansion later on---we'll know + # right away where to apply patches to the value to form the expansion + # ---------------------------------------------------------------------- + + def macro_prescan(self,macro): + macro.patch = [] # Standard macro arguments + macro.str_patch = [] # String conversion expansion + macro.var_comma_patch = [] # Variadic macro comma patch + i = 0 + while i < len(macro.value): + if macro.value[i].type == self.t_ID and macro.value[i].value in macro.arglist: + argnum = macro.arglist.index(macro.value[i].value) + # Conversion of argument to a string + if i > 0 and macro.value[i-1].value == '#': + macro.value[i] = copy.copy(macro.value[i]) + macro.value[i].type = self.t_STRING + del macro.value[i-1] + macro.str_patch.append((argnum,i-1)) + continue + # Concatenation + elif (i > 0 and macro.value[i-1].value == '##'): + macro.patch.append(('c',argnum,i-1)) + del macro.value[i-1] + continue + elif ((i+1) < len(macro.value) and macro.value[i+1].value == '##'): + macro.patch.append(('c',argnum,i)) + i += 1 + continue + # Standard expansion + else: + macro.patch.append(('e',argnum,i)) + elif macro.value[i].value == '##': + if macro.variadic and (i > 0) and (macro.value[i-1].value == ',') and \ + ((i+1) < len(macro.value)) and (macro.value[i+1].type == self.t_ID) and \ + (macro.value[i+1].value == macro.vararg): + macro.var_comma_patch.append(i-1) + i += 1 + macro.patch.sort(key=lambda x: x[2],reverse=True) + + # ---------------------------------------------------------------------- + # macro_expand_args() + # + # Given a Macro and list of arguments (each a token list), this method + # returns an expanded version of a macro. The return value is a token sequence + # representing the replacement macro tokens + # ---------------------------------------------------------------------- + + def macro_expand_args(self,macro,args): + # Make a copy of the macro token sequence + rep = [copy.copy(_x) for _x in macro.value] + + # Make string expansion patches. These do not alter the length of the replacement sequence + + str_expansion = {} + for argnum, i in macro.str_patch: + if argnum not in str_expansion: + str_expansion[argnum] = ('"%s"' % "".join([x.value for x in args[argnum]])).replace("\\","\\\\") + rep[i] = copy.copy(rep[i]) + rep[i].value = str_expansion[argnum] + + # Make the variadic macro comma patch. If the variadic macro argument is empty, we get rid + comma_patch = False + if macro.variadic and not args[-1]: + for i in macro.var_comma_patch: + rep[i] = None + comma_patch = True + + # Make all other patches. The order of these matters. It is assumed that the patch list + # has been sorted in reverse order of patch location since replacements will cause the + # size of the replacement sequence to expand from the patch point. + + expanded = { } + for ptype, argnum, i in macro.patch: + # Concatenation. Argument is left unexpanded + if ptype == 'c': + rep[i:i+1] = args[argnum] + # Normal expansion. Argument is macro expanded first + elif ptype == 'e': + if argnum not in expanded: + expanded[argnum] = self.expand_macros(args[argnum]) + rep[i:i+1] = expanded[argnum] + + # Get rid of removed comma if necessary + if comma_patch: + rep = [_i for _i in rep if _i] + + return rep + + + # ---------------------------------------------------------------------- + # expand_macros() + # + # Given a list of tokens, this function performs macro expansion. + # The expanded argument is a dictionary that contains macros already + # expanded. This is used to prevent infinite recursion. + # ---------------------------------------------------------------------- + + def expand_macros(self,tokens,expanded=None): + if expanded is None: + expanded = {} + i = 0 + while i < len(tokens): + t = tokens[i] + if t.type == self.t_ID: + if t.value in self.macros and t.value not in expanded: + # Yes, we found a macro match + expanded[t.value] = True + + m = self.macros[t.value] + if not m.arglist: + # A simple macro + ex = self.expand_macros([copy.copy(_x) for _x in m.value],expanded) + for e in ex: + e.lineno = t.lineno + tokens[i:i+1] = ex + i += len(ex) + else: + # A macro with arguments + j = i + 1 + while j < len(tokens) and tokens[j].type in self.t_WS: + j += 1 + if tokens[j].value == '(': + tokcount,args,positions = self.collect_args(tokens[j:]) + if not m.variadic and len(args) != len(m.arglist): + self.error(self.source,t.lineno,"Macro %s requires %d arguments" % (t.value,len(m.arglist))) + i = j + tokcount + elif m.variadic and len(args) < len(m.arglist)-1: + if len(m.arglist) > 2: + self.error(self.source,t.lineno,"Macro %s must have at least %d arguments" % (t.value, len(m.arglist)-1)) + else: + self.error(self.source,t.lineno,"Macro %s must have at least %d argument" % (t.value, len(m.arglist)-1)) + i = j + tokcount + else: + if m.variadic: + if len(args) == len(m.arglist)-1: + args.append([]) + else: + args[len(m.arglist)-1] = tokens[j+positions[len(m.arglist)-1]:j+tokcount-1] + del args[len(m.arglist):] + + # Get macro replacement text + rep = self.macro_expand_args(m,args) + rep = self.expand_macros(rep,expanded) + for r in rep: + r.lineno = t.lineno + tokens[i:j+tokcount] = rep + i += len(rep) + del expanded[t.value] + continue + elif t.value == '__LINE__': + t.type = self.t_INTEGER + t.value = self.t_INTEGER_TYPE(t.lineno) + + i += 1 + return tokens + + # ---------------------------------------------------------------------- + # evalexpr() + # + # Evaluate an expression token sequence for the purposes of evaluating + # integral expressions. + # ---------------------------------------------------------------------- + + def evalexpr(self,tokens): + # tokens = tokenize(line) + # Search for defined macros + i = 0 + while i < len(tokens): + if tokens[i].type == self.t_ID and tokens[i].value == 'defined': + j = i + 1 + needparen = False + result = "0L" + while j < len(tokens): + if tokens[j].type in self.t_WS: + j += 1 + continue + elif tokens[j].type == self.t_ID: + if tokens[j].value in self.macros: + result = "1L" + else: + result = "0L" + if not needparen: break + elif tokens[j].value == '(': + needparen = True + elif tokens[j].value == ')': + break + else: + self.error(self.source,tokens[i].lineno,"Malformed defined()") + j += 1 + tokens[i].type = self.t_INTEGER + tokens[i].value = self.t_INTEGER_TYPE(result) + del tokens[i+1:j+1] + i += 1 + tokens = self.expand_macros(tokens) + for i,t in enumerate(tokens): + if t.type == self.t_ID: + tokens[i] = copy.copy(t) + tokens[i].type = self.t_INTEGER + tokens[i].value = self.t_INTEGER_TYPE("0L") + elif t.type == self.t_INTEGER: + tokens[i] = copy.copy(t) + # Strip off any trailing suffixes + tokens[i].value = str(tokens[i].value) + while tokens[i].value[-1] not in "0123456789abcdefABCDEF": + tokens[i].value = tokens[i].value[:-1] + + expr = "".join([str(x.value) for x in tokens]) + expr = expr.replace("&&"," and ") + expr = expr.replace("||"," or ") + expr = expr.replace("!"," not ") + try: + result = eval(expr) + except Exception: + self.error(self.source,tokens[0].lineno,"Couldn't evaluate expression") + result = 0 + return result + + # ---------------------------------------------------------------------- + # parsegen() + # + # Parse an input string/ + # ---------------------------------------------------------------------- + def parsegen(self,input,source=None): + + # Replace trigraph sequences + t = trigraph(input) + lines = self.group_lines(t) + + if not source: + source = "" + + self.define("__FILE__ \"%s\"" % source) + + self.source = source + chunk = [] + enable = True + iftrigger = False + ifstack = [] + + for x in lines: + for i,tok in enumerate(x): + if tok.type not in self.t_WS: break + if tok.value == '#': + # Preprocessor directive + + # insert necessary whitespace instead of eaten tokens + for tok in x: + if tok.type in self.t_WS and '\n' in tok.value: + chunk.append(tok) + + dirtokens = self.tokenstrip(x[i+1:]) + if dirtokens: + name = dirtokens[0].value + args = self.tokenstrip(dirtokens[1:]) + else: + name = "" + args = [] + + if name == 'define': + if enable: + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + self.define(args) + elif name == 'include': + if enable: + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + oldfile = self.macros['__FILE__'] + for tok in self.include(args): + yield tok + self.macros['__FILE__'] = oldfile + self.source = source + elif name == 'undef': + if enable: + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + self.undef(args) + elif name == 'ifdef': + ifstack.append((enable,iftrigger)) + if enable: + if not args[0].value in self.macros: + enable = False + iftrigger = False + else: + iftrigger = True + elif name == 'ifndef': + ifstack.append((enable,iftrigger)) + if enable: + if args[0].value in self.macros: + enable = False + iftrigger = False + else: + iftrigger = True + elif name == 'if': + ifstack.append((enable,iftrigger)) + if enable: + result = self.evalexpr(args) + if not result: + enable = False + iftrigger = False + else: + iftrigger = True + elif name == 'elif': + if ifstack: + if ifstack[-1][0]: # We only pay attention if outer "if" allows this + if enable: # If already true, we flip enable False + enable = False + elif not iftrigger: # If False, but not triggered yet, we'll check expression + result = self.evalexpr(args) + if result: + enable = True + iftrigger = True + else: + self.error(self.source,dirtokens[0].lineno,"Misplaced #elif") + + elif name == 'else': + if ifstack: + if ifstack[-1][0]: + if enable: + enable = False + elif not iftrigger: + enable = True + iftrigger = True + else: + self.error(self.source,dirtokens[0].lineno,"Misplaced #else") + + elif name == 'endif': + if ifstack: + enable,iftrigger = ifstack.pop() + else: + self.error(self.source,dirtokens[0].lineno,"Misplaced #endif") + else: + # Unknown preprocessor directive + pass + + else: + # Normal text + if enable: + chunk.extend(x) + + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + + # ---------------------------------------------------------------------- + # include() + # + # Implementation of file-inclusion + # ---------------------------------------------------------------------- + + def include(self,tokens): + # Try to extract the filename and then process an include file + if not tokens: + return + if tokens: + if tokens[0].value != '<' and tokens[0].type != self.t_STRING: + tokens = self.expand_macros(tokens) + + if tokens[0].value == '<': + # Include <...> + i = 1 + while i < len(tokens): + if tokens[i].value == '>': + break + i += 1 + else: + print("Malformed #include <...>") + return + filename = "".join([x.value for x in tokens[1:i]]) + path = self.path + [""] + self.temp_path + elif tokens[0].type == self.t_STRING: + filename = tokens[0].value[1:-1] + path = self.temp_path + [""] + self.path + else: + print("Malformed #include statement") + return + for p in path: + iname = os.path.join(p,filename) + try: + data = open(iname,"r").read() + dname = os.path.dirname(iname) + if dname: + self.temp_path.insert(0,dname) + for tok in self.parsegen(data,filename): + yield tok + if dname: + del self.temp_path[0] + break + except IOError: + pass + else: + print("Couldn't find '%s'" % filename) + + # ---------------------------------------------------------------------- + # define() + # + # Define a new macro + # ---------------------------------------------------------------------- + + def define(self,tokens): + if isinstance(tokens,STRING_TYPES): + tokens = self.tokenize(tokens) + + linetok = tokens + try: + name = linetok[0] + if len(linetok) > 1: + mtype = linetok[1] + else: + mtype = None + if not mtype: + m = Macro(name.value,[]) + self.macros[name.value] = m + elif mtype.type in self.t_WS: + # A normal macro + m = Macro(name.value,self.tokenstrip(linetok[2:])) + self.macros[name.value] = m + elif mtype.value == '(': + # A macro with arguments + tokcount, args, positions = self.collect_args(linetok[1:]) + variadic = False + for a in args: + if variadic: + print("No more arguments may follow a variadic argument") + break + astr = "".join([str(_i.value) for _i in a]) + if astr == "...": + variadic = True + a[0].type = self.t_ID + a[0].value = '__VA_ARGS__' + variadic = True + del a[1:] + continue + elif astr[-3:] == "..." and a[0].type == self.t_ID: + variadic = True + del a[1:] + # If, for some reason, "." is part of the identifier, strip off the name for the purposes + # of macro expansion + if a[0].value[-3:] == '...': + a[0].value = a[0].value[:-3] + continue + if len(a) > 1 or a[0].type != self.t_ID: + print("Invalid macro argument") + break + else: + mvalue = self.tokenstrip(linetok[1+tokcount:]) + i = 0 + while i < len(mvalue): + if i+1 < len(mvalue): + if mvalue[i].type in self.t_WS and mvalue[i+1].value == '##': + del mvalue[i] + continue + elif mvalue[i].value == '##' and mvalue[i+1].type in self.t_WS: + del mvalue[i+1] + i += 1 + m = Macro(name.value,mvalue,[x[0].value for x in args],variadic) + self.macro_prescan(m) + self.macros[name.value] = m + else: + print("Bad macro definition") + except LookupError: + print("Bad macro definition") + + # ---------------------------------------------------------------------- + # undef() + # + # Undefine a macro + # ---------------------------------------------------------------------- + + def undef(self,tokens): + id = tokens[0].value + try: + del self.macros[id] + except LookupError: + pass + + # ---------------------------------------------------------------------- + # parse() + # + # Parse input text. + # ---------------------------------------------------------------------- + def parse(self,input,source=None,ignore={}): + self.ignore = ignore + self.parser = self.parsegen(input,source) + + # ---------------------------------------------------------------------- + # token() + # + # Method to return individual tokens + # ---------------------------------------------------------------------- + def token(self): + try: + while True: + tok = next(self.parser) + if tok.type not in self.ignore: return tok + except StopIteration: + self.parser = None + return None + +if __name__ == '__main__': + import ply.lex as lex + lexer = lex.lex() + + # Run a preprocessor + import sys + f = open(sys.argv[1]) + input = f.read() + + p = Preprocessor(lexer) + p.parse(input,sys.argv[1]) + while True: + tok = p.token() + if not tok: break + print(p.source, tok) + + + + + + + + + + + diff --git a/ply/ctokens.py b/ply/ctokens.py new file mode 100644 index 000000000..f6f6952d6 --- /dev/null +++ b/ply/ctokens.py @@ -0,0 +1,133 @@ +# ---------------------------------------------------------------------- +# ctokens.py +# +# Token specifications for symbols in ANSI C and C++. This file is +# meant to be used as a library in other tokenizers. +# ---------------------------------------------------------------------- + +# Reserved words + +tokens = [ + # Literals (identifier, integer constant, float constant, string constant, char const) + 'ID', 'TYPEID', 'INTEGER', 'FLOAT', 'STRING', 'CHARACTER', + + # Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=) + 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MODULO', + 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', + 'LOR', 'LAND', 'LNOT', + 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', + + # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=) + 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL', + 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL', + + # Increment/decrement (++,--) + 'INCREMENT', 'DECREMENT', + + # Structure dereference (->) + 'ARROW', + + # Ternary operator (?) + 'TERNARY', + + # Delimeters ( ) [ ] { } , . ; : + 'LPAREN', 'RPAREN', + 'LBRACKET', 'RBRACKET', + 'LBRACE', 'RBRACE', + 'COMMA', 'PERIOD', 'SEMI', 'COLON', + + # Ellipsis (...) + 'ELLIPSIS', +] + +# Operators +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_MODULO = r'%' +t_OR = r'\|' +t_AND = r'&' +t_NOT = r'~' +t_XOR = r'\^' +t_LSHIFT = r'<<' +t_RSHIFT = r'>>' +t_LOR = r'\|\|' +t_LAND = r'&&' +t_LNOT = r'!' +t_LT = r'<' +t_GT = r'>' +t_LE = r'<=' +t_GE = r'>=' +t_EQ = r'==' +t_NE = r'!=' + +# Assignment operators + +t_EQUALS = r'=' +t_TIMESEQUAL = r'\*=' +t_DIVEQUAL = r'/=' +t_MODEQUAL = r'%=' +t_PLUSEQUAL = r'\+=' +t_MINUSEQUAL = r'-=' +t_LSHIFTEQUAL = r'<<=' +t_RSHIFTEQUAL = r'>>=' +t_ANDEQUAL = r'&=' +t_OREQUAL = r'\|=' +t_XOREQUAL = r'\^=' + +# Increment/decrement +t_INCREMENT = r'\+\+' +t_DECREMENT = r'--' + +# -> +t_ARROW = r'->' + +# ? +t_TERNARY = r'\?' + +# Delimeters +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_LBRACKET = r'\[' +t_RBRACKET = r'\]' +t_LBRACE = r'\{' +t_RBRACE = r'\}' +t_COMMA = r',' +t_PERIOD = r'\.' +t_SEMI = r';' +t_COLON = r':' +t_ELLIPSIS = r'\.\.\.' + +# Identifiers +t_ID = r'[A-Za-z_][A-Za-z0-9_]*' + +# Integer literal +t_INTEGER = r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?' + +# Floating literal +t_FLOAT = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' + +# String literal +t_STRING = r'\"([^\\\n]|(\\.))*?\"' + +# Character constant 'c' or L'c' +t_CHARACTER = r'(L)?\'([^\\\n]|(\\.))*?\'' + +# Comment (C-Style) +def t_COMMENT(t): + r'/\*(.|\n)*?\*/' + t.lexer.lineno += t.value.count('\n') + return t + +# Comment (C++-Style) +def t_CPPCOMMENT(t): + r'//.*\n' + t.lexer.lineno += 1 + return t + + + + + + diff --git a/ply/lex.py b/ply/lex.py new file mode 100644 index 000000000..f548622f9 --- /dev/null +++ b/ply/lex.py @@ -0,0 +1,1100 @@ +# ----------------------------------------------------------------------------- +# ply: lex.py +# +# Copyright (C) 2001-2016 +# David M. Beazley (Dabeaz LLC) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * Neither the name of the David Beazley or Dabeaz LLC may be used to +# endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- + +__version__ = '3.10' +__tabversion__ = '3.10' + +import re +import sys +import types +import copy +import os +import inspect + +# This tuple contains known string types +try: + # Python 2.6 + StringTypes = (types.StringType, types.UnicodeType) +except AttributeError: + # Python 3.0 + StringTypes = (str, bytes) + +# This regular expression is used to match valid token names +_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') + +# Exception thrown when invalid token encountered and no default error +# handler is defined. +class LexError(Exception): + def __init__(self, message, s): + self.args = (message,) + self.text = s + + +# Token class. This class is used to represent the tokens produced. +class LexToken(object): + def __str__(self): + return 'LexToken(%s,%r,%d,%d)' % (self.type, self.value, self.lineno, self.lexpos) + + def __repr__(self): + return str(self) + + +# This object is a stand-in for a logging object created by the +# logging module. + +class PlyLogger(object): + def __init__(self, f): + self.f = f + + def critical(self, msg, *args, **kwargs): + self.f.write((msg % args) + '\n') + + def warning(self, msg, *args, **kwargs): + self.f.write('WARNING: ' + (msg % args) + '\n') + + def error(self, msg, *args, **kwargs): + self.f.write('ERROR: ' + (msg % args) + '\n') + + info = critical + debug = critical + + +# Null logger is used when no output is generated. Does nothing. +class NullLogger(object): + def __getattribute__(self, name): + return self + + def __call__(self, *args, **kwargs): + return self + + +# ----------------------------------------------------------------------------- +# === Lexing Engine === +# +# The following Lexer class implements the lexer runtime. There are only +# a few public methods and attributes: +# +# input() - Store a new string in the lexer +# token() - Get the next token +# clone() - Clone the lexer +# +# lineno - Current line number +# lexpos - Current position in the input string +# ----------------------------------------------------------------------------- + +class Lexer: + def __init__(self): + self.lexre = None # Master regular expression. This is a list of + # tuples (re, findex) where re is a compiled + # regular expression and findex is a list + # mapping regex group numbers to rules + self.lexretext = None # Current regular expression strings + self.lexstatere = {} # Dictionary mapping lexer states to master regexs + self.lexstateretext = {} # Dictionary mapping lexer states to regex strings + self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names + self.lexstate = 'INITIAL' # Current lexer state + self.lexstatestack = [] # Stack of lexer states + self.lexstateinfo = None # State information + self.lexstateignore = {} # Dictionary of ignored characters for each state + self.lexstateerrorf = {} # Dictionary of error functions for each state + self.lexstateeoff = {} # Dictionary of eof functions for each state + self.lexreflags = 0 # Optional re compile flags + self.lexdata = None # Actual input data (as a string) + self.lexpos = 0 # Current position in input text + self.lexlen = 0 # Length of the input text + self.lexerrorf = None # Error rule (if any) + self.lexeoff = None # EOF rule (if any) + self.lextokens = None # List of valid tokens + self.lexignore = '' # Ignored characters + self.lexliterals = '' # Literal characters that can be passed through + self.lexmodule = None # Module + self.lineno = 1 # Current line number + self.lexoptimize = False # Optimized mode + + def clone(self, object=None): + c = copy.copy(self) + + # If the object parameter has been supplied, it means we are attaching the + # lexer to a new object. In this case, we have to rebind all methods in + # the lexstatere and lexstateerrorf tables. + + if object: + newtab = {} + for key, ritem in self.lexstatere.items(): + newre = [] + for cre, findex in ritem: + newfindex = [] + for f in findex: + if not f or not f[0]: + newfindex.append(f) + continue + newfindex.append((getattr(object, f[0].__name__), f[1])) + newre.append((cre, newfindex)) + newtab[key] = newre + c.lexstatere = newtab + c.lexstateerrorf = {} + for key, ef in self.lexstateerrorf.items(): + c.lexstateerrorf[key] = getattr(object, ef.__name__) + c.lexmodule = object + return c + + # ------------------------------------------------------------ + # writetab() - Write lexer information to a table file + # ------------------------------------------------------------ + def writetab(self, lextab, outputdir=''): + if isinstance(lextab, types.ModuleType): + raise IOError("Won't overwrite existing lextab module") + basetabmodule = lextab.split('.')[-1] + filename = os.path.join(outputdir, basetabmodule) + '.py' + with open(filename, 'w') as tf: + tf.write('# %s.py. This file automatically created by PLY (version %s). Don\'t edit!\n' % (basetabmodule, __version__)) + tf.write('_tabversion = %s\n' % repr(__tabversion__)) + tf.write('_lextokens = set(%s)\n' % repr(tuple(self.lextokens))) + tf.write('_lexreflags = %s\n' % repr(self.lexreflags)) + tf.write('_lexliterals = %s\n' % repr(self.lexliterals)) + tf.write('_lexstateinfo = %s\n' % repr(self.lexstateinfo)) + + # Rewrite the lexstatere table, replacing function objects with function names + tabre = {} + for statename, lre in self.lexstatere.items(): + titem = [] + for (pat, func), retext, renames in zip(lre, self.lexstateretext[statename], self.lexstaterenames[statename]): + titem.append((retext, _funcs_to_names(func, renames))) + tabre[statename] = titem + + tf.write('_lexstatere = %s\n' % repr(tabre)) + tf.write('_lexstateignore = %s\n' % repr(self.lexstateignore)) + + taberr = {} + for statename, ef in self.lexstateerrorf.items(): + taberr[statename] = ef.__name__ if ef else None + tf.write('_lexstateerrorf = %s\n' % repr(taberr)) + + tabeof = {} + for statename, ef in self.lexstateeoff.items(): + tabeof[statename] = ef.__name__ if ef else None + tf.write('_lexstateeoff = %s\n' % repr(tabeof)) + + # ------------------------------------------------------------ + # readtab() - Read lexer information from a tab file + # ------------------------------------------------------------ + def readtab(self, tabfile, fdict): + if isinstance(tabfile, types.ModuleType): + lextab = tabfile + else: + exec('import %s' % tabfile) + lextab = sys.modules[tabfile] + + if getattr(lextab, '_tabversion', '0.0') != __tabversion__: + raise ImportError('Inconsistent PLY version') + + self.lextokens = lextab._lextokens + self.lexreflags = lextab._lexreflags + self.lexliterals = lextab._lexliterals + self.lextokens_all = self.lextokens | set(self.lexliterals) + self.lexstateinfo = lextab._lexstateinfo + self.lexstateignore = lextab._lexstateignore + self.lexstatere = {} + self.lexstateretext = {} + for statename, lre in lextab._lexstatere.items(): + titem = [] + txtitem = [] + for pat, func_name in lre: + titem.append((re.compile(pat, lextab._lexreflags | re.VERBOSE), _names_to_funcs(func_name, fdict))) + + self.lexstatere[statename] = titem + self.lexstateretext[statename] = txtitem + + self.lexstateerrorf = {} + for statename, ef in lextab._lexstateerrorf.items(): + self.lexstateerrorf[statename] = fdict[ef] + + self.lexstateeoff = {} + for statename, ef in lextab._lexstateeoff.items(): + self.lexstateeoff[statename] = fdict[ef] + + self.begin('INITIAL') + + # ------------------------------------------------------------ + # input() - Push a new string into the lexer + # ------------------------------------------------------------ + def input(self, s): + # Pull off the first character to see if s looks like a string + c = s[:1] + if not isinstance(c, StringTypes): + raise ValueError('Expected a string') + self.lexdata = s + self.lexpos = 0 + self.lexlen = len(s) + + # ------------------------------------------------------------ + # begin() - Changes the lexing state + # ------------------------------------------------------------ + def begin(self, state): + if state not in self.lexstatere: + raise ValueError('Undefined state') + self.lexre = self.lexstatere[state] + self.lexretext = self.lexstateretext[state] + self.lexignore = self.lexstateignore.get(state, '') + self.lexerrorf = self.lexstateerrorf.get(state, None) + self.lexeoff = self.lexstateeoff.get(state, None) + self.lexstate = state + + # ------------------------------------------------------------ + # push_state() - Changes the lexing state and saves old on stack + # ------------------------------------------------------------ + def push_state(self, state): + self.lexstatestack.append(self.lexstate) + self.begin(state) + + # ------------------------------------------------------------ + # pop_state() - Restores the previous state + # ------------------------------------------------------------ + def pop_state(self): + self.begin(self.lexstatestack.pop()) + + # ------------------------------------------------------------ + # current_state() - Returns the current lexing state + # ------------------------------------------------------------ + def current_state(self): + return self.lexstate + + # ------------------------------------------------------------ + # skip() - Skip ahead n characters + # ------------------------------------------------------------ + def skip(self, n): + self.lexpos += n + + # ------------------------------------------------------------ + # opttoken() - Return the next token from the Lexer + # + # Note: This function has been carefully implemented to be as fast + # as possible. Don't make changes unless you really know what + # you are doing + # ------------------------------------------------------------ + def token(self): + # Make local copies of frequently referenced attributes + lexpos = self.lexpos + lexlen = self.lexlen + lexignore = self.lexignore + lexdata = self.lexdata + + while lexpos < lexlen: + # This code provides some short-circuit code for whitespace, tabs, and other ignored characters + if lexdata[lexpos] in lexignore: + lexpos += 1 + continue + + # Look for a regular expression match + for lexre, lexindexfunc in self.lexre: + m = lexre.match(lexdata, lexpos) + if not m: + continue + + # Create a token for return + tok = LexToken() + tok.value = m.group() + tok.lineno = self.lineno + tok.lexpos = lexpos + + i = m.lastindex + func, tok.type = lexindexfunc[i] + + if not func: + # If no token type was set, it's an ignored token + if tok.type: + self.lexpos = m.end() + return tok + else: + lexpos = m.end() + break + + lexpos = m.end() + + # If token is processed by a function, call it + + tok.lexer = self # Set additional attributes useful in token rules + self.lexmatch = m + self.lexpos = lexpos + + newtok = func(tok) + + # Every function must return a token, if nothing, we just move to next token + if not newtok: + lexpos = self.lexpos # This is here in case user has updated lexpos. + lexignore = self.lexignore # This is here in case there was a state change + break + + # Verify type of the token. If not in the token map, raise an error + if not self.lexoptimize: + if newtok.type not in self.lextokens_all: + raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( + func.__code__.co_filename, func.__code__.co_firstlineno, + func.__name__, newtok.type), lexdata[lexpos:]) + + return newtok + else: + # No match, see if in literals + if lexdata[lexpos] in self.lexliterals: + tok = LexToken() + tok.value = lexdata[lexpos] + tok.lineno = self.lineno + tok.type = tok.value + tok.lexpos = lexpos + self.lexpos = lexpos + 1 + return tok + + # No match. Call t_error() if defined. + if self.lexerrorf: + tok = LexToken() + tok.value = self.lexdata[lexpos:] + tok.lineno = self.lineno + tok.type = 'error' + tok.lexer = self + tok.lexpos = lexpos + self.lexpos = lexpos + newtok = self.lexerrorf(tok) + if lexpos == self.lexpos: + # Error method didn't change text position at all. This is an error. + raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) + lexpos = self.lexpos + if not newtok: + continue + return newtok + + self.lexpos = lexpos + raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos], lexpos), lexdata[lexpos:]) + + if self.lexeoff: + tok = LexToken() + tok.type = 'eof' + tok.value = '' + tok.lineno = self.lineno + tok.lexpos = lexpos + tok.lexer = self + self.lexpos = lexpos + newtok = self.lexeoff(tok) + return newtok + + self.lexpos = lexpos + 1 + if self.lexdata is None: + raise RuntimeError('No input string given with input()') + return None + + # Iterator interface + def __iter__(self): + return self + + def next(self): + t = self.token() + if t is None: + raise StopIteration + return t + + __next__ = next + +# ----------------------------------------------------------------------------- +# ==== Lex Builder === +# +# The functions and classes below are used to collect lexing information +# and build a Lexer object from it. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# _get_regex(func) +# +# Returns the regular expression assigned to a function either as a doc string +# or as a .regex attribute attached by the @TOKEN decorator. +# ----------------------------------------------------------------------------- +def _get_regex(func): + return getattr(func, 'regex', func.__doc__) + +# ----------------------------------------------------------------------------- +# get_caller_module_dict() +# +# This function returns a dictionary containing all of the symbols defined within +# a caller further down the call stack. This is used to get the environment +# associated with the yacc() call if none was provided. +# ----------------------------------------------------------------------------- +def get_caller_module_dict(levels): + f = sys._getframe(levels) + ldict = f.f_globals.copy() + if f.f_globals != f.f_locals: + ldict.update(f.f_locals) + return ldict + +# ----------------------------------------------------------------------------- +# _funcs_to_names() +# +# Given a list of regular expression functions, this converts it to a list +# suitable for output to a table file +# ----------------------------------------------------------------------------- +def _funcs_to_names(funclist, namelist): + result = [] + for f, name in zip(funclist, namelist): + if f and f[0]: + result.append((name, f[1])) + else: + result.append(f) + return result + +# ----------------------------------------------------------------------------- +# _names_to_funcs() +# +# Given a list of regular expression function names, this converts it back to +# functions. +# ----------------------------------------------------------------------------- +def _names_to_funcs(namelist, fdict): + result = [] + for n in namelist: + if n and n[0]: + result.append((fdict[n[0]], n[1])) + else: + result.append(n) + return result + +# ----------------------------------------------------------------------------- +# _form_master_re() +# +# This function takes a list of all of the regex components and attempts to +# form the master regular expression. Given limitations in the Python re +# module, it may be necessary to break the master regex into separate expressions. +# ----------------------------------------------------------------------------- +def _form_master_re(relist, reflags, ldict, toknames): + if not relist: + return [] + regex = '|'.join(relist) + try: + lexre = re.compile(regex, re.VERBOSE | reflags) + + # Build the index to function map for the matching engine + lexindexfunc = [None] * (max(lexre.groupindex.values()) + 1) + lexindexnames = lexindexfunc[:] + + for f, i in lexre.groupindex.items(): + handle = ldict.get(f, None) + if type(handle) in (types.FunctionType, types.MethodType): + lexindexfunc[i] = (handle, toknames[f]) + lexindexnames[i] = f + elif handle is not None: + lexindexnames[i] = f + if f.find('ignore_') > 0: + lexindexfunc[i] = (None, None) + else: + lexindexfunc[i] = (None, toknames[f]) + + return [(lexre, lexindexfunc)], [regex], [lexindexnames] + except Exception: + m = int(len(relist)/2) + if m == 0: + m = 1 + llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames) + rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames) + return (llist+rlist), (lre+rre), (lnames+rnames) + +# ----------------------------------------------------------------------------- +# def _statetoken(s,names) +# +# Given a declaration name s of the form "t_" and a dictionary whose keys are +# state names, this function returns a tuple (states,tokenname) where states +# is a tuple of state names and tokenname is the name of the token. For example, +# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') +# ----------------------------------------------------------------------------- +def _statetoken(s, names): + nonstate = 1 + parts = s.split('_') + for i, part in enumerate(parts[1:], 1): + if part not in names and part != 'ANY': + break + + if i > 1: + states = tuple(parts[1:i]) + else: + states = ('INITIAL',) + + if 'ANY' in states: + states = tuple(names) + + tokenname = '_'.join(parts[i:]) + return (states, tokenname) + + +# ----------------------------------------------------------------------------- +# LexerReflect() +# +# This class represents information needed to build a lexer as extracted from a +# user's input file. +# ----------------------------------------------------------------------------- +class LexerReflect(object): + def __init__(self, ldict, log=None, reflags=0): + self.ldict = ldict + self.error_func = None + self.tokens = [] + self.reflags = reflags + self.stateinfo = {'INITIAL': 'inclusive'} + self.modules = set() + self.error = False + self.log = PlyLogger(sys.stderr) if log is None else log + + # Get all of the basic information + def get_all(self): + self.get_tokens() + self.get_literals() + self.get_states() + self.get_rules() + + # Validate all of the information + def validate_all(self): + self.validate_tokens() + self.validate_literals() + self.validate_rules() + return self.error + + # Get the tokens map + def get_tokens(self): + tokens = self.ldict.get('tokens', None) + if not tokens: + self.log.error('No token list is defined') + self.error = True + return + + if not isinstance(tokens, (list, tuple)): + self.log.error('tokens must be a list or tuple') + self.error = True + return + + if not tokens: + self.log.error('tokens is empty') + self.error = True + return + + self.tokens = tokens + + # Validate the tokens + def validate_tokens(self): + terminals = {} + for n in self.tokens: + if not _is_identifier.match(n): + self.log.error("Bad token name '%s'", n) + self.error = True + if n in terminals: + self.log.warning("Token '%s' multiply defined", n) + terminals[n] = 1 + + # Get the literals specifier + def get_literals(self): + self.literals = self.ldict.get('literals', '') + if not self.literals: + self.literals = '' + + # Validate literals + def validate_literals(self): + try: + for c in self.literals: + if not isinstance(c, StringTypes) or len(c) > 1: + self.log.error('Invalid literal %s. Must be a single character', repr(c)) + self.error = True + + except TypeError: + self.log.error('Invalid literals specification. literals must be a sequence of characters') + self.error = True + + def get_states(self): + self.states = self.ldict.get('states', None) + # Build statemap + if self.states: + if not isinstance(self.states, (tuple, list)): + self.log.error('states must be defined as a tuple or list') + self.error = True + else: + for s in self.states: + if not isinstance(s, tuple) or len(s) != 2: + self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", repr(s)) + self.error = True + continue + name, statetype = s + if not isinstance(name, StringTypes): + self.log.error('State name %s must be a string', repr(name)) + self.error = True + continue + if not (statetype == 'inclusive' or statetype == 'exclusive'): + self.log.error("State type for state %s must be 'inclusive' or 'exclusive'", name) + self.error = True + continue + if name in self.stateinfo: + self.log.error("State '%s' already defined", name) + self.error = True + continue + self.stateinfo[name] = statetype + + # Get all of the symbols with a t_ prefix and sort them into various + # categories (functions, strings, error functions, and ignore characters) + + def get_rules(self): + tsymbols = [f for f in self.ldict if f[:2] == 't_'] + + # Now build up a list of functions and a list of strings + self.toknames = {} # Mapping of symbols to token names + self.funcsym = {} # Symbols defined as functions + self.strsym = {} # Symbols defined as strings + self.ignore = {} # Ignore strings by state + self.errorf = {} # Error functions by state + self.eoff = {} # EOF functions by state + + for s in self.stateinfo: + self.funcsym[s] = [] + self.strsym[s] = [] + + if len(tsymbols) == 0: + self.log.error('No rules of the form t_rulename are defined') + self.error = True + return + + for f in tsymbols: + t = self.ldict[f] + states, tokname = _statetoken(f, self.stateinfo) + self.toknames[f] = tokname + + if hasattr(t, '__call__'): + if tokname == 'error': + for s in states: + self.errorf[s] = t + elif tokname == 'eof': + for s in states: + self.eoff[s] = t + elif tokname == 'ignore': + line = t.__code__.co_firstlineno + file = t.__code__.co_filename + self.log.error("%s:%d: Rule '%s' must be defined as a string", file, line, t.__name__) + self.error = True + else: + for s in states: + self.funcsym[s].append((f, t)) + elif isinstance(t, StringTypes): + if tokname == 'ignore': + for s in states: + self.ignore[s] = t + if '\\' in t: + self.log.warning("%s contains a literal backslash '\\'", f) + + elif tokname == 'error': + self.log.error("Rule '%s' must be defined as a function", f) + self.error = True + else: + for s in states: + self.strsym[s].append((f, t)) + else: + self.log.error('%s not defined as a function or string', f) + self.error = True + + # Sort the functions by line number + for f in self.funcsym.values(): + f.sort(key=lambda x: x[1].__code__.co_firstlineno) + + # Sort the strings by regular expression length + for s in self.strsym.values(): + s.sort(key=lambda x: len(x[1]), reverse=True) + + # Validate all of the t_rules collected + def validate_rules(self): + for state in self.stateinfo: + # Validate all rules defined by functions + + for fname, f in self.funcsym[state]: + line = f.__code__.co_firstlineno + file = f.__code__.co_filename + module = inspect.getmodule(f) + self.modules.add(module) + + tokname = self.toknames[fname] + if isinstance(f, types.MethodType): + reqargs = 2 + else: + reqargs = 1 + nargs = f.__code__.co_argcount + if nargs > reqargs: + self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__) + self.error = True + continue + + if nargs < reqargs: + self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) + self.error = True + continue + + if not _get_regex(f): + self.log.error("%s:%d: No regular expression defined for rule '%s'", file, line, f.__name__) + self.error = True + continue + + try: + c = re.compile('(?P<%s>%s)' % (fname, _get_regex(f)), re.VERBOSE | self.reflags) + if c.match(''): + self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file, line, f.__name__) + self.error = True + except re.error as e: + self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e) + if '#' in _get_regex(f): + self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__) + self.error = True + + # Validate all rules defined by strings + for name, r in self.strsym[state]: + tokname = self.toknames[name] + if tokname == 'error': + self.log.error("Rule '%s' must be defined as a function", name) + self.error = True + continue + + if tokname not in self.tokens and tokname.find('ignore_') < 0: + self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname) + self.error = True + continue + + try: + c = re.compile('(?P<%s>%s)' % (name, r), re.VERBOSE | self.reflags) + if (c.match('')): + self.log.error("Regular expression for rule '%s' matches empty string", name) + self.error = True + except re.error as e: + self.log.error("Invalid regular expression for rule '%s'. %s", name, e) + if '#' in r: + self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name) + self.error = True + + if not self.funcsym[state] and not self.strsym[state]: + self.log.error("No rules defined for state '%s'", state) + self.error = True + + # Validate the error function + efunc = self.errorf.get(state, None) + if efunc: + f = efunc + line = f.__code__.co_firstlineno + file = f.__code__.co_filename + module = inspect.getmodule(f) + self.modules.add(module) + + if isinstance(f, types.MethodType): + reqargs = 2 + else: + reqargs = 1 + nargs = f.__code__.co_argcount + if nargs > reqargs: + self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__) + self.error = True + + if nargs < reqargs: + self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) + self.error = True + + for module in self.modules: + self.validate_module(module) + + # ----------------------------------------------------------------------------- + # validate_module() + # + # This checks to see if there are duplicated t_rulename() functions or strings + # in the parser input file. This is done using a simple regular expression + # match on each line in the source code of the given module. + # ----------------------------------------------------------------------------- + + def validate_module(self, module): + try: + lines, linen = inspect.getsourcelines(module) + except IOError: + return + + fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') + sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') + + counthash = {} + linen += 1 + for line in lines: + m = fre.match(line) + if not m: + m = sre.match(line) + if m: + name = m.group(1) + prev = counthash.get(name) + if not prev: + counthash[name] = linen + else: + filename = inspect.getsourcefile(module) + self.log.error('%s:%d: Rule %s redefined. Previously defined on line %d', filename, linen, name, prev) + self.error = True + linen += 1 + +# ----------------------------------------------------------------------------- +# lex(module) +# +# Build all of the regular expression rules from definitions in the supplied module +# ----------------------------------------------------------------------------- +def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab', + reflags=0, nowarn=False, outputdir=None, debuglog=None, errorlog=None): + + if lextab is None: + lextab = 'lextab' + + global lexer + + ldict = None + stateinfo = {'INITIAL': 'inclusive'} + lexobj = Lexer() + lexobj.lexoptimize = optimize + global token, input + + if errorlog is None: + errorlog = PlyLogger(sys.stderr) + + if debug: + if debuglog is None: + debuglog = PlyLogger(sys.stderr) + + # Get the module dictionary used for the lexer + if object: + module = object + + # Get the module dictionary used for the parser + if module: + _items = [(k, getattr(module, k)) for k in dir(module)] + ldict = dict(_items) + # If no __file__ attribute is available, try to obtain it from the __module__ instead + if '__file__' not in ldict: + ldict['__file__'] = sys.modules[ldict['__module__']].__file__ + else: + ldict = get_caller_module_dict(2) + + # Determine if the module is package of a package or not. + # If so, fix the tabmodule setting so that tables load correctly + pkg = ldict.get('__package__') + if pkg and isinstance(lextab, str): + if '.' not in lextab: + lextab = pkg + '.' + lextab + + # Collect parser information from the dictionary + linfo = LexerReflect(ldict, log=errorlog, reflags=reflags) + linfo.get_all() + if not optimize: + if linfo.validate_all(): + raise SyntaxError("Can't build lexer") + + if optimize and lextab: + try: + lexobj.readtab(lextab, ldict) + token = lexobj.token + input = lexobj.input + lexer = lexobj + return lexobj + + except ImportError: + pass + + # Dump some basic debugging information + if debug: + debuglog.info('lex: tokens = %r', linfo.tokens) + debuglog.info('lex: literals = %r', linfo.literals) + debuglog.info('lex: states = %r', linfo.stateinfo) + + # Build a dictionary of valid token names + lexobj.lextokens = set() + for n in linfo.tokens: + lexobj.lextokens.add(n) + + # Get literals specification + if isinstance(linfo.literals, (list, tuple)): + lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) + else: + lexobj.lexliterals = linfo.literals + + lexobj.lextokens_all = lexobj.lextokens | set(lexobj.lexliterals) + + # Get the stateinfo dictionary + stateinfo = linfo.stateinfo + + regexs = {} + # Build the master regular expressions + for state in stateinfo: + regex_list = [] + + # Add rules defined by functions first + for fname, f in linfo.funcsym[state]: + line = f.__code__.co_firstlineno + file = f.__code__.co_filename + regex_list.append('(?P<%s>%s)' % (fname, _get_regex(f))) + if debug: + debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, _get_regex(f), state) + + # Now add all of the simple rules + for name, r in linfo.strsym[state]: + regex_list.append('(?P<%s>%s)' % (name, r)) + if debug: + debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state) + + regexs[state] = regex_list + + # Build the master regular expressions + + if debug: + debuglog.info('lex: ==== MASTER REGEXS FOLLOW ====') + + for state in regexs: + lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames) + lexobj.lexstatere[state] = lexre + lexobj.lexstateretext[state] = re_text + lexobj.lexstaterenames[state] = re_names + if debug: + for i, text in enumerate(re_text): + debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, text) + + # For inclusive states, we need to add the regular expressions from the INITIAL state + for state, stype in stateinfo.items(): + if state != 'INITIAL' and stype == 'inclusive': + lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) + lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) + lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) + + lexobj.lexstateinfo = stateinfo + lexobj.lexre = lexobj.lexstatere['INITIAL'] + lexobj.lexretext = lexobj.lexstateretext['INITIAL'] + lexobj.lexreflags = reflags + + # Set up ignore variables + lexobj.lexstateignore = linfo.ignore + lexobj.lexignore = lexobj.lexstateignore.get('INITIAL', '') + + # Set up error functions + lexobj.lexstateerrorf = linfo.errorf + lexobj.lexerrorf = linfo.errorf.get('INITIAL', None) + if not lexobj.lexerrorf: + errorlog.warning('No t_error rule is defined') + + # Set up eof functions + lexobj.lexstateeoff = linfo.eoff + lexobj.lexeoff = linfo.eoff.get('INITIAL', None) + + # Check state information for ignore and error rules + for s, stype in stateinfo.items(): + if stype == 'exclusive': + if s not in linfo.errorf: + errorlog.warning("No error rule is defined for exclusive state '%s'", s) + if s not in linfo.ignore and lexobj.lexignore: + errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) + elif stype == 'inclusive': + if s not in linfo.errorf: + linfo.errorf[s] = linfo.errorf.get('INITIAL', None) + if s not in linfo.ignore: + linfo.ignore[s] = linfo.ignore.get('INITIAL', '') + + # Create global versions of the token() and input() functions + token = lexobj.token + input = lexobj.input + lexer = lexobj + + # If in optimize mode, we write the lextab + if lextab and optimize: + if outputdir is None: + # If no output directory is set, the location of the output files + # is determined according to the following rules: + # - If lextab specifies a package, files go into that package directory + # - Otherwise, files go in the same directory as the specifying module + if isinstance(lextab, types.ModuleType): + srcfile = lextab.__file__ + else: + if '.' not in lextab: + srcfile = ldict['__file__'] + else: + parts = lextab.split('.') + pkgname = '.'.join(parts[:-1]) + exec('import %s' % pkgname) + srcfile = getattr(sys.modules[pkgname], '__file__', '') + outputdir = os.path.dirname(srcfile) + try: + lexobj.writetab(lextab, outputdir) + except IOError as e: + errorlog.warning("Couldn't write lextab module %r. %s" % (lextab, e)) + + return lexobj + +# ----------------------------------------------------------------------------- +# runmain() +# +# This runs the lexer as a main program +# ----------------------------------------------------------------------------- + +def runmain(lexer=None, data=None): + if not data: + try: + filename = sys.argv[1] + f = open(filename) + data = f.read() + f.close() + except IndexError: + sys.stdout.write('Reading from standard input (type EOF to end):\n') + data = sys.stdin.read() + + if lexer: + _input = lexer.input + else: + _input = input + _input(data) + if lexer: + _token = lexer.token + else: + _token = token + + while True: + tok = _token() + if not tok: + break + sys.stdout.write('(%s,%r,%d,%d)\n' % (tok.type, tok.value, tok.lineno, tok.lexpos)) + +# ----------------------------------------------------------------------------- +# @TOKEN(regex) +# +# This decorator function can be used to set the regex expression on a function +# when its docstring might need to be set in an alternative way +# ----------------------------------------------------------------------------- + +def TOKEN(r): + def set_regex(f): + if hasattr(r, '__call__'): + f.regex = _get_regex(r) + else: + f.regex = r + return f + return set_regex + +# Alternative spelling of the TOKEN decorator +Token = TOKEN + diff --git a/ply/yacc.py b/ply/yacc.py new file mode 100644 index 000000000..62ebc8855 --- /dev/null +++ b/ply/yacc.py @@ -0,0 +1,3502 @@ +# ----------------------------------------------------------------------------- +# ply: yacc.py +# +# Copyright (C) 2001-2016 +# David M. Beazley (Dabeaz LLC) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * Neither the name of the David Beazley or Dabeaz LLC may be used to +# endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- +# +# This implements an LR parser that is constructed from grammar rules defined +# as Python functions. The grammer is specified by supplying the BNF inside +# Python documentation strings. The inspiration for this technique was borrowed +# from John Aycock's Spark parsing system. PLY might be viewed as cross between +# Spark and the GNU bison utility. +# +# The current implementation is only somewhat object-oriented. The +# LR parser itself is defined in terms of an object (which allows multiple +# parsers to co-exist). However, most of the variables used during table +# construction are defined in terms of global variables. Users shouldn't +# notice unless they are trying to define multiple parsers at the same +# time using threads (in which case they should have their head examined). +# +# This implementation supports both SLR and LALR(1) parsing. LALR(1) +# support was originally implemented by Elias Ioup (ezioup@alumni.uchicago.edu), +# using the algorithm found in Aho, Sethi, and Ullman "Compilers: Principles, +# Techniques, and Tools" (The Dragon Book). LALR(1) has since been replaced +# by the more efficient DeRemer and Pennello algorithm. +# +# :::::::: WARNING ::::::: +# +# Construction of LR parsing tables is fairly complicated and expensive. +# To make this module run fast, a *LOT* of work has been put into +# optimization---often at the expensive of readability and what might +# consider to be good Python "coding style." Modify the code at your +# own risk! +# ---------------------------------------------------------------------------- + +import re +import types +import sys +import os.path +import inspect +import base64 +import warnings + +__version__ = '3.10' +__tabversion__ = '3.10' + +#----------------------------------------------------------------------------- +# === User configurable parameters === +# +# Change these to modify the default behavior of yacc (if you wish) +#----------------------------------------------------------------------------- + +yaccdebug = True # Debugging mode. If set, yacc generates a + # a 'parser.out' file in the current directory + +debug_file = 'parser.out' # Default name of the debugging file +tab_module = 'parsetab' # Default name of the table module +default_lr = 'LALR' # Default LR table generation method + +error_count = 3 # Number of symbols that must be shifted to leave recovery mode + +yaccdevel = False # Set to True if developing yacc. This turns off optimized + # implementations of certain functions. + +resultlimit = 40 # Size limit of results when running in debug mode. + +pickle_protocol = 0 # Protocol to use when writing pickle files + +# String type-checking compatibility +if sys.version_info[0] < 3: + string_types = basestring +else: + string_types = str + +MAXINT = sys.maxsize + +# This object is a stand-in for a logging object created by the +# logging module. PLY will use this by default to create things +# such as the parser.out file. If a user wants more detailed +# information, they can create their own logging object and pass +# it into PLY. + +class PlyLogger(object): + def __init__(self, f): + self.f = f + + def debug(self, msg, *args, **kwargs): + self.f.write((msg % args) + '\n') + + info = debug + + def warning(self, msg, *args, **kwargs): + self.f.write('WARNING: ' + (msg % args) + '\n') + + def error(self, msg, *args, **kwargs): + self.f.write('ERROR: ' + (msg % args) + '\n') + + critical = debug + +# Null logger is used when no output is generated. Does nothing. +class NullLogger(object): + def __getattribute__(self, name): + return self + + def __call__(self, *args, **kwargs): + return self + +# Exception raised for yacc-related errors +class YaccError(Exception): + pass + +# Format the result message that the parser produces when running in debug mode. +def format_result(r): + repr_str = repr(r) + if '\n' in repr_str: + repr_str = repr(repr_str) + if len(repr_str) > resultlimit: + repr_str = repr_str[:resultlimit] + ' ...' + result = '<%s @ 0x%x> (%s)' % (type(r).__name__, id(r), repr_str) + return result + +# Format stack entries when the parser is running in debug mode +def format_stack_entry(r): + repr_str = repr(r) + if '\n' in repr_str: + repr_str = repr(repr_str) + if len(repr_str) < 16: + return repr_str + else: + return '<%s @ 0x%x>' % (type(r).__name__, id(r)) + +# Panic mode error recovery support. This feature is being reworked--much of the +# code here is to offer a deprecation/backwards compatible transition + +_errok = None +_token = None +_restart = None +_warnmsg = '''PLY: Don't use global functions errok(), token(), and restart() in p_error(). +Instead, invoke the methods on the associated parser instance: + + def p_error(p): + ... + # Use parser.errok(), parser.token(), parser.restart() + ... + + parser = yacc.yacc() +''' + +def errok(): + warnings.warn(_warnmsg) + return _errok() + +def restart(): + warnings.warn(_warnmsg) + return _restart() + +def token(): + warnings.warn(_warnmsg) + return _token() + +# Utility function to call the p_error() function with some deprecation hacks +def call_errorfunc(errorfunc, token, parser): + global _errok, _token, _restart + _errok = parser.errok + _token = parser.token + _restart = parser.restart + r = errorfunc(token) + try: + del _errok, _token, _restart + except NameError: + pass + return r + +#----------------------------------------------------------------------------- +# === LR Parsing Engine === +# +# The following classes are used for the LR parser itself. These are not +# used during table construction and are independent of the actual LR +# table generation algorithm +#----------------------------------------------------------------------------- + +# This class is used to hold non-terminal grammar symbols during parsing. +# It normally has the following attributes set: +# .type = Grammar symbol type +# .value = Symbol value +# .lineno = Starting line number +# .endlineno = Ending line number (optional, set automatically) +# .lexpos = Starting lex position +# .endlexpos = Ending lex position (optional, set automatically) + +class YaccSymbol: + def __str__(self): + return self.type + + def __repr__(self): + return str(self) + +# This class is a wrapper around the objects actually passed to each +# grammar rule. Index lookup and assignment actually assign the +# .value attribute of the underlying YaccSymbol object. +# The lineno() method returns the line number of a given +# item (or 0 if not defined). The linespan() method returns +# a tuple of (startline,endline) representing the range of lines +# for a symbol. The lexspan() method returns a tuple (lexpos,endlexpos) +# representing the range of positional information for a symbol. + +class YaccProduction: + def __init__(self, s, stack=None): + self.slice = s + self.stack = stack + self.lexer = None + self.parser = None + + def __getitem__(self, n): + if isinstance(n, slice): + return [s.value for s in self.slice[n]] + elif n >= 0: + return self.slice[n].value + else: + return self.stack[n].value + + def __setitem__(self, n, v): + self.slice[n].value = v + + def __getslice__(self, i, j): + return [s.value for s in self.slice[i:j]] + + def __len__(self): + return len(self.slice) + + def lineno(self, n): + return getattr(self.slice[n], 'lineno', 0) + + def set_lineno(self, n, lineno): + self.slice[n].lineno = lineno + + def linespan(self, n): + startline = getattr(self.slice[n], 'lineno', 0) + endline = getattr(self.slice[n], 'endlineno', startline) + return startline, endline + + def lexpos(self, n): + return getattr(self.slice[n], 'lexpos', 0) + + def lexspan(self, n): + startpos = getattr(self.slice[n], 'lexpos', 0) + endpos = getattr(self.slice[n], 'endlexpos', startpos) + return startpos, endpos + + def error(self): + raise SyntaxError + +# ----------------------------------------------------------------------------- +# == LRParser == +# +# The LR Parsing engine. +# ----------------------------------------------------------------------------- + +class LRParser: + def __init__(self, lrtab, errorf): + self.productions = lrtab.lr_productions + self.action = lrtab.lr_action + self.goto = lrtab.lr_goto + self.errorfunc = errorf + self.set_defaulted_states() + self.errorok = True + + def errok(self): + self.errorok = True + + def restart(self): + del self.statestack[:] + del self.symstack[:] + sym = YaccSymbol() + sym.type = '$end' + self.symstack.append(sym) + self.statestack.append(0) + + # Defaulted state support. + # This method identifies parser states where there is only one possible reduction action. + # For such states, the parser can make a choose to make a rule reduction without consuming + # the next look-ahead token. This delayed invocation of the tokenizer can be useful in + # certain kinds of advanced parsing situations where the lexer and parser interact with + # each other or change states (i.e., manipulation of scope, lexer states, etc.). + # + # See: http://www.gnu.org/software/bison/manual/html_node/Default-Reductions.html#Default-Reductions + def set_defaulted_states(self): + self.defaulted_states = {} + for state, actions in self.action.items(): + rules = list(actions.values()) + if len(rules) == 1 and rules[0] < 0: + self.defaulted_states[state] = rules[0] + + def disable_defaulted_states(self): + self.defaulted_states = {} + + def parse(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): + if debug or yaccdevel: + if isinstance(debug, int): + debug = PlyLogger(sys.stderr) + return self.parsedebug(input, lexer, debug, tracking, tokenfunc) + elif tracking: + return self.parseopt(input, lexer, debug, tracking, tokenfunc) + else: + return self.parseopt_notrack(input, lexer, debug, tracking, tokenfunc) + + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # parsedebug(). + # + # This is the debugging enabled version of parse(). All changes made to the + # parsing engine should be made here. Optimized versions of this function + # are automatically created by the ply/ygen.py script. This script cuts out + # sections enclosed in markers such as this: + # + # #--! DEBUG + # statements + # #--! DEBUG + # + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + def parsedebug(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): + #--! parsedebug-start + lookahead = None # Current lookahead symbol + lookaheadstack = [] # Stack of lookahead symbols + actions = self.action # Local reference to action table (to avoid lookup on self.) + goto = self.goto # Local reference to goto table (to avoid lookup on self.) + prod = self.productions # Local reference to production list (to avoid lookup on self.) + defaulted_states = self.defaulted_states # Local reference to defaulted states + pslice = YaccProduction(None) # Production object passed to grammar rules + errorcount = 0 # Used during error recovery + + #--! DEBUG + debug.info('PLY: PARSE DEBUG START') + #--! DEBUG + + # If no lexer was given, we will try to use the lex module + if not lexer: + from . import lex + lexer = lex.lexer + + # Set up the lexer and parser objects on pslice + pslice.lexer = lexer + pslice.parser = self + + # If input was supplied, pass to lexer + if input is not None: + lexer.input(input) + + if tokenfunc is None: + # Tokenize function + get_token = lexer.token + else: + get_token = tokenfunc + + # Set the parser() token method (sometimes used in error recovery) + self.token = get_token + + # Set up the state and symbol stacks + + statestack = [] # Stack of parsing states + self.statestack = statestack + symstack = [] # Stack of grammar symbols + self.symstack = symstack + + pslice.stack = symstack # Put in the production + errtoken = None # Err token + + # The start state is assumed to be (0,$end) + + statestack.append(0) + sym = YaccSymbol() + sym.type = '$end' + symstack.append(sym) + state = 0 + while True: + # Get the next symbol on the input. If a lookahead symbol + # is already set, we just use that. Otherwise, we'll pull + # the next token off of the lookaheadstack or from the lexer + + #--! DEBUG + debug.debug('') + debug.debug('State : %s', state) + #--! DEBUG + + if state not in defaulted_states: + if not lookahead: + if not lookaheadstack: + lookahead = get_token() # Get the next token + else: + lookahead = lookaheadstack.pop() + if not lookahead: + lookahead = YaccSymbol() + lookahead.type = '$end' + + # Check the action table + ltype = lookahead.type + t = actions[state].get(ltype) + else: + t = defaulted_states[state] + #--! DEBUG + debug.debug('Defaulted state %s: Reduce using %d', state, -t) + #--! DEBUG + + #--! DEBUG + debug.debug('Stack : %s', + ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) + #--! DEBUG + + if t is not None: + if t > 0: + # shift a symbol on the stack + statestack.append(t) + state = t + + #--! DEBUG + debug.debug('Action : Shift and goto state %s', t) + #--! DEBUG + + symstack.append(lookahead) + lookahead = None + + # Decrease error count on successful shift + if errorcount: + errorcount -= 1 + continue + + if t < 0: + # reduce a symbol on the stack, emit a production + p = prod[-t] + pname = p.name + plen = p.len + + # Get production function + sym = YaccSymbol() + sym.type = pname # Production name + sym.value = None + + #--! DEBUG + if plen: + debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, + '['+','.join([format_stack_entry(_v.value) for _v in symstack[-plen:]])+']', + goto[statestack[-1-plen]][pname]) + else: + debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, [], + goto[statestack[-1]][pname]) + + #--! DEBUG + + if plen: + targ = symstack[-plen-1:] + targ[0] = sym + + #--! TRACKING + if tracking: + t1 = targ[1] + sym.lineno = t1.lineno + sym.lexpos = t1.lexpos + t1 = targ[-1] + sym.endlineno = getattr(t1, 'endlineno', t1.lineno) + sym.endlexpos = getattr(t1, 'endlexpos', t1.lexpos) + #--! TRACKING + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # below as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + del symstack[-plen:] + self.state = state + p.callable(pslice) + del statestack[-plen:] + #--! DEBUG + debug.info('Result : %s', format_result(pslice[0])) + #--! DEBUG + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) # Save the current lookahead token + symstack.extend(targ[1:-1]) # Put the production slice back on the stack + statestack.pop() # Pop back one state (before the reduce) + state = statestack[-1] + sym.type = 'error' + sym.value = 'error' + lookahead = sym + errorcount = error_count + self.errorok = False + + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + else: + + #--! TRACKING + if tracking: + sym.lineno = lexer.lineno + sym.lexpos = lexer.lexpos + #--! TRACKING + + targ = [sym] + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # above as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + self.state = state + p.callable(pslice) + #--! DEBUG + debug.info('Result : %s', format_result(pslice[0])) + #--! DEBUG + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) # Save the current lookahead token + statestack.pop() # Pop back one state (before the reduce) + state = statestack[-1] + sym.type = 'error' + sym.value = 'error' + lookahead = sym + errorcount = error_count + self.errorok = False + + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + if t == 0: + n = symstack[-1] + result = getattr(n, 'value', None) + #--! DEBUG + debug.info('Done : Returning %s', format_result(result)) + debug.info('PLY: PARSE DEBUG END') + #--! DEBUG + return result + + if t is None: + + #--! DEBUG + debug.error('Error : %s', + ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) + #--! DEBUG + + # We have some kind of parsing error here. To handle + # this, we are going to push the current token onto + # the tokenstack and replace it with an 'error' token. + # If there are any synchronization rules, they may + # catch it. + # + # In addition to pushing the error token, we call call + # the user defined p_error() function if this is the + # first syntax error. This function is only called if + # errorcount == 0. + if errorcount == 0 or self.errorok: + errorcount = error_count + self.errorok = False + errtoken = lookahead + if errtoken.type == '$end': + errtoken = None # End of file! + if self.errorfunc: + if errtoken and not hasattr(errtoken, 'lexer'): + errtoken.lexer = lexer + self.state = state + tok = call_errorfunc(self.errorfunc, errtoken, self) + if self.errorok: + # User must have done some kind of panic + # mode recovery on their own. The + # returned token is the next lookahead + lookahead = tok + errtoken = None + continue + else: + if errtoken: + if hasattr(errtoken, 'lineno'): + lineno = lookahead.lineno + else: + lineno = 0 + if lineno: + sys.stderr.write('yacc: Syntax error at line %d, token=%s\n' % (lineno, errtoken.type)) + else: + sys.stderr.write('yacc: Syntax error, token=%s' % errtoken.type) + else: + sys.stderr.write('yacc: Parse error in input. EOF\n') + return + + else: + errorcount = error_count + + # case 1: the statestack only has 1 entry on it. If we're in this state, the + # entire parse has been rolled back and we're completely hosed. The token is + # discarded and we just keep going. + + if len(statestack) <= 1 and lookahead.type != '$end': + lookahead = None + errtoken = None + state = 0 + # Nuke the pushback stack + del lookaheadstack[:] + continue + + # case 2: the statestack has a couple of entries on it, but we're + # at the end of the file. nuke the top entry and generate an error token + + # Start nuking entries on the stack + if lookahead.type == '$end': + # Whoa. We're really hosed here. Bail out + return + + if lookahead.type != 'error': + sym = symstack[-1] + if sym.type == 'error': + # Hmmm. Error is on top of stack, we'll just nuke input + # symbol and continue + #--! TRACKING + if tracking: + sym.endlineno = getattr(lookahead, 'lineno', sym.lineno) + sym.endlexpos = getattr(lookahead, 'lexpos', sym.lexpos) + #--! TRACKING + lookahead = None + continue + + # Create the error symbol for the first time and make it the new lookahead symbol + t = YaccSymbol() + t.type = 'error' + + if hasattr(lookahead, 'lineno'): + t.lineno = t.endlineno = lookahead.lineno + if hasattr(lookahead, 'lexpos'): + t.lexpos = t.endlexpos = lookahead.lexpos + t.value = lookahead + lookaheadstack.append(lookahead) + lookahead = t + else: + sym = symstack.pop() + #--! TRACKING + if tracking: + lookahead.lineno = sym.lineno + lookahead.lexpos = sym.lexpos + #--! TRACKING + statestack.pop() + state = statestack[-1] + + continue + + # Call an error function here + raise RuntimeError('yacc: internal parser error!!!\n') + + #--! parsedebug-end + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # parseopt(). + # + # Optimized version of parse() method. DO NOT EDIT THIS CODE DIRECTLY! + # This code is automatically generated by the ply/ygen.py script. Make + # changes to the parsedebug() method instead. + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + def parseopt(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): + #--! parseopt-start + lookahead = None # Current lookahead symbol + lookaheadstack = [] # Stack of lookahead symbols + actions = self.action # Local reference to action table (to avoid lookup on self.) + goto = self.goto # Local reference to goto table (to avoid lookup on self.) + prod = self.productions # Local reference to production list (to avoid lookup on self.) + defaulted_states = self.defaulted_states # Local reference to defaulted states + pslice = YaccProduction(None) # Production object passed to grammar rules + errorcount = 0 # Used during error recovery + + + # If no lexer was given, we will try to use the lex module + if not lexer: + from . import lex + lexer = lex.lexer + + # Set up the lexer and parser objects on pslice + pslice.lexer = lexer + pslice.parser = self + + # If input was supplied, pass to lexer + if input is not None: + lexer.input(input) + + if tokenfunc is None: + # Tokenize function + get_token = lexer.token + else: + get_token = tokenfunc + + # Set the parser() token method (sometimes used in error recovery) + self.token = get_token + + # Set up the state and symbol stacks + + statestack = [] # Stack of parsing states + self.statestack = statestack + symstack = [] # Stack of grammar symbols + self.symstack = symstack + + pslice.stack = symstack # Put in the production + errtoken = None # Err token + + # The start state is assumed to be (0,$end) + + statestack.append(0) + sym = YaccSymbol() + sym.type = '$end' + symstack.append(sym) + state = 0 + while True: + # Get the next symbol on the input. If a lookahead symbol + # is already set, we just use that. Otherwise, we'll pull + # the next token off of the lookaheadstack or from the lexer + + + if state not in defaulted_states: + if not lookahead: + if not lookaheadstack: + lookahead = get_token() # Get the next token + else: + lookahead = lookaheadstack.pop() + if not lookahead: + lookahead = YaccSymbol() + lookahead.type = '$end' + + # Check the action table + ltype = lookahead.type + t = actions[state].get(ltype) + else: + t = defaulted_states[state] + + + if t is not None: + if t > 0: + # shift a symbol on the stack + statestack.append(t) + state = t + + + symstack.append(lookahead) + lookahead = None + + # Decrease error count on successful shift + if errorcount: + errorcount -= 1 + continue + + if t < 0: + # reduce a symbol on the stack, emit a production + p = prod[-t] + pname = p.name + plen = p.len + + # Get production function + sym = YaccSymbol() + sym.type = pname # Production name + sym.value = None + + + if plen: + targ = symstack[-plen-1:] + targ[0] = sym + + #--! TRACKING + if tracking: + t1 = targ[1] + sym.lineno = t1.lineno + sym.lexpos = t1.lexpos + t1 = targ[-1] + sym.endlineno = getattr(t1, 'endlineno', t1.lineno) + sym.endlexpos = getattr(t1, 'endlexpos', t1.lexpos) + #--! TRACKING + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # below as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + del symstack[-plen:] + self.state = state + p.callable(pslice) + del statestack[-plen:] + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) # Save the current lookahead token + symstack.extend(targ[1:-1]) # Put the production slice back on the stack + statestack.pop() # Pop back one state (before the reduce) + state = statestack[-1] + sym.type = 'error' + sym.value = 'error' + lookahead = sym + errorcount = error_count + self.errorok = False + + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + else: + + #--! TRACKING + if tracking: + sym.lineno = lexer.lineno + sym.lexpos = lexer.lexpos + #--! TRACKING + + targ = [sym] + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # above as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + self.state = state + p.callable(pslice) + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) # Save the current lookahead token + statestack.pop() # Pop back one state (before the reduce) + state = statestack[-1] + sym.type = 'error' + sym.value = 'error' + lookahead = sym + errorcount = error_count + self.errorok = False + + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + if t == 0: + n = symstack[-1] + result = getattr(n, 'value', None) + return result + + if t is None: + + + # We have some kind of parsing error here. To handle + # this, we are going to push the current token onto + # the tokenstack and replace it with an 'error' token. + # If there are any synchronization rules, they may + # catch it. + # + # In addition to pushing the error token, we call call + # the user defined p_error() function if this is the + # first syntax error. This function is only called if + # errorcount == 0. + if errorcount == 0 or self.errorok: + errorcount = error_count + self.errorok = False + errtoken = lookahead + if errtoken.type == '$end': + errtoken = None # End of file! + if self.errorfunc: + if errtoken and not hasattr(errtoken, 'lexer'): + errtoken.lexer = lexer + self.state = state + tok = call_errorfunc(self.errorfunc, errtoken, self) + if self.errorok: + # User must have done some kind of panic + # mode recovery on their own. The + # returned token is the next lookahead + lookahead = tok + errtoken = None + continue + else: + if errtoken: + if hasattr(errtoken, 'lineno'): + lineno = lookahead.lineno + else: + lineno = 0 + if lineno: + sys.stderr.write('yacc: Syntax error at line %d, token=%s\n' % (lineno, errtoken.type)) + else: + sys.stderr.write('yacc: Syntax error, token=%s' % errtoken.type) + else: + sys.stderr.write('yacc: Parse error in input. EOF\n') + return + + else: + errorcount = error_count + + # case 1: the statestack only has 1 entry on it. If we're in this state, the + # entire parse has been rolled back and we're completely hosed. The token is + # discarded and we just keep going. + + if len(statestack) <= 1 and lookahead.type != '$end': + lookahead = None + errtoken = None + state = 0 + # Nuke the pushback stack + del lookaheadstack[:] + continue + + # case 2: the statestack has a couple of entries on it, but we're + # at the end of the file. nuke the top entry and generate an error token + + # Start nuking entries on the stack + if lookahead.type == '$end': + # Whoa. We're really hosed here. Bail out + return + + if lookahead.type != 'error': + sym = symstack[-1] + if sym.type == 'error': + # Hmmm. Error is on top of stack, we'll just nuke input + # symbol and continue + #--! TRACKING + if tracking: + sym.endlineno = getattr(lookahead, 'lineno', sym.lineno) + sym.endlexpos = getattr(lookahead, 'lexpos', sym.lexpos) + #--! TRACKING + lookahead = None + continue + + # Create the error symbol for the first time and make it the new lookahead symbol + t = YaccSymbol() + t.type = 'error' + + if hasattr(lookahead, 'lineno'): + t.lineno = t.endlineno = lookahead.lineno + if hasattr(lookahead, 'lexpos'): + t.lexpos = t.endlexpos = lookahead.lexpos + t.value = lookahead + lookaheadstack.append(lookahead) + lookahead = t + else: + sym = symstack.pop() + #--! TRACKING + if tracking: + lookahead.lineno = sym.lineno + lookahead.lexpos = sym.lexpos + #--! TRACKING + statestack.pop() + state = statestack[-1] + + continue + + # Call an error function here + raise RuntimeError('yacc: internal parser error!!!\n') + + #--! parseopt-end + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # parseopt_notrack(). + # + # Optimized version of parseopt() with line number tracking removed. + # DO NOT EDIT THIS CODE DIRECTLY. This code is automatically generated + # by the ply/ygen.py script. Make changes to the parsedebug() method instead. + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): + #--! parseopt-notrack-start + lookahead = None # Current lookahead symbol + lookaheadstack = [] # Stack of lookahead symbols + actions = self.action # Local reference to action table (to avoid lookup on self.) + goto = self.goto # Local reference to goto table (to avoid lookup on self.) + prod = self.productions # Local reference to production list (to avoid lookup on self.) + defaulted_states = self.defaulted_states # Local reference to defaulted states + pslice = YaccProduction(None) # Production object passed to grammar rules + errorcount = 0 # Used during error recovery + + + # If no lexer was given, we will try to use the lex module + if not lexer: + from . import lex + lexer = lex.lexer + + # Set up the lexer and parser objects on pslice + pslice.lexer = lexer + pslice.parser = self + + # If input was supplied, pass to lexer + if input is not None: + lexer.input(input) + + if tokenfunc is None: + # Tokenize function + get_token = lexer.token + else: + get_token = tokenfunc + + # Set the parser() token method (sometimes used in error recovery) + self.token = get_token + + # Set up the state and symbol stacks + + statestack = [] # Stack of parsing states + self.statestack = statestack + symstack = [] # Stack of grammar symbols + self.symstack = symstack + + pslice.stack = symstack # Put in the production + errtoken = None # Err token + + # The start state is assumed to be (0,$end) + + statestack.append(0) + sym = YaccSymbol() + sym.type = '$end' + symstack.append(sym) + state = 0 + while True: + # Get the next symbol on the input. If a lookahead symbol + # is already set, we just use that. Otherwise, we'll pull + # the next token off of the lookaheadstack or from the lexer + + + if state not in defaulted_states: + if not lookahead: + if not lookaheadstack: + lookahead = get_token() # Get the next token + else: + lookahead = lookaheadstack.pop() + if not lookahead: + lookahead = YaccSymbol() + lookahead.type = '$end' + + # Check the action table + ltype = lookahead.type + t = actions[state].get(ltype) + else: + t = defaulted_states[state] + + + if t is not None: + if t > 0: + # shift a symbol on the stack + statestack.append(t) + state = t + + + symstack.append(lookahead) + lookahead = None + + # Decrease error count on successful shift + if errorcount: + errorcount -= 1 + continue + + if t < 0: + # reduce a symbol on the stack, emit a production + p = prod[-t] + pname = p.name + plen = p.len + + # Get production function + sym = YaccSymbol() + sym.type = pname # Production name + sym.value = None + + + if plen: + targ = symstack[-plen-1:] + targ[0] = sym + + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # below as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + del symstack[-plen:] + self.state = state + p.callable(pslice) + del statestack[-plen:] + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) # Save the current lookahead token + symstack.extend(targ[1:-1]) # Put the production slice back on the stack + statestack.pop() # Pop back one state (before the reduce) + state = statestack[-1] + sym.type = 'error' + sym.value = 'error' + lookahead = sym + errorcount = error_count + self.errorok = False + + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + else: + + + targ = [sym] + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # above as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + self.state = state + p.callable(pslice) + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) # Save the current lookahead token + statestack.pop() # Pop back one state (before the reduce) + state = statestack[-1] + sym.type = 'error' + sym.value = 'error' + lookahead = sym + errorcount = error_count + self.errorok = False + + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + if t == 0: + n = symstack[-1] + result = getattr(n, 'value', None) + return result + + if t is None: + + + # We have some kind of parsing error here. To handle + # this, we are going to push the current token onto + # the tokenstack and replace it with an 'error' token. + # If there are any synchronization rules, they may + # catch it. + # + # In addition to pushing the error token, we call call + # the user defined p_error() function if this is the + # first syntax error. This function is only called if + # errorcount == 0. + if errorcount == 0 or self.errorok: + errorcount = error_count + self.errorok = False + errtoken = lookahead + if errtoken.type == '$end': + errtoken = None # End of file! + if self.errorfunc: + if errtoken and not hasattr(errtoken, 'lexer'): + errtoken.lexer = lexer + self.state = state + tok = call_errorfunc(self.errorfunc, errtoken, self) + if self.errorok: + # User must have done some kind of panic + # mode recovery on their own. The + # returned token is the next lookahead + lookahead = tok + errtoken = None + continue + else: + if errtoken: + if hasattr(errtoken, 'lineno'): + lineno = lookahead.lineno + else: + lineno = 0 + if lineno: + sys.stderr.write('yacc: Syntax error at line %d, token=%s\n' % (lineno, errtoken.type)) + else: + sys.stderr.write('yacc: Syntax error, token=%s' % errtoken.type) + else: + sys.stderr.write('yacc: Parse error in input. EOF\n') + return + + else: + errorcount = error_count + + # case 1: the statestack only has 1 entry on it. If we're in this state, the + # entire parse has been rolled back and we're completely hosed. The token is + # discarded and we just keep going. + + if len(statestack) <= 1 and lookahead.type != '$end': + lookahead = None + errtoken = None + state = 0 + # Nuke the pushback stack + del lookaheadstack[:] + continue + + # case 2: the statestack has a couple of entries on it, but we're + # at the end of the file. nuke the top entry and generate an error token + + # Start nuking entries on the stack + if lookahead.type == '$end': + # Whoa. We're really hosed here. Bail out + return + + if lookahead.type != 'error': + sym = symstack[-1] + if sym.type == 'error': + # Hmmm. Error is on top of stack, we'll just nuke input + # symbol and continue + lookahead = None + continue + + # Create the error symbol for the first time and make it the new lookahead symbol + t = YaccSymbol() + t.type = 'error' + + if hasattr(lookahead, 'lineno'): + t.lineno = t.endlineno = lookahead.lineno + if hasattr(lookahead, 'lexpos'): + t.lexpos = t.endlexpos = lookahead.lexpos + t.value = lookahead + lookaheadstack.append(lookahead) + lookahead = t + else: + sym = symstack.pop() + statestack.pop() + state = statestack[-1] + + continue + + # Call an error function here + raise RuntimeError('yacc: internal parser error!!!\n') + + #--! parseopt-notrack-end + +# ----------------------------------------------------------------------------- +# === Grammar Representation === +# +# The following functions, classes, and variables are used to represent and +# manipulate the rules that make up a grammar. +# ----------------------------------------------------------------------------- + +# regex matching identifiers +_is_identifier = re.compile(r'^[a-zA-Z0-9_-]+$') + +# ----------------------------------------------------------------------------- +# class Production: +# +# This class stores the raw information about a single production or grammar rule. +# A grammar rule refers to a specification such as this: +# +# expr : expr PLUS term +# +# Here are the basic attributes defined on all productions +# +# name - Name of the production. For example 'expr' +# prod - A list of symbols on the right side ['expr','PLUS','term'] +# prec - Production precedence level +# number - Production number. +# func - Function that executes on reduce +# file - File where production function is defined +# lineno - Line number where production function is defined +# +# The following attributes are defined or optional. +# +# len - Length of the production (number of symbols on right hand side) +# usyms - Set of unique symbols found in the production +# ----------------------------------------------------------------------------- + +class Production(object): + reduced = 0 + def __init__(self, number, name, prod, precedence=('right', 0), func=None, file='', line=0): + self.name = name + self.prod = tuple(prod) + self.number = number + self.func = func + self.callable = None + self.file = file + self.line = line + self.prec = precedence + + # Internal settings used during table construction + + self.len = len(self.prod) # Length of the production + + # Create a list of unique production symbols used in the production + self.usyms = [] + for s in self.prod: + if s not in self.usyms: + self.usyms.append(s) + + # List of all LR items for the production + self.lr_items = [] + self.lr_next = None + + # Create a string representation + if self.prod: + self.str = '%s -> %s' % (self.name, ' '.join(self.prod)) + else: + self.str = '%s -> ' % self.name + + def __str__(self): + return self.str + + def __repr__(self): + return 'Production(' + str(self) + ')' + + def __len__(self): + return len(self.prod) + + def __nonzero__(self): + return 1 + + def __getitem__(self, index): + return self.prod[index] + + # Return the nth lr_item from the production (or None if at the end) + def lr_item(self, n): + if n > len(self.prod): + return None + p = LRItem(self, n) + # Precompute the list of productions immediately following. + try: + p.lr_after = Prodnames[p.prod[n+1]] + except (IndexError, KeyError): + p.lr_after = [] + try: + p.lr_before = p.prod[n-1] + except IndexError: + p.lr_before = None + return p + + # Bind the production function name to a callable + def bind(self, pdict): + if self.func: + self.callable = pdict[self.func] + +# This class serves as a minimal standin for Production objects when +# reading table data from files. It only contains information +# actually used by the LR parsing engine, plus some additional +# debugging information. +class MiniProduction(object): + def __init__(self, str, name, len, func, file, line): + self.name = name + self.len = len + self.func = func + self.callable = None + self.file = file + self.line = line + self.str = str + + def __str__(self): + return self.str + + def __repr__(self): + return 'MiniProduction(%s)' % self.str + + # Bind the production function name to a callable + def bind(self, pdict): + if self.func: + self.callable = pdict[self.func] + + +# ----------------------------------------------------------------------------- +# class LRItem +# +# This class represents a specific stage of parsing a production rule. For +# example: +# +# expr : expr . PLUS term +# +# In the above, the "." represents the current location of the parse. Here +# basic attributes: +# +# name - Name of the production. For example 'expr' +# prod - A list of symbols on the right side ['expr','.', 'PLUS','term'] +# number - Production number. +# +# lr_next Next LR item. Example, if we are ' expr -> expr . PLUS term' +# then lr_next refers to 'expr -> expr PLUS . term' +# lr_index - LR item index (location of the ".") in the prod list. +# lookaheads - LALR lookahead symbols for this item +# len - Length of the production (number of symbols on right hand side) +# lr_after - List of all productions that immediately follow +# lr_before - Grammar symbol immediately before +# ----------------------------------------------------------------------------- + +class LRItem(object): + def __init__(self, p, n): + self.name = p.name + self.prod = list(p.prod) + self.number = p.number + self.lr_index = n + self.lookaheads = {} + self.prod.insert(n, '.') + self.prod = tuple(self.prod) + self.len = len(self.prod) + self.usyms = p.usyms + + def __str__(self): + if self.prod: + s = '%s -> %s' % (self.name, ' '.join(self.prod)) + else: + s = '%s -> ' % self.name + return s + + def __repr__(self): + return 'LRItem(' + str(self) + ')' + +# ----------------------------------------------------------------------------- +# rightmost_terminal() +# +# Return the rightmost terminal from a list of symbols. Used in add_production() +# ----------------------------------------------------------------------------- +def rightmost_terminal(symbols, terminals): + i = len(symbols) - 1 + while i >= 0: + if symbols[i] in terminals: + return symbols[i] + i -= 1 + return None + +# ----------------------------------------------------------------------------- +# === GRAMMAR CLASS === +# +# The following class represents the contents of the specified grammar along +# with various computed properties such as first sets, follow sets, LR items, etc. +# This data is used for critical parts of the table generation process later. +# ----------------------------------------------------------------------------- + +class GrammarError(YaccError): + pass + +class Grammar(object): + def __init__(self, terminals): + self.Productions = [None] # A list of all of the productions. The first + # entry is always reserved for the purpose of + # building an augmented grammar + + self.Prodnames = {} # A dictionary mapping the names of nonterminals to a list of all + # productions of that nonterminal. + + self.Prodmap = {} # A dictionary that is only used to detect duplicate + # productions. + + self.Terminals = {} # A dictionary mapping the names of terminal symbols to a + # list of the rules where they are used. + + for term in terminals: + self.Terminals[term] = [] + + self.Terminals['error'] = [] + + self.Nonterminals = {} # A dictionary mapping names of nonterminals to a list + # of rule numbers where they are used. + + self.First = {} # A dictionary of precomputed FIRST(x) symbols + + self.Follow = {} # A dictionary of precomputed FOLLOW(x) symbols + + self.Precedence = {} # Precedence rules for each terminal. Contains tuples of the + # form ('right',level) or ('nonassoc', level) or ('left',level) + + self.UsedPrecedence = set() # Precedence rules that were actually used by the grammer. + # This is only used to provide error checking and to generate + # a warning about unused precedence rules. + + self.Start = None # Starting symbol for the grammar + + + def __len__(self): + return len(self.Productions) + + def __getitem__(self, index): + return self.Productions[index] + + # ----------------------------------------------------------------------------- + # set_precedence() + # + # Sets the precedence for a given terminal. assoc is the associativity such as + # 'left','right', or 'nonassoc'. level is a numeric level. + # + # ----------------------------------------------------------------------------- + + def set_precedence(self, term, assoc, level): + assert self.Productions == [None], 'Must call set_precedence() before add_production()' + if term in self.Precedence: + raise GrammarError('Precedence already specified for terminal %r' % term) + if assoc not in ['left', 'right', 'nonassoc']: + raise GrammarError("Associativity must be one of 'left','right', or 'nonassoc'") + self.Precedence[term] = (assoc, level) + + # ----------------------------------------------------------------------------- + # add_production() + # + # Given an action function, this function assembles a production rule and + # computes its precedence level. + # + # The production rule is supplied as a list of symbols. For example, + # a rule such as 'expr : expr PLUS term' has a production name of 'expr' and + # symbols ['expr','PLUS','term']. + # + # Precedence is determined by the precedence of the right-most non-terminal + # or the precedence of a terminal specified by %prec. + # + # A variety of error checks are performed to make sure production symbols + # are valid and that %prec is used correctly. + # ----------------------------------------------------------------------------- + + def add_production(self, prodname, syms, func=None, file='', line=0): + + if prodname in self.Terminals: + raise GrammarError('%s:%d: Illegal rule name %r. Already defined as a token' % (file, line, prodname)) + if prodname == 'error': + raise GrammarError('%s:%d: Illegal rule name %r. error is a reserved word' % (file, line, prodname)) + if not _is_identifier.match(prodname): + raise GrammarError('%s:%d: Illegal rule name %r' % (file, line, prodname)) + + # Look for literal tokens + for n, s in enumerate(syms): + if s[0] in "'\"": + try: + c = eval(s) + if (len(c) > 1): + raise GrammarError('%s:%d: Literal token %s in rule %r may only be a single character' % + (file, line, s, prodname)) + if c not in self.Terminals: + self.Terminals[c] = [] + syms[n] = c + continue + except SyntaxError: + pass + if not _is_identifier.match(s) and s != '%prec': + raise GrammarError('%s:%d: Illegal name %r in rule %r' % (file, line, s, prodname)) + + # Determine the precedence level + if '%prec' in syms: + if syms[-1] == '%prec': + raise GrammarError('%s:%d: Syntax error. Nothing follows %%prec' % (file, line)) + if syms[-2] != '%prec': + raise GrammarError('%s:%d: Syntax error. %%prec can only appear at the end of a grammar rule' % + (file, line)) + precname = syms[-1] + prodprec = self.Precedence.get(precname) + if not prodprec: + raise GrammarError('%s:%d: Nothing known about the precedence of %r' % (file, line, precname)) + else: + self.UsedPrecedence.add(precname) + del syms[-2:] # Drop %prec from the rule + else: + # If no %prec, precedence is determined by the rightmost terminal symbol + precname = rightmost_terminal(syms, self.Terminals) + prodprec = self.Precedence.get(precname, ('right', 0)) + + # See if the rule is already in the rulemap + map = '%s -> %s' % (prodname, syms) + if map in self.Prodmap: + m = self.Prodmap[map] + raise GrammarError('%s:%d: Duplicate rule %s. ' % (file, line, m) + + 'Previous definition at %s:%d' % (m.file, m.line)) + + # From this point on, everything is valid. Create a new Production instance + pnumber = len(self.Productions) + if prodname not in self.Nonterminals: + self.Nonterminals[prodname] = [] + + # Add the production number to Terminals and Nonterminals + for t in syms: + if t in self.Terminals: + self.Terminals[t].append(pnumber) + else: + if t not in self.Nonterminals: + self.Nonterminals[t] = [] + self.Nonterminals[t].append(pnumber) + + # Create a production and add it to the list of productions + p = Production(pnumber, prodname, syms, prodprec, func, file, line) + self.Productions.append(p) + self.Prodmap[map] = p + + # Add to the global productions list + try: + self.Prodnames[prodname].append(p) + except KeyError: + self.Prodnames[prodname] = [p] + + # ----------------------------------------------------------------------------- + # set_start() + # + # Sets the starting symbol and creates the augmented grammar. Production + # rule 0 is S' -> start where start is the start symbol. + # ----------------------------------------------------------------------------- + + def set_start(self, start=None): + if not start: + start = self.Productions[1].name + if start not in self.Nonterminals: + raise GrammarError('start symbol %s undefined' % start) + self.Productions[0] = Production(0, "S'", [start]) + self.Nonterminals[start].append(0) + self.Start = start + + # ----------------------------------------------------------------------------- + # find_unreachable() + # + # Find all of the nonterminal symbols that can't be reached from the starting + # symbol. Returns a list of nonterminals that can't be reached. + # ----------------------------------------------------------------------------- + + def find_unreachable(self): + + # Mark all symbols that are reachable from a symbol s + def mark_reachable_from(s): + if s in reachable: + return + reachable.add(s) + for p in self.Prodnames.get(s, []): + for r in p.prod: + mark_reachable_from(r) + + reachable = set() + mark_reachable_from(self.Productions[0].prod[0]) + return [s for s in self.Nonterminals if s not in reachable] + + # ----------------------------------------------------------------------------- + # infinite_cycles() + # + # This function looks at the various parsing rules and tries to detect + # infinite recursion cycles (grammar rules where there is no possible way + # to derive a string of only terminals). + # ----------------------------------------------------------------------------- + + def infinite_cycles(self): + terminates = {} + + # Terminals: + for t in self.Terminals: + terminates[t] = True + + terminates['$end'] = True + + # Nonterminals: + + # Initialize to false: + for n in self.Nonterminals: + terminates[n] = False + + # Then propagate termination until no change: + while True: + some_change = False + for (n, pl) in self.Prodnames.items(): + # Nonterminal n terminates iff any of its productions terminates. + for p in pl: + # Production p terminates iff all of its rhs symbols terminate. + for s in p.prod: + if not terminates[s]: + # The symbol s does not terminate, + # so production p does not terminate. + p_terminates = False + break + else: + # didn't break from the loop, + # so every symbol s terminates + # so production p terminates. + p_terminates = True + + if p_terminates: + # symbol n terminates! + if not terminates[n]: + terminates[n] = True + some_change = True + # Don't need to consider any more productions for this n. + break + + if not some_change: + break + + infinite = [] + for (s, term) in terminates.items(): + if not term: + if s not in self.Prodnames and s not in self.Terminals and s != 'error': + # s is used-but-not-defined, and we've already warned of that, + # so it would be overkill to say that it's also non-terminating. + pass + else: + infinite.append(s) + + return infinite + + # ----------------------------------------------------------------------------- + # undefined_symbols() + # + # Find all symbols that were used the grammar, but not defined as tokens or + # grammar rules. Returns a list of tuples (sym, prod) where sym in the symbol + # and prod is the production where the symbol was used. + # ----------------------------------------------------------------------------- + def undefined_symbols(self): + result = [] + for p in self.Productions: + if not p: + continue + + for s in p.prod: + if s not in self.Prodnames and s not in self.Terminals and s != 'error': + result.append((s, p)) + return result + + # ----------------------------------------------------------------------------- + # unused_terminals() + # + # Find all terminals that were defined, but not used by the grammar. Returns + # a list of all symbols. + # ----------------------------------------------------------------------------- + def unused_terminals(self): + unused_tok = [] + for s, v in self.Terminals.items(): + if s != 'error' and not v: + unused_tok.append(s) + + return unused_tok + + # ------------------------------------------------------------------------------ + # unused_rules() + # + # Find all grammar rules that were defined, but not used (maybe not reachable) + # Returns a list of productions. + # ------------------------------------------------------------------------------ + + def unused_rules(self): + unused_prod = [] + for s, v in self.Nonterminals.items(): + if not v: + p = self.Prodnames[s][0] + unused_prod.append(p) + return unused_prod + + # ----------------------------------------------------------------------------- + # unused_precedence() + # + # Returns a list of tuples (term,precedence) corresponding to precedence + # rules that were never used by the grammar. term is the name of the terminal + # on which precedence was applied and precedence is a string such as 'left' or + # 'right' corresponding to the type of precedence. + # ----------------------------------------------------------------------------- + + def unused_precedence(self): + unused = [] + for termname in self.Precedence: + if not (termname in self.Terminals or termname in self.UsedPrecedence): + unused.append((termname, self.Precedence[termname][0])) + + return unused + + # ------------------------------------------------------------------------- + # _first() + # + # Compute the value of FIRST1(beta) where beta is a tuple of symbols. + # + # During execution of compute_first1, the result may be incomplete. + # Afterward (e.g., when called from compute_follow()), it will be complete. + # ------------------------------------------------------------------------- + def _first(self, beta): + + # We are computing First(x1,x2,x3,...,xn) + result = [] + for x in beta: + x_produces_empty = False + + # Add all the non- symbols of First[x] to the result. + for f in self.First[x]: + if f == '': + x_produces_empty = True + else: + if f not in result: + result.append(f) + + if x_produces_empty: + # We have to consider the next x in beta, + # i.e. stay in the loop. + pass + else: + # We don't have to consider any further symbols in beta. + break + else: + # There was no 'break' from the loop, + # so x_produces_empty was true for all x in beta, + # so beta produces empty as well. + result.append('') + + return result + + # ------------------------------------------------------------------------- + # compute_first() + # + # Compute the value of FIRST1(X) for all symbols + # ------------------------------------------------------------------------- + def compute_first(self): + if self.First: + return self.First + + # Terminals: + for t in self.Terminals: + self.First[t] = [t] + + self.First['$end'] = ['$end'] + + # Nonterminals: + + # Initialize to the empty set: + for n in self.Nonterminals: + self.First[n] = [] + + # Then propagate symbols until no change: + while True: + some_change = False + for n in self.Nonterminals: + for p in self.Prodnames[n]: + for f in self._first(p.prod): + if f not in self.First[n]: + self.First[n].append(f) + some_change = True + if not some_change: + break + + return self.First + + # --------------------------------------------------------------------- + # compute_follow() + # + # Computes all of the follow sets for every non-terminal symbol. The + # follow set is the set of all symbols that might follow a given + # non-terminal. See the Dragon book, 2nd Ed. p. 189. + # --------------------------------------------------------------------- + def compute_follow(self, start=None): + # If already computed, return the result + if self.Follow: + return self.Follow + + # If first sets not computed yet, do that first. + if not self.First: + self.compute_first() + + # Add '$end' to the follow list of the start symbol + for k in self.Nonterminals: + self.Follow[k] = [] + + if not start: + start = self.Productions[1].name + + self.Follow[start] = ['$end'] + + while True: + didadd = False + for p in self.Productions[1:]: + # Here is the production set + for i, B in enumerate(p.prod): + if B in self.Nonterminals: + # Okay. We got a non-terminal in a production + fst = self._first(p.prod[i+1:]) + hasempty = False + for f in fst: + if f != '' and f not in self.Follow[B]: + self.Follow[B].append(f) + didadd = True + if f == '': + hasempty = True + if hasempty or i == (len(p.prod)-1): + # Add elements of follow(a) to follow(b) + for f in self.Follow[p.name]: + if f not in self.Follow[B]: + self.Follow[B].append(f) + didadd = True + if not didadd: + break + return self.Follow + + + # ----------------------------------------------------------------------------- + # build_lritems() + # + # This function walks the list of productions and builds a complete set of the + # LR items. The LR items are stored in two ways: First, they are uniquely + # numbered and placed in the list _lritems. Second, a linked list of LR items + # is built for each production. For example: + # + # E -> E PLUS E + # + # Creates the list + # + # [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] + # ----------------------------------------------------------------------------- + + def build_lritems(self): + for p in self.Productions: + lastlri = p + i = 0 + lr_items = [] + while True: + if i > len(p): + lri = None + else: + lri = LRItem(p, i) + # Precompute the list of productions immediately following + try: + lri.lr_after = self.Prodnames[lri.prod[i+1]] + except (IndexError, KeyError): + lri.lr_after = [] + try: + lri.lr_before = lri.prod[i-1] + except IndexError: + lri.lr_before = None + + lastlri.lr_next = lri + if not lri: + break + lr_items.append(lri) + lastlri = lri + i += 1 + p.lr_items = lr_items + +# ----------------------------------------------------------------------------- +# == Class LRTable == +# +# This basic class represents a basic table of LR parsing information. +# Methods for generating the tables are not defined here. They are defined +# in the derived class LRGeneratedTable. +# ----------------------------------------------------------------------------- + +class VersionError(YaccError): + pass + +class LRTable(object): + def __init__(self): + self.lr_action = None + self.lr_goto = None + self.lr_productions = None + self.lr_method = None + + def read_table(self, module): + if isinstance(module, types.ModuleType): + parsetab = module + else: + exec('import %s' % module) + parsetab = sys.modules[module] + + if parsetab._tabversion != __tabversion__: + raise VersionError('yacc table file version is out of date') + + self.lr_action = parsetab._lr_action + self.lr_goto = parsetab._lr_goto + + self.lr_productions = [] + for p in parsetab._lr_productions: + self.lr_productions.append(MiniProduction(*p)) + + self.lr_method = parsetab._lr_method + return parsetab._lr_signature + + def read_pickle(self, filename): + try: + import cPickle as pickle + except ImportError: + import pickle + + if not os.path.exists(filename): + raise ImportError + + in_f = open(filename, 'rb') + + tabversion = pickle.load(in_f) + if tabversion != __tabversion__: + raise VersionError('yacc table file version is out of date') + self.lr_method = pickle.load(in_f) + signature = pickle.load(in_f) + self.lr_action = pickle.load(in_f) + self.lr_goto = pickle.load(in_f) + productions = pickle.load(in_f) + + self.lr_productions = [] + for p in productions: + self.lr_productions.append(MiniProduction(*p)) + + in_f.close() + return signature + + # Bind all production function names to callable objects in pdict + def bind_callables(self, pdict): + for p in self.lr_productions: + p.bind(pdict) + + +# ----------------------------------------------------------------------------- +# === LR Generator === +# +# The following classes and functions are used to generate LR parsing tables on +# a grammar. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# digraph() +# traverse() +# +# The following two functions are used to compute set valued functions +# of the form: +# +# F(x) = F'(x) U U{F(y) | x R y} +# +# This is used to compute the values of Read() sets as well as FOLLOW sets +# in LALR(1) generation. +# +# Inputs: X - An input set +# R - A relation +# FP - Set-valued function +# ------------------------------------------------------------------------------ + +def digraph(X, R, FP): + N = {} + for x in X: + N[x] = 0 + stack = [] + F = {} + for x in X: + if N[x] == 0: + traverse(x, N, stack, F, X, R, FP) + return F + +def traverse(x, N, stack, F, X, R, FP): + stack.append(x) + d = len(stack) + N[x] = d + F[x] = FP(x) # F(X) <- F'(x) + + rel = R(x) # Get y's related to x + for y in rel: + if N[y] == 0: + traverse(y, N, stack, F, X, R, FP) + N[x] = min(N[x], N[y]) + for a in F.get(y, []): + if a not in F[x]: + F[x].append(a) + if N[x] == d: + N[stack[-1]] = MAXINT + F[stack[-1]] = F[x] + element = stack.pop() + while element != x: + N[stack[-1]] = MAXINT + F[stack[-1]] = F[x] + element = stack.pop() + +class LALRError(YaccError): + pass + +# ----------------------------------------------------------------------------- +# == LRGeneratedTable == +# +# This class implements the LR table generation algorithm. There are no +# public methods except for write() +# ----------------------------------------------------------------------------- + +class LRGeneratedTable(LRTable): + def __init__(self, grammar, method='LALR', log=None): + if method not in ['SLR', 'LALR']: + raise LALRError('Unsupported method %s' % method) + + self.grammar = grammar + self.lr_method = method + + # Set up the logger + if not log: + log = NullLogger() + self.log = log + + # Internal attributes + self.lr_action = {} # Action table + self.lr_goto = {} # Goto table + self.lr_productions = grammar.Productions # Copy of grammar Production array + self.lr_goto_cache = {} # Cache of computed gotos + self.lr0_cidhash = {} # Cache of closures + + self._add_count = 0 # Internal counter used to detect cycles + + # Diagonistic information filled in by the table generator + self.sr_conflict = 0 + self.rr_conflict = 0 + self.conflicts = [] # List of conflicts + + self.sr_conflicts = [] + self.rr_conflicts = [] + + # Build the tables + self.grammar.build_lritems() + self.grammar.compute_first() + self.grammar.compute_follow() + self.lr_parse_table() + + # Compute the LR(0) closure operation on I, where I is a set of LR(0) items. + + def lr0_closure(self, I): + self._add_count += 1 + + # Add everything in I to J + J = I[:] + didadd = True + while didadd: + didadd = False + for j in J: + for x in j.lr_after: + if getattr(x, 'lr0_added', 0) == self._add_count: + continue + # Add B --> .G to J + J.append(x.lr_next) + x.lr0_added = self._add_count + didadd = True + + return J + + # Compute the LR(0) goto function goto(I,X) where I is a set + # of LR(0) items and X is a grammar symbol. This function is written + # in a way that guarantees uniqueness of the generated goto sets + # (i.e. the same goto set will never be returned as two different Python + # objects). With uniqueness, we can later do fast set comparisons using + # id(obj) instead of element-wise comparison. + + def lr0_goto(self, I, x): + # First we look for a previously cached entry + g = self.lr_goto_cache.get((id(I), x)) + if g: + return g + + # Now we generate the goto set in a way that guarantees uniqueness + # of the result + + s = self.lr_goto_cache.get(x) + if not s: + s = {} + self.lr_goto_cache[x] = s + + gs = [] + for p in I: + n = p.lr_next + if n and n.lr_before == x: + s1 = s.get(id(n)) + if not s1: + s1 = {} + s[id(n)] = s1 + gs.append(n) + s = s1 + g = s.get('$end') + if not g: + if gs: + g = self.lr0_closure(gs) + s['$end'] = g + else: + s['$end'] = gs + self.lr_goto_cache[(id(I), x)] = g + return g + + # Compute the LR(0) sets of item function + def lr0_items(self): + C = [self.lr0_closure([self.grammar.Productions[0].lr_next])] + i = 0 + for I in C: + self.lr0_cidhash[id(I)] = i + i += 1 + + # Loop over the items in C and each grammar symbols + i = 0 + while i < len(C): + I = C[i] + i += 1 + + # Collect all of the symbols that could possibly be in the goto(I,X) sets + asyms = {} + for ii in I: + for s in ii.usyms: + asyms[s] = None + + for x in asyms: + g = self.lr0_goto(I, x) + if not g or id(g) in self.lr0_cidhash: + continue + self.lr0_cidhash[id(g)] = len(C) + C.append(g) + + return C + + # ----------------------------------------------------------------------------- + # ==== LALR(1) Parsing ==== + # + # LALR(1) parsing is almost exactly the same as SLR except that instead of + # relying upon Follow() sets when performing reductions, a more selective + # lookahead set that incorporates the state of the LR(0) machine is utilized. + # Thus, we mainly just have to focus on calculating the lookahead sets. + # + # The method used here is due to DeRemer and Pennelo (1982). + # + # DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1) + # Lookahead Sets", ACM Transactions on Programming Languages and Systems, + # Vol. 4, No. 4, Oct. 1982, pp. 615-649 + # + # Further details can also be found in: + # + # J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing", + # McGraw-Hill Book Company, (1985). + # + # ----------------------------------------------------------------------------- + + # ----------------------------------------------------------------------------- + # compute_nullable_nonterminals() + # + # Creates a dictionary containing all of the non-terminals that might produce + # an empty production. + # ----------------------------------------------------------------------------- + + def compute_nullable_nonterminals(self): + nullable = set() + num_nullable = 0 + while True: + for p in self.grammar.Productions[1:]: + if p.len == 0: + nullable.add(p.name) + continue + for t in p.prod: + if t not in nullable: + break + else: + nullable.add(p.name) + if len(nullable) == num_nullable: + break + num_nullable = len(nullable) + return nullable + + # ----------------------------------------------------------------------------- + # find_nonterminal_trans(C) + # + # Given a set of LR(0) items, this functions finds all of the non-terminal + # transitions. These are transitions in which a dot appears immediately before + # a non-terminal. Returns a list of tuples of the form (state,N) where state + # is the state number and N is the nonterminal symbol. + # + # The input C is the set of LR(0) items. + # ----------------------------------------------------------------------------- + + def find_nonterminal_transitions(self, C): + trans = [] + for stateno, state in enumerate(C): + for p in state: + if p.lr_index < p.len - 1: + t = (stateno, p.prod[p.lr_index+1]) + if t[1] in self.grammar.Nonterminals: + if t not in trans: + trans.append(t) + return trans + + # ----------------------------------------------------------------------------- + # dr_relation() + # + # Computes the DR(p,A) relationships for non-terminal transitions. The input + # is a tuple (state,N) where state is a number and N is a nonterminal symbol. + # + # Returns a list of terminals. + # ----------------------------------------------------------------------------- + + def dr_relation(self, C, trans, nullable): + dr_set = {} + state, N = trans + terms = [] + + g = self.lr0_goto(C[state], N) + for p in g: + if p.lr_index < p.len - 1: + a = p.prod[p.lr_index+1] + if a in self.grammar.Terminals: + if a not in terms: + terms.append(a) + + # This extra bit is to handle the start state + if state == 0 and N == self.grammar.Productions[0].prod[0]: + terms.append('$end') + + return terms + + # ----------------------------------------------------------------------------- + # reads_relation() + # + # Computes the READS() relation (p,A) READS (t,C). + # ----------------------------------------------------------------------------- + + def reads_relation(self, C, trans, empty): + # Look for empty transitions + rel = [] + state, N = trans + + g = self.lr0_goto(C[state], N) + j = self.lr0_cidhash.get(id(g), -1) + for p in g: + if p.lr_index < p.len - 1: + a = p.prod[p.lr_index + 1] + if a in empty: + rel.append((j, a)) + + return rel + + # ----------------------------------------------------------------------------- + # compute_lookback_includes() + # + # Determines the lookback and includes relations + # + # LOOKBACK: + # + # This relation is determined by running the LR(0) state machine forward. + # For example, starting with a production "N : . A B C", we run it forward + # to obtain "N : A B C ." We then build a relationship between this final + # state and the starting state. These relationships are stored in a dictionary + # lookdict. + # + # INCLUDES: + # + # Computes the INCLUDE() relation (p,A) INCLUDES (p',B). + # + # This relation is used to determine non-terminal transitions that occur + # inside of other non-terminal transition states. (p,A) INCLUDES (p', B) + # if the following holds: + # + # B -> LAT, where T -> epsilon and p' -L-> p + # + # L is essentially a prefix (which may be empty), T is a suffix that must be + # able to derive an empty string. State p' must lead to state p with the string L. + # + # ----------------------------------------------------------------------------- + + def compute_lookback_includes(self, C, trans, nullable): + lookdict = {} # Dictionary of lookback relations + includedict = {} # Dictionary of include relations + + # Make a dictionary of non-terminal transitions + dtrans = {} + for t in trans: + dtrans[t] = 1 + + # Loop over all transitions and compute lookbacks and includes + for state, N in trans: + lookb = [] + includes = [] + for p in C[state]: + if p.name != N: + continue + + # Okay, we have a name match. We now follow the production all the way + # through the state machine until we get the . on the right hand side + + lr_index = p.lr_index + j = state + while lr_index < p.len - 1: + lr_index = lr_index + 1 + t = p.prod[lr_index] + + # Check to see if this symbol and state are a non-terminal transition + if (j, t) in dtrans: + # Yes. Okay, there is some chance that this is an includes relation + # the only way to know for certain is whether the rest of the + # production derives empty + + li = lr_index + 1 + while li < p.len: + if p.prod[li] in self.grammar.Terminals: + break # No forget it + if p.prod[li] not in nullable: + break + li = li + 1 + else: + # Appears to be a relation between (j,t) and (state,N) + includes.append((j, t)) + + g = self.lr0_goto(C[j], t) # Go to next set + j = self.lr0_cidhash.get(id(g), -1) # Go to next state + + # When we get here, j is the final state, now we have to locate the production + for r in C[j]: + if r.name != p.name: + continue + if r.len != p.len: + continue + i = 0 + # This look is comparing a production ". A B C" with "A B C ." + while i < r.lr_index: + if r.prod[i] != p.prod[i+1]: + break + i = i + 1 + else: + lookb.append((j, r)) + for i in includes: + if i not in includedict: + includedict[i] = [] + includedict[i].append((state, N)) + lookdict[(state, N)] = lookb + + return lookdict, includedict + + # ----------------------------------------------------------------------------- + # compute_read_sets() + # + # Given a set of LR(0) items, this function computes the read sets. + # + # Inputs: C = Set of LR(0) items + # ntrans = Set of nonterminal transitions + # nullable = Set of empty transitions + # + # Returns a set containing the read sets + # ----------------------------------------------------------------------------- + + def compute_read_sets(self, C, ntrans, nullable): + FP = lambda x: self.dr_relation(C, x, nullable) + R = lambda x: self.reads_relation(C, x, nullable) + F = digraph(ntrans, R, FP) + return F + + # ----------------------------------------------------------------------------- + # compute_follow_sets() + # + # Given a set of LR(0) items, a set of non-terminal transitions, a readset, + # and an include set, this function computes the follow sets + # + # Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)} + # + # Inputs: + # ntrans = Set of nonterminal transitions + # readsets = Readset (previously computed) + # inclsets = Include sets (previously computed) + # + # Returns a set containing the follow sets + # ----------------------------------------------------------------------------- + + def compute_follow_sets(self, ntrans, readsets, inclsets): + FP = lambda x: readsets[x] + R = lambda x: inclsets.get(x, []) + F = digraph(ntrans, R, FP) + return F + + # ----------------------------------------------------------------------------- + # add_lookaheads() + # + # Attaches the lookahead symbols to grammar rules. + # + # Inputs: lookbacks - Set of lookback relations + # followset - Computed follow set + # + # This function directly attaches the lookaheads to productions contained + # in the lookbacks set + # ----------------------------------------------------------------------------- + + def add_lookaheads(self, lookbacks, followset): + for trans, lb in lookbacks.items(): + # Loop over productions in lookback + for state, p in lb: + if state not in p.lookaheads: + p.lookaheads[state] = [] + f = followset.get(trans, []) + for a in f: + if a not in p.lookaheads[state]: + p.lookaheads[state].append(a) + + # ----------------------------------------------------------------------------- + # add_lalr_lookaheads() + # + # This function does all of the work of adding lookahead information for use + # with LALR parsing + # ----------------------------------------------------------------------------- + + def add_lalr_lookaheads(self, C): + # Determine all of the nullable nonterminals + nullable = self.compute_nullable_nonterminals() + + # Find all non-terminal transitions + trans = self.find_nonterminal_transitions(C) + + # Compute read sets + readsets = self.compute_read_sets(C, trans, nullable) + + # Compute lookback/includes relations + lookd, included = self.compute_lookback_includes(C, trans, nullable) + + # Compute LALR FOLLOW sets + followsets = self.compute_follow_sets(trans, readsets, included) + + # Add all of the lookaheads + self.add_lookaheads(lookd, followsets) + + # ----------------------------------------------------------------------------- + # lr_parse_table() + # + # This function constructs the parse tables for SLR or LALR + # ----------------------------------------------------------------------------- + def lr_parse_table(self): + Productions = self.grammar.Productions + Precedence = self.grammar.Precedence + goto = self.lr_goto # Goto array + action = self.lr_action # Action array + log = self.log # Logger for output + + actionp = {} # Action production array (temporary) + + log.info('Parsing method: %s', self.lr_method) + + # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items + # This determines the number of states + + C = self.lr0_items() + + if self.lr_method == 'LALR': + self.add_lalr_lookaheads(C) + + # Build the parser table, state by state + st = 0 + for I in C: + # Loop over each production in I + actlist = [] # List of actions + st_action = {} + st_actionp = {} + st_goto = {} + log.info('') + log.info('state %d', st) + log.info('') + for p in I: + log.info(' (%d) %s', p.number, p) + log.info('') + + for p in I: + if p.len == p.lr_index + 1: + if p.name == "S'": + # Start symbol. Accept! + st_action['$end'] = 0 + st_actionp['$end'] = p + else: + # We are at the end of a production. Reduce! + if self.lr_method == 'LALR': + laheads = p.lookaheads[st] + else: + laheads = self.grammar.Follow[p.name] + for a in laheads: + actlist.append((a, p, 'reduce using rule %d (%s)' % (p.number, p))) + r = st_action.get(a) + if r is not None: + # Whoa. Have a shift/reduce or reduce/reduce conflict + if r > 0: + # Need to decide on shift or reduce here + # By default we favor shifting. Need to add + # some precedence rules here. + + # Shift precedence comes from the token + sprec, slevel = Precedence.get(a, ('right', 0)) + + # Reduce precedence comes from rule being reduced (p) + rprec, rlevel = Productions[p.number].prec + + if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): + # We really need to reduce here. + st_action[a] = -p.number + st_actionp[a] = p + if not slevel and not rlevel: + log.info(' ! shift/reduce conflict for %s resolved as reduce', a) + self.sr_conflicts.append((st, a, 'reduce')) + Productions[p.number].reduced += 1 + elif (slevel == rlevel) and (rprec == 'nonassoc'): + st_action[a] = None + else: + # Hmmm. Guess we'll keep the shift + if not rlevel: + log.info(' ! shift/reduce conflict for %s resolved as shift', a) + self.sr_conflicts.append((st, a, 'shift')) + elif r < 0: + # Reduce/reduce conflict. In this case, we favor the rule + # that was defined first in the grammar file + oldp = Productions[-r] + pp = Productions[p.number] + if oldp.line > pp.line: + st_action[a] = -p.number + st_actionp[a] = p + chosenp, rejectp = pp, oldp + Productions[p.number].reduced += 1 + Productions[oldp.number].reduced -= 1 + else: + chosenp, rejectp = oldp, pp + self.rr_conflicts.append((st, chosenp, rejectp)) + log.info(' ! reduce/reduce conflict for %s resolved using rule %d (%s)', + a, st_actionp[a].number, st_actionp[a]) + else: + raise LALRError('Unknown conflict in state %d' % st) + else: + st_action[a] = -p.number + st_actionp[a] = p + Productions[p.number].reduced += 1 + else: + i = p.lr_index + a = p.prod[i+1] # Get symbol right after the "." + if a in self.grammar.Terminals: + g = self.lr0_goto(I, a) + j = self.lr0_cidhash.get(id(g), -1) + if j >= 0: + # We are in a shift state + actlist.append((a, p, 'shift and go to state %d' % j)) + r = st_action.get(a) + if r is not None: + # Whoa have a shift/reduce or shift/shift conflict + if r > 0: + if r != j: + raise LALRError('Shift/shift conflict in state %d' % st) + elif r < 0: + # Do a precedence check. + # - if precedence of reduce rule is higher, we reduce. + # - if precedence of reduce is same and left assoc, we reduce. + # - otherwise we shift + + # Shift precedence comes from the token + sprec, slevel = Precedence.get(a, ('right', 0)) + + # Reduce precedence comes from the rule that could have been reduced + rprec, rlevel = Productions[st_actionp[a].number].prec + + if (slevel > rlevel) or ((slevel == rlevel) and (rprec == 'right')): + # We decide to shift here... highest precedence to shift + Productions[st_actionp[a].number].reduced -= 1 + st_action[a] = j + st_actionp[a] = p + if not rlevel: + log.info(' ! shift/reduce conflict for %s resolved as shift', a) + self.sr_conflicts.append((st, a, 'shift')) + elif (slevel == rlevel) and (rprec == 'nonassoc'): + st_action[a] = None + else: + # Hmmm. Guess we'll keep the reduce + if not slevel and not rlevel: + log.info(' ! shift/reduce conflict for %s resolved as reduce', a) + self.sr_conflicts.append((st, a, 'reduce')) + + else: + raise LALRError('Unknown conflict in state %d' % st) + else: + st_action[a] = j + st_actionp[a] = p + + # Print the actions associated with each terminal + _actprint = {} + for a, p, m in actlist: + if a in st_action: + if p is st_actionp[a]: + log.info(' %-15s %s', a, m) + _actprint[(a, m)] = 1 + log.info('') + # Print the actions that were not used. (debugging) + not_used = 0 + for a, p, m in actlist: + if a in st_action: + if p is not st_actionp[a]: + if not (a, m) in _actprint: + log.debug(' ! %-15s [ %s ]', a, m) + not_used = 1 + _actprint[(a, m)] = 1 + if not_used: + log.debug('') + + # Construct the goto table for this state + + nkeys = {} + for ii in I: + for s in ii.usyms: + if s in self.grammar.Nonterminals: + nkeys[s] = None + for n in nkeys: + g = self.lr0_goto(I, n) + j = self.lr0_cidhash.get(id(g), -1) + if j >= 0: + st_goto[n] = j + log.info(' %-30s shift and go to state %d', n, j) + + action[st] = st_action + actionp[st] = st_actionp + goto[st] = st_goto + st += 1 + + # ----------------------------------------------------------------------------- + # write() + # + # This function writes the LR parsing tables to a file + # ----------------------------------------------------------------------------- + + def write_table(self, tabmodule, outputdir='', signature=''): + if isinstance(tabmodule, types.ModuleType): + raise IOError("Won't overwrite existing tabmodule") + + basemodulename = tabmodule.split('.')[-1] + filename = os.path.join(outputdir, basemodulename) + '.py' + try: + f = open(filename, 'w') + + f.write(''' +# %s +# This file is automatically generated. Do not edit. +_tabversion = %r + +_lr_method = %r + +_lr_signature = %r + ''' % (os.path.basename(filename), __tabversion__, self.lr_method, signature)) + + # Change smaller to 0 to go back to original tables + smaller = 1 + + # Factor out names to try and make smaller + if smaller: + items = {} + + for s, nd in self.lr_action.items(): + for name, v in nd.items(): + i = items.get(name) + if not i: + i = ([], []) + items[name] = i + i[0].append(s) + i[1].append(v) + + f.write('\n_lr_action_items = {') + for k, v in items.items(): + f.write('%r:([' % k) + for i in v[0]: + f.write('%r,' % i) + f.write('],[') + for i in v[1]: + f.write('%r,' % i) + + f.write(']),') + f.write('}\n') + + f.write(''' +_lr_action = {} +for _k, _v in _lr_action_items.items(): + for _x,_y in zip(_v[0],_v[1]): + if not _x in _lr_action: _lr_action[_x] = {} + _lr_action[_x][_k] = _y +del _lr_action_items +''') + + else: + f.write('\n_lr_action = { ') + for k, v in self.lr_action.items(): + f.write('(%r,%r):%r,' % (k[0], k[1], v)) + f.write('}\n') + + if smaller: + # Factor out names to try and make smaller + items = {} + + for s, nd in self.lr_goto.items(): + for name, v in nd.items(): + i = items.get(name) + if not i: + i = ([], []) + items[name] = i + i[0].append(s) + i[1].append(v) + + f.write('\n_lr_goto_items = {') + for k, v in items.items(): + f.write('%r:([' % k) + for i in v[0]: + f.write('%r,' % i) + f.write('],[') + for i in v[1]: + f.write('%r,' % i) + + f.write(']),') + f.write('}\n') + + f.write(''' +_lr_goto = {} +for _k, _v in _lr_goto_items.items(): + for _x, _y in zip(_v[0], _v[1]): + if not _x in _lr_goto: _lr_goto[_x] = {} + _lr_goto[_x][_k] = _y +del _lr_goto_items +''') + else: + f.write('\n_lr_goto = { ') + for k, v in self.lr_goto.items(): + f.write('(%r,%r):%r,' % (k[0], k[1], v)) + f.write('}\n') + + # Write production table + f.write('_lr_productions = [\n') + for p in self.lr_productions: + if p.func: + f.write(' (%r,%r,%d,%r,%r,%d),\n' % (p.str, p.name, p.len, + p.func, os.path.basename(p.file), p.line)) + else: + f.write(' (%r,%r,%d,None,None,None),\n' % (str(p), p.name, p.len)) + f.write(']\n') + f.close() + + except IOError as e: + raise + + + # ----------------------------------------------------------------------------- + # pickle_table() + # + # This function pickles the LR parsing tables to a supplied file object + # ----------------------------------------------------------------------------- + + def pickle_table(self, filename, signature=''): + try: + import cPickle as pickle + except ImportError: + import pickle + with open(filename, 'wb') as outf: + pickle.dump(__tabversion__, outf, pickle_protocol) + pickle.dump(self.lr_method, outf, pickle_protocol) + pickle.dump(signature, outf, pickle_protocol) + pickle.dump(self.lr_action, outf, pickle_protocol) + pickle.dump(self.lr_goto, outf, pickle_protocol) + + outp = [] + for p in self.lr_productions: + if p.func: + outp.append((p.str, p.name, p.len, p.func, os.path.basename(p.file), p.line)) + else: + outp.append((str(p), p.name, p.len, None, None, None)) + pickle.dump(outp, outf, pickle_protocol) + +# ----------------------------------------------------------------------------- +# === INTROSPECTION === +# +# The following functions and classes are used to implement the PLY +# introspection features followed by the yacc() function itself. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# get_caller_module_dict() +# +# This function returns a dictionary containing all of the symbols defined within +# a caller further down the call stack. This is used to get the environment +# associated with the yacc() call if none was provided. +# ----------------------------------------------------------------------------- + +def get_caller_module_dict(levels): + f = sys._getframe(levels) + ldict = f.f_globals.copy() + if f.f_globals != f.f_locals: + ldict.update(f.f_locals) + return ldict + +# ----------------------------------------------------------------------------- +# parse_grammar() +# +# This takes a raw grammar rule string and parses it into production data +# ----------------------------------------------------------------------------- +def parse_grammar(doc, file, line): + grammar = [] + # Split the doc string into lines + pstrings = doc.splitlines() + lastp = None + dline = line + for ps in pstrings: + dline += 1 + p = ps.split() + if not p: + continue + try: + if p[0] == '|': + # This is a continuation of a previous rule + if not lastp: + raise SyntaxError("%s:%d: Misplaced '|'" % (file, dline)) + prodname = lastp + syms = p[1:] + else: + prodname = p[0] + lastp = prodname + syms = p[2:] + assign = p[1] + if assign != ':' and assign != '::=': + raise SyntaxError("%s:%d: Syntax error. Expected ':'" % (file, dline)) + + grammar.append((file, dline, prodname, syms)) + except SyntaxError: + raise + except Exception: + raise SyntaxError('%s:%d: Syntax error in rule %r' % (file, dline, ps.strip())) + + return grammar + +# ----------------------------------------------------------------------------- +# ParserReflect() +# +# This class represents information extracted for building a parser including +# start symbol, error function, tokens, precedence list, action functions, +# etc. +# ----------------------------------------------------------------------------- +class ParserReflect(object): + def __init__(self, pdict, log=None): + self.pdict = pdict + self.start = None + self.error_func = None + self.tokens = None + self.modules = set() + self.grammar = [] + self.error = False + + if log is None: + self.log = PlyLogger(sys.stderr) + else: + self.log = log + + # Get all of the basic information + def get_all(self): + self.get_start() + self.get_error_func() + self.get_tokens() + self.get_precedence() + self.get_pfunctions() + + # Validate all of the information + def validate_all(self): + self.validate_start() + self.validate_error_func() + self.validate_tokens() + self.validate_precedence() + self.validate_pfunctions() + self.validate_modules() + return self.error + + # Compute a signature over the grammar + def signature(self): + try: + from hashlib import md5 + except ImportError: + from md5 import md5 + try: + sig = md5() + if self.start: + sig.update(self.start.encode('latin-1')) + if self.prec: + sig.update(''.join([''.join(p) for p in self.prec]).encode('latin-1')) + if self.tokens: + sig.update(' '.join(self.tokens).encode('latin-1')) + for f in self.pfuncs: + if f[3]: + sig.update(f[3].encode('latin-1')) + except (TypeError, ValueError): + pass + + digest = base64.b16encode(sig.digest()) + if sys.version_info[0] >= 3: + digest = digest.decode('latin-1') + return digest + + # ----------------------------------------------------------------------------- + # validate_modules() + # + # This method checks to see if there are duplicated p_rulename() functions + # in the parser module file. Without this function, it is really easy for + # users to make mistakes by cutting and pasting code fragments (and it's a real + # bugger to try and figure out why the resulting parser doesn't work). Therefore, + # we just do a little regular expression pattern matching of def statements + # to try and detect duplicates. + # ----------------------------------------------------------------------------- + + def validate_modules(self): + # Match def p_funcname( + fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') + + for module in self.modules: + try: + lines, linen = inspect.getsourcelines(module) + except IOError: + continue + + counthash = {} + for linen, line in enumerate(lines): + linen += 1 + m = fre.match(line) + if m: + name = m.group(1) + prev = counthash.get(name) + if not prev: + counthash[name] = linen + else: + filename = inspect.getsourcefile(module) + self.log.warning('%s:%d: Function %s redefined. Previously defined on line %d', + filename, linen, name, prev) + + # Get the start symbol + def get_start(self): + self.start = self.pdict.get('start') + + # Validate the start symbol + def validate_start(self): + if self.start is not None: + if not isinstance(self.start, string_types): + self.log.error("'start' must be a string") + + # Look for error handler + def get_error_func(self): + self.error_func = self.pdict.get('p_error') + + # Validate the error function + def validate_error_func(self): + if self.error_func: + if isinstance(self.error_func, types.FunctionType): + ismethod = 0 + elif isinstance(self.error_func, types.MethodType): + ismethod = 1 + else: + self.log.error("'p_error' defined, but is not a function or method") + self.error = True + return + + eline = self.error_func.__code__.co_firstlineno + efile = self.error_func.__code__.co_filename + module = inspect.getmodule(self.error_func) + self.modules.add(module) + + argcount = self.error_func.__code__.co_argcount - ismethod + if argcount != 1: + self.log.error('%s:%d: p_error() requires 1 argument', efile, eline) + self.error = True + + # Get the tokens map + def get_tokens(self): + tokens = self.pdict.get('tokens') + if not tokens: + self.log.error('No token list is defined') + self.error = True + return + + if not isinstance(tokens, (list, tuple)): + self.log.error('tokens must be a list or tuple') + self.error = True + return + + if not tokens: + self.log.error('tokens is empty') + self.error = True + return + + self.tokens = tokens + + # Validate the tokens + def validate_tokens(self): + # Validate the tokens. + if 'error' in self.tokens: + self.log.error("Illegal token name 'error'. Is a reserved word") + self.error = True + return + + terminals = set() + for n in self.tokens: + if n in terminals: + self.log.warning('Token %r multiply defined', n) + terminals.add(n) + + # Get the precedence map (if any) + def get_precedence(self): + self.prec = self.pdict.get('precedence') + + # Validate and parse the precedence map + def validate_precedence(self): + preclist = [] + if self.prec: + if not isinstance(self.prec, (list, tuple)): + self.log.error('precedence must be a list or tuple') + self.error = True + return + for level, p in enumerate(self.prec): + if not isinstance(p, (list, tuple)): + self.log.error('Bad precedence table') + self.error = True + return + + if len(p) < 2: + self.log.error('Malformed precedence entry %s. Must be (assoc, term, ..., term)', p) + self.error = True + return + assoc = p[0] + if not isinstance(assoc, string_types): + self.log.error('precedence associativity must be a string') + self.error = True + return + for term in p[1:]: + if not isinstance(term, string_types): + self.log.error('precedence items must be strings') + self.error = True + return + preclist.append((term, assoc, level+1)) + self.preclist = preclist + + # Get all p_functions from the grammar + def get_pfunctions(self): + p_functions = [] + for name, item in self.pdict.items(): + if not name.startswith('p_') or name == 'p_error': + continue + if isinstance(item, (types.FunctionType, types.MethodType)): + line = getattr(item, 'co_firstlineno', item.__code__.co_firstlineno) + module = inspect.getmodule(item) + p_functions.append((line, module, name, item.__doc__)) + + # Sort all of the actions by line number; make sure to stringify + # modules to make them sortable, since `line` may not uniquely sort all + # p functions + p_functions.sort(key=lambda p_function: ( + p_function[0], + str(p_function[1]), + p_function[2], + p_function[3])) + self.pfuncs = p_functions + + # Validate all of the p_functions + def validate_pfunctions(self): + grammar = [] + # Check for non-empty symbols + if len(self.pfuncs) == 0: + self.log.error('no rules of the form p_rulename are defined') + self.error = True + return + + for line, module, name, doc in self.pfuncs: + file = inspect.getsourcefile(module) + func = self.pdict[name] + if isinstance(func, types.MethodType): + reqargs = 2 + else: + reqargs = 1 + if func.__code__.co_argcount > reqargs: + self.log.error('%s:%d: Rule %r has too many arguments', file, line, func.__name__) + self.error = True + elif func.__code__.co_argcount < reqargs: + self.log.error('%s:%d: Rule %r requires an argument', file, line, func.__name__) + self.error = True + elif not func.__doc__: + self.log.warning('%s:%d: No documentation string specified in function %r (ignored)', + file, line, func.__name__) + else: + try: + parsed_g = parse_grammar(doc, file, line) + for g in parsed_g: + grammar.append((name, g)) + except SyntaxError as e: + self.log.error(str(e)) + self.error = True + + # Looks like a valid grammar rule + # Mark the file in which defined. + self.modules.add(module) + + # Secondary validation step that looks for p_ definitions that are not functions + # or functions that look like they might be grammar rules. + + for n, v in self.pdict.items(): + if n.startswith('p_') and isinstance(v, (types.FunctionType, types.MethodType)): + continue + if n.startswith('t_'): + continue + if n.startswith('p_') and n != 'p_error': + self.log.warning('%r not defined as a function', n) + if ((isinstance(v, types.FunctionType) and v.__code__.co_argcount == 1) or + (isinstance(v, types.MethodType) and v.__func__.__code__.co_argcount == 2)): + if v.__doc__: + try: + doc = v.__doc__.split(' ') + if doc[1] == ':': + self.log.warning('%s:%d: Possible grammar rule %r defined without p_ prefix', + v.__code__.co_filename, v.__code__.co_firstlineno, n) + except IndexError: + pass + + self.grammar = grammar + +# ----------------------------------------------------------------------------- +# yacc(module) +# +# Build a parser +# ----------------------------------------------------------------------------- + +def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, start=None, + check_recursion=True, optimize=False, write_tables=True, debugfile=debug_file, + outputdir=None, debuglog=None, errorlog=None, picklefile=None): + + if tabmodule is None: + tabmodule = tab_module + + # Reference to the parsing method of the last built parser + global parse + + # If pickling is enabled, table files are not created + if picklefile: + write_tables = 0 + + if errorlog is None: + errorlog = PlyLogger(sys.stderr) + + # Get the module dictionary used for the parser + if module: + _items = [(k, getattr(module, k)) for k in dir(module)] + pdict = dict(_items) + # If no __file__ attribute is available, try to obtain it from the __module__ instead + if '__file__' not in pdict: + pdict['__file__'] = sys.modules[pdict['__module__']].__file__ + else: + pdict = get_caller_module_dict(2) + + if outputdir is None: + # If no output directory is set, the location of the output files + # is determined according to the following rules: + # - If tabmodule specifies a package, files go into that package directory + # - Otherwise, files go in the same directory as the specifying module + if isinstance(tabmodule, types.ModuleType): + srcfile = tabmodule.__file__ + else: + if '.' not in tabmodule: + srcfile = pdict['__file__'] + else: + parts = tabmodule.split('.') + pkgname = '.'.join(parts[:-1]) + exec('import %s' % pkgname) + srcfile = getattr(sys.modules[pkgname], '__file__', '') + outputdir = os.path.dirname(srcfile) + + # Determine if the module is package of a package or not. + # If so, fix the tabmodule setting so that tables load correctly + pkg = pdict.get('__package__') + if pkg and isinstance(tabmodule, str): + if '.' not in tabmodule: + tabmodule = pkg + '.' + tabmodule + + + + # Set start symbol if it's specified directly using an argument + if start is not None: + pdict['start'] = start + + # Collect parser information from the dictionary + pinfo = ParserReflect(pdict, log=errorlog) + pinfo.get_all() + + if pinfo.error: + raise YaccError('Unable to build parser') + + # Check signature against table files (if any) + signature = pinfo.signature() + + # Read the tables + try: + lr = LRTable() + if picklefile: + read_signature = lr.read_pickle(picklefile) + else: + read_signature = lr.read_table(tabmodule) + if optimize or (read_signature == signature): + try: + lr.bind_callables(pinfo.pdict) + parser = LRParser(lr, pinfo.error_func) + parse = parser.parse + return parser + except Exception as e: + errorlog.warning('There was a problem loading the table file: %r', e) + except VersionError as e: + errorlog.warning(str(e)) + except ImportError: + pass + + if debuglog is None: + if debug: + try: + debuglog = PlyLogger(open(os.path.join(outputdir, debugfile), 'w')) + except IOError as e: + errorlog.warning("Couldn't open %r. %s" % (debugfile, e)) + debuglog = NullLogger() + else: + debuglog = NullLogger() + + debuglog.info('Created by PLY version %s (http://www.dabeaz.com/ply)', __version__) + + errors = False + + # Validate the parser information + if pinfo.validate_all(): + raise YaccError('Unable to build parser') + + if not pinfo.error_func: + errorlog.warning('no p_error() function is defined') + + # Create a grammar object + grammar = Grammar(pinfo.tokens) + + # Set precedence level for terminals + for term, assoc, level in pinfo.preclist: + try: + grammar.set_precedence(term, assoc, level) + except GrammarError as e: + errorlog.warning('%s', e) + + # Add productions to the grammar + for funcname, gram in pinfo.grammar: + file, line, prodname, syms = gram + try: + grammar.add_production(prodname, syms, funcname, file, line) + except GrammarError as e: + errorlog.error('%s', e) + errors = True + + # Set the grammar start symbols + try: + if start is None: + grammar.set_start(pinfo.start) + else: + grammar.set_start(start) + except GrammarError as e: + errorlog.error(str(e)) + errors = True + + if errors: + raise YaccError('Unable to build parser') + + # Verify the grammar structure + undefined_symbols = grammar.undefined_symbols() + for sym, prod in undefined_symbols: + errorlog.error('%s:%d: Symbol %r used, but not defined as a token or a rule', prod.file, prod.line, sym) + errors = True + + unused_terminals = grammar.unused_terminals() + if unused_terminals: + debuglog.info('') + debuglog.info('Unused terminals:') + debuglog.info('') + for term in unused_terminals: + errorlog.warning('Token %r defined, but not used', term) + debuglog.info(' %s', term) + + # Print out all productions to the debug log + if debug: + debuglog.info('') + debuglog.info('Grammar') + debuglog.info('') + for n, p in enumerate(grammar.Productions): + debuglog.info('Rule %-5d %s', n, p) + + # Find unused non-terminals + unused_rules = grammar.unused_rules() + for prod in unused_rules: + errorlog.warning('%s:%d: Rule %r defined, but not used', prod.file, prod.line, prod.name) + + if len(unused_terminals) == 1: + errorlog.warning('There is 1 unused token') + if len(unused_terminals) > 1: + errorlog.warning('There are %d unused tokens', len(unused_terminals)) + + if len(unused_rules) == 1: + errorlog.warning('There is 1 unused rule') + if len(unused_rules) > 1: + errorlog.warning('There are %d unused rules', len(unused_rules)) + + if debug: + debuglog.info('') + debuglog.info('Terminals, with rules where they appear') + debuglog.info('') + terms = list(grammar.Terminals) + terms.sort() + for term in terms: + debuglog.info('%-20s : %s', term, ' '.join([str(s) for s in grammar.Terminals[term]])) + + debuglog.info('') + debuglog.info('Nonterminals, with rules where they appear') + debuglog.info('') + nonterms = list(grammar.Nonterminals) + nonterms.sort() + for nonterm in nonterms: + debuglog.info('%-20s : %s', nonterm, ' '.join([str(s) for s in grammar.Nonterminals[nonterm]])) + debuglog.info('') + + if check_recursion: + unreachable = grammar.find_unreachable() + for u in unreachable: + errorlog.warning('Symbol %r is unreachable', u) + + infinite = grammar.infinite_cycles() + for inf in infinite: + errorlog.error('Infinite recursion detected for symbol %r', inf) + errors = True + + unused_prec = grammar.unused_precedence() + for term, assoc in unused_prec: + errorlog.error('Precedence rule %r defined for unknown symbol %r', assoc, term) + errors = True + + if errors: + raise YaccError('Unable to build parser') + + # Run the LRGeneratedTable on the grammar + if debug: + errorlog.debug('Generating %s tables', method) + + lr = LRGeneratedTable(grammar, method, debuglog) + + if debug: + num_sr = len(lr.sr_conflicts) + + # Report shift/reduce and reduce/reduce conflicts + if num_sr == 1: + errorlog.warning('1 shift/reduce conflict') + elif num_sr > 1: + errorlog.warning('%d shift/reduce conflicts', num_sr) + + num_rr = len(lr.rr_conflicts) + if num_rr == 1: + errorlog.warning('1 reduce/reduce conflict') + elif num_rr > 1: + errorlog.warning('%d reduce/reduce conflicts', num_rr) + + # Write out conflicts to the output file + if debug and (lr.sr_conflicts or lr.rr_conflicts): + debuglog.warning('') + debuglog.warning('Conflicts:') + debuglog.warning('') + + for state, tok, resolution in lr.sr_conflicts: + debuglog.warning('shift/reduce conflict for %s in state %d resolved as %s', tok, state, resolution) + + already_reported = set() + for state, rule, rejected in lr.rr_conflicts: + if (state, id(rule), id(rejected)) in already_reported: + continue + debuglog.warning('reduce/reduce conflict in state %d resolved using rule (%s)', state, rule) + debuglog.warning('rejected rule (%s) in state %d', rejected, state) + errorlog.warning('reduce/reduce conflict in state %d resolved using rule (%s)', state, rule) + errorlog.warning('rejected rule (%s) in state %d', rejected, state) + already_reported.add((state, id(rule), id(rejected))) + + warned_never = [] + for state, rule, rejected in lr.rr_conflicts: + if not rejected.reduced and (rejected not in warned_never): + debuglog.warning('Rule (%s) is never reduced', rejected) + errorlog.warning('Rule (%s) is never reduced', rejected) + warned_never.append(rejected) + + # Write the table file if requested + if write_tables: + try: + lr.write_table(tabmodule, outputdir, signature) + except IOError as e: + errorlog.warning("Couldn't create %r. %s" % (tabmodule, e)) + + # Write a pickled version of the tables + if picklefile: + try: + lr.pickle_table(picklefile, signature) + except IOError as e: + errorlog.warning("Couldn't create %r. %s" % (picklefile, e)) + + # Build the parser + lr.bind_callables(pinfo.pdict) + parser = LRParser(lr, pinfo.error_func) + + parse = parser.parse + return parser diff --git a/ply/ygen.py b/ply/ygen.py new file mode 100644 index 000000000..acf5ca1a3 --- /dev/null +++ b/ply/ygen.py @@ -0,0 +1,74 @@ +# ply: ygen.py +# +# This is a support program that auto-generates different versions of the YACC parsing +# function with different features removed for the purposes of performance. +# +# Users should edit the method LParser.parsedebug() in yacc.py. The source code +# for that method is then used to create the other methods. See the comments in +# yacc.py for further details. + +import os.path +import shutil + +def get_source_range(lines, tag): + srclines = enumerate(lines) + start_tag = '#--! %s-start' % tag + end_tag = '#--! %s-end' % tag + + for start_index, line in srclines: + if line.strip().startswith(start_tag): + break + + for end_index, line in srclines: + if line.strip().endswith(end_tag): + break + + return (start_index + 1, end_index) + +def filter_section(lines, tag): + filtered_lines = [] + include = True + tag_text = '#--! %s' % tag + for line in lines: + if line.strip().startswith(tag_text): + include = not include + elif include: + filtered_lines.append(line) + return filtered_lines + +def main(): + dirname = os.path.dirname(__file__) + shutil.copy2(os.path.join(dirname, 'yacc.py'), os.path.join(dirname, 'yacc.py.bak')) + with open(os.path.join(dirname, 'yacc.py'), 'r') as f: + lines = f.readlines() + + parse_start, parse_end = get_source_range(lines, 'parsedebug') + parseopt_start, parseopt_end = get_source_range(lines, 'parseopt') + parseopt_notrack_start, parseopt_notrack_end = get_source_range(lines, 'parseopt-notrack') + + # Get the original source + orig_lines = lines[parse_start:parse_end] + + # Filter the DEBUG sections out + parseopt_lines = filter_section(orig_lines, 'DEBUG') + + # Filter the TRACKING sections out + parseopt_notrack_lines = filter_section(parseopt_lines, 'TRACKING') + + # Replace the parser source sections with updated versions + lines[parseopt_notrack_start:parseopt_notrack_end] = parseopt_notrack_lines + lines[parseopt_start:parseopt_end] = parseopt_lines + + lines = [line.rstrip()+'\n' for line in lines] + with open(os.path.join(dirname, 'yacc.py'), 'w') as f: + f.writelines(lines) + + print('Updated yacc.py') + +if __name__ == '__main__': + main() + + + + + diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 000000000..1eee7db0a --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[bdist_wheel] +universal = 1 + +[metadata] +description-file = README.md diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..ee8ccd0cc --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +try: + from setuptools import setup +except ImportError: + from distutils.core import setup + +setup(name = "ply", + description="Python Lex & Yacc", + long_description = """ +PLY is yet another implementation of lex and yacc for Python. Some notable +features include the fact that its implemented entirely in Python and it +uses LALR(1) parsing which is efficient and well suited for larger grammars. + +PLY provides most of the standard lex/yacc features including support for empty +productions, precedence rules, error recovery, and support for ambiguous grammars. + +PLY is extremely easy to use and provides very extensive error checking. +It is compatible with both Python 2 and Python 3. +""", + license="""BSD""", + version = "3.10", + author = "David Beazley", + author_email = "dave@dabeaz.com", + maintainer = "David Beazley", + maintainer_email = "dave@dabeaz.com", + url = "http://www.dabeaz.com/ply/", + packages = ['ply'], + classifiers = [ + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 2', + ] + ) diff --git a/test/README b/test/README new file mode 100644 index 000000000..52f032a99 --- /dev/null +++ b/test/README @@ -0,0 +1,7 @@ +This directory mostly contains tests for various types of error +conditions. To run: + + $ python testlex.py + $ python testyacc.py + +The script 'cleanup.sh' cleans up this directory to its original state. diff --git a/test/calclex.py b/test/calclex.py new file mode 100644 index 000000000..030a9863d --- /dev/null +++ b/test/calclex.py @@ -0,0 +1,49 @@ +# ----------------------------------------------------------------------------- +# calclex.py +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lexer = lex.lex() + + + diff --git a/test/cleanup.sh b/test/cleanup.sh new file mode 100755 index 000000000..9374f2c60 --- /dev/null +++ b/test/cleanup.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +rm -rf *~ *.pyc *.pyo *.dif *.out __pycache__ + diff --git a/test/lex_closure.py b/test/lex_closure.py new file mode 100644 index 000000000..30ee67912 --- /dev/null +++ b/test/lex_closure.py @@ -0,0 +1,54 @@ +# ----------------------------------------------------------------------------- +# lex_closure.py +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +def make_calc(): + + # Tokens + + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_EQUALS = r'=' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + + t_ignore = " \t" + + def t_newline(t): + r'\n+' + t.lineno += t.value.count("\n") + + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + return lex.lex() + +make_calc() +lex.runmain(data="3+4") + + + diff --git a/test/lex_doc1.py b/test/lex_doc1.py new file mode 100644 index 000000000..8a2bfcce8 --- /dev/null +++ b/test/lex_doc1.py @@ -0,0 +1,26 @@ +# lex_doc1.py +# +# Missing documentation string + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +def t_NUMBER(t): + pass + +def t_error(t): + pass + +lex.lex() + + diff --git a/test/lex_dup1.py b/test/lex_dup1.py new file mode 100644 index 000000000..fd04cdb79 --- /dev/null +++ b/test/lex_dup1.py @@ -0,0 +1,29 @@ +# lex_dup1.py +# +# Duplicated rule specifiers + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +t_NUMBER = r'\d+' + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_dup2.py b/test/lex_dup2.py new file mode 100644 index 000000000..870e5e7d1 --- /dev/null +++ b/test/lex_dup2.py @@ -0,0 +1,33 @@ +# lex_dup2.py +# +# Duplicated rule specifiers + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +def t_NUMBER(t): + r'\d+' + pass + +def t_NUMBER(t): + r'\d+' + pass + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_dup3.py b/test/lex_dup3.py new file mode 100644 index 000000000..94b5592eb --- /dev/null +++ b/test/lex_dup3.py @@ -0,0 +1,31 @@ +# lex_dup3.py +# +# Duplicated rule specifiers + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +def t_NUMBER(t): + r'\d+' + pass + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_empty.py b/test/lex_empty.py new file mode 100644 index 000000000..e0368bfad --- /dev/null +++ b/test/lex_empty.py @@ -0,0 +1,20 @@ +# lex_empty.py +# +# No rules defined + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + + + +lex.lex() + + diff --git a/test/lex_error1.py b/test/lex_error1.py new file mode 100644 index 000000000..4508a8084 --- /dev/null +++ b/test/lex_error1.py @@ -0,0 +1,24 @@ +# lex_error1.py +# +# Missing t_error() rule + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + + + +lex.lex() + + diff --git a/test/lex_error2.py b/test/lex_error2.py new file mode 100644 index 000000000..8040d3902 --- /dev/null +++ b/test/lex_error2.py @@ -0,0 +1,26 @@ +# lex_error2.py +# +# t_error defined, but not function + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +t_error = "foo" + + + +lex.lex() + + diff --git a/test/lex_error3.py b/test/lex_error3.py new file mode 100644 index 000000000..1feefb649 --- /dev/null +++ b/test/lex_error3.py @@ -0,0 +1,27 @@ +# lex_error3.py +# +# t_error defined as function, but with wrong # args + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +def t_error(): + pass + + + +lex.lex() + + diff --git a/test/lex_error4.py b/test/lex_error4.py new file mode 100644 index 000000000..f4f48db13 --- /dev/null +++ b/test/lex_error4.py @@ -0,0 +1,27 @@ +# lex_error4.py +# +# t_error defined as function, but too many args + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +def t_error(t,s): + pass + + + +lex.lex() + + diff --git a/test/lex_hedit.py b/test/lex_hedit.py new file mode 100644 index 000000000..34f15a173 --- /dev/null +++ b/test/lex_hedit.py @@ -0,0 +1,47 @@ +# ----------------------------------------------------------------------------- +# hedit.py +# +# Paring of Fortran H Edit descriptions (Contributed by Pearu Peterson) +# +# These tokens can't be easily tokenized because they are of the following +# form: +# +# nHc1...cn +# +# where n is a positive integer and c1 ... cn are characters. +# +# This example shows how to modify the state of the lexer to parse +# such tokens +# ----------------------------------------------------------------------------- +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = ( + 'H_EDIT_DESCRIPTOR', + ) + +# Tokens +t_ignore = " \t\n" + +def t_H_EDIT_DESCRIPTOR(t): + r"\d+H.*" # This grabs all of the remaining text + i = t.value.index('H') + n = eval(t.value[:i]) + + # Adjust the tokenizing position + t.lexer.lexpos -= len(t.value) - (i+1+n) + t.value = t.value[i+1:i+1+n] + return t + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lex.lex() +lex.runmain(data="3Habc 10Habcdefghij 2Hxy") + + + diff --git a/test/lex_ignore.py b/test/lex_ignore.py new file mode 100644 index 000000000..6c43b4cff --- /dev/null +++ b/test/lex_ignore.py @@ -0,0 +1,31 @@ +# lex_ignore.py +# +# Improperly specific ignore declaration + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +def t_ignore(t): + ' \t' + pass + +def t_error(t): + pass + +import sys + +lex.lex() + + diff --git a/test/lex_ignore2.py b/test/lex_ignore2.py new file mode 100644 index 000000000..f60987a6b --- /dev/null +++ b/test/lex_ignore2.py @@ -0,0 +1,29 @@ +# lex_ignore2.py +# +# ignore declaration as a raw string + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +t_ignore = r' \t' + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_literal1.py b/test/lex_literal1.py new file mode 100644 index 000000000..db389c37c --- /dev/null +++ b/test/lex_literal1.py @@ -0,0 +1,25 @@ +# lex_literal1.py +# +# Bad literal specification + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "NUMBER", + ] + +literals = ["+","-","**"] + +def t_NUMBER(t): + r'\d+' + return t + +def t_error(t): + pass + +lex.lex() + + diff --git a/test/lex_literal2.py b/test/lex_literal2.py new file mode 100644 index 000000000..b50b92cd6 --- /dev/null +++ b/test/lex_literal2.py @@ -0,0 +1,25 @@ +# lex_literal2.py +# +# Bad literal specification + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "NUMBER", + ] + +literals = 23 + +def t_NUMBER(t): + r'\d+' + return t + +def t_error(t): + pass + +lex.lex() + + diff --git a/test/lex_literal3.py b/test/lex_literal3.py new file mode 100644 index 000000000..91ab980c8 --- /dev/null +++ b/test/lex_literal3.py @@ -0,0 +1,26 @@ +# lex_literal3.py +# +# An empty literal specification given as a list +# Issue 8 : Literals empty list causes IndexError + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "NUMBER", + ] + +literals = [] + +def t_NUMBER(t): + r'\d+' + return t + +def t_error(t): + pass + +lex.lex() + + diff --git a/test/lex_many_tokens.py b/test/lex_many_tokens.py new file mode 100644 index 000000000..77ae12baf --- /dev/null +++ b/test/lex_many_tokens.py @@ -0,0 +1,27 @@ +# lex_many_tokens.py +# +# Test lex's ability to handle a large number of tokens (beyond the +# 100-group limit of the re module) + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = ["TOK%d" % i for i in range(1000)] + +for tok in tokens: + if sys.version_info[0] < 3: + exec("t_%s = '%s:'" % (tok,tok)) + else: + exec("t_%s = '%s:'" % (tok,tok), globals()) + +t_ignore = " \t" + +def t_error(t): + pass + +lex.lex(optimize=1,lextab="manytab") +lex.runmain(data="TOK34: TOK143: TOK269: TOK372: TOK452: TOK561: TOK999:") + + diff --git a/test/lex_module.py b/test/lex_module.py new file mode 100644 index 000000000..8bdd3ed47 --- /dev/null +++ b/test/lex_module.py @@ -0,0 +1,10 @@ +# lex_module.py +# + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex +import lex_module_import +lex.lex(module=lex_module_import) +lex.runmain(data="3+4") diff --git a/test/lex_module_import.py b/test/lex_module_import.py new file mode 100644 index 000000000..df4208236 --- /dev/null +++ b/test/lex_module_import.py @@ -0,0 +1,42 @@ +# ----------------------------------------------------------------------------- +# lex_module_import.py +# +# A lexer defined in a module, but built in lex_module.py +# ----------------------------------------------------------------------------- + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + diff --git a/test/lex_object.py b/test/lex_object.py new file mode 100644 index 000000000..7e9f389dd --- /dev/null +++ b/test/lex_object.py @@ -0,0 +1,55 @@ +# ----------------------------------------------------------------------------- +# lex_object.py +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +class CalcLexer: + tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + + # Tokens + + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_EQUALS = r'=' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + def t_NUMBER(self,t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + + t_ignore = " \t" + + def t_newline(self,t): + r'\n+' + t.lineno += t.value.count("\n") + + def t_error(self,t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + +calc = CalcLexer() + +# Build the lexer +lex.lex(object=calc) +lex.runmain(data="3+4") + + + + diff --git a/test/lex_opt_alias.py b/test/lex_opt_alias.py new file mode 100644 index 000000000..5d5ed4c4e --- /dev/null +++ b/test/lex_opt_alias.py @@ -0,0 +1,54 @@ +# ----------------------------------------------------------------------------- +# lex_opt_alias.py +# +# Tests ability to match up functions with states, aliases, and +# lexing tables. +# ----------------------------------------------------------------------------- + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +tokens = ( + 'NAME','NUMBER', + ) + +states = (('instdef','inclusive'),('spam','exclusive')) + +literals = ['=','+','-','*','/', '(',')'] + +# Tokens + +def t_instdef_spam_BITS(t): + r'[01-]+' + return t + +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ANY_NUMBER = NUMBER + +t_ignore = " \t" +t_spam_ignore = t_ignore + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +t_spam_error = t_error + +# Build the lexer +import ply.lex as lex +lex.lex(optimize=1,lextab="aliastab") +lex.runmain(data="3+4") diff --git a/test/lex_optimize.py b/test/lex_optimize.py new file mode 100644 index 000000000..0e447e668 --- /dev/null +++ b/test/lex_optimize.py @@ -0,0 +1,50 @@ +# ----------------------------------------------------------------------------- +# lex_optimize.py +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lex.lex(optimize=1) +lex.runmain(data="3+4") + + + diff --git a/test/lex_optimize2.py b/test/lex_optimize2.py new file mode 100644 index 000000000..64555f635 --- /dev/null +++ b/test/lex_optimize2.py @@ -0,0 +1,50 @@ +# ----------------------------------------------------------------------------- +# lex_optimize2.py +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lex.lex(optimize=1,lextab="opt2tab") +lex.runmain(data="3+4") + + + diff --git a/test/lex_optimize3.py b/test/lex_optimize3.py new file mode 100644 index 000000000..b8df5aab2 --- /dev/null +++ b/test/lex_optimize3.py @@ -0,0 +1,52 @@ +# ----------------------------------------------------------------------------- +# lex_optimize3.py +# +# Writes table in a subdirectory structure. +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lex.lex(optimize=1,lextab="lexdir.sub.calctab" ,outputdir="lexdir/sub") +lex.runmain(data="3+4") + + + diff --git a/test/lex_re1.py b/test/lex_re1.py new file mode 100644 index 000000000..5be7aefca --- /dev/null +++ b/test/lex_re1.py @@ -0,0 +1,27 @@ +# lex_re1.py +# +# Bad regular expression in a string + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'(\d+' + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_re2.py b/test/lex_re2.py new file mode 100644 index 000000000..8dfb8e3fd --- /dev/null +++ b/test/lex_re2.py @@ -0,0 +1,27 @@ +# lex_re2.py +# +# Regular expression rule matches empty string + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+?' +t_MINUS = r'-' +t_NUMBER = r'(\d+)' + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_re3.py b/test/lex_re3.py new file mode 100644 index 000000000..e17992537 --- /dev/null +++ b/test/lex_re3.py @@ -0,0 +1,29 @@ +# lex_re3.py +# +# Regular expression rule matches empty string + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + "POUND", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'(\d+)' +t_POUND = r'#' + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_rule1.py b/test/lex_rule1.py new file mode 100644 index 000000000..0406c6f30 --- /dev/null +++ b/test/lex_rule1.py @@ -0,0 +1,27 @@ +# lex_rule1.py +# +# Rule function with incorrect number of arguments + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = 1 + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_rule2.py b/test/lex_rule2.py new file mode 100644 index 000000000..1c29d8737 --- /dev/null +++ b/test/lex_rule2.py @@ -0,0 +1,29 @@ +# lex_rule2.py +# +# Rule function with incorrect number of arguments + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +def t_NUMBER(): + r'\d+' + return t + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_rule3.py b/test/lex_rule3.py new file mode 100644 index 000000000..9ea94da2f --- /dev/null +++ b/test/lex_rule3.py @@ -0,0 +1,27 @@ +# lex_rule3.py +# +# Rule function with incorrect number of arguments + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +def t_NUMBER(t,s): + r'\d+' + return t + +def t_error(t): + pass + +lex.lex() + + diff --git a/test/lex_state1.py b/test/lex_state1.py new file mode 100644 index 000000000..7528c9154 --- /dev/null +++ b/test/lex_state1.py @@ -0,0 +1,40 @@ +# lex_state1.py +# +# Bad state declaration + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +states = 'comment' + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +# Comments +def t_comment(t): + r'/\*' + t.lexer.begin('comment') + print("Entering comment state") + +def t_comment_body_part(t): + r'(.|\n)*\*/' + print("comment body %s" % t) + t.lexer.begin('INITIAL') + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_state2.py b/test/lex_state2.py new file mode 100644 index 000000000..3aef69ea2 --- /dev/null +++ b/test/lex_state2.py @@ -0,0 +1,40 @@ +# lex_state2.py +# +# Bad state declaration + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +states = ('comment','example') + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +# Comments +def t_comment(t): + r'/\*' + t.lexer.begin('comment') + print("Entering comment state") + +def t_comment_body_part(t): + r'(.|\n)*\*/' + print("comment body %s" % t) + t.lexer.begin('INITIAL') + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_state3.py b/test/lex_state3.py new file mode 100644 index 000000000..616e48474 --- /dev/null +++ b/test/lex_state3.py @@ -0,0 +1,42 @@ +# lex_state3.py +# +# Bad state declaration + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +comment = 1 +states = ((comment, 'inclusive'), + ('example', 'exclusive')) + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +# Comments +def t_comment(t): + r'/\*' + t.lexer.begin('comment') + print("Entering comment state") + +def t_comment_body_part(t): + r'(.|\n)*\*/' + print("comment body %s" % t) + t.lexer.begin('INITIAL') + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_state4.py b/test/lex_state4.py new file mode 100644 index 000000000..182501614 --- /dev/null +++ b/test/lex_state4.py @@ -0,0 +1,41 @@ +# lex_state4.py +# +# Bad state declaration + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + + +states = (('comment', 'exclsive'),) + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +# Comments +def t_comment(t): + r'/\*' + t.lexer.begin('comment') + print("Entering comment state") + +def t_comment_body_part(t): + r'(.|\n)*\*/' + print("comment body %s" % t) + t.lexer.begin('INITIAL') + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/test/lex_state5.py b/test/lex_state5.py new file mode 100644 index 000000000..4ce828e4f --- /dev/null +++ b/test/lex_state5.py @@ -0,0 +1,40 @@ +# lex_state5.py +# +# Bad state declaration + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +states = (('comment', 'exclusive'), + ('comment', 'exclusive')) + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +# Comments +def t_comment(t): + r'/\*' + t.lexer.begin('comment') + print("Entering comment state") + +def t_comment_body_part(t): + r'(.|\n)*\*/' + print("comment body %s" % t) + t.lexer.begin('INITIAL') + +def t_error(t): + pass + + +lex.lex() + + diff --git a/test/lex_state_noerror.py b/test/lex_state_noerror.py new file mode 100644 index 000000000..90bbea878 --- /dev/null +++ b/test/lex_state_noerror.py @@ -0,0 +1,39 @@ +# lex_state_noerror.py +# +# Declaration of a state for which no rules are defined + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +states = (('comment', 'exclusive'),) + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +# Comments +def t_comment(t): + r'/\*' + t.lexer.begin('comment') + print("Entering comment state") + +def t_comment_body_part(t): + r'(.|\n)*\*/' + print("comment body %s" % t) + t.lexer.begin('INITIAL') + +def t_error(t): + pass + + +lex.lex() + + diff --git a/test/lex_state_norule.py b/test/lex_state_norule.py new file mode 100644 index 000000000..64ec6d3ec --- /dev/null +++ b/test/lex_state_norule.py @@ -0,0 +1,40 @@ +# lex_state_norule.py +# +# Declaration of a state for which no rules are defined + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +states = (('comment', 'exclusive'), + ('example', 'exclusive')) + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +# Comments +def t_comment(t): + r'/\*' + t.lexer.begin('comment') + print("Entering comment state") + +def t_comment_body_part(t): + r'(.|\n)*\*/' + print("comment body %s" % t) + t.lexer.begin('INITIAL') + +def t_error(t): + pass + + +lex.lex() + + diff --git a/test/lex_state_try.py b/test/lex_state_try.py new file mode 100644 index 000000000..fd5ba2221 --- /dev/null +++ b/test/lex_state_try.py @@ -0,0 +1,45 @@ +# lex_state_try.py +# +# Declaration of a state for which no rules are defined + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +states = (('comment', 'exclusive'),) + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +t_ignore = " \t" + +# Comments +def t_comment(t): + r'/\*' + t.lexer.begin('comment') + print("Entering comment state") + +def t_comment_body_part(t): + r'(.|\n)*\*/' + print("comment body %s" % t) + t.lexer.begin('INITIAL') + +def t_error(t): + pass + +t_comment_error = t_error +t_comment_ignore = t_ignore + +lex.lex() + +data = "3 + 4 /* This is a comment */ + 10" + +lex.runmain(data=data) diff --git a/test/lex_token1.py b/test/lex_token1.py new file mode 100644 index 000000000..6fca300b1 --- /dev/null +++ b/test/lex_token1.py @@ -0,0 +1,19 @@ +# lex_token1.py +# +# Tests for absence of tokens variable + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +def t_error(t): + pass + +lex.lex() + + diff --git a/test/lex_token2.py b/test/lex_token2.py new file mode 100644 index 000000000..6e65ab0f9 --- /dev/null +++ b/test/lex_token2.py @@ -0,0 +1,22 @@ +# lex_token2.py +# +# Tests for tokens of wrong type + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = "PLUS MINUS NUMBER" + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +def t_error(t): + pass + + +lex.lex() + + diff --git a/test/lex_token3.py b/test/lex_token3.py new file mode 100644 index 000000000..636452ea4 --- /dev/null +++ b/test/lex_token3.py @@ -0,0 +1,24 @@ +# lex_token3.py +# +# tokens is right type, but is missing a token for one rule + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +def t_error(t): + pass + +lex.lex() + + diff --git a/test/lex_token4.py b/test/lex_token4.py new file mode 100644 index 000000000..52947e9cc --- /dev/null +++ b/test/lex_token4.py @@ -0,0 +1,26 @@ +# lex_token4.py +# +# Bad token name + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "-", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +t_NUMBER = r'\d+' + +def t_error(t): + pass + +lex.lex() + + diff --git a/test/lex_token5.py b/test/lex_token5.py new file mode 100644 index 000000000..ef7a3c502 --- /dev/null +++ b/test/lex_token5.py @@ -0,0 +1,31 @@ +# lex_token5.py +# +# Return a bad token name + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' + +def t_NUMBER(t): + r'\d+' + t.type = "NUM" + return t + +def t_error(t): + pass + +lex.lex() +lex.input("1234") +t = lex.token() + + diff --git a/test/lex_token_dup.py b/test/lex_token_dup.py new file mode 100644 index 000000000..384f4e9db --- /dev/null +++ b/test/lex_token_dup.py @@ -0,0 +1,29 @@ +# lex_token_dup.py +# +# Duplicate token name in tokens + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + "MINUS" + ] + +t_PLUS = r'\+' +t_MINUS = r'-' + +def t_NUMBER(t): + r'\d+' + return t + +def t_error(t): + pass + +lex.lex() + + diff --git a/test/pkg_test1/__init__.py b/test/pkg_test1/__init__.py new file mode 100644 index 000000000..0e195589e --- /dev/null +++ b/test/pkg_test1/__init__.py @@ -0,0 +1,9 @@ +# Tests proper handling of lextab and parsetab files in package structures + +# Here for testing purposes +import sys +if '..' not in sys.path: + sys.path.insert(0, '..') + +from .parsing.calcparse import parser + diff --git a/test/pkg_test1/parsing/__init__.py b/test/pkg_test1/parsing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/pkg_test1/parsing/calclex.py b/test/pkg_test1/parsing/calclex.py new file mode 100644 index 000000000..b3c1a4d6b --- /dev/null +++ b/test/pkg_test1/parsing/calclex.py @@ -0,0 +1,47 @@ +# ----------------------------------------------------------------------------- +# calclex.py +# ----------------------------------------------------------------------------- + +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lexer = lex.lex(optimize=True) + + + diff --git a/test/pkg_test1/parsing/calcparse.py b/test/pkg_test1/parsing/calcparse.py new file mode 100644 index 000000000..c058e9f77 --- /dev/null +++ b/test/pkg_test1/parsing/calcparse.py @@ -0,0 +1,66 @@ +# ----------------------------------------------------------------------------- +# yacc_simple.py +# +# A simple, properly specifier grammar +# ----------------------------------------------------------------------------- + +from .calclex import tokens +from ply import yacc + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + t[0] = t[1] + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +parser = yacc.yacc() + + + + + diff --git a/test/pkg_test2/__init__.py b/test/pkg_test2/__init__.py new file mode 100644 index 000000000..0e195589e --- /dev/null +++ b/test/pkg_test2/__init__.py @@ -0,0 +1,9 @@ +# Tests proper handling of lextab and parsetab files in package structures + +# Here for testing purposes +import sys +if '..' not in sys.path: + sys.path.insert(0, '..') + +from .parsing.calcparse import parser + diff --git a/test/pkg_test2/parsing/__init__.py b/test/pkg_test2/parsing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/pkg_test2/parsing/calclex.py b/test/pkg_test2/parsing/calclex.py new file mode 100644 index 000000000..789e13f86 --- /dev/null +++ b/test/pkg_test2/parsing/calclex.py @@ -0,0 +1,47 @@ +# ----------------------------------------------------------------------------- +# calclex.py +# ----------------------------------------------------------------------------- + +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lexer = lex.lex(optimize=True, lextab='calclextab') + + + diff --git a/test/pkg_test2/parsing/calcparse.py b/test/pkg_test2/parsing/calcparse.py new file mode 100644 index 000000000..f5193389b --- /dev/null +++ b/test/pkg_test2/parsing/calcparse.py @@ -0,0 +1,66 @@ +# ----------------------------------------------------------------------------- +# yacc_simple.py +# +# A simple, properly specifier grammar +# ----------------------------------------------------------------------------- + +from .calclex import tokens +from ply import yacc + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + t[0] = t[1] + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +parser = yacc.yacc(tabmodule='calcparsetab') + + + + + diff --git a/test/pkg_test3/__init__.py b/test/pkg_test3/__init__.py new file mode 100644 index 000000000..0e195589e --- /dev/null +++ b/test/pkg_test3/__init__.py @@ -0,0 +1,9 @@ +# Tests proper handling of lextab and parsetab files in package structures + +# Here for testing purposes +import sys +if '..' not in sys.path: + sys.path.insert(0, '..') + +from .parsing.calcparse import parser + diff --git a/test/pkg_test3/generated/__init__.py b/test/pkg_test3/generated/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/pkg_test3/parsing/__init__.py b/test/pkg_test3/parsing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/pkg_test3/parsing/calclex.py b/test/pkg_test3/parsing/calclex.py new file mode 100644 index 000000000..6ca2c4f3c --- /dev/null +++ b/test/pkg_test3/parsing/calclex.py @@ -0,0 +1,47 @@ +# ----------------------------------------------------------------------------- +# calclex.py +# ----------------------------------------------------------------------------- + +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lexer = lex.lex(optimize=True, lextab='pkg_test3.generated.lextab') + + + diff --git a/test/pkg_test3/parsing/calcparse.py b/test/pkg_test3/parsing/calcparse.py new file mode 100644 index 000000000..2dcb52b3c --- /dev/null +++ b/test/pkg_test3/parsing/calcparse.py @@ -0,0 +1,66 @@ +# ----------------------------------------------------------------------------- +# yacc_simple.py +# +# A simple, properly specifier grammar +# ----------------------------------------------------------------------------- + +from .calclex import tokens +from ply import yacc + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + t[0] = t[1] + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +parser = yacc.yacc(tabmodule='pkg_test3.generated.parsetab') + + + + + diff --git a/test/pkg_test4/__init__.py b/test/pkg_test4/__init__.py new file mode 100644 index 000000000..ba9ddacf6 --- /dev/null +++ b/test/pkg_test4/__init__.py @@ -0,0 +1,25 @@ +# Tests proper handling of lextab and parsetab files in package structures +# Check of warning messages when files aren't writable + +# Here for testing purposes +import sys +if '..' not in sys.path: + sys.path.insert(0, '..') + +import ply.lex +import ply.yacc + +def patched_open(filename, mode): + if 'w' in mode: + raise IOError("Permission denied %r" % filename) + return open(filename, mode) + +ply.lex.open = patched_open +ply.yacc.open = patched_open +try: + from .parsing.calcparse import parser +finally: + del ply.lex.open + del ply.yacc.open + + diff --git a/test/pkg_test4/parsing/__init__.py b/test/pkg_test4/parsing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/pkg_test4/parsing/calclex.py b/test/pkg_test4/parsing/calclex.py new file mode 100644 index 000000000..b3c1a4d6b --- /dev/null +++ b/test/pkg_test4/parsing/calclex.py @@ -0,0 +1,47 @@ +# ----------------------------------------------------------------------------- +# calclex.py +# ----------------------------------------------------------------------------- + +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lexer = lex.lex(optimize=True) + + + diff --git a/test/pkg_test4/parsing/calcparse.py b/test/pkg_test4/parsing/calcparse.py new file mode 100644 index 000000000..c058e9f77 --- /dev/null +++ b/test/pkg_test4/parsing/calcparse.py @@ -0,0 +1,66 @@ +# ----------------------------------------------------------------------------- +# yacc_simple.py +# +# A simple, properly specifier grammar +# ----------------------------------------------------------------------------- + +from .calclex import tokens +from ply import yacc + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + t[0] = t[1] + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +parser = yacc.yacc() + + + + + diff --git a/test/pkg_test5/__init__.py b/test/pkg_test5/__init__.py new file mode 100644 index 000000000..0e195589e --- /dev/null +++ b/test/pkg_test5/__init__.py @@ -0,0 +1,9 @@ +# Tests proper handling of lextab and parsetab files in package structures + +# Here for testing purposes +import sys +if '..' not in sys.path: + sys.path.insert(0, '..') + +from .parsing.calcparse import parser + diff --git a/test/pkg_test5/parsing/__init__.py b/test/pkg_test5/parsing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/pkg_test5/parsing/calclex.py b/test/pkg_test5/parsing/calclex.py new file mode 100644 index 000000000..e8759b6f0 --- /dev/null +++ b/test/pkg_test5/parsing/calclex.py @@ -0,0 +1,48 @@ +# ----------------------------------------------------------------------------- +# calclex.py +# ----------------------------------------------------------------------------- + +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +import os.path +lexer = lex.lex(optimize=True, outputdir=os.path.dirname(__file__)) + + + diff --git a/test/pkg_test5/parsing/calcparse.py b/test/pkg_test5/parsing/calcparse.py new file mode 100644 index 000000000..2a1ddfe19 --- /dev/null +++ b/test/pkg_test5/parsing/calcparse.py @@ -0,0 +1,67 @@ +# ----------------------------------------------------------------------------- +# yacc_simple.py +# +# A simple, properly specifier grammar +# ----------------------------------------------------------------------------- + +from .calclex import tokens +from ply import yacc + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + t[0] = t[1] + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +import os.path +parser = yacc.yacc(outputdir=os.path.dirname(__file__)) + + + + + diff --git a/test/pkg_test6/__init__.py b/test/pkg_test6/__init__.py new file mode 100644 index 000000000..5dbe0cbd1 --- /dev/null +++ b/test/pkg_test6/__init__.py @@ -0,0 +1,9 @@ +# Tests proper sorting of modules in yacc.ParserReflect.get_pfunctions + +# Here for testing purposes +import sys +if '..' not in sys.path: + sys.path.insert(0, '..') + +from .parsing.calcparse import parser + diff --git a/test/pkg_test6/parsing/__init__.py b/test/pkg_test6/parsing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/pkg_test6/parsing/calclex.py b/test/pkg_test6/parsing/calclex.py new file mode 100644 index 000000000..e8759b6f0 --- /dev/null +++ b/test/pkg_test6/parsing/calclex.py @@ -0,0 +1,48 @@ +# ----------------------------------------------------------------------------- +# calclex.py +# ----------------------------------------------------------------------------- + +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +import os.path +lexer = lex.lex(optimize=True, outputdir=os.path.dirname(__file__)) + + + diff --git a/test/pkg_test6/parsing/calcparse.py b/test/pkg_test6/parsing/calcparse.py new file mode 100644 index 000000000..6defaf974 --- /dev/null +++ b/test/pkg_test6/parsing/calcparse.py @@ -0,0 +1,33 @@ +# ----------------------------------------------------------------------------- +# yacc_simple.py +# +# A simple, properly specifier grammar +# ----------------------------------------------------------------------------- + +from .calclex import tokens +from ply import yacc + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +from .statement import * + +from .expression import * + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +import os.path +parser = yacc.yacc(outputdir=os.path.dirname(__file__)) + + + + + diff --git a/test/pkg_test6/parsing/expression.py b/test/pkg_test6/parsing/expression.py new file mode 100644 index 000000000..028f66272 --- /dev/null +++ b/test/pkg_test6/parsing/expression.py @@ -0,0 +1,31 @@ +# This file contains definitions of expression grammar + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 diff --git a/test/pkg_test6/parsing/statement.py b/test/pkg_test6/parsing/statement.py new file mode 100644 index 000000000..ef7dc55e3 --- /dev/null +++ b/test/pkg_test6/parsing/statement.py @@ -0,0 +1,9 @@ +# This file contains definitions of statement grammar + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + t[0] = t[1] diff --git a/test/testlex.py b/test/testlex.py new file mode 100755 index 000000000..3880f6f93 --- /dev/null +++ b/test/testlex.py @@ -0,0 +1,660 @@ +# testlex.py + +import unittest +try: + import StringIO +except ImportError: + import io as StringIO + +import sys +import os +import warnings +import platform + +sys.path.insert(0,"..") +sys.tracebacklimit = 0 + +import ply.lex + +try: + from importlib.util import cache_from_source +except ImportError: + # Python 2.7, but we don't care. + cache_from_source = None + + +def make_pymodule_path(filename, optimization=None): + path = os.path.dirname(filename) + file = os.path.basename(filename) + mod, ext = os.path.splitext(file) + + if sys.hexversion >= 0x3050000: + fullpath = cache_from_source(filename, optimization=optimization) + elif sys.hexversion >= 0x3040000: + fullpath = cache_from_source(filename, ext=='.pyc') + elif sys.hexversion >= 0x3020000: + import imp + modname = mod+"."+imp.get_tag()+ext + fullpath = os.path.join(path,'__pycache__',modname) + else: + fullpath = filename + return fullpath + +def pymodule_out_exists(filename, optimization=None): + return os.path.exists(make_pymodule_path(filename, + optimization=optimization)) + +def pymodule_out_remove(filename, optimization=None): + os.remove(make_pymodule_path(filename, optimization=optimization)) + +def implementation(): + if platform.system().startswith("Java"): + return "Jython" + elif hasattr(sys, "pypy_version_info"): + return "PyPy" + else: + return "CPython" + +test_pyo = (implementation() == 'CPython') + +def check_expected(result, expected, contains=False): + if sys.version_info[0] >= 3: + if isinstance(result,str): + result = result.encode('ascii') + if isinstance(expected,str): + expected = expected.encode('ascii') + resultlines = result.splitlines() + expectedlines = expected.splitlines() + + if len(resultlines) != len(expectedlines): + return False + + for rline,eline in zip(resultlines,expectedlines): + if contains: + if eline not in rline: + return False + else: + if not rline.endswith(eline): + return False + return True + +def run_import(module): + code = "import "+module + exec(code) + del sys.modules[module] + +# Tests related to errors and warnings when building lexers +class LexErrorWarningTests(unittest.TestCase): + def setUp(self): + sys.stderr = StringIO.StringIO() + sys.stdout = StringIO.StringIO() + if sys.hexversion >= 0x3020000: + warnings.filterwarnings('ignore',category=ResourceWarning) + + def tearDown(self): + sys.stderr = sys.__stderr__ + sys.stdout = sys.__stdout__ + def test_lex_doc1(self): + self.assertRaises(SyntaxError,run_import,"lex_doc1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_doc1.py:18: No regular expression defined for rule 't_NUMBER'\n")) + def test_lex_dup1(self): + self.assertRaises(SyntaxError,run_import,"lex_dup1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_dup1.py:20: Rule t_NUMBER redefined. Previously defined on line 18\n" )) + + def test_lex_dup2(self): + self.assertRaises(SyntaxError,run_import,"lex_dup2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_dup2.py:22: Rule t_NUMBER redefined. Previously defined on line 18\n" )) + + def test_lex_dup3(self): + self.assertRaises(SyntaxError,run_import,"lex_dup3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_dup3.py:20: Rule t_NUMBER redefined. Previously defined on line 18\n" )) + + def test_lex_empty(self): + self.assertRaises(SyntaxError,run_import,"lex_empty") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No rules of the form t_rulename are defined\n" + "No rules defined for state 'INITIAL'\n")) + + def test_lex_error1(self): + run_import("lex_error1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No t_error rule is defined\n")) + + def test_lex_error2(self): + self.assertRaises(SyntaxError,run_import,"lex_error2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Rule 't_error' must be defined as a function\n") + ) + + def test_lex_error3(self): + self.assertRaises(SyntaxError,run_import,"lex_error3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_error3.py:20: Rule 't_error' requires an argument\n")) + + def test_lex_error4(self): + self.assertRaises(SyntaxError,run_import,"lex_error4") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_error4.py:20: Rule 't_error' has too many arguments\n")) + + def test_lex_ignore(self): + self.assertRaises(SyntaxError,run_import,"lex_ignore") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_ignore.py:20: Rule 't_ignore' must be defined as a string\n")) + + def test_lex_ignore2(self): + run_import("lex_ignore2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "t_ignore contains a literal backslash '\\'\n")) + + + def test_lex_re1(self): + self.assertRaises(SyntaxError,run_import,"lex_re1") + result = sys.stderr.getvalue() + if sys.hexversion < 0x3050000: + msg = "Invalid regular expression for rule 't_NUMBER'. unbalanced parenthesis\n" + else: + msg = "Invalid regular expression for rule 't_NUMBER'. missing ), unterminated subpattern at position 0" + self.assert_(check_expected(result, + msg, + contains=True)) + + def test_lex_re2(self): + self.assertRaises(SyntaxError,run_import,"lex_re2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Regular expression for rule 't_PLUS' matches empty string\n")) + + def test_lex_re3(self): + self.assertRaises(SyntaxError,run_import,"lex_re3") + result = sys.stderr.getvalue() +# self.assert_(check_expected(result, +# "Invalid regular expression for rule 't_POUND'. unbalanced parenthesis\n" +# "Make sure '#' in rule 't_POUND' is escaped with '\\#'\n")) + + if sys.hexversion < 0x3050000: + msg = ("Invalid regular expression for rule 't_POUND'. unbalanced parenthesis\n" + "Make sure '#' in rule 't_POUND' is escaped with '\\#'\n") + else: + msg = ("Invalid regular expression for rule 't_POUND'. missing ), unterminated subpattern at position 0\n" + "ERROR: Make sure '#' in rule 't_POUND' is escaped with '\#'") + self.assert_(check_expected(result, + msg, + contains=True), result) + + def test_lex_rule1(self): + self.assertRaises(SyntaxError,run_import,"lex_rule1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "t_NUMBER not defined as a function or string\n")) + + def test_lex_rule2(self): + self.assertRaises(SyntaxError,run_import,"lex_rule2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_rule2.py:18: Rule 't_NUMBER' requires an argument\n")) + + def test_lex_rule3(self): + self.assertRaises(SyntaxError,run_import,"lex_rule3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_rule3.py:18: Rule 't_NUMBER' has too many arguments\n")) + + + def test_lex_state1(self): + self.assertRaises(SyntaxError,run_import,"lex_state1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "states must be defined as a tuple or list\n")) + + def test_lex_state2(self): + self.assertRaises(SyntaxError,run_import,"lex_state2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Invalid state specifier 'comment'. Must be a tuple (statename,'exclusive|inclusive')\n" + "Invalid state specifier 'example'. Must be a tuple (statename,'exclusive|inclusive')\n")) + + def test_lex_state3(self): + self.assertRaises(SyntaxError,run_import,"lex_state3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "State name 1 must be a string\n" + "No rules defined for state 'example'\n")) + + def test_lex_state4(self): + self.assertRaises(SyntaxError,run_import,"lex_state4") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "State type for state comment must be 'inclusive' or 'exclusive'\n")) + + + def test_lex_state5(self): + self.assertRaises(SyntaxError,run_import,"lex_state5") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "State 'comment' already defined\n")) + + def test_lex_state_noerror(self): + run_import("lex_state_noerror") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No error rule is defined for exclusive state 'comment'\n")) + + def test_lex_state_norule(self): + self.assertRaises(SyntaxError,run_import,"lex_state_norule") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No rules defined for state 'example'\n")) + + def test_lex_token1(self): + self.assertRaises(SyntaxError,run_import,"lex_token1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No token list is defined\n" + "Rule 't_NUMBER' defined for an unspecified token NUMBER\n" + "Rule 't_PLUS' defined for an unspecified token PLUS\n" + "Rule 't_MINUS' defined for an unspecified token MINUS\n" +)) + + def test_lex_token2(self): + self.assertRaises(SyntaxError,run_import,"lex_token2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "tokens must be a list or tuple\n" + "Rule 't_NUMBER' defined for an unspecified token NUMBER\n" + "Rule 't_PLUS' defined for an unspecified token PLUS\n" + "Rule 't_MINUS' defined for an unspecified token MINUS\n" +)) + + def test_lex_token3(self): + self.assertRaises(SyntaxError,run_import,"lex_token3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Rule 't_MINUS' defined for an unspecified token MINUS\n")) + + + def test_lex_token4(self): + self.assertRaises(SyntaxError,run_import,"lex_token4") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Bad token name '-'\n")) + + + def test_lex_token5(self): + try: + run_import("lex_token5") + except ply.lex.LexError: + e = sys.exc_info()[1] + self.assert_(check_expected(str(e),"lex_token5.py:19: Rule 't_NUMBER' returned an unknown token type 'NUM'")) + + def test_lex_token_dup(self): + run_import("lex_token_dup") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Token 'MINUS' multiply defined\n")) + + + def test_lex_literal1(self): + self.assertRaises(SyntaxError,run_import,"lex_literal1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Invalid literal '**'. Must be a single character\n")) + + def test_lex_literal2(self): + self.assertRaises(SyntaxError,run_import,"lex_literal2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Invalid literals specification. literals must be a sequence of characters\n")) + +import os +import subprocess +import shutil + +# Tests related to various build options associated with lexers +class LexBuildOptionTests(unittest.TestCase): + def setUp(self): + sys.stderr = StringIO.StringIO() + sys.stdout = StringIO.StringIO() + def tearDown(self): + sys.stderr = sys.__stderr__ + sys.stdout = sys.__stdout__ + try: + shutil.rmtree("lexdir") + except OSError: + pass + + def test_lex_module(self): + run_import("lex_module") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + + def test_lex_object(self): + run_import("lex_object") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + + def test_lex_closure(self): + run_import("lex_closure") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + + def test_lex_optimize(self): + try: + os.remove("lextab.py") + except OSError: + pass + try: + os.remove("lextab.pyc") + except OSError: + pass + try: + os.remove("lextab.pyo") + except OSError: + pass + run_import("lex_optimize") + + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("lextab.py")) + + p = subprocess.Popen([sys.executable,'-O','lex_optimize.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + if test_pyo: + self.assert_(pymodule_out_exists("lextab.pyo", 1)) + pymodule_out_remove("lextab.pyo", 1) + + p = subprocess.Popen([sys.executable,'-OO','lex_optimize.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + + if test_pyo: + self.assert_(pymodule_out_exists("lextab.pyo", 2)) + try: + os.remove("lextab.py") + except OSError: + pass + try: + pymodule_out_remove("lextab.pyc") + except OSError: + pass + try: + pymodule_out_remove("lextab.pyo", 2) + except OSError: + pass + + def test_lex_optimize2(self): + try: + os.remove("opt2tab.py") + except OSError: + pass + try: + os.remove("opt2tab.pyc") + except OSError: + pass + try: + os.remove("opt2tab.pyo") + except OSError: + pass + run_import("lex_optimize2") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("opt2tab.py")) + + p = subprocess.Popen([sys.executable,'-O','lex_optimize2.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + if test_pyo: + self.assert_(pymodule_out_exists("opt2tab.pyo", 1)) + pymodule_out_remove("opt2tab.pyo", 1) + p = subprocess.Popen([sys.executable,'-OO','lex_optimize2.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + if test_pyo: + self.assert_(pymodule_out_exists("opt2tab.pyo", 2)) + try: + os.remove("opt2tab.py") + except OSError: + pass + try: + pymodule_out_remove("opt2tab.pyc") + except OSError: + pass + try: + pymodule_out_remove("opt2tab.pyo", 2) + except OSError: + pass + + def test_lex_optimize3(self): + try: + shutil.rmtree("lexdir") + except OSError: + pass + + os.mkdir("lexdir") + os.mkdir("lexdir/sub") + open("lexdir/__init__.py","w").write("") + open("lexdir/sub/__init__.py","w").write("") + run_import("lex_optimize3") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("lexdir/sub/calctab.py")) + + p = subprocess.Popen([sys.executable,'-O','lex_optimize3.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + if test_pyo: + self.assert_(pymodule_out_exists("lexdir/sub/calctab.pyo", 1)) + pymodule_out_remove("lexdir/sub/calctab.pyo", 1) + + p = subprocess.Popen([sys.executable,'-OO','lex_optimize3.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + if test_pyo: + self.assert_(pymodule_out_exists("lexdir/sub/calctab.pyo", 2)) + try: + shutil.rmtree("lexdir") + except OSError: + pass + + def test_lex_opt_alias(self): + try: + os.remove("aliastab.py") + except OSError: + pass + try: + os.remove("aliastab.pyc") + except OSError: + pass + try: + os.remove("aliastab.pyo") + except OSError: + pass + run_import("lex_opt_alias") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(+,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("aliastab.py")) + + p = subprocess.Popen([sys.executable,'-O','lex_opt_alias.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(+,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + if test_pyo: + self.assert_(pymodule_out_exists("aliastab.pyo", 1)) + pymodule_out_remove("aliastab.pyo", 1) + + p = subprocess.Popen([sys.executable,'-OO','lex_opt_alias.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(+,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + + if test_pyo: + self.assert_(pymodule_out_exists("aliastab.pyo", 2)) + try: + os.remove("aliastab.py") + except OSError: + pass + try: + pymodule_out_remove("aliastab.pyc") + except OSError: + pass + try: + pymodule_out_remove("aliastab.pyo", 2) + except OSError: + pass + + def test_lex_many_tokens(self): + try: + os.remove("manytab.py") + except OSError: + pass + try: + os.remove("manytab.pyc") + except OSError: + pass + try: + os.remove("manytab.pyo") + except OSError: + pass + run_import("lex_many_tokens") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(TOK34,'TOK34:',1,0)\n" + "(TOK143,'TOK143:',1,7)\n" + "(TOK269,'TOK269:',1,15)\n" + "(TOK372,'TOK372:',1,23)\n" + "(TOK452,'TOK452:',1,31)\n" + "(TOK561,'TOK561:',1,39)\n" + "(TOK999,'TOK999:',1,47)\n" + )) + + self.assert_(os.path.exists("manytab.py")) + + if implementation() == 'CPython': + p = subprocess.Popen([sys.executable,'-O','lex_many_tokens.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(TOK34,'TOK34:',1,0)\n" + "(TOK143,'TOK143:',1,7)\n" + "(TOK269,'TOK269:',1,15)\n" + "(TOK372,'TOK372:',1,23)\n" + "(TOK452,'TOK452:',1,31)\n" + "(TOK561,'TOK561:',1,39)\n" + "(TOK999,'TOK999:',1,47)\n" + )) + + self.assert_(pymodule_out_exists("manytab.pyo", 1)) + pymodule_out_remove("manytab.pyo", 1) + try: + os.remove("manytab.py") + except OSError: + pass + try: + os.remove("manytab.pyc") + except OSError: + pass + try: + os.remove("manytab.pyo") + except OSError: + pass + +# Tests related to run-time behavior of lexers +class LexRunTests(unittest.TestCase): + def setUp(self): + sys.stderr = StringIO.StringIO() + sys.stdout = StringIO.StringIO() + def tearDown(self): + sys.stderr = sys.__stderr__ + sys.stdout = sys.__stdout__ + + def test_lex_hedit(self): + run_import("lex_hedit") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(H_EDIT_DESCRIPTOR,'abc',1,0)\n" + "(H_EDIT_DESCRIPTOR,'abcdefghij',1,6)\n" + "(H_EDIT_DESCRIPTOR,'xy',1,20)\n")) + + def test_lex_state_try(self): + run_import("lex_state_try") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,'3',1,0)\n" + "(PLUS,'+',1,2)\n" + "(NUMBER,'4',1,4)\n" + "Entering comment state\n" + "comment body LexToken(body_part,'This is a comment */',1,9)\n" + "(PLUS,'+',1,30)\n" + "(NUMBER,'10',1,32)\n" + )) + + + +unittest.main() diff --git a/test/testyacc.py b/test/testyacc.py new file mode 100644 index 000000000..7e69f099d --- /dev/null +++ b/test/testyacc.py @@ -0,0 +1,452 @@ +# testyacc.py + +import unittest +try: + import StringIO +except ImportError: + import io as StringIO + +import sys +import os +import warnings +import re +import platform + +sys.path.insert(0,"..") +sys.tracebacklimit = 0 + +import ply.yacc + +def make_pymodule_path(filename): + path = os.path.dirname(filename) + file = os.path.basename(filename) + mod, ext = os.path.splitext(file) + + if sys.hexversion >= 0x3040000: + import importlib.util + fullpath = importlib.util.cache_from_source(filename, ext=='.pyc') + elif sys.hexversion >= 0x3020000: + import imp + modname = mod+"."+imp.get_tag()+ext + fullpath = os.path.join(path,'__pycache__',modname) + else: + fullpath = filename + return fullpath + +def pymodule_out_exists(filename): + return os.path.exists(make_pymodule_path(filename)) + +def pymodule_out_remove(filename): + os.remove(make_pymodule_path(filename)) + +def implementation(): + if platform.system().startswith("Java"): + return "Jython" + elif hasattr(sys, "pypy_version_info"): + return "PyPy" + else: + return "CPython" + +# Check the output to see if it contains all of a set of expected output lines. +# This alternate implementation looks weird, but is needed to properly handle +# some variations in error message order that occurs due to dict hash table +# randomization that was introduced in Python 3.3 +def check_expected(result, expected): + # Normalize 'state n' text to account for randomization effects in Python 3.3 + expected = re.sub(r' state \d+', 'state ', expected) + result = re.sub(r' state \d+', 'state ', result) + + resultlines = set() + for line in result.splitlines(): + if line.startswith("WARNING: "): + line = line[9:] + elif line.startswith("ERROR: "): + line = line[7:] + resultlines.add(line) + + # Selectively remove expected lines from the output + for eline in expected.splitlines(): + resultlines = set(line for line in resultlines if not line.endswith(eline)) + + # Return True if no result lines remain + return not bool(resultlines) + +def run_import(module): + code = "import "+module + exec(code) + del sys.modules[module] + +# Tests related to errors and warnings when building parsers +class YaccErrorWarningTests(unittest.TestCase): + def setUp(self): + sys.stderr = StringIO.StringIO() + sys.stdout = StringIO.StringIO() + try: + os.remove("parsetab.py") + pymodule_out_remove("parsetab.pyc") + except OSError: + pass + + if sys.hexversion >= 0x3020000: + warnings.filterwarnings('ignore', category=ResourceWarning) + warnings.filterwarnings('ignore', category=DeprecationWarning) + + def tearDown(self): + sys.stderr = sys.__stderr__ + sys.stdout = sys.__stdout__ + def test_yacc_badargs(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badargs") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_badargs.py:23: Rule 'p_statement_assign' has too many arguments\n" + "yacc_badargs.py:27: Rule 'p_statement_expr' requires an argument\n" + )) + def test_yacc_badid(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badid") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_badid.py:32: Illegal name 'bad&rule' in rule 'statement'\n" + "yacc_badid.py:36: Illegal rule name 'bad&rule'\n" + )) + + def test_yacc_badprec(self): + try: + run_import("yacc_badprec") + except ply.yacc.YaccError: + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "precedence must be a list or tuple\n" + )) + def test_yacc_badprec2(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badprec2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Bad precedence table\n" + )) + + def test_yacc_badprec3(self): + run_import("yacc_badprec3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Precedence already specified for terminal 'MINUS'\n" + "Generating LALR tables\n" + + )) + + def test_yacc_badrule(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badrule") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_badrule.py:24: Syntax error. Expected ':'\n" + "yacc_badrule.py:28: Syntax error in rule 'statement'\n" + "yacc_badrule.py:33: Syntax error. Expected ':'\n" + "yacc_badrule.py:42: Syntax error. Expected ':'\n" + )) + + def test_yacc_badtok(self): + try: + run_import("yacc_badtok") + except ply.yacc.YaccError: + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "tokens must be a list or tuple\n")) + + def test_yacc_dup(self): + run_import("yacc_dup") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_dup.py:27: Function p_statement redefined. Previously defined on line 23\n" + "Token 'EQUALS' defined, but not used\n" + "There is 1 unused token\n" + "Generating LALR tables\n" + + )) + def test_yacc_error1(self): + try: + run_import("yacc_error1") + except ply.yacc.YaccError: + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_error1.py:61: p_error() requires 1 argument\n")) + + def test_yacc_error2(self): + try: + run_import("yacc_error2") + except ply.yacc.YaccError: + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_error2.py:61: p_error() requires 1 argument\n")) + + def test_yacc_error3(self): + try: + run_import("yacc_error3") + except ply.yacc.YaccError: + e = sys.exc_info()[1] + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "'p_error' defined, but is not a function or method\n")) + + def test_yacc_error4(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_error4") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_error4.py:62: Illegal rule name 'error'. Already defined as a token\n" + )) + + + def test_yacc_error5(self): + run_import("yacc_error5") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "Group at 3:10 to 3:12\n" + "Undefined name 'a'\n" + "Syntax error at 'b'\n" + "Syntax error at 4:18 to 4:22\n" + "Assignment Error at 2:5 to 5:27\n" + "13\n" + )) + + def test_yacc_error6(self): + run_import("yacc_error6") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "a=7\n" + "Line 3: Syntax error at '*'\n" + "c=21\n" + )) + + def test_yacc_error7(self): + run_import("yacc_error7") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "a=7\n" + "Line 3: Syntax error at '*'\n" + "c=21\n" + )) + + def test_yacc_inf(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_inf") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Token 'NUMBER' defined, but not used\n" + "There is 1 unused token\n" + "Infinite recursion detected for symbol 'statement'\n" + "Infinite recursion detected for symbol 'expression'\n" + )) + def test_yacc_literal(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_literal") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_literal.py:36: Literal token '**' in rule 'expression' may only be a single character\n" + )) + def test_yacc_misplaced(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_misplaced") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_misplaced.py:32: Misplaced '|'\n" + )) + + def test_yacc_missing1(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_missing1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_missing1.py:24: Symbol 'location' used, but not defined as a token or a rule\n" + )) + + def test_yacc_nested(self): + run_import("yacc_nested") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "A\n" + "A\n" + "A\n", + )) + + def test_yacc_nodoc(self): + run_import("yacc_nodoc") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_nodoc.py:27: No documentation string specified in function 'p_statement_expr' (ignored)\n" + "Generating LALR tables\n" + )) + + def test_yacc_noerror(self): + run_import("yacc_noerror") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "no p_error() function is defined\n" + "Generating LALR tables\n" + )) + + def test_yacc_nop(self): + run_import("yacc_nop") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_nop.py:27: Possible grammar rule 'statement_expr' defined without p_ prefix\n" + "Generating LALR tables\n" + )) + + def test_yacc_notfunc(self): + run_import("yacc_notfunc") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "'p_statement_assign' not defined as a function\n" + "Token 'EQUALS' defined, but not used\n" + "There is 1 unused token\n" + "Generating LALR tables\n" + )) + def test_yacc_notok(self): + try: + run_import("yacc_notok") + except ply.yacc.YaccError: + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No token list is defined\n")) + + def test_yacc_rr(self): + run_import("yacc_rr") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Generating LALR tables\n" + "1 reduce/reduce conflict\n" + "reduce/reduce conflict in state 15 resolved using rule (statement -> NAME EQUALS NUMBER)\n" + "rejected rule (expression -> NUMBER) in state 15\n" + + )) + + def test_yacc_rr_unused(self): + run_import("yacc_rr_unused") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "no p_error() function is defined\n" + "Generating LALR tables\n" + "3 reduce/reduce conflicts\n" + "reduce/reduce conflict in state 1 resolved using rule (rule3 -> A)\n" + "rejected rule (rule4 -> A) in state 1\n" + "reduce/reduce conflict in state 1 resolved using rule (rule3 -> A)\n" + "rejected rule (rule5 -> A) in state 1\n" + "reduce/reduce conflict in state 1 resolved using rule (rule4 -> A)\n" + "rejected rule (rule5 -> A) in state 1\n" + "Rule (rule5 -> A) is never reduced\n" + )) + + def test_yacc_simple(self): + run_import("yacc_simple") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Generating LALR tables\n" + )) + + def test_yacc_sr(self): + run_import("yacc_sr") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Generating LALR tables\n" + "20 shift/reduce conflicts\n" + )) + + def test_yacc_term1(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_term1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_term1.py:24: Illegal rule name 'NUMBER'. Already defined as a token\n" + )) + + def test_yacc_unicode_literals(self): + run_import("yacc_unicode_literals") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Generating LALR tables\n" + )) + + def test_yacc_unused(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_unused") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_unused.py:62: Symbol 'COMMA' used, but not defined as a token or a rule\n" + "Symbol 'COMMA' is unreachable\n" + "Symbol 'exprlist' is unreachable\n" + )) + def test_yacc_unused_rule(self): + run_import("yacc_unused_rule") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_unused_rule.py:62: Rule 'integer' defined, but not used\n" + "There is 1 unused rule\n" + "Symbol 'integer' is unreachable\n" + "Generating LALR tables\n" + )) + + def test_yacc_uprec(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_uprec") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_uprec.py:37: Nothing known about the precedence of 'UMINUS'\n" + )) + + def test_yacc_uprec2(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_uprec2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_uprec2.py:37: Syntax error. Nothing follows %prec\n" + )) + + def test_yacc_prec1(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_prec1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Precedence rule 'left' defined for unknown symbol '+'\n" + "Precedence rule 'left' defined for unknown symbol '*'\n" + "Precedence rule 'left' defined for unknown symbol '-'\n" + "Precedence rule 'left' defined for unknown symbol '/'\n" + )) + + def test_pkg_test1(self): + from pkg_test1 import parser + self.assertTrue(os.path.exists('pkg_test1/parsing/parsetab.py')) + self.assertTrue(os.path.exists('pkg_test1/parsing/lextab.py')) + self.assertTrue(os.path.exists('pkg_test1/parsing/parser.out')) + r = parser.parse('3+4+5') + self.assertEqual(r, 12) + + def test_pkg_test2(self): + from pkg_test2 import parser + self.assertTrue(os.path.exists('pkg_test2/parsing/calcparsetab.py')) + self.assertTrue(os.path.exists('pkg_test2/parsing/calclextab.py')) + self.assertTrue(os.path.exists('pkg_test2/parsing/parser.out')) + r = parser.parse('3+4+5') + self.assertEqual(r, 12) + + def test_pkg_test3(self): + from pkg_test3 import parser + self.assertTrue(os.path.exists('pkg_test3/generated/parsetab.py')) + self.assertTrue(os.path.exists('pkg_test3/generated/lextab.py')) + self.assertTrue(os.path.exists('pkg_test3/generated/parser.out')) + r = parser.parse('3+4+5') + self.assertEqual(r, 12) + + def test_pkg_test4(self): + from pkg_test4 import parser + self.assertFalse(os.path.exists('pkg_test4/parsing/parsetab.py')) + self.assertFalse(os.path.exists('pkg_test4/parsing/lextab.py')) + self.assertFalse(os.path.exists('pkg_test4/parsing/parser.out')) + r = parser.parse('3+4+5') + self.assertEqual(r, 12) + + def test_pkg_test5(self): + from pkg_test5 import parser + self.assertTrue(os.path.exists('pkg_test5/parsing/parsetab.py')) + self.assertTrue(os.path.exists('pkg_test5/parsing/lextab.py')) + self.assertTrue(os.path.exists('pkg_test5/parsing/parser.out')) + r = parser.parse('3+4+5') + self.assertEqual(r, 12) + + def test_pkg_test6(self): + from pkg_test6 import parser + self.assertTrue(os.path.exists('pkg_test6/parsing/parsetab.py')) + self.assertTrue(os.path.exists('pkg_test6/parsing/lextab.py')) + self.assertTrue(os.path.exists('pkg_test6/parsing/parser.out')) + r = parser.parse('3+4+5') + self.assertEqual(r, 12) + +unittest.main() diff --git a/test/yacc_badargs.py b/test/yacc_badargs.py new file mode 100644 index 000000000..9a1d03f2c --- /dev/null +++ b/test/yacc_badargs.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_badargs.py +# +# Rules with wrong # args +# ----------------------------------------------------------------------------- +import sys +sys.tracebacklimit = 0 +sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t,s): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_badid.py b/test/yacc_badid.py new file mode 100644 index 000000000..e4b9f5eeb --- /dev/null +++ b/test/yacc_badid.py @@ -0,0 +1,77 @@ +# ----------------------------------------------------------------------------- +# yacc_badid.py +# +# Attempt to define a rule with a bad-identifier name +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_statement_expr2(t): + 'statement : bad&rule' + pass + +def p_badrule(t): + 'bad&rule : expression' + pass + + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + pass + +yacc.yacc() + + + + diff --git a/test/yacc_badprec.py b/test/yacc_badprec.py new file mode 100644 index 000000000..3013bb621 --- /dev/null +++ b/test/yacc_badprec.py @@ -0,0 +1,64 @@ +# ----------------------------------------------------------------------------- +# yacc_badprec.py +# +# Bad precedence specifier +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = "blah" + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_badprec2.py b/test/yacc_badprec2.py new file mode 100644 index 000000000..83093b42d --- /dev/null +++ b/test/yacc_badprec2.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_badprec2.py +# +# Bad precedence +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + 42, + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_badprec3.py b/test/yacc_badprec3.py new file mode 100644 index 000000000..d925ecd55 --- /dev/null +++ b/test/yacc_badprec3.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_badprec3.py +# +# Bad precedence +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE','MINUS'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[3] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_badrule.py b/test/yacc_badrule.py new file mode 100644 index 000000000..92af6460a --- /dev/null +++ b/test/yacc_badrule.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_badrule.py +# +# Syntax problems in the rule strings +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression: MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_badtok.py b/test/yacc_badtok.py new file mode 100644 index 000000000..fc4afe19e --- /dev/null +++ b/test/yacc_badtok.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_badtok.py +# +# A grammar, but tokens is a bad datatype +# ----------------------------------------------------------------------------- + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +tokens = "Hello" + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_dup.py b/test/yacc_dup.py new file mode 100644 index 000000000..309ba3299 --- /dev/null +++ b/test/yacc_dup.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_dup.py +# +# Duplicated rule name +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_error1.py b/test/yacc_error1.py new file mode 100644 index 000000000..10ac6a9cd --- /dev/null +++ b/test/yacc_error1.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_error1.py +# +# Bad p_error() function +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t,s): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_error2.py b/test/yacc_error2.py new file mode 100644 index 000000000..759141809 --- /dev/null +++ b/test/yacc_error2.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_error2.py +# +# Bad p_error() function +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_error3.py b/test/yacc_error3.py new file mode 100644 index 000000000..4604a48bf --- /dev/null +++ b/test/yacc_error3.py @@ -0,0 +1,67 @@ +# ----------------------------------------------------------------------------- +# yacc_error3.py +# +# Bad p_error() function +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +p_error = "blah" + +yacc.yacc() + + + + diff --git a/test/yacc_error4.py b/test/yacc_error4.py new file mode 100644 index 000000000..9c550cd83 --- /dev/null +++ b/test/yacc_error4.py @@ -0,0 +1,72 @@ +# ----------------------------------------------------------------------------- +# yacc_error4.py +# +# Attempt to define a rule named 'error' +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error_handler(t): + 'error : NAME' + pass + +def p_error(t): + pass + +yacc.yacc() + + + + diff --git a/test/yacc_error5.py b/test/yacc_error5.py new file mode 100644 index 000000000..9eb0f8574 --- /dev/null +++ b/test/yacc_error5.py @@ -0,0 +1,94 @@ +# ----------------------------------------------------------------------------- +# yacc_error5.py +# +# Lineno and position tracking with error tokens +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_assign_error(t): + 'statement : NAME EQUALS error' + line_start, line_end = t.linespan(3) + pos_start, pos_end = t.lexspan(3) + print("Assignment Error at %d:%d to %d:%d" % (line_start,pos_start,line_end,pos_end)) + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + line_start, line_end = t.linespan(2) + pos_start, pos_end = t.lexspan(2) + print("Group at %d:%d to %d:%d" % (line_start,pos_start, line_end, pos_end)) + t[0] = t[2] + +def p_expression_group_error(t): + 'expression : LPAREN error RPAREN' + line_start, line_end = t.linespan(2) + pos_start, pos_end = t.lexspan(2) + print("Syntax error at %d:%d to %d:%d" % (line_start,pos_start, line_end, pos_end)) + t[0] = 0 + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +parser = yacc.yacc() +import calclex +calclex.lexer.lineno=1 +parser.parse(""" +a = 3 + +(4*5) + +(a b c) + ++ 6 + 7 +""", tracking=True) + + + + + + diff --git a/test/yacc_error6.py b/test/yacc_error6.py new file mode 100644 index 000000000..8d0ec85be --- /dev/null +++ b/test/yacc_error6.py @@ -0,0 +1,80 @@ +# ----------------------------------------------------------------------------- +# yacc_error6.py +# +# Panic mode recovery test +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +def p_statements(t): + 'statements : statements statement' + pass + +def p_statements_1(t): + 'statements : statement' + pass + +def p_statement_assign(p): + 'statement : LPAREN NAME EQUALS expression RPAREN' + print("%s=%s" % (p[2],p[4])) + +def p_statement_expr(t): + 'statement : LPAREN expression RPAREN' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_error(p): + if p: + print("Line %d: Syntax error at '%s'" % (p.lineno, p.value)) + # Scan ahead looking for a name token + while True: + tok = parser.token() + if not tok or tok.type == 'RPAREN': + break + if tok: + parser.restart() + return None + +parser = yacc.yacc() +import calclex +calclex.lexer.lineno=1 + +parser.parse(""" +(a = 3 + 4) +(b = 4 + * 5 - 6 + *) +(c = 10 + 11) +""") + + + + + + diff --git a/test/yacc_error7.py b/test/yacc_error7.py new file mode 100644 index 000000000..fb131beab --- /dev/null +++ b/test/yacc_error7.py @@ -0,0 +1,80 @@ +# ----------------------------------------------------------------------------- +# yacc_error7.py +# +# Panic mode recovery test using deprecated functionality +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +def p_statements(t): + 'statements : statements statement' + pass + +def p_statements_1(t): + 'statements : statement' + pass + +def p_statement_assign(p): + 'statement : LPAREN NAME EQUALS expression RPAREN' + print("%s=%s" % (p[2],p[4])) + +def p_statement_expr(t): + 'statement : LPAREN expression RPAREN' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_error(p): + if p: + print("Line %d: Syntax error at '%s'" % (p.lineno, p.value)) + # Scan ahead looking for a name token + while True: + tok = yacc.token() + if not tok or tok.type == 'RPAREN': + break + if tok: + yacc.restart() + return None + +parser = yacc.yacc() +import calclex +calclex.lexer.lineno=1 + +parser.parse(""" +(a = 3 + 4) +(b = 4 + * 5 - 6 + *) +(c = 10 + 11) +""") + + + + + + diff --git a/test/yacc_inf.py b/test/yacc_inf.py new file mode 100644 index 000000000..efd3612a1 --- /dev/null +++ b/test/yacc_inf.py @@ -0,0 +1,56 @@ +# ----------------------------------------------------------------------------- +# yacc_inf.py +# +# Infinite recursion +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_literal.py b/test/yacc_literal.py new file mode 100644 index 000000000..0d628035b --- /dev/null +++ b/test/yacc_literal.py @@ -0,0 +1,69 @@ +# ----------------------------------------------------------------------------- +# yacc_literal.py +# +# Grammar with bad literal characters +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','+','-'), + ('left','*','/'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression + | expression '**' expression ''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_misplaced.py b/test/yacc_misplaced.py new file mode 100644 index 000000000..9159b0109 --- /dev/null +++ b/test/yacc_misplaced.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_misplaced.py +# +# A misplaced | in grammar rules +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + ''' | expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_missing1.py b/test/yacc_missing1.py new file mode 100644 index 000000000..d1b510592 --- /dev/null +++ b/test/yacc_missing1.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_missing1.py +# +# Grammar with a missing rule +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : location EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_nested.py b/test/yacc_nested.py new file mode 100644 index 000000000..a1b061e78 --- /dev/null +++ b/test/yacc_nested.py @@ -0,0 +1,33 @@ +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") + +from ply import lex, yacc + +t_A = 'A' +t_B = 'B' +t_C = 'C' + +tokens = ('A', 'B', 'C') + +the_lexer = lex.lex() + +def t_error(t): + pass + +def p_error(p): + pass + +def p_start(t): + '''start : A nest C''' + pass + +def p_nest(t): + '''nest : B''' + print(t[-1]) + +the_parser = yacc.yacc(debug = False, write_tables = False) + +the_parser.parse('ABC', the_lexer) +the_parser.parse('ABC', the_lexer, tracking=True) +the_parser.parse('ABC', the_lexer, tracking=True, debug=1) diff --git a/test/yacc_nodoc.py b/test/yacc_nodoc.py new file mode 100644 index 000000000..0f61920ab --- /dev/null +++ b/test/yacc_nodoc.py @@ -0,0 +1,67 @@ +# ----------------------------------------------------------------------------- +# yacc_nodoc.py +# +# Rule with a missing doc-string +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_noerror.py b/test/yacc_noerror.py new file mode 100644 index 000000000..b38c7581f --- /dev/null +++ b/test/yacc_noerror.py @@ -0,0 +1,66 @@ +# ----------------------------------------------------------------------------- +# yacc_noerror.py +# +# No p_error() rule defined. +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + + +yacc.yacc() + + + + diff --git a/test/yacc_nop.py b/test/yacc_nop.py new file mode 100644 index 000000000..789a9cfad --- /dev/null +++ b/test/yacc_nop.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_nop.py +# +# Possible grammar rule defined without p_ prefix +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_notfunc.py b/test/yacc_notfunc.py new file mode 100644 index 000000000..5093a7448 --- /dev/null +++ b/test/yacc_notfunc.py @@ -0,0 +1,66 @@ +# ----------------------------------------------------------------------------- +# yacc_notfunc.py +# +# p_rule not defined as a function +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +p_statement_assign = "Blah" + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_notok.py b/test/yacc_notok.py new file mode 100644 index 000000000..cff55a8d0 --- /dev/null +++ b/test/yacc_notok.py @@ -0,0 +1,67 @@ +# ----------------------------------------------------------------------------- +# yacc_notok.py +# +# A grammar, but we forgot to import the tokens list +# ----------------------------------------------------------------------------- + +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_prec1.py b/test/yacc_prec1.py new file mode 100644 index 000000000..99fcd903b --- /dev/null +++ b/test/yacc_prec1.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_prec1.py +# +# Tests case where precedence specifier doesn't match up to terminals +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left', '+', '-'), + ('left', '*', '/'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_rr.py b/test/yacc_rr.py new file mode 100644 index 000000000..e7336c2f0 --- /dev/null +++ b/test/yacc_rr.py @@ -0,0 +1,72 @@ +# ----------------------------------------------------------------------------- +# yacc_rr.py +# +# A grammar with a reduce/reduce conflict +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_assign_2(t): + 'statement : NAME EQUALS NUMBER' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_rr_unused.py b/test/yacc_rr_unused.py new file mode 100644 index 000000000..1ca5f7e5b --- /dev/null +++ b/test/yacc_rr_unused.py @@ -0,0 +1,30 @@ +# ----------------------------------------------------------------------------- +# yacc_rr_unused.py +# +# A grammar with reduce/reduce conflicts and a rule that never +# gets reduced. +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +tokens = ('A', 'B', 'C') + +def p_grammar(p): + ''' + rule1 : rule2 B + | rule2 C + + rule2 : rule3 B + | rule4 + | rule5 + + rule3 : A + + rule4 : A + + rule5 : A + ''' + +yacc.yacc() diff --git a/test/yacc_simple.py b/test/yacc_simple.py new file mode 100644 index 000000000..bd989f4d6 --- /dev/null +++ b/test/yacc_simple.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_simple.py +# +# A simple, properly specifier grammar +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_sr.py b/test/yacc_sr.py new file mode 100644 index 000000000..69a1e9c7f --- /dev/null +++ b/test/yacc_sr.py @@ -0,0 +1,63 @@ +# ----------------------------------------------------------------------------- +# yacc_sr.py +# +# A grammar with shift-reduce conflicts +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_term1.py b/test/yacc_term1.py new file mode 100644 index 000000000..eaa36e9d6 --- /dev/null +++ b/test/yacc_term1.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_term1.py +# +# Terminal used on the left-hand-side +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'NUMBER : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_unicode_literals.py b/test/yacc_unicode_literals.py new file mode 100644 index 000000000..5ae4f5b8a --- /dev/null +++ b/test/yacc_unicode_literals.py @@ -0,0 +1,70 @@ +# ----------------------------------------------------------------------------- +# yacc_unicode_literals +# +# Test for unicode literals on Python 2.x +# ----------------------------------------------------------------------------- +from __future__ import unicode_literals + +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_unused.py b/test/yacc_unused.py new file mode 100644 index 000000000..55b677b1f --- /dev/null +++ b/test/yacc_unused.py @@ -0,0 +1,77 @@ +# ----------------------------------------------------------------------------- +# yacc_unused.py +# +# A grammar with an unused rule +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_expr_list(t): + 'exprlist : exprlist COMMA expression' + pass + +def p_expr_list_2(t): + 'exprlist : expression' + pass + + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_unused_rule.py b/test/yacc_unused_rule.py new file mode 100644 index 000000000..4868ef863 --- /dev/null +++ b/test/yacc_unused_rule.py @@ -0,0 +1,72 @@ +# ----------------------------------------------------------------------------- +# yacc_unused_rule.py +# +# Grammar with an unused rule +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_integer(t): + 'integer : NUMBER' + t[0] = t[1] + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_uprec.py b/test/yacc_uprec.py new file mode 100644 index 000000000..569adb8f9 --- /dev/null +++ b/test/yacc_uprec.py @@ -0,0 +1,63 @@ +# ----------------------------------------------------------------------------- +# yacc_uprec.py +# +# A grammar with a bad %prec specifier +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/test/yacc_uprec2.py b/test/yacc_uprec2.py new file mode 100644 index 000000000..73274bfb6 --- /dev/null +++ b/test/yacc_uprec2.py @@ -0,0 +1,63 @@ +# ----------------------------------------------------------------------------- +# yacc_uprec2.py +# +# A grammar with a bad %prec specifier +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + +