improved error handling, and some cleanup

This commit is contained in:
adam j hartz 2015-03-24 20:23:50 -04:00
parent 7ad7d81420
commit 34d2367768

View file

@ -67,14 +67,16 @@ def handle_dollar(state, token, stream):
try: try:
n = next(stream) n = next(stream)
except: except:
raise Exception("missing token after $") m = "missing token after $"
yield _new_token("ERRORTOKEN", m, token.start)
if n.start != token.end: if n.start != token.end:
raise Exception("unexpected whitespace after $") m = "unexpected whitespace after $"
yield _new_token("ERRORTOKEN", m, token.start)
if n.type == tokenize.NAME: if n.type == tokenize.NAME:
state['last'] = n
yield _new_token('DOLLAR_NAME', '$' + n.string, token.start) yield _new_token('DOLLAR_NAME', '$' + n.string, token.start)
state['last'] = token
elif n.type == tokenize.OP and n.string == '(': elif n.type == tokenize.OP and n.string == '(':
state['pymode'].append(False) state['pymode'].append(False)
state['last'] = n state['last'] = n
@ -89,13 +91,15 @@ def handle_dollar(state, token, stream):
yield _new_token('DOLLAR_LBRACE', '${', token.start) yield _new_token('DOLLAR_LBRACE', '${', token.start)
else: else:
e = 'expected NAME, (, [, or {{ after $, but got {0}' e = 'expected NAME, (, [, or {{ after $, but got {0}'
raise Exception(e.format(n)) m = e.format(n)
yield _new_token("ERRORTOKEN", m, token.start)
def handle_at(state, token, stream): def handle_at(state, token, stream):
try: try:
n = next(stream) n = next(stream)
except: except:
raise Exception("missing token after @") m = "missing token after @"
yield _new_token("ERRORTOKEN", m, token.start)
if n.type == tokenize.OP and n.string == '(' and \ if n.type == tokenize.OP and n.string == '(' and \
n.start == token.end: n.start == token.end:
@ -146,7 +150,8 @@ def handle_backtick(state, token, stream):
state['last'] = n state['last'] = n
else: else:
e = "Could not find matching backtick for regex on line {0}" e = "Could not find matching backtick for regex on line {0}"
raise Exception(e.format(token.start[0])) m = e.format(token.start[0])
yield _new_token("ERRORTOKEN", m, token.start)
def handle_newline(state, token, stream): def handle_newline(state, token, stream):
try: try:
@ -192,6 +197,11 @@ def handle_rbracket(state, token, stream):
state['last'] = token state['last'] = token
yield _new_token('RBRACKET', ']', token.start) yield _new_token('RBRACKET', ']', token.start)
def handle_error_space(state, token, stream):
if state['pymode'][-1]:
state['last'] = token
yield _new_token('WS', ' ', token.start)
special_handlers = { special_handlers = {
tokenize.ENCODING: lambda s,t,st: [], tokenize.ENCODING: lambda s,t,st: [],
tokenize.NEWLINE: handle_newline, tokenize.NEWLINE: handle_newline,
@ -205,8 +215,9 @@ special_handlers = {
(tokenize.ERRORTOKEN, '`'): handle_backtick, (tokenize.ERRORTOKEN, '`'): handle_backtick,
(tokenize.ERRORTOKEN, '?'): handle_question, (tokenize.ERRORTOKEN, '?'): handle_question,
(tokenize.OP, '@'): handle_at, (tokenize.OP, '@'): handle_at,
(tokenize.ERRORTOKEN, ' '): handle_error_space,
tokenize.INDENT: handle_indent, tokenize.INDENT: handle_indent,
tokenize.DEDENT: handle_indent tokenize.DEDENT: handle_indent,
} }
def handle_token(state, token, stream): def handle_token(state, token, stream):
@ -234,7 +245,8 @@ def handle_token(state, token, stream):
for i in special_handlers[typ](state, token, stream): for i in special_handlers[typ](state, token, stream):
yield i yield i
else: else:
raise Exception('Unexpected token: {0}'.format(token)) m = "Unexpected token: {0}".format(token)
yield _new_token("ERRORTOKEN", m, token.start)
def preprocess_tokens(tokstream): def preprocess_tokens(tokstream):
tokstream = clear_NL(tokstream) tokstream = clear_NL(tokstream)
@ -248,9 +260,15 @@ def clear_NL(tokstream):
if i.type != tokenize.NL: if i.type != tokenize.NL:
yield i yield i
def single_error(exc):
yield _new_token("ERRORTOKEN", "{} (line {}, column {})".format(exc.msg, exc.lineno, exc.offset), (0,0))
from io import BytesIO from io import BytesIO
def tok(s): def tok(s):
return iter(tokenize.tokenize(BytesIO(s.encode('utf-8')).readline)) try:
return iter(tokenize.tokenize(BytesIO(s.encode('utf-8')).readline))
except Exception as e:
return iter(single_error(e))
#synthesize a new PLY token #synthesize a new PLY token
@ -289,31 +307,13 @@ class Lexer(object):
self.errfunc = errfunc self.errfunc = errfunc
self.fname = '' self.fname = ''
self.last = None self.last = None
self.lexer = None
self.indent = ''
self.in_py_mode = [True]
def build(self, **kwargs): def build(self, **kwargs):
"""Part of the PLY lexer API.""" """Part of the PLY lexer API."""
self.lexer = lex.lex(object=self, **kwargs) pass
self.reset()
def reset(self): def reset(self):
#self.lexer.lineno = 1 pass
self.indent = ''
self.last = None
self.in_py_mode = [True]
self.in_parens = [False]
@property
def lineno(self):
if self.lexer is not None:
return self.lexer.lineno
@lineno.setter
def lineno(self, value):
if self.lexer is not None:
self.lexer.lineno = value
def input(self, s): def input(self, s):
"""Calls the lexer on the string s.""" """Calls the lexer on the string s."""
@ -324,24 +324,11 @@ class Lexer(object):
"""Retrieves the next token.""" """Retrieves the next token."""
try: try:
self.last = next(self.token_stream) self.last = next(self.token_stream)
#print(self.last) print(self.last)
return self.last return self.last
except: except StopIteration:
return None return None
def token_col(self, token):
"""Discovers the token column number."""
offset = self.lexer.lexdata.rfind('\n', 0, token.lexpos)
return token.lexpos - offset
def _error(self, msg, token):
location = self._make_tok_location(token)
self.errfunc(msg, location[0], location[1])
self.lexer.skip(1)
def _make_tok_location(self, token):
return (token.lineno, self.token_col(token))
def __iter__(self): def __iter__(self):
t = self.token() t = self.token()
while t is not None: while t is not None:
@ -407,278 +394,3 @@ class Lexer(object):
# Ellipsis (...) # Ellipsis (...)
'ELLIPSIS', 'ELLIPSIS',
) )
#
# Token Regexes
#
identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
dollar = r'\$'
int_literal = '\d+'
hex_literal = '0[xX][0-9a-fA-F]+'
oct_literal = '0[oO]?[0-7]+'
bin_literal = '0[bB]?[0-1]+'
# string literals
triple_single_string = r"'''((\\(.|\n))|([^'\\])|('(?!''))|\n)*'''"
triple_double_string = r'"""((\\(.|\n))|([^"\\])|("(?!""))|\n)*"""'
single_single_string = r"'((\\(.|\n))|([^'\\]))*'"
single_double_string = r'"((\\(.|\n))|([^"\\]))*"'
triple_string = anyof(triple_single_string, triple_double_string)
single_string = anyof(single_single_string, single_double_string)
string_literal = anyof(triple_string, single_string)
raw_string_literal = '[Rr]' + string_literal
unicode_literal = '[Uu]' + string_literal
bytes_literal = '[Bb]' + string_literal
# floating point
float_exponent = r"(?:[eE][-+]?[0-9]+)"
float_mantissa = r"(?:[0-9]*\.[0-9]+)|(?:[0-9]+\.)"
float_literal = ('((((' + float_mantissa + ')' + float_exponent +
'?)|([0-9]+' + float_exponent + ')))')
imag_literal = '(' + r'[0-9]+[jJ]' + '|' + float_literal + r'[jJ]' + ')'
#
# Rules
#
# Command line
def t_INDENT(self, t):
r'[ \t]+'
last = self.last
if not self.in_py_mode[-1]:
return t
elif last is not None and last.type != 'NEWLINE':
return # returns None to skip internal whitespace
i = self.indent
v = t.value
if len(i) > len(v):
if not i.startswith(v):
self._error("indentation level does not match previous level", t)
t.type = 'DEDENT'
elif not v.startswith(i):
self._error("indentation level does not match previous level", t)
self.indent = v
t.lexer.lineno += 1
return t
t_ENDMARKER = r'\x03'
# Newlines
def t_NEWLINE(self, t):
r'\n'
if self.in_parens[-1]:
t.lexer.lineno += 1
return None
else:
return t
#
# Ignore internal whitespace based on parentherical scope
#
def t_AT_LPAREN(self, t):
r'@\('
self.in_parens.append(True)
self.in_py_mode.append(True)
return t
def t_DOLLAR_LPAREN(self, t):
r'\$\('
self.in_parens.append(True)
self.in_py_mode.append(False)
return t
def t_LPAREN(self, t):
r'\('
self.in_parens.append(True)
self.in_py_mode.append(True)
return t
def t_RPAREN(self, t):
r'\)'
self.in_parens.pop()
self.in_py_mode.pop()
return t
def t_DOLLAR_LBRACE(self, t):
r'\$\{'
self.in_parens.append(True)
self.in_py_mode.append(True)
return t
def t_LBRACE(self, t):
r'\{'
self.in_parens.append(True)
self.in_py_mode.append(True)
return t
def t_RBRACE(self, t):
r'\}'
self.in_parens.pop()
self.in_py_mode.pop()
return t
def t_DOLLAR_LBRACKET(self, t):
r'\$\['
self.in_parens.append(True)
self.in_py_mode.append(False)
return t
def t_LBRACKET(self, t):
r'\['
self.in_parens.append(True)
self.in_py_mode.append(True)
return t
def t_RBRACKET(self, t):
r'\]'
self.in_parens.pop()
self.in_py_mode.pop()
return t
# Basic Operators
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_DIVIDE = r'/'
t_DOUBLEDIV = r'//'
t_MOD = r'%'
t_POW = r'\*\*'
t_PIPE = r'\|'
t_AMPERSAND = r'&'
t_TILDE = r'~'
t_XOR = r'\^'
t_LSHIFT = r'<<'
t_RSHIFT = r'>>'
#t_LOGIC_OR = r'\|\|'
#t_LOGIC_AND = r'&&'
t_LT = r'<'
t_GT = r'>'
t_LE = r'<='
t_GE = r'>='
t_EQ = r'=='
t_NE = r'!='
#t_LARROW = r'<-'
t_RARROW = r'->'
# Assignment Operators
t_EQUALS = r'='
t_PLUSEQUAL = r'\+='
t_MINUSEQUAL = r'-='
t_TIMESEQUAL = r'\*='
t_DIVEQUAL = r'/='
t_MODEQUAL = r'%='
t_POWEQUAL = r'\*\*='
t_LSHIFTEQUAL = r'<<='
t_RSHIFTEQUAL = r'>>='
t_AMPERSANDEQUAL = r'&='
t_PIPEEQUAL = r'\|='
t_XOREQUAL = r'\^='
t_DOUBLEDIVEQUAL = r'//='
t_DOLLAR = dollar
t_REGEXPATH = r'`[^`]*`'
def t_DOUBLE_QUESTION(self, t):
r'\?\?'
return t
t_QUESTION = r'\?'
# Delimeters
#t_LPAREN = r'\('
#t_RPAREN = r'\)'
#t_LBRACKET = r'\['
#t_RBRACKET = r'\]'
#t_LBRACE = r'\{'
#t_RBRACE = r'\}'
t_COMMA = r','
t_PERIOD = r'\.'
t_SEMI = r';'
t_COLON = r':'
t_AT = r'@'
t_ELLIPSIS = r'\.\.\.'
def t_COMMENT(self, t):
r'\#.*'
return
#
# Literals
#
# strings, functions to ensure correct ordering
@TOKEN(string_literal)
def t_STRING_LITERAL(self, t):
return t
# float literal must come before int literals
@TOKEN(imag_literal)
def t_IMAG_LITERAL(self, t):
if self.in_py_mode[-1]:
t.value = eval(t.value)
return t
@TOKEN(float_literal)
def t_FLOAT_LITERAL(self, t):
if self.in_py_mode[-1]:
t.value = float(t.value)
return t
# ints, functions to ensure correct ordering
@TOKEN(hex_literal)
def t_HEX_LITERAL(self, t):
if self.in_py_mode[-1]:
t.value = int(t.value, 16)
return t
@TOKEN(oct_literal)
def t_OCT_LITERAL(self, t):
if self.in_py_mode[-1]:
t.value = int(t.value, 8)
return t
@TOKEN(bin_literal)
def t_BIN_LITERAL(self, t):
if self.in_py_mode[-1]:
t.value = int(t.value, 2)
return t
@TOKEN(int_literal)
def t_INT_LITERAL(self, t):
if self.in_py_mode[-1]:
t.value = int(t.value)
return t
def t_NONE(self, t):
r'None'
if self.in_py_mode[-1]:
t.value = None
return t
def t_TRUE(self, t):
r'True'
if self.in_py_mode[-1]:
t.value = True
return t
def t_FALSE(self, t):
r'False'
if self.in_py_mode[-1]:
t.value = False
return t
# Extra
@TOKEN(identifier)
def t_NAME(self, t):
if self.in_py_mode[-1] and t.value in self.pykeyword_map:
t.type = self.pykeyword_map[t.value]
return t
def t_error(self, t):
msg = 'Invalid token {0!r}'.format(t.value[0])
self._error(msg, t)