diff --git a/xonsh/lexer.py b/xonsh/lexer.py index 905877dca..a5b033848 100644 --- a/xonsh/lexer.py +++ b/xonsh/lexer.py @@ -67,14 +67,16 @@ def handle_dollar(state, token, stream): try: n = next(stream) except: - raise Exception("missing token after $") + m = "missing token after $" + yield _new_token("ERRORTOKEN", m, token.start) if n.start != token.end: - raise Exception("unexpected whitespace after $") + m = "unexpected whitespace after $" + yield _new_token("ERRORTOKEN", m, token.start) if n.type == tokenize.NAME: + state['last'] = n yield _new_token('DOLLAR_NAME', '$' + n.string, token.start) - state['last'] = token elif n.type == tokenize.OP and n.string == '(': state['pymode'].append(False) state['last'] = n @@ -89,13 +91,15 @@ def handle_dollar(state, token, stream): yield _new_token('DOLLAR_LBRACE', '${', token.start) else: e = 'expected NAME, (, [, or {{ after $, but got {0}' - raise Exception(e.format(n)) + m = e.format(n) + yield _new_token("ERRORTOKEN", m, token.start) def handle_at(state, token, stream): try: n = next(stream) except: - raise Exception("missing token after @") + m = "missing token after @" + yield _new_token("ERRORTOKEN", m, token.start) if n.type == tokenize.OP and n.string == '(' and \ n.start == token.end: @@ -146,7 +150,8 @@ def handle_backtick(state, token, stream): state['last'] = n else: e = "Could not find matching backtick for regex on line {0}" - raise Exception(e.format(token.start[0])) + m = e.format(token.start[0]) + yield _new_token("ERRORTOKEN", m, token.start) def handle_newline(state, token, stream): try: @@ -192,6 +197,11 @@ def handle_rbracket(state, token, stream): state['last'] = token yield _new_token('RBRACKET', ']', token.start) +def handle_error_space(state, token, stream): + if state['pymode'][-1]: + state['last'] = token + yield _new_token('WS', ' ', token.start) + special_handlers = { tokenize.ENCODING: lambda s,t,st: [], tokenize.NEWLINE: handle_newline, @@ -205,8 +215,9 @@ special_handlers = { (tokenize.ERRORTOKEN, '`'): handle_backtick, (tokenize.ERRORTOKEN, '?'): handle_question, (tokenize.OP, '@'): handle_at, + (tokenize.ERRORTOKEN, ' '): handle_error_space, tokenize.INDENT: handle_indent, - tokenize.DEDENT: handle_indent + tokenize.DEDENT: handle_indent, } def handle_token(state, token, stream): @@ -234,7 +245,8 @@ def handle_token(state, token, stream): for i in special_handlers[typ](state, token, stream): yield i else: - raise Exception('Unexpected token: {0}'.format(token)) + m = "Unexpected token: {0}".format(token) + yield _new_token("ERRORTOKEN", m, token.start) def preprocess_tokens(tokstream): tokstream = clear_NL(tokstream) @@ -248,9 +260,15 @@ def clear_NL(tokstream): if i.type != tokenize.NL: yield i +def single_error(exc): + yield _new_token("ERRORTOKEN", "{} (line {}, column {})".format(exc.msg, exc.lineno, exc.offset), (0,0)) + from io import BytesIO def tok(s): - return iter(tokenize.tokenize(BytesIO(s.encode('utf-8')).readline)) + try: + return iter(tokenize.tokenize(BytesIO(s.encode('utf-8')).readline)) + except Exception as e: + return iter(single_error(e)) #synthesize a new PLY token @@ -289,31 +307,13 @@ class Lexer(object): self.errfunc = errfunc self.fname = '' self.last = None - self.lexer = None - self.indent = '' - self.in_py_mode = [True] def build(self, **kwargs): """Part of the PLY lexer API.""" - self.lexer = lex.lex(object=self, **kwargs) - self.reset() + pass def reset(self): - #self.lexer.lineno = 1 - self.indent = '' - self.last = None - self.in_py_mode = [True] - self.in_parens = [False] - - @property - def lineno(self): - if self.lexer is not None: - return self.lexer.lineno - - @lineno.setter - def lineno(self, value): - if self.lexer is not None: - self.lexer.lineno = value + pass def input(self, s): """Calls the lexer on the string s.""" @@ -324,24 +324,11 @@ class Lexer(object): """Retrieves the next token.""" try: self.last = next(self.token_stream) - #print(self.last) + print(self.last) return self.last - except: + except StopIteration: return None - def token_col(self, token): - """Discovers the token column number.""" - offset = self.lexer.lexdata.rfind('\n', 0, token.lexpos) - return token.lexpos - offset - - def _error(self, msg, token): - location = self._make_tok_location(token) - self.errfunc(msg, location[0], location[1]) - self.lexer.skip(1) - - def _make_tok_location(self, token): - return (token.lineno, self.token_col(token)) - def __iter__(self): t = self.token() while t is not None: @@ -407,278 +394,3 @@ class Lexer(object): # Ellipsis (...) 'ELLIPSIS', ) - - # - # Token Regexes - # - identifier = r'[a-zA-Z_][0-9a-zA-Z_]*' - dollar = r'\$' - - int_literal = '\d+' - hex_literal = '0[xX][0-9a-fA-F]+' - oct_literal = '0[oO]?[0-7]+' - bin_literal = '0[bB]?[0-1]+' - - # string literals - triple_single_string = r"'''((\\(.|\n))|([^'\\])|('(?!''))|\n)*'''" - triple_double_string = r'"""((\\(.|\n))|([^"\\])|("(?!""))|\n)*"""' - single_single_string = r"'((\\(.|\n))|([^'\\]))*'" - single_double_string = r'"((\\(.|\n))|([^"\\]))*"' - triple_string = anyof(triple_single_string, triple_double_string) - single_string = anyof(single_single_string, single_double_string) - string_literal = anyof(triple_string, single_string) - raw_string_literal = '[Rr]' + string_literal - unicode_literal = '[Uu]' + string_literal - bytes_literal = '[Bb]' + string_literal - - # floating point - float_exponent = r"(?:[eE][-+]?[0-9]+)" - float_mantissa = r"(?:[0-9]*\.[0-9]+)|(?:[0-9]+\.)" - float_literal = ('((((' + float_mantissa + ')' + float_exponent + - '?)|([0-9]+' + float_exponent + ')))') - imag_literal = '(' + r'[0-9]+[jJ]' + '|' + float_literal + r'[jJ]' + ')' - - # - # Rules - # - - # Command line - def t_INDENT(self, t): - r'[ \t]+' - last = self.last - if not self.in_py_mode[-1]: - return t - elif last is not None and last.type != 'NEWLINE': - return # returns None to skip internal whitespace - i = self.indent - v = t.value - if len(i) > len(v): - if not i.startswith(v): - self._error("indentation level does not match previous level", t) - t.type = 'DEDENT' - elif not v.startswith(i): - self._error("indentation level does not match previous level", t) - self.indent = v - t.lexer.lineno += 1 - return t - - t_ENDMARKER = r'\x03' - - # Newlines - def t_NEWLINE(self, t): - r'\n' - if self.in_parens[-1]: - t.lexer.lineno += 1 - return None - else: - return t - - # - # Ignore internal whitespace based on parentherical scope - # - - def t_AT_LPAREN(self, t): - r'@\(' - self.in_parens.append(True) - self.in_py_mode.append(True) - return t - - def t_DOLLAR_LPAREN(self, t): - r'\$\(' - self.in_parens.append(True) - self.in_py_mode.append(False) - return t - - def t_LPAREN(self, t): - r'\(' - self.in_parens.append(True) - self.in_py_mode.append(True) - return t - - def t_RPAREN(self, t): - r'\)' - self.in_parens.pop() - self.in_py_mode.pop() - return t - - def t_DOLLAR_LBRACE(self, t): - r'\$\{' - self.in_parens.append(True) - self.in_py_mode.append(True) - return t - - def t_LBRACE(self, t): - r'\{' - self.in_parens.append(True) - self.in_py_mode.append(True) - return t - - def t_RBRACE(self, t): - r'\}' - self.in_parens.pop() - self.in_py_mode.pop() - return t - - def t_DOLLAR_LBRACKET(self, t): - r'\$\[' - self.in_parens.append(True) - self.in_py_mode.append(False) - return t - - def t_LBRACKET(self, t): - r'\[' - self.in_parens.append(True) - self.in_py_mode.append(True) - return t - - def t_RBRACKET(self, t): - r'\]' - self.in_parens.pop() - self.in_py_mode.pop() - return t - - # Basic Operators - t_PLUS = r'\+' - t_MINUS = r'-' - t_TIMES = r'\*' - t_DIVIDE = r'/' - t_DOUBLEDIV = r'//' - t_MOD = r'%' - t_POW = r'\*\*' - t_PIPE = r'\|' - t_AMPERSAND = r'&' - t_TILDE = r'~' - t_XOR = r'\^' - t_LSHIFT = r'<<' - t_RSHIFT = r'>>' - #t_LOGIC_OR = r'\|\|' - #t_LOGIC_AND = r'&&' - t_LT = r'<' - t_GT = r'>' - t_LE = r'<=' - t_GE = r'>=' - t_EQ = r'==' - t_NE = r'!=' - #t_LARROW = r'<-' - t_RARROW = r'->' - - # Assignment Operators - t_EQUALS = r'=' - t_PLUSEQUAL = r'\+=' - t_MINUSEQUAL = r'-=' - t_TIMESEQUAL = r'\*=' - t_DIVEQUAL = r'/=' - t_MODEQUAL = r'%=' - t_POWEQUAL = r'\*\*=' - t_LSHIFTEQUAL = r'<<=' - t_RSHIFTEQUAL = r'>>=' - t_AMPERSANDEQUAL = r'&=' - t_PIPEEQUAL = r'\|=' - t_XOREQUAL = r'\^=' - t_DOUBLEDIVEQUAL = r'//=' - t_DOLLAR = dollar - t_REGEXPATH = r'`[^`]*`' - - def t_DOUBLE_QUESTION(self, t): - r'\?\?' - return t - - t_QUESTION = r'\?' - - # Delimeters - #t_LPAREN = r'\(' - #t_RPAREN = r'\)' - #t_LBRACKET = r'\[' - #t_RBRACKET = r'\]' - #t_LBRACE = r'\{' - #t_RBRACE = r'\}' - t_COMMA = r',' - t_PERIOD = r'\.' - t_SEMI = r';' - t_COLON = r':' - t_AT = r'@' - t_ELLIPSIS = r'\.\.\.' - - def t_COMMENT(self, t): - r'\#.*' - return - - # - # Literals - # - - # strings, functions to ensure correct ordering - - @TOKEN(string_literal) - def t_STRING_LITERAL(self, t): - return t - - # float literal must come before int literals - - @TOKEN(imag_literal) - def t_IMAG_LITERAL(self, t): - if self.in_py_mode[-1]: - t.value = eval(t.value) - return t - - @TOKEN(float_literal) - def t_FLOAT_LITERAL(self, t): - if self.in_py_mode[-1]: - t.value = float(t.value) - return t - - # ints, functions to ensure correct ordering - - @TOKEN(hex_literal) - def t_HEX_LITERAL(self, t): - if self.in_py_mode[-1]: - t.value = int(t.value, 16) - return t - - @TOKEN(oct_literal) - def t_OCT_LITERAL(self, t): - if self.in_py_mode[-1]: - t.value = int(t.value, 8) - return t - - @TOKEN(bin_literal) - def t_BIN_LITERAL(self, t): - if self.in_py_mode[-1]: - t.value = int(t.value, 2) - return t - - @TOKEN(int_literal) - def t_INT_LITERAL(self, t): - if self.in_py_mode[-1]: - t.value = int(t.value) - return t - - def t_NONE(self, t): - r'None' - if self.in_py_mode[-1]: - t.value = None - return t - - def t_TRUE(self, t): - r'True' - if self.in_py_mode[-1]: - t.value = True - return t - - def t_FALSE(self, t): - r'False' - if self.in_py_mode[-1]: - t.value = False - return t - - # Extra - @TOKEN(identifier) - def t_NAME(self, t): - if self.in_py_mode[-1] and t.value in self.pykeyword_map: - t.type = self.pykeyword_map[t.value] - return t - - def t_error(self, t): - msg = 'Invalid token {0!r}'.format(t.value[0]) - self._error(msg, t) -