first go at using Python's tokenizer to help with parsing

2025-03-05 00:41:00 +01:00 · 2015-03-24 14:04:47 -04:00 · 2015-03-24 14:04:47 -04:00 · 37e61cb96a
commit 37e61cb96a
parent 1e76fd0765
4 changed files with 243 additions and 50 deletions
--- a/xonsh/ast.py
+++ b/xonsh/ast.py
@ -98,11 +98,13 @@ class CtxAwareTransformer(NodeTransformer):

    def try_subproc_toks(self, node):
        """Tries to parse the line of the node as a subprocess."""
+        print('trying to run as subprocess')
        line = self.lines[node.lineno - 1]
        mincol = len(line) - len(line.lstrip())
-        maxcol = None if self.mode == 'eval' else node.col_offset
+        maxcol = None# if self.mode == 'eval' else node.col_offset
        spline = subproc_toks(line, mincol=mincol, maxcol=maxcol, 
                              returnline=False, lexer=self.parser.lexer)
+        print('spline',spline)
        try:
            newnode = self.parser.parse(spline, mode=self.mode)
            newnode = newnode.body
--- a/xonsh/lexer.py
+++ b/xonsh/lexer.py
@ -1,9 +1,203 @@
 from __future__ import print_function, unicode_literals
 import re
 import sys
+import tokenize
+
+from keyword import kwlist

 from ply import lex
-from ply.lex import TOKEN
+from ply.lex import TOKEN, LexToken
+
+# mapping from tokenize to PLY
+# some keys are (type, name) tuples (for specific, e.g., keywords)
+# some keys are just a type, for things like strings/names
+# values are always a PLY token type
+token_map = {}
+
+# keywords
+for kw in kwlist:
+    token_map[(tokenize.NAME, kw)] = kw.upper() 
+
+#operators
+op_map = {
+        # punctuation
+        '(': 'LPAREN', ')': 'RPAREN', '[': 'LBRACKET', ']': 'RBRACKET',
+        '{': 'LBRACE', '}': 'RBRACE', ',': 'COMMA', '.': 'PERIOD', ';': 'SEMI',
+        ':': 'COLON',
+        #basic operators
+        '+': 'PLUS', '-': 'MINUS', '*': 'TIMES', '/': 'DIVIDE', 
+        '//': 'DOUBLEDIV', '%': 'MOD', '**': 'POW', '|': 'PIPE', 
+        '&': 'AMPERSAND', '~': 'TILDE', '^': 'XOR', '<<': 'LSHIFT', 
+        '>>': 'RSHIFT', '<': 'LT', '<=': 'LE', '>': 'GT', '>=': 'GE', 
+        '==': 'EQ', '!=': 'NE','->': 'RARROW',
+        # assignment operators
+        '=': 'EQUALS', '+=': 'PLUSEQUAL', '-=': 'MINUSEQUAL', 
+        '*=': 'TIMESEQUAL', '/=': 'DIVEQUAL', '%=': 'MODEQUAL', 
+        '**=': 'POWEQUAL', '<<=': 'LSHIFTEQUAL', '>>=': 'RSHIFTEQUAL', 
+        '&=': 'AMPERSANDEQUAL', '^=': 'XOREQUAL', '|=': 'PIPEEQUAL', 
+        '//=': 'DOUBLEDIVEQUAL',
+}
+for (op, type) in op_map.items():
+    token_map[(tokenize.OP, op)] = type
+
+token_map[tokenize.NAME] = 'NAME'
+token_map[tokenize.NUMBER] = 'NUMBER'
+token_map[tokenize.STRING] = 'STRING'
+token_map[tokenize.ENDMARKER] = 'ENDMARKER'
+
+def handle_indent(state, token, stream):
+    level = len(token.string)
+    if token.type == tokenize.DEDENT:
+        state['indents'].pop()
+        yield _new_token(state, 'DEDENT', ' '*state['indents'][-1], token.start[0], token.start[1])
+    elif token.type == tokenize.INDENT:
+        #moving forward
+        state['indents'].append(level)
+        yield _new_token(state, 'INDENT', token.string, token.start[0], token.start[1])
+
+def handle_dollar(state, token, stream):
+    try:
+        n = next(stream)
+    except:
+        raise Exception("missing token after $")
+
+    if n.start != token.end:
+        raise Exception("unexpected whitespace after $")
+
+    if n.type == tokenize.NAME:
+        yield _new_token(state, 'DOLLAR_NAME', '$' + n.string, token.start[0], token.start[1])
+    elif n.type == tokenize.OP and n.string == '(':
+        yield _new_token(state, 'DOLLAR_LPAREN', '$(', token.start[0], token.start[1])
+    elif n.type == tokenize.OP and n.string == '[':
+        yield _new_token(state, 'DOLLAR_LBRACKET', '$[', token.start[0], token.start[1])
+    elif n.type == tokenize.OP and n.string == '{':
+        yield _new_token(state, 'DOLLAR_LBRACE', '${', token.start[0], token.start[1])
+    else:
+        e = 'expected NAME, (, [, or {{ after $, but got {0}'
+        raise Exception(e.format(n))
+
+def handle_at(state, token, stream):
+    try:
+        n = next(stream)
+    except:
+        raise Exception("missing token after @")
+    
+    if n.type == tokenize.OP and n.string == '(' and \
+            n.start == token.end:
+        yield _new_token(state, 'AT_LPAREN', '@(', token.start[0], token.start[1])
+    else:
+        yield _new_token(state, 'AT', '@', token.start[0], token.start[1])
+        for i in handle_token(state, n, stream):
+            yield i
+
+def handle_question(state, token, stream):
+    try:
+        n = next(stream)
+    except:
+        n = None
+
+    if n.type == tokenize.ERRORTOKEN and n.string == '?' and \
+            n.start == token.end:
+        yield _new_token(state, 'DOUBLE_QUESTION', '??', token.start[0], token.start[1])
+    else:
+        yield _new_token(state, 'QUESTION', '?', token.start[0], token.start[1])
+        for i in handle_token(state, n, stream):
+            yield i
+
+def handle_backtick(state, token, stream):
+    try:
+        n = next(stream)
+    except:
+        n = None
+
+    found_match = False
+    sofar = ''
+    while n is not None:
+        if n.type == tokenize.ERRORTOKEN and n.string == '`':
+            found_match = True
+            break
+        else:
+            sofar += n.string
+        try:
+            n = next(stream)
+        except:
+            n = None
+    if found_match:
+        yield _new_token(state, 'REGEXPATH', sofar, token.start[0], token.start[1])
+    else:
+        e = "Could not find matching backtick for regex on line {0}"
+        raise Exception(e.format(token.start[0]))
+
+def handle_newline(state, token, stream):
+    try:
+        n = next(stream)
+    except:
+        n = None
+
+    yield _new_token(state, 'NEWLINE', '\n', token.start[0], token.start[1])
+
+    if n is not None:
+        if n.type != tokenize.ENDMARKER:
+            for i in handle_token(state, n, stream):
+                yield i
+        
+
+special_handlers = {
+    tokenize.ENCODING: lambda s,t,st: [],
+    tokenize.NEWLINE: handle_newline,
+    (tokenize.ERRORTOKEN, '$'): handle_dollar,
+    (tokenize.ERRORTOKEN, '`'): handle_backtick,
+    (tokenize.ERRORTOKEN, '?'): handle_question,
+    (tokenize.OP, '@'): handle_at,
+    tokenize.INDENT: handle_indent,
+    tokenize.DEDENT: handle_indent
+}
+
+def handle_token(state, token, stream):
+    typ = token.type
+    st = token.string
+    print('trying', typ, st)
+    if (typ, st) in token_map:
+        yield _new_token(state, token_map[(typ, st)], st, token.start[0], token.start[1])
+    elif typ in token_map:
+        yield _new_token(state, token_map[typ], st, token.start[0], token.start[1])
+    elif (typ, st) in special_handlers:
+        for i in special_handlers[(typ, st)](state, token, stream):
+            yield i
+    elif typ in special_handlers:
+        for i in special_handlers[typ](state, token, stream):
+            yield i
+    else:
+        raise Exception('Unexpected token: {0}'.format(token))
+
+def preprocess_tokens(tokstream):
+    #tokstream = clear_NL(tokstream)
+    state = {'indents': [0], 'lexpos': 0}
+    for token in tokstream:
+        for i in handle_token(state, token, tokstream):
+            yield i
+
+def clear_NL(tokstream):
+    for i in tokstream:
+        if i.type != tokenize.NL:
+            yield i
+
+from io import BytesIO
+def tok(s):
+    return iter(tokenize.tokenize(BytesIO(s.encode('utf-8')).readline))
+
+
+#synthesize a new PLY token
+def _new_token(state, type, value, lineno, col):
+    o = LexToken()
+    o.type = type
+    o.value = value
+    o.lineno = lineno
+    o.lexpos = state['lexpos']
+    o.col = col
+    print('col',col)
+    state['lexpos'] += 1
+    return o

 def anyof(*regexes):
    return '(' + '|'.join(regexes) + ')'
@ -61,12 +255,17 @@ class Lexer(object):

    def input(self, s):
        """Calls the lexer on the string s."""
-        self.lexer.input(s)
+        print('code:\n',repr(s))
+        self.token_stream = preprocess_tokens(tok(s))

    def token(self):
        """Retrieves the next token."""
-        self.last = self.lexer.token()
-        return self.last
+        try:
+            o = next(self.token_stream)
+            print(o)
+            return o
+        except:
+            return None

    def token_col(self, token):
        """Discovers the token column number."""
@ -108,9 +307,7 @@ class Lexer(object):
        'NONE', 'TRUE', 'FALSE',

        # literals
-        'INT_LITERAL', 'HEX_LITERAL', 'OCT_LITERAL', 'BIN_LITERAL',
-        'FLOAT_LITERAL', 'IMAG_LITERAL', 'STRING_LITERAL',
-        'RAW_STRING_LITERAL', 'BYTES_LITERAL', 'UNICODE_LITERAL',
+        'NUMBER', 'STRING',

        # Basic Operators
        'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'DOUBLEDIV', 'MOD', 'POW', 
@ -137,10 +334,10 @@ class Lexer(object):
        'COMMA', 'PERIOD',       # . ,
        'SEMI', 'COLON',         # ; :
        'AT',                    # @
-        'DOLLAR',                # $
        'QUESTION',              # ?
        'DOUBLE_QUESTION',       # ??
        'AT_LPAREN',             # @(
+        'DOLLAR_NAME',           # $NAME
        'DOLLAR_LPAREN',         # $(
        'DOLLAR_LBRACE',         # ${
        'DOLLAR_LBRACKET',       # $[
@ -354,18 +551,6 @@ class Lexer(object):
    def t_STRING_LITERAL(self, t):
        return t

-    @TOKEN(raw_string_literal)
-    def t_RAW_STRING_LITERAL(self, t):
-        return t
-
-    @TOKEN(unicode_literal)
-    def t_UNICODE_LITERAL(self, t):
-        return t
-
-    @TOKEN(bytes_literal)
-    def t_BYTES_LITERAL(self, t):
-        return t
-
    # float literal must come before int literals

    @TOKEN(imag_literal)
--- a/xonsh/parser.py
+++ b/xonsh/parser.py
@ -711,6 +711,14 @@ class Parser(object):
        """
        p[0] = p[1]

+    def p_stmt_list(self, p):
+        """stmt_list : stmt
+                     | stmt_list stmt"""
+        if len(p) == 2:
+            p[0] = p[1]
+        else:
+            p[0] = p[1] + p[2]
+
    def p_semi_opt(self, p):
        """semi_opt : SEMI
                    | empty
@ -1199,11 +1207,9 @@ class Parser(object):

    def p_suite(self, p):
        """suite : simple_stmt
-                 | NEWLINE indented_stmt DEDENT
-                 | NEWLINE indented_stmt_list
-                 | NEWLINE indented_stmt_list DEDENT
+                 | NEWLINE INDENT stmt_list DEDENT
        """
-        p[0] = p[1] if len(p) == 2 else p[2]
+        p[0] = p[1] if len(p) == 2 else p[3]
        if len(p) < 4:
            self.lineno += 1  # needs to be at the end

@ -1505,7 +1511,7 @@ class Parser(object):
                | TRUE
                | FALSE
                | REGEXPATH
-                | DOLLAR NAME
+                | DOLLAR_NAME
                | DOLLAR_LBRACE test RBRACE
                | DOLLAR_LPAREN subproc RPAREN
                | DOLLAR_LBRACKET subproc RBRACKET
@ -1576,11 +1582,7 @@ class Parser(object):
        p[0] = p0

    def p_string_literal(self, p):
-        """string_literal : STRING_LITERAL
-                          | RAW_STRING_LITERAL
-                          | UNICODE_LITERAL
-                          | BYTES_LITERAL
-        """
+        """string_literal : STRING"""
        s = eval(p[1])
        cls = ast.Bytes if p[1].startswith('b') else ast.Str
        p[0] = cls(s=s, lineno=self.lineno, col_offset=self.col)
@ -1594,14 +1596,8 @@ class Parser(object):
        p[0] = p[1]

    def p_number(self, p):
-        """number : INT_LITERAL
-                  | HEX_LITERAL
-                  | OCT_LITERAL
-                  | BIN_LITERAL
-                  | FLOAT_LITERAL
-                  | IMAG_LITERAL
-        """
-        p[0] = ast.Num(n=p[1], lineno=self.lineno, col_offset=self.col)
+        """number : NUMBER"""
+        p[0] = ast.Num(n=eval(p[1]), lineno=self.lineno, col_offset=self.col)

    def p_testlist_comp(self, p):
        """testlist_comp : test_or_star_expr comp_for
@ -2009,9 +2005,11 @@ class Parser(object):
    def p_subproc(self, p):
        """subproc : subproc_atoms
                   | subproc_atoms INDENT
+                   | subproc_atoms ENDMARKER
                   | subproc AMPERSAND
                   | subproc subproc_special subproc_atoms
                   | subproc subproc_special subproc_atoms INDENT
+                   | subproc subproc_special subproc_atoms ENDMARKER
        """
        lineno = self.lineno
        col = self.col
@ -2047,7 +2045,7 @@ class Parser(object):
        """subproc_atom : subproc_arg
                        | string_literal
                        | REGEXPATH
-                        | DOLLAR NAME
+                        | DOLLAR_NAME
                        | AT_LPAREN test RPAREN
                        | DOLLAR_LBRACE test RBRACE
                        | DOLLAR_LPAREN subproc RPAREN
@ -2075,7 +2073,7 @@ class Parser(object):
            else:
                assert False
        elif lenp == 3:
-            p0 = self._envvar_by_name(p[2], lineno=self.lineno, col=self.col)
+            p0 = self._envvar_by_name(p[2][1:], lineno=self.lineno, col=self.col)
            p0._cliarg_action = 'ensure_list'
        elif p1 == '@(':
            l = self.lineno
@ -2132,11 +2130,7 @@ class Parser(object):
                            | NONE
                            | TRUE
                            | FALSE
-                            | INT_LITERAL
-                            | HEX_LITERAL
-                            | OCT_LITERAL
-                            | BIN_LITERAL
-                            | FLOAT_LITERAL
+                            | NUMBER
        """
        # Many tokens cannot be part of this list, such as $, ', ", ()
        # Use a string atom instead.
--- a/xonsh/tools.py
+++ b/xonsh/tools.py
@ -35,24 +35,33 @@ def subproc_toks(line, mincol=-1, maxcol=None, lexer=None, returnline=False):
    subprocess $[] starting at a minimum column. If there are no tokens 
    (ie in a comment line) this returns None.
    """
+    line = line if line.endswith('\n') else (line+'\n')
    if lexer is None:
        lexer = builtins.__xonsh_execer__.parser.lexer
    if maxcol is None:
+        print(len(line))
        maxcol = len(line) + 1
+        print(maxcol)
+    print(len(line), repr(line), maxcol)
    lexer.reset()
    lexer.input(line)
    toks = []
    end_offset = 0
    for tok in lexer:
-        pos = tok.lexpos
+        print('TOKEN',tok)
+        pos = tok.col
        if pos >= maxcol:
+            print(pos,maxcol)
+            print('too far')
            break
        if len(toks) > 0 and toks[-1].type == 'SEMI':
+            print('semi')
            toks.clear()
        if pos < mincol:
+            print('minicol')
            continue
        toks.append(tok)
-        if tok.type == 'NEWLINE':
+        if tok.type in ('NEWLINE', 'ENDMARKER'):
            break
    else:
        if len(toks) == 0:
@ -60,7 +69,7 @@ def subproc_toks(line, mincol=-1, maxcol=None, lexer=None, returnline=False):
        if toks[-1].type == 'SEMI':
            toks.pop()
        tok = toks[-1]
-        pos = tok.lexpos
+        pos = tok.col
        if isinstance(tok.value, string_types):
            end_offset = len(tok.value)
        else:
@ -68,7 +77,10 @@ def subproc_toks(line, mincol=-1, maxcol=None, lexer=None, returnline=False):
            end_offset = len(el)
    if len(toks) == 0:
        return  # handle comment lines
-    beg, end = toks[0].lexpos, (toks[-1].lexpos + end_offset)
+    print(toks)
+    beg, end = toks[0].col, (toks[-1].col + end_offset)
+    print('LINE,BEG,END', (line,beg,end))
+    print('LINE:',line[beg:end])
    rtn = '$[' + line[beg:end] + ']'
    if returnline:
        rtn = line[:beg] + rtn + line[end:]