first go at using Python's tokenizer to help with parsing

This commit is contained in:
adam j hartz 2015-03-24 14:04:47 -04:00
parent 1e76fd0765
commit 37e61cb96a
4 changed files with 243 additions and 50 deletions

View file

@ -98,11 +98,13 @@ class CtxAwareTransformer(NodeTransformer):
def try_subproc_toks(self, node):
"""Tries to parse the line of the node as a subprocess."""
print('trying to run as subprocess')
line = self.lines[node.lineno - 1]
mincol = len(line) - len(line.lstrip())
maxcol = None if self.mode == 'eval' else node.col_offset
maxcol = None# if self.mode == 'eval' else node.col_offset
spline = subproc_toks(line, mincol=mincol, maxcol=maxcol,
returnline=False, lexer=self.parser.lexer)
print('spline',spline)
try:
newnode = self.parser.parse(spline, mode=self.mode)
newnode = newnode.body

View file

@ -1,9 +1,203 @@
from __future__ import print_function, unicode_literals
import re
import sys
import tokenize
from keyword import kwlist
from ply import lex
from ply.lex import TOKEN
from ply.lex import TOKEN, LexToken
# mapping from tokenize to PLY
# some keys are (type, name) tuples (for specific, e.g., keywords)
# some keys are just a type, for things like strings/names
# values are always a PLY token type
token_map = {}
# keywords
for kw in kwlist:
token_map[(tokenize.NAME, kw)] = kw.upper()
#operators
op_map = {
# punctuation
'(': 'LPAREN', ')': 'RPAREN', '[': 'LBRACKET', ']': 'RBRACKET',
'{': 'LBRACE', '}': 'RBRACE', ',': 'COMMA', '.': 'PERIOD', ';': 'SEMI',
':': 'COLON',
#basic operators
'+': 'PLUS', '-': 'MINUS', '*': 'TIMES', '/': 'DIVIDE',
'//': 'DOUBLEDIV', '%': 'MOD', '**': 'POW', '|': 'PIPE',
'&': 'AMPERSAND', '~': 'TILDE', '^': 'XOR', '<<': 'LSHIFT',
'>>': 'RSHIFT', '<': 'LT', '<=': 'LE', '>': 'GT', '>=': 'GE',
'==': 'EQ', '!=': 'NE','->': 'RARROW',
# assignment operators
'=': 'EQUALS', '+=': 'PLUSEQUAL', '-=': 'MINUSEQUAL',
'*=': 'TIMESEQUAL', '/=': 'DIVEQUAL', '%=': 'MODEQUAL',
'**=': 'POWEQUAL', '<<=': 'LSHIFTEQUAL', '>>=': 'RSHIFTEQUAL',
'&=': 'AMPERSANDEQUAL', '^=': 'XOREQUAL', '|=': 'PIPEEQUAL',
'//=': 'DOUBLEDIVEQUAL',
}
for (op, type) in op_map.items():
token_map[(tokenize.OP, op)] = type
token_map[tokenize.NAME] = 'NAME'
token_map[tokenize.NUMBER] = 'NUMBER'
token_map[tokenize.STRING] = 'STRING'
token_map[tokenize.ENDMARKER] = 'ENDMARKER'
def handle_indent(state, token, stream):
level = len(token.string)
if token.type == tokenize.DEDENT:
state['indents'].pop()
yield _new_token(state, 'DEDENT', ' '*state['indents'][-1], token.start[0], token.start[1])
elif token.type == tokenize.INDENT:
#moving forward
state['indents'].append(level)
yield _new_token(state, 'INDENT', token.string, token.start[0], token.start[1])
def handle_dollar(state, token, stream):
try:
n = next(stream)
except:
raise Exception("missing token after $")
if n.start != token.end:
raise Exception("unexpected whitespace after $")
if n.type == tokenize.NAME:
yield _new_token(state, 'DOLLAR_NAME', '$' + n.string, token.start[0], token.start[1])
elif n.type == tokenize.OP and n.string == '(':
yield _new_token(state, 'DOLLAR_LPAREN', '$(', token.start[0], token.start[1])
elif n.type == tokenize.OP and n.string == '[':
yield _new_token(state, 'DOLLAR_LBRACKET', '$[', token.start[0], token.start[1])
elif n.type == tokenize.OP and n.string == '{':
yield _new_token(state, 'DOLLAR_LBRACE', '${', token.start[0], token.start[1])
else:
e = 'expected NAME, (, [, or {{ after $, but got {0}'
raise Exception(e.format(n))
def handle_at(state, token, stream):
try:
n = next(stream)
except:
raise Exception("missing token after @")
if n.type == tokenize.OP and n.string == '(' and \
n.start == token.end:
yield _new_token(state, 'AT_LPAREN', '@(', token.start[0], token.start[1])
else:
yield _new_token(state, 'AT', '@', token.start[0], token.start[1])
for i in handle_token(state, n, stream):
yield i
def handle_question(state, token, stream):
try:
n = next(stream)
except:
n = None
if n.type == tokenize.ERRORTOKEN and n.string == '?' and \
n.start == token.end:
yield _new_token(state, 'DOUBLE_QUESTION', '??', token.start[0], token.start[1])
else:
yield _new_token(state, 'QUESTION', '?', token.start[0], token.start[1])
for i in handle_token(state, n, stream):
yield i
def handle_backtick(state, token, stream):
try:
n = next(stream)
except:
n = None
found_match = False
sofar = ''
while n is not None:
if n.type == tokenize.ERRORTOKEN and n.string == '`':
found_match = True
break
else:
sofar += n.string
try:
n = next(stream)
except:
n = None
if found_match:
yield _new_token(state, 'REGEXPATH', sofar, token.start[0], token.start[1])
else:
e = "Could not find matching backtick for regex on line {0}"
raise Exception(e.format(token.start[0]))
def handle_newline(state, token, stream):
try:
n = next(stream)
except:
n = None
yield _new_token(state, 'NEWLINE', '\n', token.start[0], token.start[1])
if n is not None:
if n.type != tokenize.ENDMARKER:
for i in handle_token(state, n, stream):
yield i
special_handlers = {
tokenize.ENCODING: lambda s,t,st: [],
tokenize.NEWLINE: handle_newline,
(tokenize.ERRORTOKEN, '$'): handle_dollar,
(tokenize.ERRORTOKEN, '`'): handle_backtick,
(tokenize.ERRORTOKEN, '?'): handle_question,
(tokenize.OP, '@'): handle_at,
tokenize.INDENT: handle_indent,
tokenize.DEDENT: handle_indent
}
def handle_token(state, token, stream):
typ = token.type
st = token.string
print('trying', typ, st)
if (typ, st) in token_map:
yield _new_token(state, token_map[(typ, st)], st, token.start[0], token.start[1])
elif typ in token_map:
yield _new_token(state, token_map[typ], st, token.start[0], token.start[1])
elif (typ, st) in special_handlers:
for i in special_handlers[(typ, st)](state, token, stream):
yield i
elif typ in special_handlers:
for i in special_handlers[typ](state, token, stream):
yield i
else:
raise Exception('Unexpected token: {0}'.format(token))
def preprocess_tokens(tokstream):
#tokstream = clear_NL(tokstream)
state = {'indents': [0], 'lexpos': 0}
for token in tokstream:
for i in handle_token(state, token, tokstream):
yield i
def clear_NL(tokstream):
for i in tokstream:
if i.type != tokenize.NL:
yield i
from io import BytesIO
def tok(s):
return iter(tokenize.tokenize(BytesIO(s.encode('utf-8')).readline))
#synthesize a new PLY token
def _new_token(state, type, value, lineno, col):
o = LexToken()
o.type = type
o.value = value
o.lineno = lineno
o.lexpos = state['lexpos']
o.col = col
print('col',col)
state['lexpos'] += 1
return o
def anyof(*regexes):
return '(' + '|'.join(regexes) + ')'
@ -61,12 +255,17 @@ class Lexer(object):
def input(self, s):
"""Calls the lexer on the string s."""
self.lexer.input(s)
print('code:\n',repr(s))
self.token_stream = preprocess_tokens(tok(s))
def token(self):
"""Retrieves the next token."""
self.last = self.lexer.token()
return self.last
try:
o = next(self.token_stream)
print(o)
return o
except:
return None
def token_col(self, token):
"""Discovers the token column number."""
@ -108,9 +307,7 @@ class Lexer(object):
'NONE', 'TRUE', 'FALSE',
# literals
'INT_LITERAL', 'HEX_LITERAL', 'OCT_LITERAL', 'BIN_LITERAL',
'FLOAT_LITERAL', 'IMAG_LITERAL', 'STRING_LITERAL',
'RAW_STRING_LITERAL', 'BYTES_LITERAL', 'UNICODE_LITERAL',
'NUMBER', 'STRING',
# Basic Operators
'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'DOUBLEDIV', 'MOD', 'POW',
@ -137,10 +334,10 @@ class Lexer(object):
'COMMA', 'PERIOD', # . ,
'SEMI', 'COLON', # ; :
'AT', # @
'DOLLAR', # $
'QUESTION', # ?
'DOUBLE_QUESTION', # ??
'AT_LPAREN', # @(
'DOLLAR_NAME', # $NAME
'DOLLAR_LPAREN', # $(
'DOLLAR_LBRACE', # ${
'DOLLAR_LBRACKET', # $[
@ -354,18 +551,6 @@ class Lexer(object):
def t_STRING_LITERAL(self, t):
return t
@TOKEN(raw_string_literal)
def t_RAW_STRING_LITERAL(self, t):
return t
@TOKEN(unicode_literal)
def t_UNICODE_LITERAL(self, t):
return t
@TOKEN(bytes_literal)
def t_BYTES_LITERAL(self, t):
return t
# float literal must come before int literals
@TOKEN(imag_literal)

View file

@ -711,6 +711,14 @@ class Parser(object):
"""
p[0] = p[1]
def p_stmt_list(self, p):
"""stmt_list : stmt
| stmt_list stmt"""
if len(p) == 2:
p[0] = p[1]
else:
p[0] = p[1] + p[2]
def p_semi_opt(self, p):
"""semi_opt : SEMI
| empty
@ -1199,11 +1207,9 @@ class Parser(object):
def p_suite(self, p):
"""suite : simple_stmt
| NEWLINE indented_stmt DEDENT
| NEWLINE indented_stmt_list
| NEWLINE indented_stmt_list DEDENT
| NEWLINE INDENT stmt_list DEDENT
"""
p[0] = p[1] if len(p) == 2 else p[2]
p[0] = p[1] if len(p) == 2 else p[3]
if len(p) < 4:
self.lineno += 1 # needs to be at the end
@ -1505,7 +1511,7 @@ class Parser(object):
| TRUE
| FALSE
| REGEXPATH
| DOLLAR NAME
| DOLLAR_NAME
| DOLLAR_LBRACE test RBRACE
| DOLLAR_LPAREN subproc RPAREN
| DOLLAR_LBRACKET subproc RBRACKET
@ -1576,11 +1582,7 @@ class Parser(object):
p[0] = p0
def p_string_literal(self, p):
"""string_literal : STRING_LITERAL
| RAW_STRING_LITERAL
| UNICODE_LITERAL
| BYTES_LITERAL
"""
"""string_literal : STRING"""
s = eval(p[1])
cls = ast.Bytes if p[1].startswith('b') else ast.Str
p[0] = cls(s=s, lineno=self.lineno, col_offset=self.col)
@ -1594,14 +1596,8 @@ class Parser(object):
p[0] = p[1]
def p_number(self, p):
"""number : INT_LITERAL
| HEX_LITERAL
| OCT_LITERAL
| BIN_LITERAL
| FLOAT_LITERAL
| IMAG_LITERAL
"""
p[0] = ast.Num(n=p[1], lineno=self.lineno, col_offset=self.col)
"""number : NUMBER"""
p[0] = ast.Num(n=eval(p[1]), lineno=self.lineno, col_offset=self.col)
def p_testlist_comp(self, p):
"""testlist_comp : test_or_star_expr comp_for
@ -2009,9 +2005,11 @@ class Parser(object):
def p_subproc(self, p):
"""subproc : subproc_atoms
| subproc_atoms INDENT
| subproc_atoms ENDMARKER
| subproc AMPERSAND
| subproc subproc_special subproc_atoms
| subproc subproc_special subproc_atoms INDENT
| subproc subproc_special subproc_atoms ENDMARKER
"""
lineno = self.lineno
col = self.col
@ -2047,7 +2045,7 @@ class Parser(object):
"""subproc_atom : subproc_arg
| string_literal
| REGEXPATH
| DOLLAR NAME
| DOLLAR_NAME
| AT_LPAREN test RPAREN
| DOLLAR_LBRACE test RBRACE
| DOLLAR_LPAREN subproc RPAREN
@ -2075,7 +2073,7 @@ class Parser(object):
else:
assert False
elif lenp == 3:
p0 = self._envvar_by_name(p[2], lineno=self.lineno, col=self.col)
p0 = self._envvar_by_name(p[2][1:], lineno=self.lineno, col=self.col)
p0._cliarg_action = 'ensure_list'
elif p1 == '@(':
l = self.lineno
@ -2132,11 +2130,7 @@ class Parser(object):
| NONE
| TRUE
| FALSE
| INT_LITERAL
| HEX_LITERAL
| OCT_LITERAL
| BIN_LITERAL
| FLOAT_LITERAL
| NUMBER
"""
# Many tokens cannot be part of this list, such as $, ', ", ()
# Use a string atom instead.

View file

@ -35,24 +35,33 @@ def subproc_toks(line, mincol=-1, maxcol=None, lexer=None, returnline=False):
subprocess $[] starting at a minimum column. If there are no tokens
(ie in a comment line) this returns None.
"""
line = line if line.endswith('\n') else (line+'\n')
if lexer is None:
lexer = builtins.__xonsh_execer__.parser.lexer
if maxcol is None:
print(len(line))
maxcol = len(line) + 1
print(maxcol)
print(len(line), repr(line), maxcol)
lexer.reset()
lexer.input(line)
toks = []
end_offset = 0
for tok in lexer:
pos = tok.lexpos
print('TOKEN',tok)
pos = tok.col
if pos >= maxcol:
print(pos,maxcol)
print('too far')
break
if len(toks) > 0 and toks[-1].type == 'SEMI':
print('semi')
toks.clear()
if pos < mincol:
print('minicol')
continue
toks.append(tok)
if tok.type == 'NEWLINE':
if tok.type in ('NEWLINE', 'ENDMARKER'):
break
else:
if len(toks) == 0:
@ -60,7 +69,7 @@ def subproc_toks(line, mincol=-1, maxcol=None, lexer=None, returnline=False):
if toks[-1].type == 'SEMI':
toks.pop()
tok = toks[-1]
pos = tok.lexpos
pos = tok.col
if isinstance(tok.value, string_types):
end_offset = len(tok.value)
else:
@ -68,7 +77,10 @@ def subproc_toks(line, mincol=-1, maxcol=None, lexer=None, returnline=False):
end_offset = len(el)
if len(toks) == 0:
return # handle comment lines
beg, end = toks[0].lexpos, (toks[-1].lexpos + end_offset)
print(toks)
beg, end = toks[0].col, (toks[-1].col + end_offset)
print('LINE,BEG,END', (line,beg,end))
print('LINE:',line[beg:end])
rtn = '$[' + line[beg:end] + ']'
if returnline:
rtn = line[:beg] + rtn + line[end:]