mirror of
https://github.com/xonsh/xonsh.git
synced 2025-03-05 00:41:00 +01:00
first go at using Python's tokenizer to help with parsing
This commit is contained in:
parent
1e76fd0765
commit
37e61cb96a
4 changed files with 243 additions and 50 deletions
|
@ -98,11 +98,13 @@ class CtxAwareTransformer(NodeTransformer):
|
|||
|
||||
def try_subproc_toks(self, node):
|
||||
"""Tries to parse the line of the node as a subprocess."""
|
||||
print('trying to run as subprocess')
|
||||
line = self.lines[node.lineno - 1]
|
||||
mincol = len(line) - len(line.lstrip())
|
||||
maxcol = None if self.mode == 'eval' else node.col_offset
|
||||
maxcol = None# if self.mode == 'eval' else node.col_offset
|
||||
spline = subproc_toks(line, mincol=mincol, maxcol=maxcol,
|
||||
returnline=False, lexer=self.parser.lexer)
|
||||
print('spline',spline)
|
||||
try:
|
||||
newnode = self.parser.parse(spline, mode=self.mode)
|
||||
newnode = newnode.body
|
||||
|
|
225
xonsh/lexer.py
225
xonsh/lexer.py
|
@ -1,9 +1,203 @@
|
|||
from __future__ import print_function, unicode_literals
|
||||
import re
|
||||
import sys
|
||||
import tokenize
|
||||
|
||||
from keyword import kwlist
|
||||
|
||||
from ply import lex
|
||||
from ply.lex import TOKEN
|
||||
from ply.lex import TOKEN, LexToken
|
||||
|
||||
# mapping from tokenize to PLY
|
||||
# some keys are (type, name) tuples (for specific, e.g., keywords)
|
||||
# some keys are just a type, for things like strings/names
|
||||
# values are always a PLY token type
|
||||
token_map = {}
|
||||
|
||||
# keywords
|
||||
for kw in kwlist:
|
||||
token_map[(tokenize.NAME, kw)] = kw.upper()
|
||||
|
||||
#operators
|
||||
op_map = {
|
||||
# punctuation
|
||||
'(': 'LPAREN', ')': 'RPAREN', '[': 'LBRACKET', ']': 'RBRACKET',
|
||||
'{': 'LBRACE', '}': 'RBRACE', ',': 'COMMA', '.': 'PERIOD', ';': 'SEMI',
|
||||
':': 'COLON',
|
||||
#basic operators
|
||||
'+': 'PLUS', '-': 'MINUS', '*': 'TIMES', '/': 'DIVIDE',
|
||||
'//': 'DOUBLEDIV', '%': 'MOD', '**': 'POW', '|': 'PIPE',
|
||||
'&': 'AMPERSAND', '~': 'TILDE', '^': 'XOR', '<<': 'LSHIFT',
|
||||
'>>': 'RSHIFT', '<': 'LT', '<=': 'LE', '>': 'GT', '>=': 'GE',
|
||||
'==': 'EQ', '!=': 'NE','->': 'RARROW',
|
||||
# assignment operators
|
||||
'=': 'EQUALS', '+=': 'PLUSEQUAL', '-=': 'MINUSEQUAL',
|
||||
'*=': 'TIMESEQUAL', '/=': 'DIVEQUAL', '%=': 'MODEQUAL',
|
||||
'**=': 'POWEQUAL', '<<=': 'LSHIFTEQUAL', '>>=': 'RSHIFTEQUAL',
|
||||
'&=': 'AMPERSANDEQUAL', '^=': 'XOREQUAL', '|=': 'PIPEEQUAL',
|
||||
'//=': 'DOUBLEDIVEQUAL',
|
||||
}
|
||||
for (op, type) in op_map.items():
|
||||
token_map[(tokenize.OP, op)] = type
|
||||
|
||||
token_map[tokenize.NAME] = 'NAME'
|
||||
token_map[tokenize.NUMBER] = 'NUMBER'
|
||||
token_map[tokenize.STRING] = 'STRING'
|
||||
token_map[tokenize.ENDMARKER] = 'ENDMARKER'
|
||||
|
||||
def handle_indent(state, token, stream):
|
||||
level = len(token.string)
|
||||
if token.type == tokenize.DEDENT:
|
||||
state['indents'].pop()
|
||||
yield _new_token(state, 'DEDENT', ' '*state['indents'][-1], token.start[0], token.start[1])
|
||||
elif token.type == tokenize.INDENT:
|
||||
#moving forward
|
||||
state['indents'].append(level)
|
||||
yield _new_token(state, 'INDENT', token.string, token.start[0], token.start[1])
|
||||
|
||||
def handle_dollar(state, token, stream):
|
||||
try:
|
||||
n = next(stream)
|
||||
except:
|
||||
raise Exception("missing token after $")
|
||||
|
||||
if n.start != token.end:
|
||||
raise Exception("unexpected whitespace after $")
|
||||
|
||||
if n.type == tokenize.NAME:
|
||||
yield _new_token(state, 'DOLLAR_NAME', '$' + n.string, token.start[0], token.start[1])
|
||||
elif n.type == tokenize.OP and n.string == '(':
|
||||
yield _new_token(state, 'DOLLAR_LPAREN', '$(', token.start[0], token.start[1])
|
||||
elif n.type == tokenize.OP and n.string == '[':
|
||||
yield _new_token(state, 'DOLLAR_LBRACKET', '$[', token.start[0], token.start[1])
|
||||
elif n.type == tokenize.OP and n.string == '{':
|
||||
yield _new_token(state, 'DOLLAR_LBRACE', '${', token.start[0], token.start[1])
|
||||
else:
|
||||
e = 'expected NAME, (, [, or {{ after $, but got {0}'
|
||||
raise Exception(e.format(n))
|
||||
|
||||
def handle_at(state, token, stream):
|
||||
try:
|
||||
n = next(stream)
|
||||
except:
|
||||
raise Exception("missing token after @")
|
||||
|
||||
if n.type == tokenize.OP and n.string == '(' and \
|
||||
n.start == token.end:
|
||||
yield _new_token(state, 'AT_LPAREN', '@(', token.start[0], token.start[1])
|
||||
else:
|
||||
yield _new_token(state, 'AT', '@', token.start[0], token.start[1])
|
||||
for i in handle_token(state, n, stream):
|
||||
yield i
|
||||
|
||||
def handle_question(state, token, stream):
|
||||
try:
|
||||
n = next(stream)
|
||||
except:
|
||||
n = None
|
||||
|
||||
if n.type == tokenize.ERRORTOKEN and n.string == '?' and \
|
||||
n.start == token.end:
|
||||
yield _new_token(state, 'DOUBLE_QUESTION', '??', token.start[0], token.start[1])
|
||||
else:
|
||||
yield _new_token(state, 'QUESTION', '?', token.start[0], token.start[1])
|
||||
for i in handle_token(state, n, stream):
|
||||
yield i
|
||||
|
||||
def handle_backtick(state, token, stream):
|
||||
try:
|
||||
n = next(stream)
|
||||
except:
|
||||
n = None
|
||||
|
||||
found_match = False
|
||||
sofar = ''
|
||||
while n is not None:
|
||||
if n.type == tokenize.ERRORTOKEN and n.string == '`':
|
||||
found_match = True
|
||||
break
|
||||
else:
|
||||
sofar += n.string
|
||||
try:
|
||||
n = next(stream)
|
||||
except:
|
||||
n = None
|
||||
if found_match:
|
||||
yield _new_token(state, 'REGEXPATH', sofar, token.start[0], token.start[1])
|
||||
else:
|
||||
e = "Could not find matching backtick for regex on line {0}"
|
||||
raise Exception(e.format(token.start[0]))
|
||||
|
||||
def handle_newline(state, token, stream):
|
||||
try:
|
||||
n = next(stream)
|
||||
except:
|
||||
n = None
|
||||
|
||||
yield _new_token(state, 'NEWLINE', '\n', token.start[0], token.start[1])
|
||||
|
||||
if n is not None:
|
||||
if n.type != tokenize.ENDMARKER:
|
||||
for i in handle_token(state, n, stream):
|
||||
yield i
|
||||
|
||||
|
||||
special_handlers = {
|
||||
tokenize.ENCODING: lambda s,t,st: [],
|
||||
tokenize.NEWLINE: handle_newline,
|
||||
(tokenize.ERRORTOKEN, '$'): handle_dollar,
|
||||
(tokenize.ERRORTOKEN, '`'): handle_backtick,
|
||||
(tokenize.ERRORTOKEN, '?'): handle_question,
|
||||
(tokenize.OP, '@'): handle_at,
|
||||
tokenize.INDENT: handle_indent,
|
||||
tokenize.DEDENT: handle_indent
|
||||
}
|
||||
|
||||
def handle_token(state, token, stream):
|
||||
typ = token.type
|
||||
st = token.string
|
||||
print('trying', typ, st)
|
||||
if (typ, st) in token_map:
|
||||
yield _new_token(state, token_map[(typ, st)], st, token.start[0], token.start[1])
|
||||
elif typ in token_map:
|
||||
yield _new_token(state, token_map[typ], st, token.start[0], token.start[1])
|
||||
elif (typ, st) in special_handlers:
|
||||
for i in special_handlers[(typ, st)](state, token, stream):
|
||||
yield i
|
||||
elif typ in special_handlers:
|
||||
for i in special_handlers[typ](state, token, stream):
|
||||
yield i
|
||||
else:
|
||||
raise Exception('Unexpected token: {0}'.format(token))
|
||||
|
||||
def preprocess_tokens(tokstream):
|
||||
#tokstream = clear_NL(tokstream)
|
||||
state = {'indents': [0], 'lexpos': 0}
|
||||
for token in tokstream:
|
||||
for i in handle_token(state, token, tokstream):
|
||||
yield i
|
||||
|
||||
def clear_NL(tokstream):
|
||||
for i in tokstream:
|
||||
if i.type != tokenize.NL:
|
||||
yield i
|
||||
|
||||
from io import BytesIO
|
||||
def tok(s):
|
||||
return iter(tokenize.tokenize(BytesIO(s.encode('utf-8')).readline))
|
||||
|
||||
|
||||
#synthesize a new PLY token
|
||||
def _new_token(state, type, value, lineno, col):
|
||||
o = LexToken()
|
||||
o.type = type
|
||||
o.value = value
|
||||
o.lineno = lineno
|
||||
o.lexpos = state['lexpos']
|
||||
o.col = col
|
||||
print('col',col)
|
||||
state['lexpos'] += 1
|
||||
return o
|
||||
|
||||
def anyof(*regexes):
|
||||
return '(' + '|'.join(regexes) + ')'
|
||||
|
@ -61,12 +255,17 @@ class Lexer(object):
|
|||
|
||||
def input(self, s):
|
||||
"""Calls the lexer on the string s."""
|
||||
self.lexer.input(s)
|
||||
print('code:\n',repr(s))
|
||||
self.token_stream = preprocess_tokens(tok(s))
|
||||
|
||||
def token(self):
|
||||
"""Retrieves the next token."""
|
||||
self.last = self.lexer.token()
|
||||
return self.last
|
||||
try:
|
||||
o = next(self.token_stream)
|
||||
print(o)
|
||||
return o
|
||||
except:
|
||||
return None
|
||||
|
||||
def token_col(self, token):
|
||||
"""Discovers the token column number."""
|
||||
|
@ -108,9 +307,7 @@ class Lexer(object):
|
|||
'NONE', 'TRUE', 'FALSE',
|
||||
|
||||
# literals
|
||||
'INT_LITERAL', 'HEX_LITERAL', 'OCT_LITERAL', 'BIN_LITERAL',
|
||||
'FLOAT_LITERAL', 'IMAG_LITERAL', 'STRING_LITERAL',
|
||||
'RAW_STRING_LITERAL', 'BYTES_LITERAL', 'UNICODE_LITERAL',
|
||||
'NUMBER', 'STRING',
|
||||
|
||||
# Basic Operators
|
||||
'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'DOUBLEDIV', 'MOD', 'POW',
|
||||
|
@ -137,10 +334,10 @@ class Lexer(object):
|
|||
'COMMA', 'PERIOD', # . ,
|
||||
'SEMI', 'COLON', # ; :
|
||||
'AT', # @
|
||||
'DOLLAR', # $
|
||||
'QUESTION', # ?
|
||||
'DOUBLE_QUESTION', # ??
|
||||
'AT_LPAREN', # @(
|
||||
'DOLLAR_NAME', # $NAME
|
||||
'DOLLAR_LPAREN', # $(
|
||||
'DOLLAR_LBRACE', # ${
|
||||
'DOLLAR_LBRACKET', # $[
|
||||
|
@ -354,18 +551,6 @@ class Lexer(object):
|
|||
def t_STRING_LITERAL(self, t):
|
||||
return t
|
||||
|
||||
@TOKEN(raw_string_literal)
|
||||
def t_RAW_STRING_LITERAL(self, t):
|
||||
return t
|
||||
|
||||
@TOKEN(unicode_literal)
|
||||
def t_UNICODE_LITERAL(self, t):
|
||||
return t
|
||||
|
||||
@TOKEN(bytes_literal)
|
||||
def t_BYTES_LITERAL(self, t):
|
||||
return t
|
||||
|
||||
# float literal must come before int literals
|
||||
|
||||
@TOKEN(imag_literal)
|
||||
|
|
|
@ -711,6 +711,14 @@ class Parser(object):
|
|||
"""
|
||||
p[0] = p[1]
|
||||
|
||||
def p_stmt_list(self, p):
|
||||
"""stmt_list : stmt
|
||||
| stmt_list stmt"""
|
||||
if len(p) == 2:
|
||||
p[0] = p[1]
|
||||
else:
|
||||
p[0] = p[1] + p[2]
|
||||
|
||||
def p_semi_opt(self, p):
|
||||
"""semi_opt : SEMI
|
||||
| empty
|
||||
|
@ -1199,11 +1207,9 @@ class Parser(object):
|
|||
|
||||
def p_suite(self, p):
|
||||
"""suite : simple_stmt
|
||||
| NEWLINE indented_stmt DEDENT
|
||||
| NEWLINE indented_stmt_list
|
||||
| NEWLINE indented_stmt_list DEDENT
|
||||
| NEWLINE INDENT stmt_list DEDENT
|
||||
"""
|
||||
p[0] = p[1] if len(p) == 2 else p[2]
|
||||
p[0] = p[1] if len(p) == 2 else p[3]
|
||||
if len(p) < 4:
|
||||
self.lineno += 1 # needs to be at the end
|
||||
|
||||
|
@ -1505,7 +1511,7 @@ class Parser(object):
|
|||
| TRUE
|
||||
| FALSE
|
||||
| REGEXPATH
|
||||
| DOLLAR NAME
|
||||
| DOLLAR_NAME
|
||||
| DOLLAR_LBRACE test RBRACE
|
||||
| DOLLAR_LPAREN subproc RPAREN
|
||||
| DOLLAR_LBRACKET subproc RBRACKET
|
||||
|
@ -1576,11 +1582,7 @@ class Parser(object):
|
|||
p[0] = p0
|
||||
|
||||
def p_string_literal(self, p):
|
||||
"""string_literal : STRING_LITERAL
|
||||
| RAW_STRING_LITERAL
|
||||
| UNICODE_LITERAL
|
||||
| BYTES_LITERAL
|
||||
"""
|
||||
"""string_literal : STRING"""
|
||||
s = eval(p[1])
|
||||
cls = ast.Bytes if p[1].startswith('b') else ast.Str
|
||||
p[0] = cls(s=s, lineno=self.lineno, col_offset=self.col)
|
||||
|
@ -1594,14 +1596,8 @@ class Parser(object):
|
|||
p[0] = p[1]
|
||||
|
||||
def p_number(self, p):
|
||||
"""number : INT_LITERAL
|
||||
| HEX_LITERAL
|
||||
| OCT_LITERAL
|
||||
| BIN_LITERAL
|
||||
| FLOAT_LITERAL
|
||||
| IMAG_LITERAL
|
||||
"""
|
||||
p[0] = ast.Num(n=p[1], lineno=self.lineno, col_offset=self.col)
|
||||
"""number : NUMBER"""
|
||||
p[0] = ast.Num(n=eval(p[1]), lineno=self.lineno, col_offset=self.col)
|
||||
|
||||
def p_testlist_comp(self, p):
|
||||
"""testlist_comp : test_or_star_expr comp_for
|
||||
|
@ -2009,9 +2005,11 @@ class Parser(object):
|
|||
def p_subproc(self, p):
|
||||
"""subproc : subproc_atoms
|
||||
| subproc_atoms INDENT
|
||||
| subproc_atoms ENDMARKER
|
||||
| subproc AMPERSAND
|
||||
| subproc subproc_special subproc_atoms
|
||||
| subproc subproc_special subproc_atoms INDENT
|
||||
| subproc subproc_special subproc_atoms ENDMARKER
|
||||
"""
|
||||
lineno = self.lineno
|
||||
col = self.col
|
||||
|
@ -2047,7 +2045,7 @@ class Parser(object):
|
|||
"""subproc_atom : subproc_arg
|
||||
| string_literal
|
||||
| REGEXPATH
|
||||
| DOLLAR NAME
|
||||
| DOLLAR_NAME
|
||||
| AT_LPAREN test RPAREN
|
||||
| DOLLAR_LBRACE test RBRACE
|
||||
| DOLLAR_LPAREN subproc RPAREN
|
||||
|
@ -2075,7 +2073,7 @@ class Parser(object):
|
|||
else:
|
||||
assert False
|
||||
elif lenp == 3:
|
||||
p0 = self._envvar_by_name(p[2], lineno=self.lineno, col=self.col)
|
||||
p0 = self._envvar_by_name(p[2][1:], lineno=self.lineno, col=self.col)
|
||||
p0._cliarg_action = 'ensure_list'
|
||||
elif p1 == '@(':
|
||||
l = self.lineno
|
||||
|
@ -2132,11 +2130,7 @@ class Parser(object):
|
|||
| NONE
|
||||
| TRUE
|
||||
| FALSE
|
||||
| INT_LITERAL
|
||||
| HEX_LITERAL
|
||||
| OCT_LITERAL
|
||||
| BIN_LITERAL
|
||||
| FLOAT_LITERAL
|
||||
| NUMBER
|
||||
"""
|
||||
# Many tokens cannot be part of this list, such as $, ', ", ()
|
||||
# Use a string atom instead.
|
||||
|
|
|
@ -35,24 +35,33 @@ def subproc_toks(line, mincol=-1, maxcol=None, lexer=None, returnline=False):
|
|||
subprocess $[] starting at a minimum column. If there are no tokens
|
||||
(ie in a comment line) this returns None.
|
||||
"""
|
||||
line = line if line.endswith('\n') else (line+'\n')
|
||||
if lexer is None:
|
||||
lexer = builtins.__xonsh_execer__.parser.lexer
|
||||
if maxcol is None:
|
||||
print(len(line))
|
||||
maxcol = len(line) + 1
|
||||
print(maxcol)
|
||||
print(len(line), repr(line), maxcol)
|
||||
lexer.reset()
|
||||
lexer.input(line)
|
||||
toks = []
|
||||
end_offset = 0
|
||||
for tok in lexer:
|
||||
pos = tok.lexpos
|
||||
print('TOKEN',tok)
|
||||
pos = tok.col
|
||||
if pos >= maxcol:
|
||||
print(pos,maxcol)
|
||||
print('too far')
|
||||
break
|
||||
if len(toks) > 0 and toks[-1].type == 'SEMI':
|
||||
print('semi')
|
||||
toks.clear()
|
||||
if pos < mincol:
|
||||
print('minicol')
|
||||
continue
|
||||
toks.append(tok)
|
||||
if tok.type == 'NEWLINE':
|
||||
if tok.type in ('NEWLINE', 'ENDMARKER'):
|
||||
break
|
||||
else:
|
||||
if len(toks) == 0:
|
||||
|
@ -60,7 +69,7 @@ def subproc_toks(line, mincol=-1, maxcol=None, lexer=None, returnline=False):
|
|||
if toks[-1].type == 'SEMI':
|
||||
toks.pop()
|
||||
tok = toks[-1]
|
||||
pos = tok.lexpos
|
||||
pos = tok.col
|
||||
if isinstance(tok.value, string_types):
|
||||
end_offset = len(tok.value)
|
||||
else:
|
||||
|
@ -68,7 +77,10 @@ def subproc_toks(line, mincol=-1, maxcol=None, lexer=None, returnline=False):
|
|||
end_offset = len(el)
|
||||
if len(toks) == 0:
|
||||
return # handle comment lines
|
||||
beg, end = toks[0].lexpos, (toks[-1].lexpos + end_offset)
|
||||
print(toks)
|
||||
beg, end = toks[0].col, (toks[-1].col + end_offset)
|
||||
print('LINE,BEG,END', (line,beg,end))
|
||||
print('LINE:',line[beg:end])
|
||||
rtn = '$[' + line[beg:end] + ']'
|
||||
if returnline:
|
||||
rtn = line[:beg] + rtn + line[end:]
|
||||
|
|
Loading…
Add table
Reference in a new issue