Merge pull request #2227 from xonsh/lex

Lexer splitter
This commit is contained in:
Gil Forsyth 2017-02-20 11:13:44 -05:00 committed by GitHub
commit fbee493630
4 changed files with 107 additions and 2 deletions

17
news/lex.rst Normal file
View file

@ -0,0 +1,17 @@
**Added:**
* The lexer has a new ``split()`` method which splits strings
according to xonsh's rules for whitespace and quotes.
**Changed:** None
**Deprecated:** None
**Removed:** None
**Fixed:**
* The ``@$(cmd)`` operator now correctly splits strings according to
xonsh semantics, rather than just on whitespace using ``str.split()``.
**Security:** None

View file

@ -32,12 +32,14 @@ def ensure_tuple(x):
raise TypeError('{0} is not a sequence'.format(x))
return x
def tokens_equal(x, y):
"""Tests whether two token are equal."""
xtup = ensure_tuple(x)
ytup = ensure_tuple(y)
return xtup == ytup
def assert_token_equal(x, y):
"""Asserts that two tokens are equal."""
if not tokens_equal(x, y):
@ -45,6 +47,7 @@ def assert_token_equal(x, y):
pytest.fail(msg)
return True
def assert_tokens_equal(x, y):
"""Asserts that two token sequences are equal."""
if len(x) != len(y):
@ -60,6 +63,7 @@ def assert_tokens_equal(x, y):
pytest.fail(msg)
return True
def check_token(inp, exp):
l = Lexer()
l.input(inp)
@ -70,41 +74,50 @@ def check_token(inp, exp):
pytest.fail(msg.format(len(obs), pformat(obs)))
return assert_token_equal(exp, obs[0])
def check_tokens(inp, exp):
l = Lexer()
l.input(inp)
obs = list(l)
return assert_tokens_equal(exp, obs)
def check_tokens_subproc(inp, exp):
l = Lexer()
l.input('$[{}]'.format(inp))
obs = list(l)[1:-1]
return assert_tokens_equal(exp, obs)
def test_int_literal():
assert check_token('42', ['NUMBER', '42', 0])
def test_hex_literal():
assert check_token('0x42', ['NUMBER', '0x42', 0])
def test_oct_o_literal():
assert check_token('0o42', ['NUMBER', '0o42', 0])
def test_bin_literal():
assert check_token('0b101010', ['NUMBER', '0b101010', 0])
def test_indent():
exp = [('INDENT', ' \t ', 0),
('NUMBER', '42', 5),
('DEDENT', '', 0)]
assert check_tokens(' \t 42', exp)
def test_post_whitespace():
inp = '42 \t '
exp = [('NUMBER', '42', 0)]
assert check_tokens(inp, exp)
def test_internal_whitespace():
inp = '42 +\t65'
exp = [('NUMBER', '42', 0),
@ -112,6 +125,7 @@ def test_internal_whitespace():
('NUMBER', '65', 6),]
assert check_tokens(inp, exp)
def test_indent_internal_whitespace():
inp = ' 42 +\t65'
exp = [('INDENT', ' ', 0),
@ -121,6 +135,7 @@ def test_indent_internal_whitespace():
('DEDENT', '', 0)]
assert check_tokens(inp, exp)
def test_assignment():
inp = 'x = 42'
exp = [('NAME', 'x', 0),
@ -128,6 +143,7 @@ def test_assignment():
('NUMBER', '42', 4),]
assert check_tokens(inp, exp)
def test_multiline():
inp = 'x\ny'
exp = [('NAME', 'x', 0),
@ -144,51 +160,67 @@ def test_atdollar_expression():
('RPAREN', ')', 15)]
assert check_tokens(inp, exp)
def test_and():
assert check_token('and', ['AND', 'and', 0])
def test_ampersand():
assert check_token('&', ['AMPERSAND', '&', 0])
def test_atdollar():
assert check_token('@$', ['ATDOLLAR', '@$', 0])
def test_doubleamp():
assert check_token('&&', ['AND', 'and', 0])
def test_pipe():
assert check_token('|', ['PIPE', '|', 0])
def test_doublepipe():
assert check_token('||', ['OR', 'or', 0])
def test_single_quote_literal():
assert check_token("'yo'", ['STRING', "'yo'", 0])
def test_double_quote_literal():
assert check_token('"yo"', ['STRING', '"yo"', 0])
def test_triple_single_quote_literal():
assert check_token("'''yo'''", ['STRING', "'''yo'''", 0])
def test_triple_double_quote_literal():
assert check_token('"""yo"""', ['STRING', '"""yo"""', 0])
def test_single_raw_string_literal():
assert check_token("r'yo'", ['STRING', "r'yo'", 0])
def test_double_raw_string_literal():
assert check_token('r"yo"', ['STRING', 'r"yo"', 0])
def test_single_unicode_literal():
assert check_token("u'yo'", ['STRING', "u'yo'", 0])
def test_double_unicode_literal():
assert check_token('u"yo"', ['STRING', 'u"yo"', 0])
def test_single_bytes_literal():
assert check_token("b'yo'", ['STRING', "b'yo'", 0])
def test_path_string_literal():
assert check_token("p'/foo'", ['STRING', "p'/foo'", 0])
assert check_token('p"/foo"', ['STRING', 'p"/foo"', 0])
@ -204,12 +236,36 @@ def test_regex_globs():
c = '{}`{}`'.format(p,i)
assert check_token(c, ['SEARCHPATH', c, 0])
@pytest.mark.parametrize('case', [
'0.0', '.0', '0.', '1e10', '1.e42', '0.1e42', '0.5e-42', '5E10', '5e+42'])
def test_float_literals(case):
assert check_token(case, ['NUMBER', case, 0])
def test_ioredir():
cases = ['2>1', 'err>out', 'o>', 'all>', 'e>o', 'e>', 'out>', '2>&1']
for s in cases:
assert check_tokens_subproc(s, [('IOREDIRECT', s, 2)])
@pytest.mark.parametrize('s, exp', [
('', []),
(' \t \n \t ', []),
('echo hello', ['echo', 'hello']),
('echo "hello"', ['echo', '"hello"']),
('![echo "hello"]', ['![echo', '"hello"]']),
('/usr/bin/echo hello', ['/usr/bin/echo', 'hello']),
('$(/usr/bin/echo hello)', ['$(/usr/bin/echo', 'hello)']),
('C:\\Python\\python.exe -m xonsh', ['C:\\Python\\python.exe', '-m', 'xonsh']),
('print("""I am a triple string""")', ['print("""I am a triple string""")']),
('print("""I am a \ntriple string""")', ['print("""I am a \ntriple string""")']),
('echo $HOME', ['echo', '$HOME']),
('echo -n $HOME', ['echo', '-n', '$HOME']),
('echo --go=away', ['echo', '--go=away']),
('echo --go=$HOME', ['echo', '--go=$HOME']),
])
def test_lexer_split(s, exp):
lexer = Lexer()
obs = lexer.split(s)
assert exp == obs

View file

@ -839,8 +839,13 @@ def subproc_captured_stdout(*cmds):
def subproc_captured_inject(*cmds):
"""Runs a subprocess, capturing the output. Returns a list of
whitespace-separated strings in the stdout that was produced."""
return [i.strip() for i in run_subproc(cmds, captured='stdout').split()]
whitespace-separated strings of the stdout that was produced.
The string is split using xonsh's lexer, rather than Python's str.split()
or shlex.split().
"""
s = run_subproc(cmds, captured='stdout')
toks = builtins.__xonsh_execer__.parser.lexer.split(s)
return toks
def subproc_captured_object(*cmds):

View file

@ -334,6 +334,33 @@ class Lexer(object):
yield t
t = self.token()
def split(self, s):
"""Splits a string into a list of strings which are whitepace-separated
tokens.
"""
vals = []
self.input(s)
l = c = -1
ws = 'WS'
nl = '\n'
for t in self:
if t.type == ws:
continue
elif l < t.lineno:
vals.append(t.value)
elif len(vals) > 0 and c == t.lexpos:
vals[-1] = vals[-1] + t.value
else:
vals.append(t.value)
nnl = t.value.count(nl)
if nnl == 0:
l = t.lineno
c = t.lexpos + len(t.value)
else:
l = t.lineno + nnl
c = len(t.value.rpartition(nl)[-1])
return vals
#
# All the tokens recognized by the lexer
#