xonsh/tests/test_lexer.py

# -*- coding: utf-8 -*-
"""Tests the xonsh lexer."""
from __future__ import unicode_literals, print_function
import os
import sys
from collections import Sequence
sys.path.insert(0, os.path.abspath('..'))  # FIXME
from pprint import pformat

try:
    from ply.lex import LexToken
except ImportError:
    from xonsh.ply.lex import LexToken


from xonsh.lexer import Lexer

LEXER_ARGS = {'lextab': 'lexer_test_table', 'debug': 0}

def ensure_tuple(x):
    if isinstance(x, LexToken):
        # line numbers can no longer be solely determined from the lexer
        #x = (x.type, x.value, x.lineno, x.lexpos)
        x = (x.type, x.value, x.lexpos)
    elif isinstance(x, tuple):
        pass
    elif isinstance(x, Sequence):
        x = tuple(x)
    else:
        raise TypeError('{0} is not a sequence'.format(x))
    return x

def tokens_equal(x, y):
    """Tests whether two token are equal."""
    xtup = ensure_tuple(x)
    ytup = ensure_tuple(y)
    return xtup == ytup

def assert_token_equal(x, y):
    """Asserts that two tokens are equal."""
    if not tokens_equal(x, y):
        msg = 'The tokens differ: {0!r} != {1!r}'.format(x, y)
        raise AssertionError(msg)
    return True

def assert_tokens_equal(x, y):
    """Asserts that two token sequences are equal."""
    if len(x) != len(y):
        msg = 'The tokens sequences have different lengths: {0!r} != {1!r}\n'
        msg += '# x\n{2}\n\n# y\n{3}'
        raise AssertionError(msg.format(len(x), len(y), pformat(x), pformat(y)))
    diffs = []
    diffs = [(a, b) for a, b in zip(x, y) if not tokens_equal(a, b)]
    if len(diffs) > 0:
        msg = ['The token sequences differ: ']
        for a, b in diffs:
            msg += ['', '- ' + repr(a), '+ ' + repr(b)]
        msg = '\n'.join(msg)
        raise AssertionError(msg)
    return True

def check_token(inp, exp):
    l = Lexer()
    l.input(inp)
    obs = list(l)
    if len(obs) != 1:
        msg = 'The observed sequence does not have length-1: {0!r} != 1\n'
        msg += '# obs\n{1}'
        raise AssertionError(msg.format(len(obs), pformat(obs)))
    return assert_token_equal(exp, obs[0])

def check_tokens(inp, exp):
    l = Lexer()
    l.input(inp)
    obs = list(l)
    return assert_tokens_equal(exp, obs)

def check_tokens_subproc(inp, exp):
    l = Lexer()
    l.input('$[{}]'.format(inp))
    obs = list(l)[1:-1]
    return assert_tokens_equal(exp, obs)

def test_int_literal():
    assert check_token('42', ['NUMBER', '42', 0])

def test_hex_literal():
    assert check_token('0x42', ['NUMBER', '0x42', 0])

def test_oct_o_literal():
    assert check_token('0o42', ['NUMBER', '0o42', 0])

def test_bin_literal():
    assert check_token('0b101010', ['NUMBER', '0b101010', 0])

def test_indent():
    exp = [('INDENT', '  \t  ', 0),
           ('NUMBER', '42', 5),
           ('DEDENT', '', 0)]
    assert check_tokens('  \t  42', exp)

def test_post_whitespace():
    inp = '42  \t  '
    exp = [('NUMBER', '42', 0)]
    assert check_tokens(inp, exp)

def test_internal_whitespace():
    inp = '42  +\t65'
    exp = [('NUMBER', '42', 0),
           ('PLUS', '+', 4),
           ('NUMBER', '65', 6),]
    assert check_tokens(inp, exp)

def test_indent_internal_whitespace():
    inp = ' 42  +\t65'
    exp = [('INDENT', ' ', 0),
           ('NUMBER', '42', 1),
           ('PLUS', '+', 5),
           ('NUMBER', '65', 7),
           ('DEDENT', '', 0)]
    assert check_tokens(inp, exp)

def test_assignment():
    inp = 'x = 42'
    exp = [('NAME', 'x', 0),
           ('EQUALS', '=', 2),
           ('NUMBER', '42', 4),]
    assert check_tokens(inp, exp)

def test_multiline():
    inp = 'x\ny'
    exp = [('NAME', 'x', 0),
           ('NEWLINE', '\n', 1),
           ('NAME', 'y', 0),]
    assert check_tokens(inp, exp)

def test_atdollar_expression():
    inp = '@$(which python)'
    exp = [('ATDOLLAR_LPAREN', '@$(', 0),
           ('NAME', 'which', 3),
           ('WS', ' ', 8),
           ('NAME', 'python', 9),
           ('RPAREN', ')', 15)]
    assert check_tokens(inp, exp)

def test_and():
    assert check_token('and', ['AND', 'and', 0])

def test_ampersand():
    assert check_token('&', ['AMPERSAND', '&', 0])

def test_atdollar():
    assert check_token('@$', ['ATDOLLAR', '@$', 0])

def test_doubleamp():
    assert check_token('&&', ['AND', 'and', 0])

def test_pipe():
    assert check_token('|', ['PIPE', '|', 0])

def test_doublepipe():
    assert check_token('||', ['OR', 'or', 0])

def test_single_quote_literal():
    assert check_token("'yo'", ['STRING', "'yo'", 0])

def test_double_quote_literal():
    assert check_token('"yo"', ['STRING', '"yo"', 0])

def test_triple_single_quote_literal():
    assert check_token("'''yo'''", ['STRING', "'''yo'''", 0])

def test_triple_double_quote_literal():
    assert check_token('"""yo"""', ['STRING', '"""yo"""', 0])

def test_single_raw_string_literal():
    assert check_token("r'yo'", ['STRING', "r'yo'", 0])

def test_double_raw_string_literal():
    assert check_token('r"yo"', ['STRING', 'r"yo"', 0])

def test_single_unicode_literal():
    assert check_token("u'yo'", ['STRING', "u'yo'", 0])

def test_double_unicode_literal():
    assert check_token('u"yo"', ['STRING', 'u"yo"', 0])

def test_single_bytes_literal():
    assert check_token("b'yo'", ['STRING', "b'yo'", 0])

def test_regex_globs():
    for i in ('.*', r'\d*', '.*#{1,2}'):
        for p in ('', 'r', 'g', '@somethingelse'):
            c = '{}`{}`'.format(p,i)
            assert check_token(c, ['SEARCHPATH', c, 0])

def test_float_literals():
    cases = ['0.0', '.0', '0.', '1e10', '1.e42', '0.1e42', '0.5e-42',
             '5E10', '5e+42']
    for s in cases:
        assert check_token(s, ['NUMBER', s, 0])

def test_ioredir():
    cases = ['2>1', 'err>out', 'o>', 'all>', 'e>o', 'e>', 'out>', '2>&1']
    for s in cases:
        assert check_tokens_subproc(s, [('IOREDIRECT', s, 2)])
Add a uniform character encoding to all files. I wouldn't normally do something like this but issue #487 brought to my attention the fact that too many of the python modules don't have an encoding comment and of those that do there is a lot of pointless inconsistency in the style of the comment. 2015-11-16 14:04:32 -08:00			`# -- coding: utf-8 --`
almost lexing 2015-01-23 19:23:45 -06:00			`"""Tests the xonsh lexer."""`
one test works 2015-01-23 20:25:56 -06:00			`from __future__ import unicode_literals, print_function`
almost lexing 2015-01-23 19:23:45 -06:00			`import os`
			`import sys`
one test works 2015-01-23 20:25:56 -06:00			`from collections import Sequence`
started parser 2015-01-24 00:17:08 -06:00			`sys.path.insert(0, os.path.abspath('..')) # FIXME`
more updates 2015-04-02 19:42:38 -05:00			`from pprint import pformat`
almost lexing 2015-01-23 19:23:45 -06:00
Add try-except block to ply imports 2016-05-07 21:34:19 +02:00			`try:`
			`from ply.lex import LexToken`
			`except ImportError:`
			`from xonsh.ply.lex import LexToken`
'lexer' 2016-06-22 23:12:11 +03:00
one test works 2015-01-23 20:25:56 -06:00
almost lexing 2015-01-23 19:23:45 -06:00			`from xonsh.lexer import Lexer`

some optimization fixes 2015-03-07 11:47:32 -06:00			`LEXER_ARGS = {'lextab': 'lexer_test_table', 'debug': 0}`
one test works 2015-01-23 20:25:56 -06:00
			`def ensure_tuple(x):`
			`if isinstance(x, LexToken):`
test fies 2015-03-13 19:20:26 -05:00			`# line numbers can no longer be solely determined from the lexer`
			`#x = (x.type, x.value, x.lineno, x.lexpos)`
			`x = (x.type, x.value, x.lexpos)`
one test works 2015-01-23 20:25:56 -06:00			`elif isinstance(x, tuple):`
			`pass`
			`elif isinstance(x, Sequence):`
			`x = tuple(x)`
			`else:`
			`raise TypeError('{0} is not a sequence'.format(x))`
			`return x`

			`def tokens_equal(x, y):`
			`"""Tests whether two token are equal."""`
			`xtup = ensure_tuple(x)`
			`ytup = ensure_tuple(y)`
			`return xtup == ytup`

			`def assert_token_equal(x, y):`
			`"""Asserts that two tokens are equal."""`
			`if not tokens_equal(x, y):`
			`msg = 'The tokens differ: {0!r} != {1!r}'.format(x, y)`
			`raise AssertionError(msg)`
'lexer' 2016-06-22 23:12:11 +03:00			`return True`
one test works 2015-01-23 20:25:56 -06:00
			`def assert_tokens_equal(x, y):`
			`"""Asserts that two token sequences are equal."""`
			`if len(x) != len(y):`
			`msg = 'The tokens sequences have different lengths: {0!r} != {1!r}\n'`
			`msg += '# x\n{2}\n\n# y\n{3}'`
			`raise AssertionError(msg.format(len(x), len(y), pformat(x), pformat(y)))`
			`diffs = []`
			`diffs = [(a, b) for a, b in zip(x, y) if not tokens_equal(a, b)]`
			`if len(diffs) > 0:`
lexer tests 2015-03-24 22:09:50 -04:00			`msg = ['The token sequences differ: ']`
one test works 2015-01-23 20:25:56 -06:00			`for a, b in diffs:`
more whitspace fixes 2015-01-23 20:45:45 -06:00			`msg += ['', '- ' + repr(a), '+ ' + repr(b)]`
one test works 2015-01-23 20:25:56 -06:00			`msg = '\n'.join(msg)`
			`raise AssertionError(msg)`
'lexer' 2016-06-22 23:12:11 +03:00			`return True`
one test works 2015-01-23 20:25:56 -06:00
more updates 2015-04-02 19:42:38 -05:00			`def check_token(inp, exp):`
almost lexing 2015-01-23 19:23:45 -06:00			`l = Lexer()`
more updates 2015-04-02 19:42:38 -05:00			`l.input(inp)`
more token tests 2015-01-23 20:41:45 -06:00			`obs = list(l)`
stupid string literals 2015-01-23 23:16:24 -06:00			`if len(obs) != 1:`
			`msg = 'The observed sequence does not have length-1: {0!r} != 1\n'`
			`msg += '# obs\n{1}'`
			`raise AssertionError(msg.format(len(obs), pformat(obs)))`
'lexer' 2016-06-22 23:12:11 +03:00			`return assert_token_equal(exp, obs[0])`
one test works 2015-01-23 20:25:56 -06:00
more updates 2015-04-02 19:42:38 -05:00			`def check_tokens(inp, exp):`
one test works 2015-01-23 20:25:56 -06:00			`l = Lexer()`
more updates 2015-04-02 19:42:38 -05:00			`l.input(inp)`
one test works 2015-01-23 20:25:56 -06:00			`obs = list(l)`
'lexer' 2016-06-22 23:12:11 +03:00			`return assert_tokens_equal(exp, obs)`
more token tests 2015-01-23 20:41:45 -06:00
small ioredirect tests 2015-05-19 18:05:49 -04:00			`def check_tokens_subproc(inp, exp):`
			`l = Lexer()`
			`l.input('$[{}]'.format(inp))`
			`obs = list(l)[1:-1]`
'lexer' 2016-06-22 23:12:11 +03:00			`return assert_tokens_equal(exp, obs)`
more token tests 2015-01-23 20:41:45 -06:00
			`def test_int_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('42', ['NUMBER', '42', 0])`
more token tests 2015-01-23 20:41:45 -06:00
added floating point literals 2015-01-23 23:39:42 -06:00			`def test_hex_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('0x42', ['NUMBER', '0x42', 0])`
added floating point literals 2015-01-23 23:39:42 -06:00
			`def test_oct_o_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('0o42', ['NUMBER', '0o42', 0])`
added floating point literals 2015-01-23 23:39:42 -06:00
			`def test_bin_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('0b101010', ['NUMBER', '0b101010', 0])`
added floating point literals 2015-01-23 23:39:42 -06:00
more token tests 2015-01-23 20:41:45 -06:00			`def test_indent():`
Removed trailing whitespace 2015-07-29 23:58:25 +02:00			`exp = [('INDENT', ' \t ', 0),`
lexer tests 2015-03-24 22:09:50 -04:00			`('NUMBER', '42', 5),`
			`('DEDENT', '', 0)]`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_tokens(' \t 42', exp)`
almost lexing 2015-01-23 19:23:45 -06:00
fixed whitespace issue 2015-01-23 20:35:54 -06:00			`def test_post_whitespace():`
more updates 2015-04-02 19:42:38 -05:00			`inp = '42 \t '`
lexer tests 2015-03-24 22:09:50 -04:00			`exp = [('NUMBER', '42', 0)]`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_tokens(inp, exp)`
fixed whitespace issue 2015-01-23 20:35:54 -06:00
more whitspace fixes 2015-01-23 20:45:45 -06:00			`def test_internal_whitespace():`
more updates 2015-04-02 19:42:38 -05:00			`inp = '42 +\t65'`
Removed trailing whitespace 2015-07-29 23:58:25 +02:00			`exp = [('NUMBER', '42', 0),`
test fies 2015-03-13 19:20:26 -05:00			`('PLUS', '+', 4),`
lexer tests 2015-03-24 22:09:50 -04:00			`('NUMBER', '65', 6),]`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_tokens(inp, exp)`
more whitspace fixes 2015-01-23 20:45:45 -06:00
stupid string literals 2015-01-23 23:16:24 -06:00			`def test_indent_internal_whitespace():`
more updates 2015-04-02 19:42:38 -05:00			`inp = ' 42 +\t65'`
test fies 2015-03-13 19:20:26 -05:00			`exp = [('INDENT', ' ', 0),`
Removed trailing whitespace 2015-07-29 23:58:25 +02:00			`('NUMBER', '42', 1),`
test fies 2015-03-13 19:20:26 -05:00			`('PLUS', '+', 5),`
lexer tests 2015-03-24 22:09:50 -04:00			`('NUMBER', '65', 7),`
			`('DEDENT', '', 0)]`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_tokens(inp, exp)`
stupid string literals 2015-01-23 23:16:24 -06:00
			`def test_assignment():`
more updates 2015-04-02 19:42:38 -05:00			`inp = 'x = 42'`
test fies 2015-03-13 19:20:26 -05:00			`exp = [('NAME', 'x', 0),`
			`('EQUALS', '=', 2),`
Removed trailing whitespace 2015-07-29 23:58:25 +02:00			`('NUMBER', '42', 4),]`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_tokens(inp, exp)`
stupid string literals 2015-01-23 23:16:24 -06:00
			`def test_multiline():`
more updates 2015-04-02 19:42:38 -05:00			`inp = 'x\ny'`
test fies 2015-03-13 19:20:26 -05:00			`exp = [('NAME', 'x', 0),`
			`('NEWLINE', '\n', 1),`
lexer tests 2015-03-24 22:09:50 -04:00			`('NAME', 'y', 0),]`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_tokens(inp, exp)`
stupid string literals 2015-01-23 23:16:24 -06:00
add tests for new lexer/parser changes 2016-05-20 19:46:46 -04:00			`def test_atdollar_expression():`
			`inp = '@$(which python)'`
			`exp = [('ATDOLLAR_LPAREN', '@$(', 0),`
			`('NAME', 'which', 3),`
			`('WS', ' ', 8),`
			`('NAME', 'python', 9),`
			`('RPAREN', ')', 15)]`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_tokens(inp, exp)`
add tests for new lexer/parser changes 2016-05-20 19:46:46 -04:00
stupid string literals 2015-01-23 23:16:24 -06:00			`def test_and():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('and', ['AND', 'and', 0])`
stupid string literals 2015-01-23 23:16:24 -06:00
double pipe token 2016-02-09 00:43:21 -05:00			`def test_ampersand():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('&', ['AMPERSAND', '&', 0])`
double pipe token 2016-02-09 00:43:21 -05:00
add tests for new lexer/parser changes 2016-05-20 19:46:46 -04:00			`def test_atdollar():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('@$', ['ATDOLLAR', '@$', 0])`
add tests for new lexer/parser changes 2016-05-20 19:46:46 -04:00
added annoying double ampersand token 2016-02-09 00:31:15 -05:00			`def test_doubleamp():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('&&', ['AND', 'and', 0])`
added annoying double ampersand token 2016-02-09 00:31:15 -05:00
double pipe token 2016-02-09 00:43:21 -05:00			`def test_pipe():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('\|', ['PIPE', '\|', 0])`
double pipe token 2016-02-09 00:43:21 -05:00
			`def test_doublepipe():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('\|\|', ['OR', 'or', 0])`
double pipe token 2016-02-09 00:43:21 -05:00
stupid string literals 2015-01-23 23:16:24 -06:00			`def test_single_quote_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token("'yo'", ['STRING', "'yo'", 0])`
stupid string literals 2015-01-23 23:16:24 -06:00
			`def test_double_quote_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('"yo"', ['STRING', '"yo"', 0])`
stupid string literals 2015-01-23 23:16:24 -06:00
			`def test_triple_single_quote_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token("'''yo'''", ['STRING', "'''yo'''", 0])`
stupid string literals 2015-01-23 23:16:24 -06:00
			`def test_triple_double_quote_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('"""yo"""', ['STRING', '"""yo"""', 0])`
stupid string literals 2015-01-23 23:16:24 -06:00
			`def test_single_raw_string_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token("r'yo'", ['STRING', "r'yo'", 0])`
stupid string literals 2015-01-23 23:16:24 -06:00
			`def test_double_raw_string_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('r"yo"', ['STRING', 'r"yo"', 0])`
stupid string literals 2015-01-23 23:16:24 -06:00
			`def test_single_unicode_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token("u'yo'", ['STRING', "u'yo'", 0])`
stupid string literals 2015-01-23 23:16:24 -06:00
			`def test_double_unicode_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token('u"yo"', ['STRING', 'u"yo"', 0])`
stupid string literals 2015-01-23 23:16:24 -06:00
			`def test_single_bytes_literal():`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token("b'yo'", ['STRING', "b'yo'", 0])`
stupid string literals 2015-01-23 23:16:24 -06:00
test cases for updated regex glob 2016-05-19 19:23:42 -04:00			`def test_regex_globs():`
			`for i in ('.', r'\d', '.*#{1,2}'):`
update tests 2016-06-11 21:35:39 -04:00			`for p in ('', 'r', 'g', '@somethingelse'):`
update tests, add some new tests 2016-06-08 22:53:15 -04:00			c = '{}`{}`'.format(p,i)
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token(c, ['SEARCHPATH', c, 0])`
test cases for updated regex glob 2016-05-19 19:23:42 -04:00
added floating point literals 2015-01-23 23:39:42 -06:00			`def test_float_literals():`
Removed trailing whitespace 2015-07-29 23:58:25 +02:00			`cases = ['0.0', '.0', '0.', '1e10', '1.e42', '0.1e42', '0.5e-42',`
added another test 2015-01-23 23:43:59 -06:00			`'5E10', '5e+42']`
added floating point literals 2015-01-23 23:39:42 -06:00			`for s in cases:`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_token(s, ['NUMBER', s, 0])`
one test works 2015-01-23 20:25:56 -06:00
small ioredirect tests 2015-05-19 18:05:49 -04:00			`def test_ioredir():`
			`cases = ['2>1', 'err>out', 'o>', 'all>', 'e>o', 'e>', 'out>', '2>&1']`
			`for s in cases:`
'lexer' 2016-06-22 23:12:11 +03:00			`assert check_tokens_subproc(s, [('IOREDIRECT', s, 2)])`