consolidate tokenizers into a single file

2025-03-04 08:24:40 +01:00 · 2016-05-23 01:13:20 -04:00 · 2016-05-23 01:13:20 -04:00 · 60ad616344
commit 60ad616344
parent da8299b3b4
5 changed files with 58 additions and 745 deletions
--- a/setup.py
+++ b/setup.py
@ -171,7 +171,7 @@ def main():
        url='https://github.com/scopatz/xonsh',
        platforms='Cross Platform',
        classifiers=['Programming Language :: Python :: 3'],
-        packages=['xonsh', 'xonsh.ptk', 'xonsh.parsers', 'xonsh.xoreutils', 'xontrib', 'xonsh.tokenize'],
+        packages=['xonsh', 'xonsh.ptk', 'xonsh.parsers', 'xonsh.xoreutils', 'xontrib'],
        package_dir={'xonsh': 'xonsh', 'xontrib': 'xontrib'},
        package_data={'xonsh': ['*.json'], 'xontrib': ['*.xsh']},
        cmdclass=cmdclass
--- a/xonsh/environ.py
+++ b/xonsh/environ.py
@ -213,7 +213,7 @@ DEFAULT_VALUES = {
    'XONSH_HISTORY_FILE': os.path.expanduser('~/.xonsh_history.json'),
    'XONSH_HISTORY_SIZE': (8128, 'commands'),
    'XONSH_LOGIN': False,
-    'XONSH_SHOW_TRACEBACK': False,
+    'XONSH_SHOW_TRACEBACK': True,
    'XONSH_STORE_STDIN': False,
    'XONSH_STORE_STDOUT': False,
 }
--- a/xonsh/tokenize/tokenize_35.py
+++ b/xonsh/tokenize/tokenize_35.py
@ -1,11 +1,14 @@
 """Tokenization help for xonsh programs.

-This file is a modified version of tokenize.py form the Python 3.5 standard
-library (licensed under the Python Software Foundation License, version 2),
-which provides tokenization help for Python programs.
+This file is a modified version of tokenize.py form the Python 3.4 and 3.5
+standard libraries (licensed under the Python Software Foundation License,
+version 2), which provides tokenization help for Python programs.

-It is modified to properly tokenize xonsh's backtick operator and to support
-the @$ operator.
+It is modified to properly tokenize xonsh code, including backtick regex
+path and several xonsh-specific operators.
+
+A few pieces of this file are specific to the version of Python being used.
+To find these pieces, search the PY35.

 Original file credits:
   __author__ = 'Ka-Ping Yee <ping@lfw.org>'
@ -23,15 +26,27 @@ import re
 import sys
 from token import *

+from xonsh.platform import PYTHON_VERSION_INFO
+
+
 cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

 import token
 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
                           "NL", "untokenize", "ENCODING", "TokenInfo",
-                           "REGEXPATH", "ATDOLLAR", "TokenError"]
+                           "TokenError", 'REGEXPATH', 'ATDOLLAR', 'ATEQUAL', '_tokenize']
 del token

+PY35 = PYTHON_VERSION_INFO >= (3, 5, 0)
+if PY35:
+    AUGASSIGN_OPS = r"[+\-*/%&@|^=<>]=?"
+    ADDSPACE_TOKS = (NAME, NUMBER, ASYNC, AWAIT)
+else:                 
+    AUGASSIGN_OPS = r"[+\-*/%&|^=<>]=?"
+    ADDSPACE_TOKS = (NAME, NUMBER)
+    
+
 COMMENT = N_TOKENS
 tok_name[COMMENT] = 'COMMENT'
 NL = N_TOKENS + 1
@ -40,10 +55,13 @@ ENCODING = N_TOKENS + 2
 tok_name[ENCODING] = 'ENCODING'
 N_TOKENS += 3
 REGEXPATH = N_TOKENS
-tok_name[REGEXPATH] = 'REGEXPATH'
+tok_name[N_TOKENS] = 'REGEXPATH'
 N_TOKENS += 1
 ATDOLLAR = N_TOKENS
-tok_name[ATDOLLAR] = 'ATDOLLAR'
+tok_name[N_TOKENS] = 'ATDOLLAR'
+N_TOKENS += 1
+ATEQUAL = N_TOKENS
+tok_name[N_TOKENS] = 'ATEQUAL'
 N_TOKENS += 1
 EXACT_TOKEN_TYPES = {
    '(':   LPAR,
@ -152,7 +170,7 @@ RegexPath = r"`[^\n`\\]*(?:\\.[^\n`\\]*)*`"
 # recognized as two instances of =).
 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
                 r"//=?", r"->", r"@\$",
-                 r"[+\-*/%&@|^=<>]=?",
+                 AUGASSIGN_OPS,
                 r"~")

 Bracket = '[][(){}]'
@ -292,7 +310,7 @@ class Untokenizer:
                self.encoding = tokval
                continue

-            if toknum in (NAME, NUMBER, ASYNC, AWAIT):
+            if toknum in ADDSPACE_TOKS:
                tokval += ' '

            # Insert a space between two consecutive strings
@ -464,34 +482,6 @@ def open(filename):
        raise


-def tokenize(readline):
-    """
-    The tokenize() generator requires one argment, readline, which
-    must be a callable object which provides the same interface as the
-    readline() method of built-in file objects.  Each call to the function
-    should return one line of input as bytes.  Alternately, readline
-    can be a callable function terminating with StopIteration:
-        readline = open(myfile, 'rb').__next__  # Example of alternate readline
-
-    The generator produces 5-tuples with these members: the token type; the
-    token string; a 2-tuple (srow, scol) of ints specifying the row and
-    column where the token begins in the source; a 2-tuple (erow, ecol) of
-    ints specifying the row and column where the token ends in the source;
-    and the line on which the token was found.  The line passed is the
-    logical line; continuation lines are included.
-
-    The first token sequence will always be an ENCODING token
-    which tells you which encoding was used to decode the bytes stream.
-    """
-    # This import is here to avoid problems when the itertools module is not
-    # built yet and tokenize is imported.
-    from itertools import chain, repeat
-    encoding, consumed = detect_encoding(readline)
-    rl_gen = iter(readline, b"")
-    empty = repeat(b"")
-    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
-
-
 def _tokenize(readline, encoding):
    lnum = parenlev = continued = 0
    numchars = '0123456789'
@ -711,6 +701,34 @@ def _tokenize(readline, encoding):
    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')


+def tokenize(readline):
+    """
+    The tokenize() generator requires one argment, readline, which
+    must be a callable object which provides the same interface as the
+    readline() method of built-in file objects.  Each call to the function
+    should return one line of input as bytes.  Alternately, readline
+    can be a callable function terminating with StopIteration:
+        readline = open(myfile, 'rb').__next__  # Example of alternate readline
+
+    The generator produces 5-tuples with these members: the token type; the
+    token string; a 2-tuple (srow, scol) of ints specifying the row and
+    column where the token begins in the source; a 2-tuple (erow, ecol) of
+    ints specifying the row and column where the token ends in the source;
+    and the line on which the token was found.  The line passed is the
+    logical line; continuation lines are included.
+
+    The first token sequence will always be an ENCODING token
+    which tells you which encoding was used to decode the bytes stream.
+    """
+    # This import is here to avoid problems when the itertools module is not
+    # built yet and tokenize is imported.
+    from itertools import chain, repeat
+    encoding, consumed = detect_encoding(readline)
+    rl_gen = iter(readline, b"")
+    empty = repeat(b"")
+    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
+
+
 # An undocumented, backwards compatible, API for all the places in the standard
 # library that expect to be able to use tokenize with strings
 def generate_tokens(readline):
@ -775,6 +793,3 @@ def main():
    except Exception as err:
        perror("unexpected error: %s" % err)
        raise
-
-if __name__ == "__main__":
-    main()
--- a/xonsh/tokenize/init.py
+++ b/xonsh/tokenize/init.py
@ -1,6 +0,0 @@
-from xonsh.platform import PYTHON_VERSION_INFO
-
-if PYTHON_VERSION_INFO >= (3, 5, 0):
-    from xonsh.tokenize.tokenize_35 import *
-else:
-    from xonsh.tokenize.tokenize_34 import *
--- a/xonsh/tokenize/tokenize_34.py
+++ b/xonsh/tokenize/tokenize_34.py
@ -1,696 +0,0 @@
-"""Tokenization help for xonsh programs.
-
-This file is a modified version of tokenize.py form the Python 3.4 standard
-library (licensed under the Python Software Foundation License, version 2),
-which provides tokenization help for Python programs.
-
-It is modified to properly tokenize xonsh's backtick operator and to support
-the @$ operator.
-
-Original file credits:
-   __author__ = 'Ka-Ping Yee <ping@lfw.org>'
-   __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
-                  'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
-                  'Michael Foord')
-"""
-
-from codecs import lookup, BOM_UTF8
-import collections
-from io import TextIOWrapper
-from itertools import chain
-import re
-import sys
-from token import *
-
-cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
-blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
-
-import token
-__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
-                           "NL", "untokenize", "ENCODING", "TokenInfo",
-                           "REGEXPATH", "ATDOLLAR", "TokenError"]
-del token
-
-COMMENT = N_TOKENS
-tok_name[COMMENT] = 'COMMENT'
-NL = N_TOKENS + 1
-tok_name[NL] = 'NL'
-ENCODING = N_TOKENS + 2
-tok_name[ENCODING] = 'ENCODING'
-N_TOKENS += 3
-REGEXPATH = N_TOKENS
-tok_name[REGEXPATH] = 'REGEXPATH'
-N_TOKENS += 1
-ATDOLLAR = N_TOKENS
-tok_name[ATDOLLAR] = 'ATDOLLAR'
-N_TOKENS += 1
-EXACT_TOKEN_TYPES = {
-    '(':   LPAR,
-    ')':   RPAR,
-    '[':   LSQB,
-    ']':   RSQB,
-    ':':   COLON,
-    ',':   COMMA,
-    ';':   SEMI,
-    '+':   PLUS,
-    '-':   MINUS,
-    '*':   STAR,
-    '/':   SLASH,
-    '|':   VBAR,
-    '&':   AMPER,
-    '<':   LESS,
-    '>':   GREATER,
-    '=':   EQUAL,
-    '.':   DOT,
-    '%':   PERCENT,
-    '{':   LBRACE,
-    '}':   RBRACE,
-    '==':  EQEQUAL,
-    '!=':  NOTEQUAL,
-    '<=':  LESSEQUAL,
-    '>=':  GREATEREQUAL,
-    '~':   TILDE,
-    '^':   CIRCUMFLEX,
-    '<<':  LEFTSHIFT,
-    '>>':  RIGHTSHIFT,
-    '**':  DOUBLESTAR,
-    '+=':  PLUSEQUAL,
-    '-=':  MINEQUAL,
-    '*=':  STAREQUAL,
-    '/=':  SLASHEQUAL,
-    '%=':  PERCENTEQUAL,
-    '&=':  AMPEREQUAL,
-    '|=':  VBAREQUAL,
-    '^=': CIRCUMFLEXEQUAL,
-    '<<=': LEFTSHIFTEQUAL,
-    '>>=': RIGHTSHIFTEQUAL,
-    '**=': DOUBLESTAREQUAL,
-    '//':  DOUBLESLASH,
-    '//=': DOUBLESLASHEQUAL,
-    '@':   AT,
-    '@$':  ATDOLLAR,
-}
-
-class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
-    def __repr__(self):
-        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
-        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
-                self._replace(type=annotated_type))
-
-    @property
-    def exact_type(self):
-        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
-            return EXACT_TOKEN_TYPES[self.string]
-        else:
-            return self.type
-
-def group(*choices): return '(' + '|'.join(choices) + ')'
-def any(*choices): return group(*choices) + '*'
-def maybe(*choices): return group(*choices) + '?'
-
-# Note: we use unicode matching for names ("\w") but ascii matching for
-# number literals.
-Whitespace = r'[ \f\t]*'
-Comment = r'#[^\r\n]*'
-Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'\w+'
-
-Hexnumber = r'0[xX][0-9a-fA-F]+'
-Binnumber = r'0[bB][01]+'
-Octnumber = r'0[oO][0-7]+'
-Decnumber = r'(?:0+|[1-9][0-9]*)'
-Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?[0-9]+'
-Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
-Expfloat = r'[0-9]+' + Exponent
-Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
-Number = group(Imagnumber, Floatnumber, Intnumber)
-
-StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
-
-# Tail end of ' string.
-Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
-# Tail end of " string.
-Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
-# Tail end of ''' string.
-Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
-# Tail end of """ string.
-Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-Triple = group(StringPrefix + "'''", StringPrefix + '"""')
-# Single-line ' or " string.
-String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
-               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
-
-# Xonsh-specific Regular Expression Glob Syntax
-RegexPath = r"`[^\n`\\]*(?:\\.[^\n`\\]*)*`"
-
-# Because of leftmost-then-longest match semantics, be sure to put the
-# longest operators first (e.g., if = came before ==, == would get
-# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
-                 r"//=?", r"->",
-                 r"[+\-*/%&|^=<>]=?",
-                 r"~", r"@\$")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-Funny = group(Operator, Bracket, Special)
-
-PlainToken = group(Number, Funny, String, Name, RegexPath)
-Token = Ignore + PlainToken
-
-# First (or only) line of ' or " string.
-ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
-                group("'", r'\\\r?\n'),
-                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
-                group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple, RegexPath)
-PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-
-def _compile(expr):
-    return re.compile(expr, re.UNICODE)
-
-endpats = {"'": Single, '"': Double,
-           "'''": Single3, '"""': Double3,
-           "r'''": Single3, 'r"""': Double3,
-           "b'''": Single3, 'b"""': Double3,
-           "R'''": Single3, 'R"""': Double3,
-           "B'''": Single3, 'B"""': Double3,
-           "br'''": Single3, 'br"""': Double3,
-           "bR'''": Single3, 'bR"""': Double3,
-           "Br'''": Single3, 'Br"""': Double3,
-           "BR'''": Single3, 'BR"""': Double3,
-           "rb'''": Single3, 'rb"""': Double3,
-           "Rb'''": Single3, 'Rb"""': Double3,
-           "rB'''": Single3, 'rB"""': Double3,
-           "RB'''": Single3, 'RB"""': Double3,
-           "u'''": Single3, 'u"""': Double3,
-           "R'''": Single3, 'R"""': Double3,
-           "U'''": Single3, 'U"""': Double3,
-           'r': None, 'R': None, 'b': None, 'B': None,
-           'u': None, 'U': None}
-
-triple_quoted = {}
-for t in ("'''", '"""',
-          "r'''", 'r"""', "R'''", 'R"""',
-          "b'''", 'b"""', "B'''", 'B"""',
-          "br'''", 'br"""', "Br'''", 'Br"""',
-          "bR'''", 'bR"""', "BR'''", 'BR"""',
-          "rb'''", 'rb"""', "rB'''", 'rB"""',
-          "Rb'''", 'Rb"""', "RB'''", 'RB"""',
-          "u'''", 'u"""', "U'''", 'U"""',
-          ):
-    triple_quoted[t] = t
-single_quoted = {}
-for t in ("'", '"',
-          "r'", 'r"', "R'", 'R"',
-          "b'", 'b"', "B'", 'B"',
-          "br'", 'br"', "Br'", 'Br"',
-          "bR'", 'bR"', "BR'", 'BR"' ,
-          "rb'", 'rb"', "rB'", 'rB"',
-          "Rb'", 'Rb"', "RB'", 'RB"' ,
-          "u'", 'u"', "U'", 'U"',
-          ):
-    single_quoted[t] = t
-
-tabsize = 8
-
-class TokenError(Exception): pass
-
-class StopTokenizing(Exception): pass
-
-
-class Untokenizer:
-
-    def __init__(self):
-        self.tokens = []
-        self.prev_row = 1
-        self.prev_col = 0
-        self.encoding = None
-
-    def add_whitespace(self, start):
-        row, col = start
-        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
-            raise ValueError("start ({},{}) precedes previous end ({},{})"
-                             .format(row, col, self.prev_row, self.prev_col))
-        row_offset = row - self.prev_row
-        if row_offset:
-            self.tokens.append("\\\n" * row_offset)
-            self.prev_col = 0
-        col_offset = col - self.prev_col
-        if col_offset:
-            self.tokens.append(" " * col_offset)
-
-    def untokenize(self, iterable):
-        it = iter(iterable)
-        for t in it:
-            if len(t) == 2:
-                self.compat(t, it)
-                break
-            tok_type, token, start, end, line = t
-            if tok_type == ENCODING:
-                self.encoding = token
-                continue
-            if tok_type == ENDMARKER:
-                break
-            self.add_whitespace(start)
-            self.tokens.append(token)
-            self.prev_row, self.prev_col = end
-            if tok_type in (NEWLINE, NL):
-                self.prev_row += 1
-                self.prev_col = 0
-        return "".join(self.tokens)
-
-    def compat(self, token, iterable):
-        indents = []
-        toks_append = self.tokens.append
-        startline = token[0] in (NEWLINE, NL)
-        prevstring = False
-
-        for tok in chain([token], iterable):
-            toknum, tokval = tok[:2]
-            if toknum == ENCODING:
-                self.encoding = tokval
-                continue
-
-            if toknum in (NAME, NUMBER):
-                tokval += ' '
-
-            # Insert a space between two consecutive strings
-            if toknum == STRING:
-                if prevstring:
-                    tokval = ' ' + tokval
-                prevstring = True
-            else:
-                prevstring = False
-
-            if toknum == INDENT:
-                indents.append(tokval)
-                continue
-            elif toknum == DEDENT:
-                indents.pop()
-                continue
-            elif toknum in (NEWLINE, NL):
-                startline = True
-            elif startline and indents:
-                toks_append(indents[-1])
-                startline = False
-            toks_append(tokval)
-
-
-def untokenize(iterable):
-    """Transform tokens back into Python source code.
-    It returns a bytes object, encoded using the ENCODING
-    token, which is the first token sequence output by tokenize.
-
-    Each element returned by the iterable must be a token sequence
-    with at least two elements, a token number and token value.  If
-    only two tokens are passed, the resulting output is poor.
-
-    Round-trip invariant for full input:
-        Untokenized source will match input source exactly
-
-    Round-trip invariant for limited intput:
-        # Output bytes will tokenize the back to the input
-        t1 = [tok[:2] for tok in tokenize(f.readline)]
-        newcode = untokenize(t1)
-        readline = BytesIO(newcode).readline
-        t2 = [tok[:2] for tok in tokenize(readline)]
-        assert t1 == t2
-    """
-    ut = Untokenizer()
-    out = ut.untokenize(iterable)
-    if ut.encoding is not None:
-        out = out.encode(ut.encoding)
-    return out
-
-
-def _get_normal_name(orig_enc):
-    """Imitates get_normal_name in tokenizer.c."""
-    # Only care about the first 12 characters.
-    enc = orig_enc[:12].lower().replace("_", "-")
-    if enc == "utf-8" or enc.startswith("utf-8-"):
-        return "utf-8"
-    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
-       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
-        return "iso-8859-1"
-    return orig_enc
-
-def detect_encoding(readline):
-    """
-    The detect_encoding() function is used to detect the encoding that should
-    be used to decode a Python source file.  It requires one argument, readline,
-    in the same way as the tokenize() generator.
-
-    It will call readline a maximum of twice, and return the encoding used
-    (as a string) and a list of any lines (left as bytes) it has read in.
-
-    It detects the encoding from the presence of a utf-8 bom or an encoding
-    cookie as specified in pep-0263.  If both a bom and a cookie are present,
-    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
-    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
-    'utf-8-sig' is returned.
-
-    If no encoding is specified, then the default of 'utf-8' will be returned.
-    """
-    try:
-        filename = readline.__self__.name
-    except AttributeError:
-        filename = None
-    bom_found = False
-    encoding = None
-    default = 'utf-8'
-    def read_or_stop():
-        try:
-            return readline()
-        except StopIteration:
-            return b''
-
-    def find_cookie(line):
-        try:
-            # Decode as UTF-8. Either the line is an encoding declaration,
-            # in which case it should be pure ASCII, or it must be UTF-8
-            # per default encoding.
-            line_string = line.decode('utf-8')
-        except UnicodeDecodeError:
-            msg = "invalid or missing encoding declaration"
-            if filename is not None:
-                msg = '{} for {!r}'.format(msg, filename)
-            raise SyntaxError(msg)
-
-        match = cookie_re.match(line_string)
-        if not match:
-            return None
-        encoding = _get_normal_name(match.group(1))
-        try:
-            codec = lookup(encoding)
-        except LookupError:
-            # This behaviour mimics the Python interpreter
-            if filename is None:
-                msg = "unknown encoding: " + encoding
-            else:
-                msg = "unknown encoding for {!r}: {}".format(filename,
-                        encoding)
-            raise SyntaxError(msg)
-
-        if bom_found:
-            if encoding != 'utf-8':
-                # This behaviour mimics the Python interpreter
-                if filename is None:
-                    msg = 'encoding problem: utf-8'
-                else:
-                    msg = 'encoding problem for {!r}: utf-8'.format(filename)
-                raise SyntaxError(msg)
-            encoding += '-sig'
-        return encoding
-
-    first = read_or_stop()
-    if first.startswith(BOM_UTF8):
-        bom_found = True
-        first = first[3:]
-        default = 'utf-8-sig'
-    if not first:
-        return default, []
-
-    encoding = find_cookie(first)
-    if encoding:
-        return encoding, [first]
-    if not blank_re.match(first):
-        return default, [first]
-
-    second = read_or_stop()
-    if not second:
-        return default, [first]
-
-    encoding = find_cookie(second)
-    if encoding:
-        return encoding, [first, second]
-
-    return default, [first, second]
-
-
-_builtin_open = open
-
-def open(filename):
-    """Open a file in read only mode using the encoding detected by
-    detect_encoding().
-    """
-    buffer = _builtin_open(filename, 'rb')
-    encoding, lines = detect_encoding(buffer.readline)
-    buffer.seek(0)
-    text = TextIOWrapper(buffer, encoding, line_buffering=True)
-    text.mode = 'r'
-    return text
-
-
-def tokenize(readline):
-    """
-    The tokenize() generator requires one argment, readline, which
-    must be a callable object which provides the same interface as the
-    readline() method of built-in file objects.  Each call to the function
-    should return one line of input as bytes.  Alternately, readline
-    can be a callable function terminating with StopIteration:
-        readline = open(myfile, 'rb').__next__  # Example of alternate readline
-
-    The generator produces 5-tuples with these members: the token type; the
-    token string; a 2-tuple (srow, scol) of ints specifying the row and
-    column where the token begins in the source; a 2-tuple (erow, ecol) of
-    ints specifying the row and column where the token ends in the source;
-    and the line on which the token was found.  The line passed is the
-    logical line; continuation lines are included.
-
-    The first token sequence will always be an ENCODING token
-    which tells you which encoding was used to decode the bytes stream.
-    """
-    # This import is here to avoid problems when the itertools module is not
-    # built yet and tokenize is imported.
-    from itertools import chain, repeat
-    encoding, consumed = detect_encoding(readline)
-    rl_gen = iter(readline, b"")
-    empty = repeat(b"")
-    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
-
-
-def _tokenize(readline, encoding):
-    lnum = parenlev = continued = 0
-    numchars = '0123456789'
-    contstr, needcont = '', 0
-    contline = None
-    indents = [0]
-
-    if encoding is not None:
-        if encoding == "utf-8-sig":
-            # BOM will already have been stripped.
-            encoding = "utf-8"
-        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
-    while True:             # loop over lines in stream
-        try:
-            line = readline()
-        except StopIteration:
-            line = b''
-
-        if encoding is not None:
-            line = line.decode(encoding)
-        lnum += 1
-        pos, max = 0, len(line)
-
-        if contstr:                            # continued string
-            if not line:
-                raise TokenError("EOF in multi-line string", strstart)
-            endmatch = endprog.match(line)
-            if endmatch:
-                pos = end = endmatch.end(0)
-                yield TokenInfo(STRING, contstr + line[:end],
-                       strstart, (lnum, end), contline + line)
-                contstr, needcont = '', 0
-                contline = None
-            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
-                yield TokenInfo(ERRORTOKEN, contstr + line,
-                           strstart, (lnum, len(line)), contline)
-                contstr = ''
-                contline = None
-                continue
-            else:
-                contstr = contstr + line
-                contline = contline + line
-                continue
-
-        elif parenlev == 0 and not continued:  # new statement
-            if not line: break
-            column = 0
-            while pos < max:                   # measure leading whitespace
-                if line[pos] == ' ':
-                    column += 1
-                elif line[pos] == '\t':
-                    column = (column//tabsize + 1)*tabsize
-                elif line[pos] == '\f':
-                    column = 0
-                else:
-                    break
-                pos += 1
-            if pos == max:
-                break
-
-            if line[pos] in '#\r\n':           # skip comments or blank lines
-                if line[pos] == '#':
-                    comment_token = line[pos:].rstrip('\r\n')
-                    nl_pos = pos + len(comment_token)
-                    yield TokenInfo(COMMENT, comment_token,
-                           (lnum, pos), (lnum, pos + len(comment_token)), line)
-                    yield TokenInfo(NL, line[nl_pos:],
-                           (lnum, nl_pos), (lnum, len(line)), line)
-                else:
-                    yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
-                           (lnum, pos), (lnum, len(line)), line)
-                continue
-
-            if column > indents[-1]:           # count indents or dedents
-                indents.append(column)
-                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-            while column < indents[-1]:
-                if column not in indents:
-                    raise IndentationError(
-                        "unindent does not match any outer indentation level",
-                        ("<tokenize>", lnum, pos, line))
-                indents = indents[:-1]
-                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
-
-        else:                                  # continued statement
-            if not line:
-                raise TokenError("EOF in multi-line statement", (lnum, 0))
-            continued = 0
-
-        while pos < max:
-            pseudomatch = _compile(PseudoToken).match(line, pos)
-            if pseudomatch:                                # scan for tokens
-                start, end = pseudomatch.span(1)
-                spos, epos, pos = (lnum, start), (lnum, end), end
-                if start == end:
-                    continue
-                token, initial = line[start:end], line[start]
-
-                if (initial in numchars or                  # ordinary number
-                    (initial == '.' and token != '.' and token != '...')):
-                    yield TokenInfo(NUMBER, token, spos, epos, line)
-                elif initial in '\r\n':
-                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
-                           token, spos, epos, line)
-                elif initial == '#':
-                    assert not token.endswith("\n")
-                    yield TokenInfo(COMMENT, token, spos, epos, line)
-                elif initial == '`':
-                    # Xonsh-specific Regex Globbing
-                    yield TokenInfo(REGEXPATH, token, spos, epos, line)
-                elif token in triple_quoted:
-                    endprog = _compile(endpats[token])
-                    endmatch = endprog.match(line, pos)
-                    if endmatch:                           # all on one line
-                        pos = endmatch.end(0)
-                        token = line[start:pos]
-                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
-                    else:
-                        strstart = (lnum, start)           # multiple lines
-                        contstr = line[start:]
-                        contline = line
-                        break
-                elif initial in single_quoted or \
-                    token[:2] in single_quoted or \
-                    token[:3] in single_quoted:
-                    if token[-1] == '\n':                  # continued string
-                        strstart = (lnum, start)
-                        endprog = _compile(endpats[initial] or
-                                           endpats[token[1]] or
-                                           endpats[token[2]])
-                        contstr, needcont = line[start:], 1
-                        contline = line
-                        break
-                    else:                                  # ordinary string
-                        yield TokenInfo(STRING, token, spos, epos, line)
-                elif initial.isidentifier():               # ordinary name
-                    yield TokenInfo(NAME, token, spos, epos, line)
-                elif initial == '\\':                      # continued stmt
-                    continued = 1
-                else:
-                    if initial in '([{':
-                        parenlev += 1
-                    elif initial in ')]}':
-                        parenlev -= 1
-                    yield TokenInfo(OP, token, spos, epos, line)
-            else:
-                yield TokenInfo(ERRORTOKEN, line[pos],
-                           (lnum, pos), (lnum, pos+1), line)
-                pos += 1
-
-    for indent in indents[1:]:                 # pop remaining indent levels
-        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
-    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
-
-
-# An undocumented, backwards compatible, API for all the places in the standard
-# library that expect to be able to use tokenize with strings
-def generate_tokens(readline):
-    return _tokenize(readline, None)
-
-def main():
-    import argparse
-
-    # Helper error handling routines
-    def perror(message):
-        print(message, file=sys.stderr)
-
-    def error(message, filename=None, location=None):
-        if location:
-            args = (filename,) + location + (message,)
-            perror("%s:%d:%d: error: %s" % args)
-        elif filename:
-            perror("%s: error: %s" % (filename, message))
-        else:
-            perror("error: %s" % message)
-        sys.exit(1)
-
-    # Parse the arguments and options
-    parser = argparse.ArgumentParser(prog='python -m tokenize')
-    parser.add_argument(dest='filename', nargs='?',
-                        metavar='filename.py',
-                        help='the file to tokenize; defaults to stdin')
-    parser.add_argument('-e', '--exact', dest='exact', action='store_true',
-                        help='display token names using the exact type')
-    args = parser.parse_args()
-
-    try:
-        # Tokenize the input
-        if args.filename:
-            filename = args.filename
-            with _builtin_open(filename, 'rb') as f:
-                tokens = list(tokenize(f.readline))
-        else:
-            filename = "<stdin>"
-            tokens = _tokenize(sys.stdin.readline, None)
-
-        # Output the tokenization
-        for token in tokens:
-            token_type = token.type
-            if args.exact:
-                token_type = token.exact_type
-            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
-            print("%-20s%-15s%-15r" %
-                  (token_range, tok_name[token_type], token.string))
-    except IndentationError as err:
-        line, column = err.args[1][1:3]
-        error(err.args[0], filename, (line, column))
-    except TokenError as err:
-        line, column = err.args[1]
-        error(err.args[0], filename, (line, column))
-    except SyntaxError as err:
-        error(err, filename)
-    except OSError as err:
-        error(err)
-    except KeyboardInterrupt:
-        print("interrupted\n")
-    except Exception as err:
-        perror("unexpected error: %s" % err)
-        raise
-
-if __name__ == "__main__":
-    main()