#!/usr/bin/env python3

# Compare two python source files by tokens, ignoring whitespace (other than
# indentation) and comments

import sys
import re


class PyToken:
    INDENT = 1
    OUTDENT = 2
    ENDLINE = 3
    WORD = 100
    INT = 101
    FLOAT = 102
    STRING = 103

    def __init__(self, type, n_line):
        self.type = type
        self.n_line = n_line

    def __str__(self):
        if self.type == PyToken.INDENT:
            return '<INDENT>'
        if self.type == PyToken.OUTDENT:
            return '<OUTDENT>'
        if self.type == PyToken.ENDLINE:
            return '<EOL>'
        return str(self.type)

    def __eq__(self, other):
        return self.type == other.type


class WordToken(PyToken):
    # We don't need to distinguish between keywords and other words, so
    # we just lump them together in a single token type...
    def __init__(self, word, n_line):
        super().__init__(PyToken.WORD, n_line)
        self.word = word

    def __str__(self):
        return self.word

    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        return self.word == other.word


class IntToken(PyToken):
    def __init__(self, value, n_line):
        super().__init__(PyToken.INT, n_line)
        try:
            self.value = int(value.replace('_', ''), 0)
        except ValueError:
            # Support Python 2.x octal literals
            if value.startswith('0'):
                self.value = int(value.replace('_', ''), 8)
            else:
                raise

    def __str__(self):
        return str(self.value)

    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        return self.value == other.value


class FloatToken(PyToken):
    def __init__(self, value, n_line):
        super().__init__(PyToken.FLOAT, n_line)
        self.value = float(value.replace('_', ''))

    def __str__(self):
        return str(self.value)

    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        # TODO: Might need some fuzz
        return self.value == other.value


class StringToken(PyToken):
    def __init__(self, prefix, content, n_line):
        super().__init__(PyToken.STRING, n_line)

        # Normalize prefix for comparison
        self.prefix = ''.join(sorted(prefix.lower()))

        # Normalize special characters for comparison
        self.content = content.replace("\\'", "'").replace("'", "\\'") \
                              .replace('\\"', '"').replace('\t', '\\t') \
                              .replace('\n', '\\n').replace('\r', '\\r')

    def __str__(self):
        return "{}'{}'".format(self.prefix, self.content)

    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        return self.prefix == other.prefix and self.content == other.content


RE_WHITESPACE = re.compile(r'\s+')
RE_WORD = re.compile(r'[A-Za-z_][A-Za-z0-9_]*')
RE_INT = re.compile(r'[0-9][0-9_]*|0[Xx][0-9A-Fa-f_]+|0[Bb][0-1_]+|0[Oo][0-7_]+')
RE_FLOAT = re.compile(r'(([0-9][0-9_]*)?\.[0-9][0-9_]*|[0-9][0-9_]*\.)([eE][+-]?[0-9][0-9_]*)?')
RE_START_STRING = re.compile(r'([rR][fFbB]?|[uU]|[fF][rR]?|[bB][rR]?)?(\'\'\'|\'|"""|")')

# Note, tokens sharing a common prefix should be entered in order from
# longest to shortest, so we don't mismatch a long token as a sequence
# of shorter tokens
SYMBOLIC_TOKENS = (
    '<<=', '>>=', '**=', '//=', '...', '.',
    '+=', '-=', '*=', '@=', '/=', '%=', '&=', '|=', '^=',
    '<>', '<<', '<=', '<', '>>', '>=', '>', '!=', '==', '=',
    ',', ';', ':=', ':', '->', '~', '`',
    '+', '-', '**', '*', '@', '//', '/', '%', '&', '|', '^',
    '(', ')', '{', '}', '[', ']',
)
def symbolic_token(line, n_line):
    for tok in SYMBOLIC_TOKENS:
        if line.startswith(tok):
            return PyToken(tok, n_line)
    return None


def string_token(line, n_line, pysrc):
    match = RE_START_STRING.match(line)
    if not match:
        return None

    # Look for the end of the string
    prefix = match.group(1)
    if prefix is None:
        prefix = ''
    quotes = match.group(2)
    start = len(prefix) + len(quotes)
    content = ''
    while True:
        end = line.find(quotes, start)
        if end > 0 and line[end - 1] == '\\':
            content += line[start:end + 1]
            start = end + 1
            continue
        elif end >= 0:
            content += line[start:end]
            break

        # Read in a new line
        content += line[start:]
        line = pysrc.readline()
        n_line += 1
        start = 0
        if not line:
            raise RuntimeError('Reached EOF while looking for {}'.format(repr(quotes)))

    token = StringToken(prefix, content, n_line)
    token.rem_line = line[end + len(quotes):]
    token.end_line = n_line
    return token


def read_tokens(pysrc):
    indent_stack = [0]
    context_stack = []
    n_line = 0

    while True:
        line = pysrc.readline()
        n_line += 1
        if not line:
            break

        if not line.strip() or line.lstrip().startswith('#'):
            continue

        # Look for indentation changes
        if len(context_stack) == 0:
            indent = len(line) - len(line.lstrip())
            if indent > indent_stack[-1]:
                indent_stack.append(indent)
                yield PyToken(PyToken.INDENT, n_line)
            while indent < indent_stack[-1]:
                indent_stack.pop()
                yield PyToken(PyToken.OUTDENT, n_line)
            if indent != indent_stack[-1]:
                raise RuntimeError('Incorrect indentation on line {}'.format(n_line))

        while True:
            line = line.lstrip()
            if not line:
                break
            if line[0] == '#':
                # The rest of this line is a comment
                break

            token = symbolic_token(line, n_line)
            if token:
                if token.type in {'(', '{', '['}:
                    context_stack.append(token.type)
                elif token.type == ')':
                    if len(context_stack) == 0 or context_stack[-1] != '(':
                        raise RuntimeError('Mismatched token at {} on line {}'.format(line, n_line))
                    context_stack.pop()
                elif token.type == '}':
                    if len(context_stack) == 0 or context_stack[-1] != '{':
                        raise RuntimeError('Mismatched token at {} on line {}'.format(line, n_line))
                    context_stack.pop()
                elif token.type == ']':
                    if len(context_stack) == 0 or context_stack[-1] != '[':
                        raise RuntimeError('Mismatched token at {} on line {}'.format(line, n_line))
                    context_stack.pop()
                yield token
                line = line[len(token.type):]
                continue

            match = RE_FLOAT.match(line)
            if match:
                yield FloatToken(match.group(), n_line)
                line = line[match.end():]
                continue

            match = RE_INT.match(line)
            if match:
                yield IntToken(match.group(), n_line)
                line = line[match.end():]
                continue

            token = string_token(line, n_line, pysrc)
            if token:
                line = token.rem_line
                n_line = token.end_line
                yield token
                continue

            match = RE_WORD.match(line)
            if match:
                yield WordToken(match.group(), n_line)
                line = line[match.end():]
                continue

            raise RuntimeError('Error: Unrecognized tokens: "{}" at line {}'.format(line, n_line))

        if len(context_stack) == 0:
            yield PyToken(PyToken.ENDLINE, n_line)


if __name__ == '__main__':
    if '--help' in sys.argv:
        print('Usage: token_dump <file>.py')
        sys.exit(0)

    if len(sys.argv) >= 2:
        pysrc = open(sys.argv[1], 'r')
    else:
        pysrc = sys.stdin

    for tok in read_tokens(pysrc):
        if tok.type in {PyToken.ENDLINE, PyToken.INDENT, PyToken.OUTDENT}:
            print(tok)
        else:
            print(tok, end=' ')

    pysrc.close()
