import re
from typing import List

PUNCTUATIONS = '.,:;?!-"“”/\\<>()[]{}'
SPECIAL_CHARS = '#$€%@*&_|=+^`~<<>>'


class TetunRegexTokenizer:
    """Tokenizes text using regular expressions."""

    def __init__(self, patterns: str, split: bool = False) -> None:
        """
        :param patterns: a regular expression to match the tokens.
        :param split: if True, use re.split() to tokenize text, else use re.findall().            
        """
        self.patterns = patterns
        self.split = split

    def tokenize(self, text: str) -> List[str]:
        """ 
        :param text: the text to be tokenized.
        :return: a list of tokens.
        """
        if self.split:
            tokens = re.split(self.patterns, text)
        else:
            tokens = re.findall(self.patterns, text)
        return tokens


class TetunStandardTokenizer(TetunRegexTokenizer):
    """ Tokenize text by word, punctuations, or special characters delimiters. """

    def __init__(self) -> None:
        patterns = (
            # e.g.: Área, área, ne'e, Ne'ebé, kompañia, ida-ne'e, ida-ne'ebé, etc.
            r"[A-Za-záéíóúñ]+(?:[-’'][A-Za-záéíóúñ]+)*"
            r"|"
            r"[\d]+[\.\d]*[\,\d]*"
            r"|"
            r"[" + re.escape("".join(PUNCTUATIONS + SPECIAL_CHARS)) + "]"
        )
        super().__init__(patterns)


class TetunWhiteSpaceTokenizer(TetunRegexTokenizer):
    """ Tokenize text by whitespace delimiter. """

    def __init__(self) -> None:
        patterns = r"\s+"
        super().__init__(patterns, split=True)


class TetunBlankLineTokenizer(TetunRegexTokenizer):
    """ Tokenize a text, treating any sequence of blank lines as a delimiter. """

    def __init__(self, split=True) -> None:
        patterns = r"s*\n\s*\n\s*"
        super().__init__(patterns, split=True)


class TetunSimpleTokenizer(TetunRegexTokenizer):
    """ Tokenize strings and numbers and ignore punctuations and special characters. """

    def __init__(self) -> None:
        patterns = (
            r"[A-Za-záéíóúñ]+(?:[-’'][A-Za-záéíóúñ]+)*"
            r"|"
            r"[\d]+[\.\d]*[\,\d]*"
        )
        super().__init__(patterns)


class TetunWordTokenizer(TetunRegexTokenizer):
    """ Tokenize strings and ignore numbers, punctuations and special characters. """

    def __init__(self) -> None:
        patterns = (
            r"[A-Za-záéíóúñ]+(?:[-’'][A-Za-záéíóúñ]+)*"
        )
        super().__init__(patterns)
