#!/usr/bin/env python

"""AttaCut: Fast and Reasonably Accurate Word Tokenizer for Thai

Usage:
  attacut-cli <src> [--dest=<dest>] [--model=<model>]
  attacut-cli (-v | --version)
  attacut-cli (-h | --help)

Options:
  -h --help         Show this screen.
  --model=<model>   Model to be used [default: attacut-sc].
  --dest=<dest>     If not specified, it'll be <src>-tokenized-by-<model>.txt
  --version         Show version

"""

import os
import torch

from docopt import docopt
from tqdm import tqdm

from attacut import utils, artifacts, preprocessing, Tokenizer, __version__

SEP = "|"

def get_argument(dict, name, default):
    v = dict.get(name)
    return v if v is not None else default
  
if __name__ == "__main__":
    arguments = docopt(__doc__)

    if arguments["--version"] or arguments["-v"]:
        print(f"AttaCut: version {__version__}")
        os.sys.exit(0)

    src = arguments["<src>"]
    model = arguments["--model"]

    # for a custom model, use the last dir's name.
    model_name = model.split("/")[-1]

    dest = get_argument(
        arguments,
        "--dest",
        utils.add_suffix_to_file_path(
            src,
            f"-tokenized-by-{model_name}"
        )
    )

    print(f"Tokenizing {src}")
    print(f"Using {src}")
    print(f"Output: {dest}")

    tokenizer = Tokenizer(model)
    total_lines = utils.wc_l(src)

    with torch.no_grad(), \
        open(src, "r") as fin, \
        open(dest, "w") as fout:
        # todo: dataloader generator
        # todo: tokenize by batch and write results
        for line in tqdm(fin, total=total_lines):

            line = preprocessing.TRAILING_SPACE_RX.sub("", line)

            if not line:
                fout.write("\n")
                continue

            words = tokenizer.tokenize(line)
            tokenized_line = SEP.join(words)

            fout.write(f"{tokenized_line}\n")
