#!/usr/bin/env python3

import itermae

import argparse
import re
import sys
import regex
import time

# Using argparse module to define the arguments

# Name and description of this program
parser = argparse.ArgumentParser(description=""+
    "itermae - Tool for iteratively chopping up each FASTQ read using "+
    "fuzzy regular expressions, for many many reads."+
    "\n\n"+
    "For best performance, we recommend you use this with GNU parallel, "+
    "see examples in the README.")

# Input determination 
parser.add_argument("--input",default="STDIN",
    help="We expect this flag is normally NOT USED, and so I will expect "+
        "FASTQ(Z) input on the STDIN. However, you can specify a path "+
        "using this flag to read from a file. This is not parallelized, "+
        "so it's slower. But it's there. "+
        "Importantly, ALL SEQUENCE is coerced to uppercase letters before "+
        "matching, for speed. So keep it uppercase in the regex.")
# Is it gzip'd ?
parser.add_argument("-z","--gzipped",action="store_true",
    help="This flag indicates that input, be it STDIN or a file path, "+
        "is GZIPPED FASTQ. Note that this implies that any STDIN usage is "+
        "not parallel, so we do not expect this to be used in combination "+
        "with the STDIN input. But it's there.")

# Output determination
parser.add_argument("--output",default="STDOUT",
    help="we expect this flag is normally NOT USED, and so output would "+
        "be send to STDOUT. However, it's here incase you'd like to write "+
        "it to a file. Note I do not add a suffix for you.")
parser.add_argument("-of","--output-format",default='sam',
    help="The output format specification. Default is an unmapped sam "+
        "('sam') so that the tabular nature can permit joining with other "+
        "parsed output from the reverse read, but you can also specify "+
        "'fastq' to get a FASTQ back.")
parser.add_argument("--failed",default=None,
    help="Optional name of output file for failed reads, for debugging. "+
        "If you say 'STDOUT' then it'll go there, so you could use that "+
        "to collect failed and passed reads in the same 'sam' file for "+
        "example.")
parser.add_argument("--report",default=None,
    help="Add this flag with a filename to print a report of per-read "+
        "statistics. That's a lot of disk writes btw, but "+
        "would be good in combination with a '--limit' argument "+
        "so that you can spec out the kinds of noise you got "+
        "in your data and debug the running.")

# verbosity
parser.add_argument("-v","--verbose",action="count",default=0,
    help="How much debugging issues should I pipe out to STDERR?"+
        "None is nothing, "+
        "-v is setup messages and start-stop messsages, "+
        "-v -v is worker-level details, "+
        "-v -v -v is chop-level details, "+
        "-v -v -v -v is each operation level details. "+
        "Keep in mind these are going to STDERR.")

### CLI operation and output specification
# Operations
parser.add_argument("-o","--operation",action="append",
    help="The pattern to match and extract. This has a specific and sort "+
        "of complicated syntax. Copy and tweak examples, and/or refer to "+
        "the documentation via the README.md file, or to the original "+
        "regex module documentation. Each operation is done in the order "+
        "you specify, so later operations can use previous matching "+
        "groups as inputs. But, you can't name a group 'input' or "+
        "'dummyspacer', I'm using those names ! "+
        "Importantly, WRITE ALL DNA BASES AS UPPERCASE. I won't coerce "+
        "it in the regex because that messes up group names.")
# Filter specification
parser.add_argument("-f","--filter",action="append",
    help="Filters for eliminating reads that don't pass some criteria. "+
        "You write these based on the groups captured in the operations "+
        "above. So a group named 'group' would have a start at "+
        "'group_start', same for the 'group_end' and 'group_length'. "+
        "You're welcome. Also, `statistics` package is loaded, so you "+
        "can use those expressions for means, medians, etc.")
# Outputs
parser.add_argument("-oid","--output-id",action="append",
    help="A list of output ID definitions, in the same order as for "+
        "output-seq (see that one, below probably). This is evaluated for "+
        "reads that pass filter. You can access 'input.id' to get the "+
        "original FASTQ read ID, and 'group.seq' to get the sequence of a "+
        "particular group, so for example you could do "+
        "'input.id+\"_\"+index.seq' to append the index sequence to the "+
        "FASTQ ID. Unlike output-seq, this a string not a Sequence object.")
parser.add_argument("-oseq","--output-seq",action="append",
    help="A list of output seq definitions, in the same order as for "+
        "output-ids. This is evaluated on the groups being Biopython "+
        "SeqRecords, so you you can paste them together like "+
        "'sample+barcode' or 'sample+dummyspacer+barcode+dummyspacer+umi' "+
        "if you'd like a spacer X character to delimit them specifically. "+
        "You can't access .id or _length properties or the like, just "+
        "groups. Unlike output-id, this a Sequence object not a string.")

### Parse the arguments, check them.

args = parser.parse_args()

# Operations, outputs are read as an array of dicts to keep it ordered.
operations_array = []
outputs_array = []
# Here we read on through to add those on, and complain loudly if someone
# tries to use our reserved names.
try:
    for each in args.operation:
        if each.find("<dummyspacer>") > 0:
            print("Hey, you can't name a capture group "+
                "'dummyspacer', I'm using that! Pick a different name.",
                file=sys.stderr)
            exit(1)

        if each.find("<input>") > 0:
            print("Hey, you can't name a capture group "+
                "'input', I'm using that! Pick a different name.",
                file=sys.stderr)
            exit(1)
        # Here, we use the ` > ` to specify the flow of input to
        # the regex
        (input_string, regex_string) = re.split("\s>\s",each.strip())
        compiled_regex = regex.compile(
            regex_string.strip(), # We use this regex
            regex.BESTMATCH # And we use the BESTMATCH strategy, I think
            )
        # append that to the operations array
        operations_array.append( [input_string.strip(), compiled_regex] )
except:
    # Failure likely from lack of operations to do
    print("Wait a second, I don't understand the operations to be done! "+
        "Are there any? Maybe there's small part I'm choking on? Maybe "+
        "try adding steps in one at a time in an interactive context with "+
        "'--limit' set, to debug easier. Exiting...",file=sys.stderr)
    exit(1)

# Next we build the array of outputs, by combining the named output IDs
# and seq specifications.
try:
    # We need output sequences, so we go through that list, by index
    for i in range(len(args.output_seq)):
        if args.output_id is None:
            # or, use the default of input ID
            outputs_array.append( ["input.id", args.output_seq[i]] )
        else:
            # append that to the outputs array
            outputs_array.append( [args.output_id[i], args.output_seq[i]] )
except:
    # Failure likely from lack of operations to do
    print("Wait a second, I don't understand the outputs to be done! "+
        "Are there any? Maybe there's small part I'm choking on? Maybe "+
        "try adding steps in one at a time in an interactive context with "+
        "'--limit' set, to debug easier. Exiting...",file=sys.stderr)
    exit(1)

if args.verbose >= 1:
    print("\n["+str(time.time())+"] : "+
        "I'm reading in something, applying these operations of "+
        "alignment:\n",file=sys.stderr)
    for each in operations_array:
        print("  - from : "+each[0]+"\n"+
            "    extract groups with regex : '"+str(each[1]),
            file=sys.stderr)

if args.verbose >= 1:
    print("\n["+str(time.time())+"] : ...and with these filters:\n",
        file=sys.stderr)
    try:
        for i in args.filter:
            print("  - "+i,file=sys.stderr)
    except:
        print("  ( no filters defined )",file=sys.stderr)

if args.verbose >= 1:
    print("\n["+str(time.time())+"] : "+
        "Then I'm going to construct outputs that look like:\n",
        file=sys.stderr)
    for each in outputs_array:
        print("  - With ID of : "+each[0]+"\n"+
            "    and the sequence is the group(s) : "+each[1],file=sys.stderr)

# If it's omitted, we believe that means no filter, and we make it True
# because it gets `eval`'d in the function. 
if args.filter is None:
    args.filter = ["True == True"]

if args.verbose >= 1:
    print("\n["+str(time.time())+"] : Then, I'm going to write out a "+
        args.output_format+" format file to "+
        args.output+"",file=sys.stderr)
    if args.report is not None:
        print("\n["+str(time.time())+"] : and a report to '"+
            vars(args)["report"]+".",file=sys.stderr)

# I should implement the below ... but really?
#    # checking file existance for outputs, zipping together the 
#    # output base with each of the three. 
#    exit_flag = 0
#    for each in zip( [vars(args)["output-base"]]*20,            \
#                    ["_fail.fastq", "_pass.fastq",              \
#                        "_report.fastq", "_report.csv" ] ):
#        # At this stage, the tuple is joined to make the filename
#        this_path = ''.join(each)
#        # If the write-report flag is off and the path is the report,
#        # then this won't trip True for that path existing
#import os.path
#        if os.path.isfile(this_path) and              \
#                not( not(args.write_report) and       \
#                    (this_path.find("_report")>=0) ):
#            print("\n"+"["+str(time.time())+"]"+" : "+"File "+this_path+
#                " exits, so I'm quitting before you ask me to do "+
#                "something you might regret.")
#            exit_flag = 1
#    if exit_flag == 1:
#        exit(1)

# We begin
if args.verbose >= 1:
    print("\n["+str(time.time())+"] : BEGIN RUNNING",file=sys.stderr)

itermae.reader(
    input_file=vars(args)["input"], is_gzipped=args.gzipped,
    operations_array=operations_array, filters=args.filter, 
    outputs_array=outputs_array,
    out_format=args.output_format.upper(),
    output_file=vars(args)["output"],failed_file=vars(args)["failed"],
    report_file=vars(args)["report"],
    verbosity=args.verbose
    )

if args.verbose >= 1:
    print("\n"+"["+str(time.time())+"]"+" : "+
        "All worked 'till the work is done --- or some fatal error.",
        file=sys.stderr)

exit(0)
