#!/bin/bash
set -eo pipefail

PREFIX="assm"
THREADS=1
CHUNK="100000"
rflag=false
iflag=false
bed_flag=false
catalogue_flag=false
homopolymer_flag=false
trim_flag=false
ALIGN_OPTS=""
BED=""
MIN_INDEL_LEN=0
exclude_indel_flag=false

usage="$(basename "$0") [-h] -r <reference> -i <fastq>

Calculate accuracy statistics for an assembly. 

    -h  show this help text.
    -r  reference, should be a fasta file. If correspondng minimap2 indices
        do not exist they will be created. (required).
    -i  fastq/a input assembly (required).
    -c  chunk size. Input reads/contigs will be broken into chunks
        prior to alignment, 0 will not chunk (default ${CHUNK}).
    -C  catalogue errors.
    -H  count homopolymers. 
    -t  alignment threads (default: ${THREADS}).
    -p  output file prefix (default: ${PREFIX}).
    -T  trim consensus to primary alignments of truth to assembly.
    -b  .bed file of reference regions to assess.
    -l  list all indels at least this long (default: ${MIN_INDEL_LEN}, set to 0 to skip searching for indels). 
    -e  use with -l option to create a .bed file to exclude indels. If the -b option is used, the two bed files will be combined."



while getopts ':hr:i:p:c:CHTt:b:l:e' option; do
  case "$option" in
    h  ) echo "$usage" >&2; exit;;
    r  ) rflag=true; REFERENCE=$OPTARG;;
    i  ) iflag=true; INPUT=$OPTARG;;
    p  ) PREFIX=$OPTARG;;
    c  ) CHUNK=$OPTARG;;
    C  ) catalogue_flag=true; ALIGN_OPTS="-m";;
    H  ) homopolymer_flag=true; ALIGN_OPTS="-m";;
    T  ) trim_flag=true; ALIGN_OPTS="-m";;
    t  ) THREADS=$OPTARG;;
    b  ) bed_flag=true; BED="--bed $OPTARG"; ALIGN_OPTS="-m";;
    l  ) MIN_INDEL_LEN="$OPTARG";;
    e  ) exclude_indel_flag=true; ALIGN_OPTS="-m";;
    \? ) echo "Invalid option: -${OPTARG}." >&2; exit 1;;
    :  ) echo "Option -$OPTARG requires an argument." >&2; exit 1;;
  esac
done
shift $(($OPTIND - 1))

if ! $iflag || ! $rflag; then
  echo "$usage" >&2;
  echo "-i and -r must be specified." >&2;
  exit 1;
fi

if (($CHUNK > 0)); then
    ALIGN_OPTS="$ALIGN_OPTS -c $CHUNK"
fi

if $trim_flag; then
    mini_align -i $REFERENCE -r $INPUT -p ${PREFIX}_trim -t $THREADS $ALIGN_OPTS
    ref_seqs_from_bam ${PREFIX}_trim.bam > ${PREFIX}_trim.fasta
    INPUT=${PREFIX}_trim.fasta
fi

mini_align -i $INPUT -r $REFERENCE -p $PREFIX -t $THREADS $ALIGN_OPTS 

if (($MIN_INDEL_LEN > 0)); then
    echo "Writing list of indels ${MIN_INDEL_LEN} bases and longer to ${PREFIX}_indel_ge${MIN_INDEL_LEN}.txt."
    # find indels and write to a .bed file
    find_indels -m ${MIN_INDEL_LEN} ${PREFIX}.bam -o ${PREFIX}_indel_ge${MIN_INDEL_LEN}.txt -b ${PREFIX}_indel_ge${MIN_INDEL_LEN}.bed

    if $exclude_indel_flag; then
        if ! $bed_flag; then  
            # if no bedfile has been given, create one spanning the genome
            BEDFILE=${PREFIX}_ref_contigs.bed
            samtools idxstats ${PREFIX}.bam | awk '{if(!/\*/){print $1"\t0\t"$2}}' > ${BEDFILE}
            echo "Creating bed file to mask out indels ${MIN_INDEL_LEN} bases and longer."
        else
            echo "Subtracting indels ${MIN_INDEL_LEN} bases and longer from provided bed file."
            # strip --bed prefix from $BED
            BEDFILE=$(echo $BED | sed 's/--bed//')
        fi
        # if a bedfile has been given, subtract indel intervals from it
        bedtools subtract -a ${BEDFILE} -b ${PREFIX}_indel_ge${MIN_INDEL_LEN}.bed > ${PREFIX}.bed
        BED="--bed ${PREFIX}.bed"
    fi
fi


STATS_THREADS=1
if [[ "$BED" != "" ]]; then
    STATS_THREADS=$THREADS
fi
stats_from_bam ${PREFIX}.bam -o ${PREFIX}_stats.txt -t ${STATS_THREADS} ${BED} 

summary_from_stats -i ${PREFIX}_stats.txt -pr -o ${PREFIX}_summ.txt

if $catalogue_flag; then
    OUTDIR=${PREFIX}_error_catalogue
    echo "Running catalogue_errors, saving data to ${OUTDIR}"
    catalogue_errors count ${PREFIX}.bam -t $THREADS -o ${OUTDIR} ${BED}
fi

grep 'Percentage Errors' -A 7 -m 1 ${PREFIX}_summ.txt
grep 'Q Scores' -A 7 -m 1 ${PREFIX}_summ.txt

if $homopolymer_flag; then
    OUTDIR=${PREFIX}_homopolymer
    echo "Running homopolymer count, saving data to ${OUTDIR}"
    if $bed_flag; then
        assess_homopolymers count ${PREFIX}.bam -o ${OUTDIR} ${BED}
    else
        assess_homopolymers count ${PREFIX}.bam -o ${OUTDIR}
    fi
fi


echo "All done, output written to ${PREFIX}_stats.txt, ${PREFIX}_summ.txt and ${PREFIX}_indel_ge${MIN_INDEL_LEN}.txt"
