#!/bin/bash
set -eo pipefail

PREFIX="assm"
THREADS=1
CHUNK="100000"
rflag=false
iflag=false
bed_flag=false
catalogue_flag=false
homopolymer_flag=false
ALIGN_OPTS=""
STATS_OPTS=""
SUMM_OPTS="-pr"
ACCUMULATE_OPTS="10,100"
BED=""
MIN_INDEL_LEN=0
exclude_indel_flag=false

usage="$(basename "$0") [-h] -r <reference> -i <fastq>

Calculate accuracy statistics for an assembly.

    -h  show this help text.
    -r  reference, should be a fasta file. If correspondng minimap2 indices
        do not exist they will be created. (required).
    -i  fastq/a input assembly (required).
    -d  set the minimap2 preset, e.g. map-ont, asm5, asm10, asm20 [default: map-ont].
    -c  chunk size. Input reads/contigs will be broken into chunks
        prior to alignment, 0 will not chunk (default ${CHUNK}).
    -C  catalogue errors.
    -H  count homopolymers.
    -t  alignment threads (default: ${THREADS}).
    -p  output file prefix (default: ${PREFIX}).
    -b  .bed file of reference regions to assess.
    -l  list all indels at least this long (default: ${MIN_INDEL_LEN}, set to 0 to skip searching for indels).
    -e  use with -l option to create a .bed file to exclude indels. If the -b option is used, the two bed files will be combined.
    -y  include supplementary alignments.
    -a  accumulate the stats over a number of chunks, can include multiple values separated by comma,
        one summary file will be generated for each value [default: ${ACCUMULATE_OPTS}]."



while getopts ':hr:i:d:p:c:CHt:b:l:e:ya:' option; do
  case "$option" in
    h  ) echo "$usage" >&2; exit;;
    r  ) rflag=true; REFERENCE=$OPTARG;;
    i  ) iflag=true; INPUT=$OPTARG;;
    d  ) ALIGN_OPTS="${ALIGN_OPTS} -d ${OPTARG}";;
    p  ) PREFIX=$OPTARG;;
    c  ) CHUNK=$OPTARG;;
    C  ) catalogue_flag=true;;
    H  ) homopolymer_flag=true;;
    t  ) THREADS=$OPTARG;;
    b  ) bed_flag=true; BED="--bed $OPTARG";;
    l  ) MIN_INDEL_LEN="$OPTARG";;
    e  ) exclude_indel_flag=true;;
    y  ) ALIGN_OPTS="${ALIGN_OPTS} -y"; STATS_OPTS="-a";;
    a  ) ACCUMULATE_OPTS="${OPTARG}";;
    \? ) echo "Invalid option: -${OPTARG}." >&2; exit 1;;
    :  ) echo "Option -$OPTARG requires an argument." >&2; exit 1;;
  esac
done
shift $(($OPTIND - 1))

if ! $iflag || ! $rflag; then
  echo "$usage" >&2;
  echo "-i and -r must be specified." >&2;
  exit 1;
fi

# Check if the homopolymer directory exists
if $homopolymer_flag; then
    HP_OUTDIR=${PREFIX}_homopolymer
    if [ -d "$HP_OUTDIR" ]; then
        echo "Directory ${HP_OUTDIR} already exists, delete or rename it." >&2
        exit 1
    fi
fi

echo "Assessing $INPUT against $REFERENCE."  >&2

if $catalogue_flag || $homopolymer_flag || $bed_flag || $exclude_indel_flag; then
    ALIGN_OPTS="$ALIGN_OPTS -m"
fi

if (($CHUNK > 0)); then
    ALIGN_OPTS="$ALIGN_OPTS -c $CHUNK"
fi

mini_align -i $INPUT -r $REFERENCE -p $PREFIX -t $THREADS $ALIGN_OPTS

if (($MIN_INDEL_LEN > 0)); then
    echo "Writing list of indels ${MIN_INDEL_LEN} bases and longer to ${PREFIX}_indel_ge${MIN_INDEL_LEN}.txt."  >&2
    # find indels and write to a .bed file
    find_indels -m ${MIN_INDEL_LEN} ${PREFIX}.bam -o ${PREFIX}_indel_ge${MIN_INDEL_LEN}.txt -b ${PREFIX}_indel_ge${MIN_INDEL_LEN}.bed

    if $exclude_indel_flag; then
        if ! $bed_flag; then
            # if no bedfile has been given, create one spanning the genome
            BEDFILE=${PREFIX}_ref_contigs.bed
            samtools idxstats ${PREFIX}.bam | awk '{if(!/\*/){print $1"\t0\t"$2}}' > ${BEDFILE}
            echo "Creating bed file to mask out indels ${MIN_INDEL_LEN} bases and longer."  >&2
        else
            echo "Subtracting indels ${MIN_INDEL_LEN} bases and longer from provided bed file."  >&2
            # strip --bed prefix from $BED
            BEDFILE=$(echo $BED | sed 's/--bed//')
        fi
        # if a bedfile has been given, subtract indel intervals from it
        bedtools subtract -a ${BEDFILE} -b ${PREFIX}_indel_ge${MIN_INDEL_LEN}.bed > ${PREFIX}.bed
        BED="--bed ${PREFIX}.bed"
    fi
fi


STATS_THREADS=1
if [[ "$BED" != "" ]]; then
    STATS_THREADS=$THREADS
fi
stats_from_bam ${STATS_OPTS} ${PREFIX}.bam -o ${PREFIX}_stats.txt -t ${STATS_THREADS} ${BED}

# Generate summary without accumulation
output_file="${PREFIX}_summ.txt"
summary_from_stats -i ${PREFIX}_stats.txt -o ${output_file} ${SUMM_OPTS}
# Print the main summary to stdout
grep 'Percentage Errors' -A 7 -m 1 ${output_file}
grep 'Q Scores' -A 7 -m 1 ${output_file}

# Generating summary files for each accumulation parameter
for n in $(echo $ACCUMULATE_OPTS | tr "," " "); do
    echo "======================> Accumulating over $n chunks"
    output_file="${PREFIX}_summ_accm${n}.txt"
    summary_from_stats -i ${PREFIX}_stats.txt -o ${output_file} -a $n ${SUMM_OPTS}
    # Print the main summary to stdout
    grep 'Percentage Errors' -A 7 -m 1 ${output_file}
    grep 'Q Scores' -A 7 -m 1 ${output_file}
done

if $catalogue_flag; then
    CAT_OUTDIR=${PREFIX}_error_catalogue
    echo "Running catalogue_errors, saving data to ${CAT_OUTDIR}" >&2
    catalogue_errors count ${PREFIX}.bam -t $THREADS -o ${CAT_OUTDIR} ${BED}
fi

if $homopolymer_flag; then
    echo "Running homopolymer count, saving data to ${HP_OUTDIR}" >&2
    if $bed_flag; then
        assess_homopolymers count ${PREFIX}.bam -t $THREADS -o ${HP_OUTDIR} ${BED}
    else
        assess_homopolymers count ${PREFIX}.bam -t $THREADS -o ${HP_OUTDIR}
    fi
fi


echo "All done, output written to ${PREFIX}_stats.txt, ${PREFIX}_summ.txt and ${PREFIX}_indel_ge${MIN_INDEL_LEN}.txt"  >&2
