#!/bin/bash
set -eo pipefail

usage="$(basename "$0") [-h] -i <fastq>

Assemble fastq/fasta formatted reads and perform POA consensus.

    -h  show this help text.
    -i  fastx input reads (required).
    -r  reference fasta for reference-guided consensus (instead of de novo assembly)
    -o  output folder (default: assm).
    -p  output file prefix (default: reads).
    -t  number of minimap and racon threads (default: 1).
    -m  number of racon rounds (default: 4).
    -n  number of racon shuffles (default: 1. If not 1, should be >10).
    -w  racon window length (default: 500).
    -k  keep intermediate files (default: delete).
    -K  minimap's -K option (default: 500M).
    -c  trim adapters from reads prior to everything else.
    -e  error correct longest e% of reads prior to assembly, or an estimated coverage (e.g. 2x).
        For an estimated coverage, the -l option must be a fastx or a length (e.g. 4.8mb).
    -l  Reference length, either as a number (e.g. 4.8mb) or fastx from which length will be calculated.
    -x  log all commands before running."

OUTPUT="assm"
NAME="reads"
THREADS=1
ROUNDS=4
SHUFFLES=1
MIN_SHUFFLES=10
WINDOW_LEN=500
MINIMAP_K="500M"
CHOP=false
iflag=false
rflag=false
eflag=false
lflag=false
xflag=false
kflag=false
while getopts ':hi:q:r:o:p:t:m:n:w:K:kce:l:x' option; do
  case "$option" in
    h  ) echo "$usage" >&2; exit;;
    i  ) iflag=true; INPUT=$OPTARG;;
    r  ) rflag=true; REF=$(cd "$(dirname "$OPTARG")"; pwd)/$(basename "$OPTARG");;
    o  ) OUTPUT=$OPTARG;;
    p  ) NAME=$OPTARG;;
    t  ) THREADS=$OPTARG;;
    m  ) ROUNDS=$OPTARG;;
    n  ) SHUFFLES=$OPTARG;;
    w  ) WINDOW_LEN=$OPTARG;;
    K  ) MINIMAP_K=$OPTARG;;
    k  ) kflag=true;;
    c  ) CHOP=true;;
    e  ) eflag=true; ERRCORR=$OPTARG;;
    l  ) lflag=true; REFLEN=$OPTARG;;
    x  ) xflag=true;;
    \? ) echo "Invalid option: -${OPTARG}." >&2; exit 1;;
    :  ) echo "Option -$OPTARG requires an argument." >&2; exit 1;;
  esac
done
shift $(($OPTIND - 1))

if ! $iflag; then
  echo "$usage" >&2;
  echo "-i must be specified." >&2;
  exit 1;
fi

if $eflag && [[ $ERRCORR = *x ]]; then
  if ! $lflag; then
    echo "$usage" >&2;
    echo "If -e is a coverage, -l must be specified." >&2;
    exit 1;
  fi
  ERRCOV=$(echo "$ERRCORR" | sed 's/x//')
  ERRCORR=$(coverage_from_fastx ${INPUT} ${REFLEN} --coverage $ERRCOV --longest | grep 'reads required' | awk '{print $NF}')
  echo "Will error-correct the longest ${ERRCORR} % of reads (${ERRCOV}x)"
fi


if $xflag; then
  set -x;
fi

if [[ ! -e ${OUTPUT} ]]; then
  mkdir -p ${OUTPUT}
else
  echo "Output ${OUTPUT} already exists." >&2; exit 1
fi

if [ "$SHUFFLES" -gt "1" ] && [ "$SHUFFLES" -lt "$MIN_SHUFFLES" ]; then
    echo "Error: the number of shuffles n should be 1 or >=10, exiting." >&2; exit 1;
fi

FASTX=${OUTPUT}/${NAME}.fa.gz
echo "Copying FASTX input to workspace: ${INPUT} > ${FASTX}"
seqkit fq2fa ${INPUT} | gzip -1 > ${FASTX}  # this works fine for fasta input

cd ${OUTPUT}
READS=$(basename "$FASTX")

if $CHOP; then
  echo "Trimming reads..."
  TRIMMED=${NAME}_trimmed.fa.gz
  porechop -i ${READS} -o ${TRIMMED} --format fasta.gz --threads ${THREADS} --check_reads 1000 --discard_middle
  if ! $kflag; then
    rm ${READS} &
  fi
  READS=${TRIMMED}
else
  echo "Skipped adapter trimming."
fi

# Note, these could/should be made dependent on read correction
BASERACONOPTS="-m 8 -x -6 -g -8"
OVLOPTS="-x ava-ont -K ${MINIMAP_K}"
ASMOPTS="-s 100 -e 3"
MOCKQ=10

if $eflag; then
  echo "Extracting longest ${ERRCORR}% of reads..."
  # long_fastq doesn't support gz files
  LONG_READS=long_${READS//.gz/}
  OTHERS=other_${READS//.gz/}
  gunzip ${READS}
  long_fastx ${READS//.gz/} ${LONG_READS} --longest ${ERRCORR} --others ${OTHERS}
  gzip -1 ${READS//.gz/}

  echo "Correcting long reads..."
  LONG_PAF=long_corr.paf.gz
  CORR_READS=${NAME}_corrected.fa.gz
  minimap2 ${OVLOPTS} -t ${THREADS} ${LONG_READS} ${READS} | gzip -1 > ${LONG_PAF}
  racon ${BASERACONOPTS} -w ${WINDOW_LEN} -t ${THREADS} -f -q -1 ${READS} ${LONG_PAF} ${LONG_READS} | \
      cat - ${OTHERS} | gzip -1 > ${CORR_READS}

  if ! $kflag; then
    rm ${LONG_READS} ${OTHERS} ${LONG_PAF} ${READS} &
  fi
  READS=${CORR_READS}
else
  echo "Skipped pre-assembly correction." 
fi

if ! $rflag; then 
  echo "Overlapping reads..."
  READSPAF=${NAME}.paf.gz
  minimap2 ${OVLOPTS} -t ${THREADS} ${READS} ${READS} | gzip -1 > ${READSPAF}
  echo "Assembling graph..."
  DRAFT=${NAME}.gfa.fa.gz
  miniasm ${ASMOPTS} -f ${READS} ${READSPAF} | awk '/^S/{print ">"$2"\n"$3}' | gzip -1 > ${DRAFT}
  if ! $kflag; then
    rm ${READSPAF} &
  fi
else
  echo "Using supplied reference to perform reference-guided consensus."
  echo "Warning: Parts of this reference might not be polished by racon and such parts will be present in the final consensus unmodified."
  DRAFT=${REF}
fi

find_larger_prime(){
  # find first prime larger than input
  val=$1
  while true; do
    check=$(factor ${val} | awk '{print NF-1}')
    if [[ $check -eq 1 ]]; then
        echo $val
        break
    fi
    val=$((val+1))
  done
}


MOD_WINDOW_LEN=${WINDOW_LEN} 
for SHUF in $(seq -w 1 ${SHUFFLES}); do
  echo "Running racon read shuffle ${SHUF}..."

  if [ ${SHUFFLES} -ne 1 ]; then
    # dont allow racon to trim as this can interfere with our mischevous
    #   trimming tricks below
    ADDRACONOPTS="--include-unpolished"
  fi

  if [ ${SHUF} -ne 1 ]; then
    echo "Shuffling reads..."
    MOD_WINDOW_LEN=$(find_larger_prime ${MOD_WINDOW_LEN})  # might explode at larger lengths...
    SHUFREADS=shuffled_${SHUF}_${READS}
    # set a random -s (random seed) for each shuffle iteration
    # (else seed adopts default value of 23, and we do n identical shuffles)
    seqkit shuffle -s ${RANDOM:0:2} ${READS} | gzip -1 > ${SHUFREADS}
    # as well as shuffling reads we shift the backbone to force
    #   racon to use different boundaries
    SCAFFOLD=shuffled_${SHUF}_backbone.fa.gz
    PADLEN=$((((10#$SHUF)-1)*((10#$WINDOW_LEN)/(10#$SHUFFLES)))); 
    echo "Trimming backbones by ${PADLEN}."
    seqkit replace -p "^.{${PADLEN}}" -r "" -s ${DRAFT} | gzip -1 > ${SCAFFOLD}
  else
    SHUFREADS=${READS}
    SCAFFOLD=${DRAFT}
    if [ ${SHUFFLES} -ne 1 ]; then
      ln -s ${READS} shuffled_${SHUF}_${READS}
      ln -s ${SCAFFOLD} shuffled_${SHUF}_backbone.fa.gz
    fi
  fi

  for ROUND in $(seq -w 1 ${ROUNDS}); do
    echo "Running round ${ROUND} consensus..."
    READS2TIGS=reads2contigs_${SHUF}_${ROUND}.paf.gz
    NEWSCAF=racon_${SHUF}_${ROUND}.fa.gz

    minimap2 -K ${MINIMAP_K} -t ${THREADS} ${SCAFFOLD} ${SHUFREADS} | gzip -1 > ${READS2TIGS}
    racon ${BASERACONOPTS} ${ADDRACONOPTS} -w ${MOD_WINDOW_LEN} -t ${THREADS} -q -1 ${SHUFREADS} ${READS2TIGS} ${SCAFFOLD} | gzip -1 > ${NEWSCAF}

    if ! $kflag; then
      rm ${READS2TIGS} &
    fi
    SCAFFOLD=${NEWSCAF}
  done

done
if ! $kflag; then
  rm ${READS} &
fi

# we elect to write uncompressed output
FINAL=${NAME}_final.fa
if [ ${SHUFFLES} -eq 1 ]; then
  gunzip -c ${SCAFFOLD} > ${FINAL}
else
  # One last compilation step
  echo "Combining consensus shuffles."
  COMBINED=racon_combined_shuffles.fa.gz
  for SHUF in $(seq -w 1 ${SHUFFLES}); do
      gunzip -c racon_${SHUF}_${ROUNDS}.fa.gz | sed "s/\(>[^[:space:]]\+\)/\1_shuffle_${SHUF}/" | gzip -1 >> ${COMBINED}
  done

  READS2TIGS=combined2contigs.paf.gz
  minimap2 -K ${MINIMAP_K} -t ${THREADS} ${SCAFFOLD} ${COMBINED} | gzip -1 > ${READS2TIGS}
  # use a slightly longer window here for good measure
  racon ${RACONOPTS} -w $((3*WINDOW_LEN/2)) -t ${THREADS} -q -1 ${COMBINED} ${READS2TIGS} ${SCAFFOLD} > ${FINAL}
  if ! $kflag; then
    rm ${READS2TIGS} &
  fi
fi

if ! $kflag; then
  #TODO: move these earlier?
  rm -rf racon_*_*.fa.gz shuffled* *paf* *gfa* &
fi
echo "Waiting for cleanup."
wait
echo "Final assembly written to ${OUTPUT}/${FINAL}. Have a nice day."
