################################################################################
# Copyright (C) 2014, 2022 GenAP, McGill University and Genome Quebec Innovation Centre
#
# This file is part of MUGQIC Pipelines.
#
# MUGQIC Pipelines is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MUGQIC Pipelines is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with MUGQIC Pipelines.  If not, see <http://www.gnu.org/licenses/>.
################################################################################

# Python Standard Modules
import logging
import os
import re

# MUGQIC Modules
from ...bfx import ballgown
from ...bfx import bedtools
from ...bfx import bvatools
from ...bfx import bwa
from ...bfx import cufflinks
from ...bfx import differential_expression
from ...bfx import gq_seq_utils
from ...bfx import htseq
from ...bfx import metrics
from ...bfx import picard
from ...bfx import rmarkdown
from ...bfx import samtools
from ...bfx import star
from ...bfx import stringtie
from ...bfx import tools
from ...bfx import ucsc
from ...core.config import global_conf, _raise, SanitycheckError
from ...core.job import Job, concat_jobs, pipe_jobs
from .. import common
from ... import utils

log = logging.getLogger(__name__)

class RnaSeqRaw(common.Illumina):
    """
    RNA-Seq Pipeline
    ================

    The standard MUGQIC RNA-Seq pipeline is based on the use of the [STAR aligner](https://code.google.com/p/rna-star/)
    to align reads to the reference genome. These alignments are used during
    downstream analysis to determine genes and transcripts differential expression. The
    [Cufflinks](http://cufflinks.cbcb.umd.edu/) suite is used for the transcript analysis whereas
    [DESeq](http://bioconductor.org/packages/release/bioc/html/DESeq.html) and
    [edgeR](http://bioconductor.org/packages/release/bioc/html/edgeR.html) are used for the gene analysis.

    The RNAseq pipeline requires to provide a design file which will be used to define group comparison
    in the differential analyses. The design file format is described
    [here](https://bitbucket.org/mugqic/mugqic_pipelines/src#markdown-header-design-file)

    The differential gene analysis is followed by a Gene Ontology (GO) enrichment analysis.
    This analysis use the [goseq approach](http://bioconductor.org/packages/release/bioc/html/goseq.html).
    The goseq is based on the use of non-native GO terms (see details in the section 5 of
    [the corresponding vignette](http://bioconductor.org/packages/release/bioc/vignettes/goseq/inst/doc/goseq.pdf).

    Finally, a summary html report is automatically generated by the pipeline at the end of the analysis.
    This report contains description
    of the sequencing experiment as well as a detailed presentation of the pipeline steps and results.
    Various Quality Control (QC) summary statistics are included in the report and additional QC analysis
    is accessible for download directly through the report. The report includes also the main references
    of the software tools and methods used during the analysis, together with the full list of parameters
    that have been passed to the pipeline main script.

    An example of the RNA-Seq report for an analysis on Public Corriel CEPH B-cell is available for illustration
    purpose only: [RNA-Seq report](http://gqinnovationcenter.com/services/bioinformatics/tools/rnaReport/index.html).

    [Here](https://bitbucket.org/mugqic/mugqic_pipelines/downloads/MUGQIC_Bioinfo_RNA-Seq.pptx) is more
    information about the RNA-Seq pipeline that you may find interesting.
    """

    def star(self):
        """
        The filtered reads are aligned to a reference genome. The alignment is done per readset of sequencing
        using the [STAR](https://code.google.com/p/rna-star/) software. It generates a Binary Alignment Map file (.bam).

        This step takes as input files:

        1. Trimmed FASTQ files if available
        2. Else, FASTQ files from the readset file if available
        3. Else, FASTQ output files from previous picard_sam_to_fastq conversion of BAM files
        """

        jobs = []
        project_index_directory = "reference.Merged"
        project_junction_file = os.path.join("alignment_1stPass", "AllSamples.SJ.out.tab")
        individual_junction_list=[]
        ######
        #pass 1 -alignment
        for readset in self.readsets:
            trim_file_prefix = os.path.join("trim", readset.sample.name, readset.name + ".trim.")
            alignment_1stPass_directory = os.path.join("alignment_1stPass", readset.sample.name, readset.name)
            individual_junction_list.append(os.path.join(alignment_1stPass_directory,"SJ.out.tab"))

            if readset.run_type == "PAIRED_END":
                candidate_input_files = [[trim_file_prefix + "pair1.fastq.gz", trim_file_prefix + "pair2.fastq.gz"]]
                if readset.fastq1 and readset.fastq2:
                    candidate_input_files.append([readset.fastq1, readset.fastq2])
                if readset.bam:
                    candidate_input_files.append([re.sub("\.bam$", ".pair1.fastq.gz", readset.bam), re.sub("\.bam$", ".pair2.fastq.gz", readset.bam)])
                [fastq1, fastq2] = self.select_input_files(candidate_input_files)
            elif readset.run_type == "SINGLE_END":
                candidate_input_files = [[trim_file_prefix + "single.fastq.gz"]]
                if readset.fastq1:
                    candidate_input_files.append([readset.fastq1])
                if readset.bam:
                    candidate_input_files.append([re.sub("\.bam$", ".single.fastq.gz", readset.bam)])
                [fastq1] = self.select_input_files(candidate_input_files)
                fastq2 = None
            else:
                _raise(SanitycheckError("Error: run type \"" + readset.run_type +
                "\" is invalid for readset \"" + readset.name + "\" (should be PAIRED_END or SINGLE_END)!"))

            rg_platform = global_conf.get('star_align', 'platform', required=False)
            rg_center = global_conf.get('star_align', 'sequencing_center', required=False)

            job = star.align(
                reads1=fastq1,
                reads2=fastq2,
                output_directory=alignment_1stPass_directory,
                genome_index_folder=None,
                rg_id=readset.name,
                rg_sample=readset.sample.name,
                rg_library=readset.library if readset.library else "",
                rg_platform_unit=readset.run + "_" + readset.lane if readset.run and readset.lane else "",
                rg_platform=rg_platform if rg_platform else "",
                rg_center=rg_center if rg_center else ""
            )
            job.name = "star_align.1." + readset.name
            job.samples = [readset.sample]
            jobs.append(job)

        ######
        jobs.append(concat_jobs([
            #pass 1 - contatenate junction
            Job(samples=self.samples),
            star.concatenate_junction(
                input_junction_files_list=individual_junction_list,
                output_junction_file=project_junction_file
            ),
            #pass 1 - genome indexing
            star.index(
                genome_index_folder=project_index_directory,
                junction_file=project_junction_file
        )], name = "star_index.AllSamples", samples=self.samples))

        ######
        #Pass 2 - alignment
        for readset in self.readsets:
            trim_file_prefix = os.path.join("trim", readset.sample.name, readset.name + ".trim.")
            alignment_2ndPass_directory = os.path.join("alignment", readset.sample.name, readset.name)

            if readset.run_type == "PAIRED_END":
                candidate_input_files = [[trim_file_prefix + "pair1.fastq.gz", trim_file_prefix + "pair2.fastq.gz"]]
                if readset.fastq1 and readset.fastq2:
                    candidate_input_files.append([readset.fastq1, readset.fastq2])
                if readset.bam:
                    candidate_input_files.append([re.sub("\.bam$", ".pair1.fastq.gz", readset.bam), re.sub("\.bam$", ".pair2.fastq.gz", readset.bam)])
                [fastq1, fastq2] = self.select_input_files(candidate_input_files)
            elif readset.run_type == "SINGLE_END":
                candidate_input_files = [[trim_file_prefix + "single.fastq.gz"]]
                if readset.fastq1:
                    candidate_input_files.append([readset.fastq1])
                if readset.bam:
                    candidate_input_files.append([re.sub("\.bam$", ".single.fastq.gz", readset.bam)])
                [fastq1] = self.select_input_files(candidate_input_files)
                fastq2 = None
            else:
                _raise(SanitycheckError("Error: run type \"" + readset.run_type +
                "\" is invalid for readset \"" + readset.name + "\" (should be PAIRED_END or SINGLE_END)!"))

            rg_platform = global_conf.get('star_align', 'platform', required=False)
            rg_center = global_conf.get('star_align', 'sequencing_center', required=False)

            job = star.align(
                reads1=fastq1,
                reads2=fastq2,
                output_directory=alignment_2ndPass_directory,
                genome_index_folder=project_index_directory,
                rg_id=readset.name,
                rg_sample=readset.sample.name,
                rg_library=readset.library if readset.library else "",
                rg_platform_unit=readset.run + "_" + readset.lane if readset.run and readset.lane else "",
                rg_platform=rg_platform if rg_platform else "",
                rg_center=rg_center if rg_center else "",
                create_wiggle_track=True,
                search_chimeres=True,
                cuff_follow=True,
                sort_bam=True
            )
            job.samples = [readset.sample]
            job.input_files.append(os.path.join(project_index_directory, "SAindex"))

            # If this readset is unique for this sample, further BAM merging is not necessary.
            # Thus, create a sample BAM symlink to the readset BAM.
            # remove older symlink before otherwise it raise an error if the link already exist (in case of redo)
            if len(readset.sample.readsets) == 1:
                readset_bam = os.path.join(alignment_2ndPass_directory, "Aligned.sortedByCoord.out.bam")
                sample_bam = os.path.join("alignment", readset.sample.name ,readset.sample.name + ".sorted.bam")
                job = concat_jobs([
                    job,
                    Job([readset_bam], [sample_bam], command="ln -s -f " + os.path.relpath(readset_bam, os.path.dirname(sample_bam)) + " " + sample_bam, removable_files=[sample_bam])])

            job.name = "star_align.2." + readset.name
            jobs.append(job)

        report_file = os.path.join("report", "RnaSeq.star.md")
        jobs.append(
            Job(
                [os.path.join("alignment", readset.sample.name, readset.name, "Aligned.sortedByCoord.out.bam") for readset in self.readsets],
                [report_file],
                [['star', 'module_pandoc']],
                command="""\
mkdir -p report && \\
pandoc --to=markdown \\
  --template {report_template_dir}/{basename_report_file} \\
  --variable scientific_name="{scientific_name}" \\
  --variable assembly="{assembly}" \\
  {report_template_dir}/{basename_report_file} \\
  > {report_file}""".format(
                    scientific_name=global_conf.get('star', 'scientific_name'),
                    assembly=global_conf.get('star', 'assembly'),
                    report_template_dir=self.report_template_dir,
                    basename_report_file=os.path.basename(report_file),
                    report_file=report_file
                ),
                report_files=[report_file],
                name="star_report",
                samples=self.samples
            )
        )

        return jobs

    def picard_merge_sam_files(self):
        """
        BAM readset files are merged into one file per sample. Merge is done using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        for sample in self.samples:
            # Skip samples with one readset only, since symlink has been created at align step
            if len(sample.readsets) > 1:
                alignment_directory = os.path.join("alignment", sample.name)
                inputs = [os.path.join(alignment_directory, readset.name, "Aligned.sortedByCoord.out.bam") for readset in sample.readsets]
                output = os.path.join(alignment_directory, sample.name + ".sorted.bam")

                job = picard.merge_sam_files(inputs, output)
                job.name = "picard_merge_sam_files." + sample.name
                job.samples = [sample]
                jobs.append(job)
        return jobs

    def picard_sort_sam(self):
        """
        The alignment file is reordered (QueryName) using [Picard](http://broadinstitute.github.io/picard/). The QueryName-sorted bam files will be used to determine raw read counts.
        """

        jobs = []
        for sample in self.samples:
            alignment_file_prefix = os.path.join("alignment", sample.name, sample.name)

            job = picard.sort_sam(
                alignment_file_prefix + ".sorted.bam",
                alignment_file_prefix + ".QueryNameSorted.bam",
                "queryname"
            )
            job.name = "picard_sort_sam." + sample.name
            job.samples = [sample]
            jobs.append(job)
        return jobs

    def picard_mark_duplicates(self):
        """
        Mark duplicates. Aligned reads per sample are duplicates if they have the same 5' alignment positions
        (for both mates in the case of paired-end reads). All but the best pair (based on alignment score)
        will be marked as a duplicate in the BAM file. Marking duplicates is done using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        for sample in self.samples:
            alignment_file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.")

            job = picard.mark_duplicates(
                [alignment_file_prefix + "bam"],
                alignment_file_prefix + "mdup.bam",
                alignment_file_prefix + "mdup.metrics"
            )
            job.name = "picard_mark_duplicates." + sample.name
            job.samples = [sample]
            jobs.append(job)
        return jobs

    def bam_hard_clip(self):
        """
        Generate a hardclipped version of the bam for the toxedo suite which doesn't support this official sam feature.
        """

        jobs = []
        for sample in self.samples:
            alignment_input = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.bam")
            alignment_output = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.hardClip.bam")
            job=pipe_jobs([
                samtools.view(
                    alignment_input,
                    None,
                    "-h"
                ),
                Job(
                    [None],
                    [alignment_output],
                    # awk to transform soft clip into hard clip for tuxedo suite
                    command="""\
awk 'BEGIN {{OFS="\\t"}} {{if (substr($1,1,1)=="@") {{print;next}}; split($6,C,/[0-9]*/); split($6,L,/[SMDIN]/); if (C[2]=="S") {{$10=substr($10,L[1]+1); $11=substr($11,L[1]+1)}}; if (C[length(C)]=="S") {{L1=length($10)-L[length(L)-1]; $10=substr($10,1,L1); $11=substr($11,1,L1); }}; gsub(/[0-9]*S/,"",$6); print}}' """.format()
                ),
                samtools.view(
                    "-",
                    alignment_output,
                    "-hbS"
                ),
            ])
            job.name="tuxedo_hard_clip."+ sample.name
            job.samples = [sample]
            jobs.append(job)
        return jobs

    def rnaseqc(self):
        """
        Computes a series of quality control metrics using [RNA-SeQC](https://www.broadinstitute.org/cancer/cga/rna-seqc).
        """

        jobs = []
        sample_file = os.path.join("alignment", "rnaseqc.samples.txt")
        sample_rows = [[sample.name, os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.bam"), "RNAseq"] for sample in self.samples]
        input_bams = [sample_row[1] for sample_row in sample_rows]
        output_directory = os.path.join("metrics", "rnaseqRep")
        # Use GTF with transcript_id only otherwise RNASeQC fails
        gtf_transcript_id = global_conf.get('rnaseqc', 'gtf_transcript_id', param_type='filepath')

        jobs.append(concat_jobs([
            Job(command="mkdir -p " + output_directory, removable_files=[output_directory], samples=self.samples),
            Job(input_bams, [sample_file], command="""\
echo "Sample\tBamFile\tNote
{sample_rows}" \\
  > {sample_file}""".format(sample_rows="\n".join(["\t".join(sample_row) for sample_row in sample_rows]), sample_file=sample_file)),
            metrics.rnaseqc(sample_file, output_directory, self.run_type == "SINGLE_END", gtf_file=gtf_transcript_id, reference=global_conf.get('rnaseqc', 'genome_fasta', param_type='filepath'), ribosomal_interval_file=global_conf.get('rnaseqc', 'ribosomal_fasta', param_type='filepath')),
            Job([], [output_directory + ".zip"], command="zip -r {output_directory}.zip {output_directory}".format(output_directory=output_directory))
        ], name="rnaseqc"))

        trim_metrics_file = os.path.join("metrics", "trimSampleTable.tsv")
        metrics_file = os.path.join("metrics", "rnaseqRep", "metrics.tsv")
        report_metrics_file = os.path.join("report", "trimAlignmentTable.tsv")
        report_file = os.path.join("report", "RnaSeq.rnaseqc.md")
        jobs.append(
            Job(
                [metrics_file],
                [report_file, report_metrics_file],
                [['rnaseqc', 'module_python'], ['rnaseqc', 'module_pandoc']],
                # Ugly awk to merge sample metrics with trim metrics if they exist; knitr may do this better
                command="""\
mkdir -p report && \\
cp {output_directory}.zip report/reportRNAseqQC.zip && \\
python -c 'import csv; csv_in = csv.DictReader(open("{metrics_file}"), delimiter="\t")
print "\t".join(["Sample", "Aligned Reads", "Alternative Alignments", "%", "rRNA Reads", "Coverage", "Exonic Rate", "Genes"])
print "\\n".join(["\t".join([
    line["Sample"],
    line["Mapped"],
    line["Alternative Aligments"],
    str(float(line["Alternative Aligments"]) / float(line["Mapped"]) * 100),
    line["rRNA"],
    line["Mean Per Base Cov."],
    line["Exonic Rate"],
    line["Genes Detected"]
]) for line in csv_in])' \\
  > {report_metrics_file}.tmp && \\
if [[ -f {trim_metrics_file} ]]
then
  awk -F"\t" 'FNR==NR{{raw_reads[$1]=$2; surviving_reads[$1]=$3; surviving_pct[$1]=$4; next}}{{OFS="\t"; if ($2=="Aligned Reads"){{surviving_pct[$1]="%"; aligned_pct="%"; rrna_pct="%"}} else {{aligned_pct=($2 / surviving_reads[$1] * 100); rrna_pct=($5 / surviving_reads[$1] * 100)}}; printf $1"\t"raw_reads[$1]"\t"surviving_reads[$1]"\t"surviving_pct[$1]"\t"$2"\t"aligned_pct"\t"$3"\t"$4"\t"$5"\t"rrna_pct; for (i = 6; i<= NF; i++) {{printf "\t"$i}}; print ""}}' \\
  {trim_metrics_file} \\
  {report_metrics_file}.tmp \\
  > {report_metrics_file}
else
  cp {report_metrics_file}.tmp {report_metrics_file}
fi && \\
rm {report_metrics_file}.tmp && \\
trim_alignment_table_md=`if [[ -f {trim_metrics_file} ]] ; then cut -f1-13 {report_metrics_file} | LC_NUMERIC=en_CA awk -F "\t" '{{OFS="|"; if (NR == 1) {{$1 = $1; print $0; print "-----|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:"}} else {{print $1, sprintf("%\\47d", $2), sprintf("%\\47d", $3), sprintf("%.1f", $4), sprintf("%\\47d", $5), sprintf("%.1f", $6), sprintf("%\\47d", $7), sprintf("%.1f", $8), sprintf("%\\47d", $9), sprintf("%.1f", $10), sprintf("%.2f", $11), sprintf("%.2f", $12), sprintf("%\\47d", $13)}}}}' ; else cat {report_metrics_file} | LC_NUMERIC=en_CA awk -F "\t" '{{OFS="|"; if (NR == 1) {{$1 = $1; print $0; print "-----|-----:|-----:|-----:|-----:|-----:|-----:|-----:"}} else {{print $1, sprintf("%\\47d", $2), sprintf("%\\47d", $3), sprintf("%.1f", $4), sprintf("%\\47d", $5), sprintf("%.2f", $6), sprintf("%.2f", $7), $8}}}}' ; fi`
pandoc \\
  {report_template_dir}/{basename_report_file} \\
  --template {report_template_dir}/{basename_report_file} \\
  --variable trim_alignment_table="$trim_alignment_table_md" \\
  --to markdown \\
  > {report_file}""".format(
                    output_directory=output_directory,
                    report_template_dir=self.report_template_dir,
                    trim_metrics_file=trim_metrics_file,
                    metrics_file=metrics_file,
                    basename_report_file=os.path.basename(report_file),
                    report_metrics_file=report_metrics_file,
                    report_file=report_file
                ),
                report_files=[report_file],
                name="rnaseqc_report",
                samples=self.samples
            )
        )

        return jobs

    def picard_rna_metrics(self):
        """
        Computes a series of quality control metrics using both CollectRnaSeqMetrics and CollectAlignmentSummaryMetrics functions
        metrics are collected using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        reference_file = global_conf.get('picard_rna_metrics', 'genome_fasta', param_type='filepath')
        for sample in self.samples:
                alignment_file = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.bam")
                output_directory = os.path.join("metrics", sample.name)

                job = concat_jobs([
                    Job(command="mkdir -p " + output_directory, removable_files=[output_directory], samples=[sample]),
                    picard.collect_multiple_metrics(alignment_file, os.path.join(output_directory, sample.name), reference_file, library_type=sample.readsets[0].run_type),
                    picard.collect_rna_metrics(alignment_file, os.path.join(output_directory, sample.name+".picard_rna_metrics"))
                ],name="picard_rna_metrics."+ sample.name)
                jobs.append(job)

        return jobs

    def estimate_ribosomal_rna(self):
        """
        Use bwa mem to align reads on the rRNA reference fasta and count the number of read mapped
        The filtered reads are aligned to a reference fasta file of ribosomal sequence. The alignment is done per sequencing readset.
        The alignment software used is [BWA](http://bio-bwa.sourceforge.net/) with algorithm: bwa mem.
        BWA output BAM files are then sorted by coordinate using [Picard](http://broadinstitute.github.io/picard/).

        This step takes as input files:

        readset Bam files
        """

        jobs = []
        for readset in self.readsets:
            readset_bam = os.path.join("alignment", readset.sample.name, readset.name , "Aligned.sortedByCoord.out.bam")
            output_folder = os.path.join("metrics",readset.sample.name, readset.name)
            readset_metrics_bam = os.path.join(output_folder,readset.name +"rRNA.bam")


            job = concat_jobs([
                Job(command="mkdir -p " + os.path.dirname(readset_bam) + " " + output_folder),
                pipe_jobs([
                    bvatools.bam2fq(
                        readset_bam
                    ),
                    bwa.mem(
                        "/dev/stdin",
                        None,
                        read_group="'@RG" + \
                            "\tID:" + readset.name + \
                            "\tSM:" + readset.sample.name + \
                                   ("\tLB:" + readset.library if readset.library else "") + \
                                   ("\tPU:run" + readset.run + "_" + readset.lane if readset.run and readset.lane else "") + \
                                   ("\tCN:" + global_conf.get('bwa_mem_rRNA', 'sequencing_center') if global_conf.get('bwa_mem_rRNA', 'sequencing_center', required=False) else "") + \
                            "\tPL:Illumina" + \
                            "'",
                        ref=global_conf.get('bwa_mem_rRNA', 'ribosomal_fasta'),
                        ini_section='bwa_mem_rRNA'
                    ),
                    picard.sort_sam(
                        "/dev/stdin",
                        readset_metrics_bam,
                        "coordinate",
                        ini_section='picard_sort_sam_rrna'
                    )
                ]),
                tools.py_rrnaBAMcount (
                    bam=readset_metrics_bam,
                    gtf=global_conf.get('bwa_mem_rRNA', 'gtf'),
                    output=os.path.join(output_folder,readset.name+"rRNA.stats.tsv"),
                    typ="transcript")], name="bwa_mem_rRNA." + readset.name )

            job.removable_files=[readset_metrics_bam]
            job.samples = [readset.sample]
            jobs.append(job)
        return jobs


    def wiggle(self):
        """
        Generate wiggle tracks suitable for multiple browsers.
        """

        jobs = []

        ##check the library status
        library = {}
        for readset in self.readsets:
            if not readset.sample in library:
                library[readset.sample]="PAIRED_END"
            if readset.run_type == "SINGLE_END" :
                library[readset.sample]="SINGLE_END"

        for sample in self.samples:
            bam_file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.")
            input_bam = bam_file_prefix + "bam"
            bed_graph_prefix = os.path.join("tracks", sample.name, sample.name)
            big_wig_prefix = os.path.join("tracks", "bigWig", sample.name)

            if (global_conf.get('DEFAULT', 'strand_info') != 'fr-unstranded') and library[sample] == "PAIRED_END":
                input_bam_f1 = bam_file_prefix + "tmp1.forward.bam"
                input_bam_f2 = bam_file_prefix + "tmp2.forward.bam"
                input_bam_r1 = bam_file_prefix + "tmp1.reverse.bam"
                input_bam_r2 = bam_file_prefix + "tmp2.reverse.bam"
                output_bam_f = bam_file_prefix + "forward.bam"
                output_bam_r = bam_file_prefix + "reverse.bam"

                bam_f_job = concat_jobs([
                    samtools.view(input_bam, input_bam_f1, "-bh -F 256 -f 81"),
                    samtools.view(input_bam, input_bam_f2, "-bh -F 256 -f 161"),
                    picard.merge_sam_files([input_bam_f1, input_bam_f2], output_bam_f),
                    Job(command="rm " + input_bam_f1 + " " + input_bam_f2)
                ], name="wiggle." + sample.name + ".forward_strandspec")
                bam_f_job.samples = [sample]
                # Remove temporary-then-deleted files from job output files, otherwise job is never up to date
                bam_f_job.output_files.remove(input_bam_f1)
                bam_f_job.output_files.remove(input_bam_f2)

                bam_r_job = concat_jobs([
                    Job(command="mkdir -p " + os.path.join("tracks", sample.name) + " " + os.path.join("tracks", "bigWig")),
                    samtools.view(input_bam, input_bam_r1, "-bh -F 256 -f 97"),
                    samtools.view(input_bam, input_bam_r2, "-bh -F 256 -f 145"),
                    picard.merge_sam_files([input_bam_r1, input_bam_r2], output_bam_r),
                    Job(command="rm " + input_bam_r1 + " " + input_bam_r2)
                ], name="wiggle." + sample.name + ".reverse_strandspec")
                bam_r_job.samples = [sample]
                # Remove temporary-then-deleted files from job output files, otherwise job is never up to date
                bam_r_job.output_files.remove(input_bam_r1)
                bam_r_job.output_files.remove(input_bam_r2)

                jobs.extend([bam_f_job, bam_r_job])

                outputs = [
                    [bed_graph_prefix + ".forward.bedGraph", big_wig_prefix + ".forward.bw"],
                    [bed_graph_prefix + ".reverse.bedGraph", big_wig_prefix + ".reverse.bw"],
                ]
            else:
                outputs = [[bed_graph_prefix + ".bedGraph", big_wig_prefix + ".bw"]]

            for bed_graph_output, big_wig_output in outputs:
                if "forward" in bed_graph_output:
                    in_bam = bam_file_prefix + "forward.bam"    # same as output_bam_f from previous picard job
                elif "reverse" in bed_graph_output:
                    in_bam = bam_file_prefix + "reverse.bam"    # same as output_bam_r from previous picard job
                else:
                    in_bam = input_bam
                jobs.append(
                    concat_jobs([
                        Job(command="mkdir -p " + os.path.join("tracks", sample.name) + " ", removable_files=["tracks"], samples=[sample]),
                        bedtools.graph(in_bam, bed_graph_output, library[sample])
                    ], name="bed_graph." + re.sub(".bedGraph", "", os.path.basename(bed_graph_output)))
                )
                jobs.append(
                    concat_jobs([
                        Job(command="mkdir -p " + os.path.join("tracks", "bigWig"), samples=[sample]),
                        ucsc.bedGraphToBigWig(bed_graph_output, big_wig_output, False)
                    ], name="wiggle." + re.sub(".bw", "", os.path.basename(big_wig_output)))
                )

        return jobs

    def raw_counts(self):
        """
        Count reads in features using [htseq-count](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html).
        """

        jobs = []

        for sample in self.samples:
            alignment_file_prefix = os.path.join("alignment", sample.name, sample.name)
            input_bam = alignment_file_prefix + ".QueryNameSorted.bam"

            # Count reads
            output_count = os.path.join("raw_counts", sample.name + ".readcounts.csv")
            stranded = "no" if global_conf.get('DEFAULT', 'strand_info') == "fr-unstranded" else "reverse"
            job = concat_jobs([
                Job(command="mkdir -p raw_counts"),
                pipe_jobs([
                        samtools.view(
                                input_bam,
                                options="-F 4"
                        ),
                        htseq.htseq_count(
                        "-",
                        global_conf.get('htseq_count', 'gtf', param_type='filepath'),
                        output_count,
                        global_conf.get('htseq_count', 'options'),
                        stranded
                        )
                ])
            ], name="htseq_count." + sample.name)
            job.samples = [sample]
            jobs.append(job)

        return jobs

    def raw_counts_metrics(self):
        """
        Create rawcount matrix, zip the wiggle tracks and create the saturation plots based on standardized read counts.
        """

        jobs = []

        # Create raw count matrix
        output_directory = "DGE"
        read_count_files = [os.path.join("raw_counts", sample.name + ".readcounts.csv") for sample in self.samples]
        output_matrix = os.path.join(output_directory, "rawCountMatrix.csv")

        job = Job(read_count_files, [output_matrix], [['raw_counts_metrics', 'module_mugqic_tools']], name="metrics.matrix")

        job.command = """\
mkdir -p {output_directory} && \\
gtf2tmpMatrix.awk \\
  {reference_gtf} \\
  {output_directory}/tmpMatrix.txt && \\
HEAD='Gene\\tSymbol' && \\
for read_count_file in \\
  {read_count_files}
do
  sort -k1,1 $read_count_file > {output_directory}/tmpSort.txt && \\
  join -1 1 -2 1 <(sort -k1,1 {output_directory}/tmpMatrix.txt) {output_directory}/tmpSort.txt > {output_directory}/tmpMatrix.2.txt && \\
  mv {output_directory}/tmpMatrix.2.txt {output_directory}/tmpMatrix.txt && \\
  na=$(basename $read_count_file | rev | cut -d. -f3- | rev) && \\
  HEAD="$HEAD\\t$na"
done && \\
echo -e $HEAD | cat - {output_directory}/tmpMatrix.txt | tr ' ' '\\t' > {output_matrix} && \\
rm {output_directory}/tmpSort.txt {output_directory}/tmpMatrix.txt""".format(
            reference_gtf=global_conf.get('raw_counts_metrics', 'gtf', param_type='filepath'),
            output_directory=output_directory,
            read_count_files=" \\\n  ".join(read_count_files),
            output_matrix=output_matrix
        )
        job.samples = self.samples
        jobs.append(job)

        # Create Wiggle tracks archive
        library = {}
        for readset in self.readsets:
            if not readset.sample in library:
                library[readset.sample]="PAIRED_END"
            if readset.run_type == "SINGLE_END" :
                library[readset.sample]="SINGLE_END"

        wiggle_directory = os.path.join("tracks", "bigWig")
        wiggle_archive = "tracks.zip"
        if global_conf.get('DEFAULT', 'strand_info') != 'fr-unstranded':
            wiggle_files = []
            for sample in self.samples:
                if library[sample] == "PAIRED_END":
                    wiggle_files.extend([os.path.join(wiggle_directory, sample.name) + ".forward.bw", os.path.join(wiggle_directory, sample.name) + ".reverse.bw"])
        else:
            wiggle_files = [os.path.join(wiggle_directory, sample.name + ".bw") for sample in self.samples]
        jobs.append(Job(wiggle_files, [wiggle_archive], name="metrics.wigzip", command="zip -r " + wiggle_archive + " " + wiggle_directory, samples=self.samples))

        # RPKM and Saturation
        count_file = os.path.join("DGE", "rawCountMatrix.csv")
        gene_size_file = global_conf.get('rpkm_saturation', 'gene_size', param_type='filepath')
        rpkm_directory = "raw_counts"
        saturation_directory = os.path.join("metrics", "saturation")

        job = concat_jobs([
            Job(command="mkdir -p " + saturation_directory),
            metrics.rpkm_saturation(count_file, gene_size_file, rpkm_directory, saturation_directory)
        ], name="rpkm_saturation")
        job.samples = self.samples
        jobs.append(job)

        report_file = os.path.join("report", "RnaSeq.raw_counts_metrics.md")
        jobs.append(
            Job(
                [wiggle_archive, saturation_directory + ".zip","metrics/rnaseqRep/corrMatrixSpearman.txt"],
                [report_file],
                [['raw_counts_metrics', 'module_pandoc']],
                command="""\
mkdir -p report && \\
cp metrics/rnaseqRep/corrMatrixSpearman.txt report/corrMatrixSpearman.tsv && \\
cp {wiggle_archive} report/ && \\
cp {saturation_archive} report/ && \\
pandoc --to=markdown \\
  --template {report_template_dir}/{basename_report_file} \\
  --variable corr_matrix_spearman_table="`head -16 report/corrMatrixSpearman.tsv | cut -f-16| awk -F"\t" '{{OFS="\t"; if (NR==1) {{$0="Vs"$0; print; gsub(/[^\t]/, "-"); print}} else {{printf $1; for (i=2; i<=NF; i++) {{printf "\t"sprintf("%.2f", $i)}}; print ""}}}}' | sed 's/\t/|/g'`" \\
  {report_template_dir}/{basename_report_file} \\
  > {report_file}""".format(
                    wiggle_archive=wiggle_archive,
                    saturation_archive=saturation_directory + ".zip",
                    report_template_dir=self.report_template_dir,
                    basename_report_file=os.path.basename(report_file),
                    report_file=report_file
                ),
                report_files=[report_file],
                name="raw_count_metrics_report",
                samples = self.samples
            )
        )

        return jobs

    def stringtie(self):
        """
        Assemble transcriptome using [stringtie](https://ccb.jhu.edu/software/stringtie/index.shtml).
        Warning: Still in testing.
        """
        jobs = []

        gtf = global_conf.get('stringtie', 'gtf', param_type='filepath')
        for sample in self.samples:
            input_bam = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.hardClip.bam")
            output_directory = os.path.join("stringtie", sample.name)

            job = stringtie.stringtie(input_bam, output_directory, gtf)
            job.name = "stringtie." + sample.name
            job.samples = [sample]
            jobs.append(job)

        return jobs

    def stringtie_merge(self):
        """
        Merge assemblies into a master teranscriptome reference using [stringtie](https://ccb.jhu.edu/software/stringtie/index.shtml).
        Warning: still in testing
        """

        output_directory = os.path.join("stringtie", "AllSamples")
        sample_file = os.path.join("stringtie", "stringtie-merge.samples.txt")
        input_gtfs = [os.path.join("stringtie", sample.name, "transcripts.gtf") for sample in self.samples]
        gtf = global_conf.get('stringtie', 'gtf', param_type='filepath')


        job = concat_jobs([
            Job(command="mkdir -p " + output_directory, samples=self.samples),
            Job(input_gtfs, [sample_file], command="""\
`cat > {sample_file} << END
{sample_rows}
END

`""".format(sample_rows="\n".join(input_gtfs), sample_file=sample_file)),
            stringtie.stringtie_merge(sample_file, output_directory, gtf)],
            name="stringtie-merge")

        return [job]

    def stringtie_abund(self):
        """
        Assemble transcriptome and compute RNA-seq expression using [stringtie](https://ccb.jhu.edu/software/stringtie/index.shtml).
        Warning: Still in testing.
        """
        jobs = []

        gtf = os.path.join("stringtie", "AllSamples", "merged.gtf")

        for sample in self.samples:
            input_bam = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.hardClip.bam")
            output_directory = os.path.join("stringtie", sample.name)

            job = stringtie.stringtie(input_bam, output_directory, gtf, abund=True)
            job.name = "stringtie_abund." + sample.name
            job.samples = [sample]
            jobs.append(job)

        return jobs

    def ballgown(self):
        """
        [Ballgown](https://bioconductor.org/packages/release/bioc/html/ballgown.html) is used to calculate differential transcript and gene expression levels and test them for significant differences.

        Warning: still in testing
        """

        jobs = []

        # Perform ballgown on each design contrast
        # If --design <design_file> option is missing, self.contrasts call will raise an Exception
        if self.contrasts:
            design_file = os.path.relpath(self.design_file.name, self.output_dir)
        output_directory = "ballgown"
        input_abund = [os.path.join("stringtie", sample.name, "abundance.tab") for sample in self.samples]

        ballgown_job = ballgown.ballgown(input_abund, design_file, output_directory)
        ballgown_job.name = "ballgown"
        ballgown_job.samples = self.samples
        jobs.append(ballgown_job)

        return jobs

    def cufflinks(self):
        """
        Compute RNA-Seq data expression using [cufflinks](http://cole-trapnell-lab.github.io/cufflinks/cufflinks/).
        Warning: It needs to use a hard clipped bam file while Tuxedo tools do not support official soft clip SAM format
        """

        jobs = []

        gtf = global_conf.get('cufflinks', 'gtf', param_type='filepath')
        for sample in self.samples:
            input_bam = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.hardClip.bam")
            output_directory = os.path.join("cufflinks", sample.name)

            # De Novo FPKM
            job = cufflinks.cufflinks(input_bam, output_directory, gtf)
            job.removable_files = ["cufflinks"]
            job.name = "cufflinks."+sample.name
            job.samples = [sample]
            jobs.append(job)

        return jobs

    def cuffmerge(self):
        """
        Merge assemblies into a master transcriptome reference using [cuffmerge](http://cole-trapnell-lab.github.io/cufflinks/cuffmerge/).
        """

        output_directory = os.path.join("cufflinks", "AllSamples")
        sample_file = os.path.join("cufflinks", "cuffmerge.samples.txt")
        input_gtfs = [os.path.join("cufflinks", sample.name, "transcripts.gtf") for sample in self.samples]
        gtf = global_conf.get('cuffmerge', 'gtf', param_type='filepath')


        job = concat_jobs([
            Job(command="mkdir -p " + output_directory, samples=self.samples),
            Job(input_gtfs, [sample_file], command="""\
`cat > {sample_file} << END
{sample_rows}
END

`""".format(sample_rows="\n".join(input_gtfs), sample_file=sample_file)),
            cufflinks.cuffmerge(sample_file, output_directory, gtf_file=gtf)],
            name="cuffmerge")

        return [job]

    def cuffquant(self):
        """
        Compute expression profiles (abundances.cxb) using [cuffquant](http://cole-trapnell-lab.github.io/cufflinks/cuffquant/).
        Warning: It needs to use a hard clipped bam file while Tuxedo tools do not support official soft clip SAM format
        """

        jobs = []

        gtf = os.path.join("cufflinks", "AllSamples","merged.gtf")

        for sample in self.samples:
            input_bam = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.hardClip.bam")
            output_directory = os.path.join("cufflinks", sample.name)

            #Quantification
            job = cufflinks.cuffquant(input_bam, output_directory, gtf)
            job.name = "cuffquant."+sample.name
            job.samples = [sample]
            jobs.append(job)

        return jobs

    def cuffdiff(self):
        """
        [Cuffdiff](http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/) is used to calculate differential transcript expression levels and test them for significant differences.
        """

        jobs = []

        fpkm_directory = "cufflinks"
        gtf = os.path.join(fpkm_directory, "AllSamples","merged.gtf")


        # Perform cuffdiff on each design contrast
        for contrast in self.contrasts:
            job = cufflinks.cuffdiff(
                # Cuffdiff input is a list of lists of replicate bams per control and per treatment
                [[os.path.join(fpkm_directory, sample.name, "abundances.cxb") for sample in group] for group in [contrast.controls, contrast.treatments]],
                gtf,
                os.path.join("cuffdiff", contrast.name)
            )
            for group in [contrast.controls, contrast.treatments]:
                job.samples = [sample for sample in group]
            job.removable_files = ["cuffdiff"]
            job.name = "cuffdiff." + contrast.name
            jobs.append(job)

        return jobs

    def cuffnorm(self):
        """
        Global normalization of RNA-Seq expression levels using [Cuffnorm](http://cole-trapnell-lab.github.io/cufflinks/cuffnorm/).
        """

        jobs = []

        fpkm_directory = "cufflinks"
        gtf = os.path.join(fpkm_directory, "AllSamples","merged.gtf")
        sample_labels = ",".join([sample.name for sample in self.samples])

        # Perform cuffnorm using every samples
        job = cufflinks.cuffnorm(
            [os.path.join(fpkm_directory, sample.name, "abundances.cxb") for sample in self.samples],
            gtf,
            "cuffnorm",
            sample_labels
        )
        job.removable_files = ["cuffnorm"]
        job.name = "cuffnorm"
        job.samples = self.samples
        jobs.append(job)

        return jobs

    def fpkm_correlation_matrix(self):
        """
        Compute the pearson corrleation matrix of gene and transcripts FPKM. FPKM data are those estimated by cuffnorm.
        """
        output_directory = "metrics"
        output_transcript = os.path.join(output_directory,"transcripts_fpkm_correlation_matrix.tsv")
        cuffnorm_transcript = os.path.join("cuffnorm","isoforms.fpkm_table")
        output_gene = os.path.join(output_directory,"gene_fpkm_correlation_matrix.tsv")
        cuffnorm_gene = os.path.join("cuffnorm","genes.fpkm_table")

        jobs = []

        job = concat_jobs([
            Job(command="mkdir -p " + output_directory),
            utils.fpkm_correlation_matrix(cuffnorm_transcript, output_transcript)
        ])
        job.name="fpkm_correlation_matrix_transcript"
        job.samples = self.samples
        jobs = jobs + [job]

        job = utils.fpkm_correlation_matrix(cuffnorm_gene, output_gene)
        job.name="fpkm_correlation_matrix_gene"
        job.samples = self.samples
        jobs = jobs + [job]

        return jobs

    def gq_seq_utils_exploratory_analysis_rnaseq(self):
        """
        Exploratory analysis using the gqSeqUtils R package.
        """

        jobs = []

        # gqSeqUtils function call
        sample_fpkm_readcounts = [[
            sample.name,
            os.path.join("cufflinks", sample.name, "isoforms.fpkm_tracking"),
            os.path.join("raw_counts", sample.name + ".readcounts.csv")
        ] for sample in self.samples]
        jobs.append(concat_jobs([
            Job(command="mkdir -p exploratory", samples=self.samples),
            gq_seq_utils.exploratory_analysis_rnaseq(
                os.path.join("DGE", "rawCountMatrix.csv"),
                "cuffnorm",
                global_conf.get('gq_seq_utils_exploratory_analysis_rnaseq', 'genes', param_type='filepath'),
                "exploratory"
            )
        ], name="gq_seq_utils_exploratory_analysis_rnaseq"))

        # Render Rmarkdown Report
        jobs.append(
            rmarkdown.render(
                job_input            = os.path.join("exploratory", "index.tsv"),
                job_name             = "gq_seq_utils_exploratory_analysis_rnaseq_report",
                input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeq.gq_seq_utils_exploratory_analysis_rnaseq.Rmd"),
                samples              = self.samples,
                render_output_dir    = 'report',
                module_section       = 'report', # TODO: this or exploratory?
                prerun_r             = 'report_dir="report";' # TODO: really necessary or should be hard-coded in exploratory.Rmd?
            )
        )



        report_file = os.path.join("report", "RnaSeq.cuffnorm.md")
        jobs.append(
            Job(
                [os.path.join("cufflinks", "AllSamples","merged.gtf")],
                [report_file],
                command="""\
mkdir -p report && \\
zip -r report/cuffAnalysis.zip cufflinks/ cuffdiff/ cuffnorm/ && \\
cp \\
  {report_template_dir}/{basename_report_file} \\
  {report_file}""".format(
                    report_template_dir=self.report_template_dir,
                    basename_report_file=os.path.basename(report_file),
                    report_file=report_file
                ),
                report_files=[report_file],
                name="cuffnorm_report",
                samples=self.samples
            )
        )

        return jobs


    def differential_expression(self):
        """
        Performs differential gene expression analysis using [DESEQ](http://bioconductor.org/packages/release/bioc/html/DESeq.html) and [EDGER](http://www.bioconductor.org/packages/release/bioc/html/edgeR.html).
        Merge the results of the analysis in a single csv file.
        """

        # If --design <design_file> option is missing, self.contrasts call will raise an Exception
        if self.contrasts:
            design_file = os.path.relpath(self.design_file.name, self.output_dir)
        output_directory = "DGE"
        count_matrix = os.path.join(output_directory, "rawCountMatrix.csv")

        edger_job = differential_expression.edger(design_file, count_matrix, output_directory)
        edger_job.output_files = [os.path.join(output_directory, contrast.name, "edger_results.csv") for contrast in self.contrasts]
        edger_job.samples = self.samples

        deseq_job = differential_expression.deseq2(design_file, count_matrix, output_directory)
        deseq_job.output_files = [os.path.join(output_directory, contrast.name, "dge_results.csv") for contrast in self.contrasts]
        deseq_job.samples = self.samples

        return [concat_jobs([
            Job(command="mkdir -p " + output_directory),
            edger_job,
            deseq_job
        ], name="differential_expression")]

    def differential_expression_goseq(self):
        """
        Gene Ontology analysis for RNA-Seq using the Bioconductor's R package [goseq](http://www.bioconductor.org/packages/release/bioc/html/goseq.html).
        Generates GO annotations for differential gene expression analysis.
        """

        jobs = []

        for contrast in self.contrasts:
            # goseq for differential gene expression results
            job = differential_expression.goseq(
                os.path.join("DGE", contrast.name, "dge_results.csv"),
                global_conf.get("differential_expression_goseq", "dge_input_columns"),
                os.path.join("DGE", contrast.name, "gene_ontology_results.csv")
            )
            job.name = "differential_expression_goseq.dge." + contrast.name
            for group in contrast.controls, contrast.treatments:
                job.samples = [sample for sample in group]
            jobs.append(job)


###################
        report_file = os.path.join("report", "RnaSeq.differential_expression.md")
        jobs.append(
            Job(
                [os.path.join("DGE", "rawCountMatrix.csv")] +
                [os.path.join("DGE", contrast.name, "dge_results.csv") for contrast in self.contrasts] +
                [os.path.join("cuffdiff", contrast.name, "isoforms.fpkm_tracking") for contrast in self.contrasts] +
                [os.path.join("cuffdiff", contrast.name, "isoform_exp.diff") for contrast in self.contrasts] +
                [os.path.join("DGE", contrast.name, "gene_ontology_results.csv") for contrast in self.contrasts],
                [report_file],
                [['rnaseqc', 'module_python'], ['rnaseqc', 'module_pandoc']],
                # Ugly awk to format differential expression results into markdown for genes, transcripts and GO if any; knitr may do this better
                # Ugly awk and python to merge cuffdiff fpkm and isoforms into transcript expression results
                command="""\
set -eu -o pipefail && \\
mkdir -p report && \\
cp {design_file} report/design.tsv && \\
cp DGE/rawCountMatrix.csv report/ && \\
pandoc \\
  {report_template_dir}/{basename_report_file} \\
  --template {report_template_dir}/{basename_report_file} \\
  --variable design_table="`head -7 report/design.tsv | cut -f-8 | awk -F"\t" '{{OFS="\t"; if (NR==1) {{print; gsub(/[^\t]/, "-")}} print}}' | sed 's/\t/|/g'`" \\
  --variable raw_count_matrix_table="`head -7 report/rawCountMatrix.csv | cut -f-8 | awk -F"\t" '{{OFS="\t"; if (NR==1) {{print; gsub(/[^\t]/, "-")}} print}}' | sed 's/\t/|/g'`" \\
  --variable adj_pvalue_threshold={adj_pvalue_threshold} \\
  --to markdown \\
  > {report_file} && \\
for contrast in {contrasts}
do
  mkdir -p report/DiffExp/$contrast/
  echo -e "\\n#### $contrast Results\\n" >> {report_file}
  cp DGE/$contrast/dge_results.csv report/DiffExp/$contrast/${{contrast}}_Genes_DE_results.tsv
  echo -e "\\nTable: Differential Gene Expression Results (**partial table**; [download full table](DiffExp/$contrast/${{contrast}}_Genes_DE_results.tsv))\\n" >> {report_file}
  head -7 report/DiffExp/$contrast/${{contrast}}_Genes_DE_results.tsv | cut -f-8 | sed '2i ---\t---\t---\t---\t---\t---\t---\t---' | sed 's/\t/|/g' >> {report_file}
  sed '1s/^tracking_id/test_id/' cuffdiff/$contrast/isoforms.fpkm_tracking | awk -F"\t" 'FNR==NR{{line[$1]=$0; next}}{{OFS="\t"; print line[$1], $0}}' - cuffdiff/$contrast/isoform_exp.diff | python -c 'import csv,sys; rows_in = csv.DictReader(sys.stdin, delimiter="\t"); rows_out = csv.DictWriter(sys.stdout, fieldnames=["test_id", "gene_id", "tss_id","nearest_ref_id","class_code","gene","locus","length","log2(fold_change)","test_stat","p_value","q_value"], delimiter="\t", extrasaction="ignore"); rows_out.writeheader(); rows_out.writerows(rows_in)' > report/DiffExp/$contrast/${{contrast}}_Transcripts_DE_results.tsv
  echo -e "\\n---\\n\\nTable: Differential Transcript Expression Results (**partial table**; [download full table](DiffExp/$contrast/${{contrast}}_Transcripts_DE_results.tsv))\\n" >> {report_file}
  head -7 report/DiffExp/$contrast/${{contrast}}_Transcripts_DE_results.tsv | cut -f-8 | sed '2i ---\t---\t---\t---\t---\t---\t---\t---' | sed 's/\t/|/g' >> {report_file}
  if [ `wc -l DGE/$contrast/gene_ontology_results.csv | cut -f1 -d\ ` -gt 1 ]
  then
    cp DGE/$contrast/gene_ontology_results.csv report/DiffExp/$contrast/${{contrast}}_Genes_GO_results.tsv
    echo -e "\\n---\\n\\nTable: GO Results of the Differentially Expressed Genes (**partial table**; [download full table](DiffExp/${{contrast}}/${{contrast}}_Genes_GO_results.tsv))\\n" >> {report_file}
    head -7 report/DiffExp/${{contrast}}/${{contrast}}_Genes_GO_results.tsv | cut -f-8 | sed '2i ---\t---\t---\t---\t---\t---\t---\t---' | sed 's/\t/|/g' >> {report_file}
  else
    echo -e "\\nNo FDR adjusted GO enrichment was significant (p-value too high) based on the differentially expressed gene results for this design.\\n" >> {report_file}
  fi
done""".format(
                    design_file=os.path.abspath(self.design_file.name),
                    report_template_dir=self.report_template_dir,
                    basename_report_file=os.path.basename(report_file),
                    adj_pvalue_threshold=global_conf.get('differential_expression_goseq', 'other_options').split(" ")[1],
                    report_file=report_file,
                    contrasts=" ".join([contrast.name for contrast in self.contrasts])
                ),
                report_files=[report_file],
                name="differential_expression_goseq_report",
                samples=self.samples
            )
        )
        return jobs

    def ihec_metrics(self):
        """
        Generate IHEC's standard metrics.
        """

        genome = global_conf.get('ihec_metrics', 'assembly')

        return [metrics.ihec_metrics_rnaseq(genome)]


class RnaSeq(RnaSeqRaw):
    __doc__ = RnaSeqRaw.__doc__
    def __init__(self,*args, protocol=None, **kwargs):
        self._protocol = protocol
        # Add pipeline specific arguments
        super().__init__(*args, **kwargs)

    @classmethod
    def argparser(cls, argparser):
        super().argparser(argparser)
        cls._argparser.add_argument("-t", "--type", help="RNAseq analysis type", dest='protocol',
                                    choices=["stringtie","cufflinks"], default="stringtie")
        return cls._argparser

    @property
    def step_list(self):
        return self.protocols()[self._protocol]

    def protocols(self):
        return { "stringtie":
            [self.picard_sam_to_fastq,
            self.trimmomatic,
            self.merge_trimmomatic_stats,
            self.star,
            self.picard_merge_sam_files,
            self.picard_sort_sam,
            self.picard_mark_duplicates,
            self.picard_rna_metrics,
            self.estimate_ribosomal_rna,
            self.bam_hard_clip,
            self.rnaseqc,
            self.wiggle,
            self.raw_counts,
            self.raw_counts_metrics,
            self.stringtie,
            self.stringtie_merge,
            self.stringtie_abund,
            self.ballgown,
            self.differential_expression,
            self.cram_output
            ], "cufflinks":
            [self.picard_sam_to_fastq,
            self.trimmomatic,
            self.merge_trimmomatic_stats,
            self.star,
            self.picard_merge_sam_files,
            self.picard_sort_sam,
            self.picard_mark_duplicates,
            self.picard_rna_metrics,
            self.estimate_ribosomal_rna,
            self.bam_hard_clip,
            self.rnaseqc,
            self.wiggle,
            self.raw_counts,
            self.raw_counts_metrics,
            self.cufflinks,
            self.cuffmerge,
            self.cuffquant,
            self.cuffdiff,
            self.cuffnorm,
            self.fpkm_correlation_matrix,
            self.gq_seq_utils_exploratory_analysis_rnaseq,
            self.differential_expression,
            self.differential_expression_goseq,
            self.ihec_metrics,
            self.cram_output
            ]
                 }
