#!/usr/bin/env bash
if [[ $# -ne 3 ]] ; then
    echo "Usage:"
    echo "$0 in.exon.gtf in.gtf out.name_id"
    exit
fi

exon_gtf=$1
in_gtf=$2
out_name_id=$3
gene_bed=${1}.tmp.bed
tmp=${1}.tmp

#

# 0. gene_gtf => gene.bed
awk -v OFS="\t" '
    ($3=="gene") {
    if (match($0, /gene_id "([^"]+)";/, arr)) {
        gene_id=arr[1];
    } else {
        gene_id = "AA"
    }
    if (match($0, /gene_name "([^"]+)";/, arr)) {
        gene_name=arr[1];
    } else {
        gene_name = "NA"
    }
    if (gene_id == "NA" && gene_name != "NA") {
        gene_id = gene_name
    } else if (gene_name == "NA" && gene_id != "NA") {
        gene_name = gene_id
    }
    print $1, $4-1, $5, gene_id","gene_name","$7, ".", $7;
}
' $in_gtf > $gene_bed

# 1. intersect gene_bed with exon_gtf
#echo "bedtools intersect -s -a $gene_bed -b $exon_gtf -split -wb > $tmp"
bedtools intersect -s -a $gene_bed -b $exon_gtf -split -wb > $tmp

# 2. extract gene id and gene name
awk -v OFS="\t" '
    (match($0, /gene_id "([^"]+)";/, arr)) {
        read_id=arr[1]
        split($4, gene, ",")
        gene_id=gene[1]
        gene_name=gene[2]
        gene_strand=gene[3]

        print read_id, gene_id, gene_name, gene_strand
    }
' $tmp | sort | uniq > $out_name_id
rm $gene_bed
rm $tmp
