#!/usr/bin/python

__author__="BU"

from optparse import OptionParser
import gzip
from os import sys
import getopt

# ===============
## OPTION PARSING
# ===============

parser = OptionParser()
parser.add_option("-i", 
	help="GTF/GTF. No stdin")
parser.add_option('-o', help='Output GFF file with trans coord')

options, args = parser.parse_args()


# Store input and output file

infile=''
outfile=''

# Read command line args
myopts, args = getopt.getopt(sys.argv[1:],"i:o:")

# Loop over arguments and assign input to infile and output to outfile 
# o == option
# a == argument passed to the o

if myopts == []:
    print("Usage: %s -i input -o output -g outgtf" % sys.argv[0])

for o, a in myopts:
    if o == '-i':
        infile=a
    if o == '-o':
        outfile=a
        
print ("Input file : %s \nOutput file: %s" % (infile,outfile))

def extract_trans_coord():

    tx_start={}
    tx_stop={}
    tx_line={}
    tx_gff={}

    gtf_file=open(infile, 'r')
    for line in gtf_file:
        line_split=line.split("\n")[0].split("\t")
        chrom=line_split[0]
        item=line_split[1]
        typ=line_split[2]

        if typ == "exon":

            start=int(line_split[3])
            stop=int(line_split[4])
            strand=line_split[6]
            txID=line_split[8].split(';')[1].split('transcript_id')[1]
            geneID=line_split[8].split(';')[0].split('gene_id')[1]
            #print 'TX', txID
            #print 'GENE', geneID
            record=chrom+"@"+item+"@"+strand+"@"+geneID
            #print record

            # hash with start 

            if txID not in tx_start:
                tx_start[txID]=[start]
            else:
                tx_start[txID].append(start)
           
            # hash with stop  
          
            if txID not in tx_stop:
                tx_stop[txID]=[stop]
            else:
                tx_stop[txID].append(stop)
           
            # txID and the line

            tx_line[txID]=record
            #print 'TX_LINE',txID, tx_line[txID]

    for key in tx_start:
        #print 'TX_START',tx_start[key]

        tx_start[key].sort()
        tx_stop[key].sort()  

        new_start=tx_start[key][0]    
        #print 'NEW_START', new_start
        #print 'STOP', stop
        #print 'tx_stop[key]', tx_stop[key]
        new_stop=tx_stop[key][len(tx_stop[key])-1]
        #print 'NEW_STOP', new_stop
        tx_items=tx_line[key].split("@")
        tx_chrom=tx_items[0]
        tx_add=tx_items[1]
        tx_str=tx_items[2]
        tx_geneID=tx_items[3]
        merged=tx_chrom+"\t"+tx_add+"\t"+"exon"+"\t"+str(new_start)+"\t"+str(new_stop)+"\t"+"."+"\t"+tx_str+"\t"+"."+"\t"+"gene_id"+tx_geneID+";"+" "+"transcript_id"+key+";" 
              
        tx_gff[key]=merged

    f=open(outfile, 'w')
    for kyz in tx_gff:
        f.write('%s\n' % tx_gff[kyz])
    f.close()
                    
extract_trans_coord()