#!/usr/bin/python

__author__="BU"


"""This script converts gff to gtf"""

# Usage python gff3Togtf.py -i annotation.gff3 -o output.gtf 

from os import sys
import getopt
import heapq
from optparse import OptionParser

# ----------------
## OPTION PARSING
# ----------------

parser = OptionParser()
parser.add_option("-i",  help="Annotation in gff")
parser.add_option("-o",  help='Output gtf file')

options, args = parser.parse_args()


# Store input and output file

infile=''
outfile=''

# Read command line args
myopts, args = getopt.getopt(sys.argv[1:],"i:o:")

# Loop over arguments and assign input to infile and output to outfile 
# o == option
# a == argument passed to the o

#if myopts == []:
#    print("Usage: %s -i input -o output" % sys.argv[0])

for o, a in myopts:
    if o == '-i':
        infile=a
    if o == '-o':
        outfile=a

print ("Input file: %s \nOutput file: %s" % (infile,outfile))

#-------------------------------------------------------------------------------------------------
# Defining Functions
#-------------------------------------------------------------------------------------------------

def gffTogtf():
   
    gtf={}
    tx2gene={}
    reg2chr={}

    gff=open(infile, "r")
    for line in gff:
        if not line.startswith('#'):
            split=line.split("\t")
            #print split
            if split[2]=="region":
                reg=split[0]
                #print reg
                spliter=split[8].split(";")
                #print spliter
                for item in spliter:           
                    if item.startswith('chromosome'): 
                        chrT=item.split('=')[1].split('\n')[0]
                        chrN='chr'+chrT
                        #print chrN
                          
                        reg2chr[reg]=chrN
             
            if split[2]=="gene":
                #print 'Gene', split
                regs=split[0]
                if regs in reg2chr:
                    chrN=reg2chr[regs]   
                add=split[1]
                typ=split[2]
                start=split[3]
                stop=split[4]
                dot1=split[5]
                strand=split[6]
                dot2=split[7]
                 
                spliter=split[8].split(";")
                #print spliter
                for item in spliter:           
                    if item.startswith('ID'): 
                        ids=item.split('=')[1].split('\n')[0]
                        
                    if item.startswith('Dbxref'): 
                        core=item.split('=')[1].split(',')[0].split(':')[1]
                        #print core
                        gene_id='GeneID:'+core+"_"+ids
                        #print gene_id
  
                    if item.startswith('gene_biotype'): 
                        gene_biotype=item.split('=')[1].split('\n')[0]
                
                        record=(chrN+"\t"+add+"\t"+typ+"\t"+start+"\t"+stop+"\t"+dot1+"\t"+strand+"\t"+dot2+"\t"+"gene_id"+" "+'"'+gene_id+'"'"; "+"transcript_id"+" "+'"'+gene_id+'"'"; "+"gene_biotype"+" "+'"'+gene_biotype+'"'";")
           
                        if gene_id not in gtf:
                            gtf[gene_id]=[record]
                        else:
                            gtf[gene_id].append(record)  


            if split[2]=="transcript":
                #print 'TX', split
                regs=split[0]
                if regs in reg2chr:
                    chrN=reg2chr[regs]
                add=split[1]
                typ=split[2]
                start=split[3]
                stop=split[4]
                dot1=split[5]
                strand=split[6]
                dot2=split[7]
                transcript_biotype='NA'
                
                spliter=split[8].split(";")
                for item in spliter:
                    if item.startswith('ID'): 
                        txids=item.split('=')[1].split('\n')[0]
                        #print transcript_id
                    if item.startswith('Parent'): 
                        ids=item.split('=')[1].split(',')[0]
                        #print gene_id
                    if item.startswith('Dbxref'): 
                        core=item.split('=')[1].split(',')[0].split(':')[1]
                        #print core
                        gene_id='GeneID:'+core+"_"+ids
                        #print gene_id
                    if item.startswith('transcript_id'): 
                        txcore=item.split('=')[1].split('\n')[0]
                        #print core
                        transcript_id=txcore+"_"+txids
                        #print transcript_id

               
                        record=(chrN+"\t"+add+"\t"+typ+"\t"+start+"\t"+stop+"\t"+dot1+"\t"+strand+"\t"+dot2+"\t"+"gene_id"+" "+'"'+gene_id+'"'"; "+"transcript_id"+" "+'"'+transcript_id+'"'"; "+"transcript_biotype"+" "+'"'+transcript_biotype+'"'";")
           
                        if gene_id not in gtf:
                            gtf[gene_id]=[record]
                        else:
                            gtf[gene_id].append(record)  

                        tx2gene[transcript_id]=gene_id

            if split[2]=="lnc_RNA":
                #print 'TX', split
                regs=split[0]
                if regs in reg2chr:
                    chrN=reg2chr[regs]
                add=split[1]
                typ=split[2]
                start=split[3]
                stop=split[4]
                dot1=split[5]
                strand=split[6]
                dot2=split[7]
                transcript_biotype='lncRNA'
                
                spliter=split[8].split(";")
                for item in spliter:
                    if item.startswith('ID'): 
                        txids=item.split('=')[1].split('\n')[0]
                        #print transcript_id
                    if item.startswith('Parent'): 
                        ids=item.split('=')[1].split(',')[0]
                        #print gene_id
                    if item.startswith('Dbxref'): 
                        core=item.split('=')[1].split(',')[0].split(':')[1]
                        #print core
                        gene_id='GeneID:'+core+"_"+ids
                        #print gene_id
                    if item.startswith('transcript_id'): 
                        txcore=item.split('=')[1].split('\n')[0]
                        #print core
                        transcript_id=txcore+"_"+txids
                        #print transcript_id
               
                        record=(chrN+"\t"+add+"\t"+typ+"\t"+start+"\t"+stop+"\t"+dot1+"\t"+strand+"\t"+dot2+"\t"+"gene_id"+" "+'"'+gene_id+'"'"; "+"transcript_id"+" "+'"'+transcript_id+'"'"; "+"transcript_biotype"+" "+'"'+transcript_biotype+'"'";")
           
                        if gene_id not in gtf:
                            gtf[gene_id]=[record]
                        else:
                            gtf[gene_id].append(record)

                        tx2gene[transcript_id]=gene_id     

            if split[2]=="exon":
                regs=split[0]
                if regs in reg2chr:
                    chrN=reg2chr[regs]   
                add=split[1]
                typ=split[2]
                start=split[3]
                stop=split[4]
                dot1=split[5]
                strand=split[6]
                dot2=split[7] 

                spliter=split[8].split(";")
                for item in spliter:
                    if item.startswith('ID'): 
                        exon_id=item.split('=')[1]
                        #print exon_id
                    if item.startswith('Parent'): 
                        txids=item.split('=')[1].split('\n')[0]
                    if item.startswith('transcript_id'): 
                        txcore=item.split('=')[1].split('\n')[0]
                        #print core
                        transcript_id=txcore+"_"+txids
                        if transcript_id in tx2gene:
                            gene_id=tx2gene[transcript_id]
        
               
                            record=(chrN+"\t"+add+"\t"+typ+"\t"+start+"\t"+stop+"\t"+dot1+"\t"+strand+"\t"+dot2+"\t"+"gene_id"+" "+'"'+gene_id+'"'"; "+"transcript_id"+" "+'"'+transcript_id+'"'"; "+"exon_id"+" "+'"'+exon_id+'"'";")
           
                            if gene_id not in gtf:
                                gtf[gene_id]=[record]
                            else:
                                gtf[gene_id].append(record)  
    
    f=open(outfile, "w")

    for key in gtf:
        for v in gtf[key]:
            f.write('%s\n' % v)
    f.close() 

gffTogtf()