#!/usr/bin/env python

import fuma,sys

from fuma.CLI import CLI_ensmble_gtf_to_bed_converter

args = CLI_ensmble_gtf_to_bed_converter()

## GTF
##  - start: 1-based
##  - end:   1-based
##
## BED
##  - start: 0-based
##  - end:   1-based

#print args
#print args.genecode_gtf_file

idx = {}

# Parse the GTF file
with open(args.genecode_gtf_file[0],"r") as fh:
	for line in fh:
		line = line.strip()
		if len(line) > 0:
			if line[0] != "#":
				params = line.split("\t")
				
				gene_id = params[8].split("gene_id",1)[1].split(";",1)[0].strip(" ").strip('"')
				transcript_id = params[8].split("transcript_id",1)[1].split(";",1)[0].strip(" ").strip('"')
				#exon_number = params[8].split("exon_number",1)[1].split(";",1)[0].strip(" ").strip('"')
				
				start = int(params[3])
				end = int(params[4])
				inversed = (end < start)
				
				min_pos = min(start,end)
				max_pos = max(start,end)
				
				if not idx.has_key(transcript_id):
					idx[transcript_id] = {}
				
				if not idx[transcript_id].has_key(params[0]):
					idx[transcript_id][params[0]] = [min_pos,max_pos,inversed,params[6],gene_id,params[0]]
				
				if min_pos < idx[transcript_id][params[0]][0]:
					# if inversion and strand is identical, overwrite
					if inversed == idx[transcript_id][params[0]][2] and params[6] == idx[transcript_id][params[0]][3]:
						idx[transcript_id][params[0]][0] = min_pos
					else:
						raise Exception("Error: transcript annotated in different directions:\n"+line)
				
				if max_pos > idx[transcript_id][params[0]][1]:
					# if inversion and strand is identical, overwrite
					if inversed == idx[transcript_id][params[0]][2] and params[6] == idx[transcript_id][params[0]][3]:
						idx[transcript_id][params[0]][1] = max_pos
					else:
						raise Exception("Error: transcript annotated in different directions:\n"+line)

# Export to BED
lines = set()
for t in sorted(idx.keys()):# Sorted is essential to get the same output and for functional testing
	if len(idx[t].keys()) != 1:
		raise Exception("Error: " + t + " has either no annotated chromosomes or multiple")
	else:
		data = idx[t][idx[t].keys()[0]]
		
		out =  data[5]# chr
		out += "\t"+str(data[0]-1)# start
		out += "\t"+str(data[1])# end
		out += "\t"+data[4]#.split(".",1)[0] << trick to get rid of suffixes of ensembl ID's
		
		lines.update([out])

# Lines are unique by using sets
for line in sorted(lines):
	print line
