#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Calculate count, FPKM, and FPKM-UQ values.
FPKM and FPKM-UQ were defined here:
https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#upper-quartile-fpkm

Note:
This program generates *exactly* the same FPKM and FPKM-UQ values as TCGA, if:
1) you use the TCGA BAM file, or follow the TCGA RNA-seq workflow to do alignment (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#rna-seq-alignment-workflow).
2) use the GENCODE v22 GTF file (https://api.gdc.cancer.gov/data/25aa497c-e615-4cb7-8751-71f744f9691f).
3) use the GENCODE v22 information file (https://api.gdc.cancer.gov/data/b011ee3e-14d8-4a97-aed4-e0b10f6bbe82).
4) you must install HTSeq (https://pypi.org/project/HTSeq/). And the 'htseq-count' command is callable.

'''

import sys
import os
import shutil
import subprocess
import numpy as np
from optparse import OptionParser
from time import strftime

__author__ = "Liguo Wang"
__copyright__ = "Copyleft"
__credits__ = []
__license__ = "GPL"
__version__="4.0.0"
__maintainer__ = "Liguo Wang"
__email__ = "wang.liguo@mayo.edu"
__status__ = "Production"

def printlog (mesg):
	'''print progress into stderr and log file'''
	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
	print(mesg, file=sys.stderr)

def run_HTseq(bam_file, gtf_file, out_file, print_cmd = False):
	'''
	parameters
	----------
	bam_file : str
		BAM format file.
	gtf_file : str
		GTF format file.
	out_file : str
		output file name.
	print_cmd : bool
		if set to True, return the "htseq-count" command line and exit. If set to False
		run the "htseq-count" command line.
	'''
	# processing and checking
	if not os.path.exists(bam_file):
		print ("%s does not exist!" % bam_file)
		sys.exit()
	if not os.path.exists(gtf_file):
		print ("%s does not exist!" % gtf_file)
		sys.exit()

	# find htseq-count command
	htseq_cmd = shutil.which("htseq-count")
	if htseq_cmd is None:
		print ("Cannot find \"htseq-count\" command!", file=sys.stderr)
		sys.exit()

	# Set parameters used by TCGA workflow
	# https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/
	paras = {
		'-f' : 'bam',
		'-r' : 'pos',
		'-s' : 'no',
		'-a' : 10,
		'-t' : 'exon',
		'-i' : 'gene_id',
		'-m' : ' intersection-nonempty',
		}

	# run HTSeq
	cmd = htseq_cmd + ' ' + ' '.join([ str(i) + ' ' + str(j) for i,j in paras.items()]) + ' ' + os.path.abspath(bam_file) + ' ' + os.path.abspath(gtf_file) + ' > ' + out_file

	if print_cmd:
		return cmd
	else:
		printlog ("Running : %s" % cmd)
		subprocess.call(cmd,shell=True)

def cal_fpkm(count_file, infor_file, out_file,log2_flag=False):
	'''
	parameters
	----------
	count_file : str
		count file generated by HT-Seq. The first column is gene ID, the second
		column is read count.
	infor_file : str
		Information file.
			gene_id gene_name       seqname start   end     strand  gene_type       gene_status     havana_gene     full_length     exon_length     exon_num
			ENSG00000223972.5       DDX11L1 chr1    11869   14409   +       transcribed_unprocessed_pseudogene      KNOWN   OTTHUMG00000000961.2    2541    1735    9
			ENSG00000238009.5       RP11-34P13.7    chr1    89295   133723  -       lincRNA NOVEL   OTTHUMG00000001096.2    44429   3726    17
			ENSG00000230415.1       RP5-902P8.10    chr1    1275223 1280420 +       lincRNA NOVEL   OTTHUMG00000002234.2    5198    513     5
	'''

	printlog ('Read gene information file: %s' % infor_file)
	gene_sizes = {} # mRNA size for all genes
	gene_infor = {}
	protein_coding = set()	#list of protein coding genes
	for l in open(infor_file):
		l = l.strip()
		if l.startswith('gene_id'):continue
		f = l.split()
		gene_sizes[f[0]] = int(f[10])
		gene_infor[f[0]] = '\t'.join(f[1:6])
		if f[6] == 'protein_coding':
			protein_coding.add(f[0])

	print ('\tTotal genes: %d' % len(gene_sizes), file=sys.stderr)
	print ('\tTotal protein-coding genes: %d' % len(protein_coding), file=sys.stderr)


	printlog('Read gene count file to calculate 75 percentile count and total count: %s' % count_file)
	gene_counts = []
	for l in open(count_file):
		l = l.strip()
		if l.startswith('__'):
			continue
		f = l.split()
		gene_id = f[0]
		if gene_id not in protein_coding:
			continue
		gene_counts.append(int(f[1]))

	uq_count = np.percentile(sorted(gene_counts), 75)
	total_count = sum(gene_counts)
	print ('\tTotal protein-coding genes: %d' % len(gene_counts), file=sys.stderr)
	print ('\tThe 75 perentile count of protein-coding genes: %f' % (uq_count), file=sys.stderr)
	print ('\tThe total count of protein-coding genes: %f' % (total_count), file=sys.stderr)

	FPKM_OUT = open(out_file, 'w')
	if log2_flag  is True:
		print ('\t'.join(['gene_ID','symbol','chrom','start','end','strand', 'raw_count', 'FPKM(log2(x+1))', 'FPKM-UQ(log2(x+1))']), file=FPKM_OUT)
	else:
		print ('\t'.join(['gene_ID', 'symbol','chrom','start','end','strand', 'raw_count', 'FPKM', 'FPKM-UQ']), file=FPKM_OUT)
	print ('Read gene count file to calculate FPKM and FPKM-UQ: %s' % count_file, file=sys.stderr)
	for l in open(count_file):
		l = l.strip()
		if l.startswith('__'):
			continue
		f = l.split()
		gene = f[0]
		count = int(f[1])
		if gene in gene_sizes:
			try:
				if log2_flag  is True:
					fpkm_uq = np.log2((count*1000000000)/(gene_sizes[gene]*uq_count) +1)
					fpkm = np.log2((count*1000000000)/(gene_sizes[gene]*total_count) +1)
				else:
					fpkm_uq = (count*1000000000)/(gene_sizes[gene]*uq_count)
					fpkm = (count*1000000000)/(gene_sizes[gene]*total_count)

			except:
				fpkm_uq = 'NA'
				fpkm = 'NA'

		else:
			fpkm_uq = 'NA'
			fpkm = 'NA'
		print (gene + '\t' + gene_infor[gene] + '\t' + '\t'.join([str(i) for i in (count, fpkm, fpkm_uq)]), file=FPKM_OUT)

	FPKM_OUT.close()

def main():

	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("--bam",action="store",type="string",dest="bam_file",help="Alignment file in BAM format. BAM file shoul be sorted and indexed. Ideally, the BAM file should generaet from the TCGA RNA-seq analysis workflow described here https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/.")
	parser.add_option("--gtf",action="store",type="string",dest="GTF_file",help="Gene model in GTF format.")
	parser.add_option("--info",action="store",type='string', dest="infor_file", default=5, help="Gene model information file.")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
	parser.add_option("--log2",action="store_true",dest="log_scale",default=False, help="Convert FPKM and FPKM-UQ values into log2 (x+1) scale. A pseudo count 1 will be added to each gene/transcript.")
	(options,args)=parser.parse_args()

	if not (options.bam_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)
	(options,args)=parser.parse_args()
	if not (options.GTF_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
	(options,args)=parser.parse_args()
	if not (options.infor_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
	printlog("Running htseq-count ...")
	run_HTseq(bam_file = options.bam_file, gtf_file = options.GTF_file, out_file = options.out_file + '.htseq.counts.txt')

	if options.log_scale:
		printlog("Calculate log2(FPKM + 1) and log2(FPKM-UQ + 1) ...")
	else:
		printlog("Calculate FPKM and FPKM-UQ ...")
	cal_fpkm(count_file = (options.out_file + '.htseq.counts.txt'), infor_file = options.infor_file, out_file = options.out_file + '.FPKM-UQ.txt', log2_flag = options.log_scale)

if __name__=='__main__':
	main()
