#!/usr/bin/env python
import argparse
import logging
import re

import fetch_db
from gene_pred_ext import GenePredExt


class PTM(object):
    def __init__(self, db):
        '''
        :db: sqlite database contains PTM information generated by SpineD.
        '''
        self.db = db
        self.logger = logging.getLogger(__name__)

    def query_ptm_info(self, transcript_id, pstart, pend, ptm_info):
        query_info = []
        for record in ptm_info:
            trans_id, uniprot_id, position, modification = record
            if pstart <= position <= pend:
                query_info.append(record)
        return query_info
    
    def cal_ptm(self, transcript_id, pstart, pend):
        ptm_info = self.db.query_ptm(transcript_id)
        ptm = 'NA'
        if pstart and pend:
            if ptm_info:
                query_result = self.query_ptm_info(transcript_id, pstart, pend, ptm_info)
                if query_result:
                    ptm = float(len(query_result)) / (pend - pstart + 1) * 100 
                else:
                    ptm = 0.0
                    self.logger.debug('region does not contain PTM. {0} {1}-{2}\n'.format(transcript_id, pstart, pend))
            else:
                ptm = 0.0
        return ptm


def main():
    parser = argparse.ArgumentParser(description='''
            Parse GenePred table (Extended) and extract features.''')
    parser.add_argument('ensembldb',
            help='ensembl sqlite db file, containing protein PTM info.')
    parser.add_argument('gfname',
            help='GenePred table (Extended) file name, from UCSC table browser.')
    parser.add_argument('bfname',
            help='bedtools closest region distances output')
    args = parser.parse_args()
    
    gene_pred = GenePredExt(args.gfname)

    db = fetch_db.DB(args.ensembldb)
    ptm = PTM(db)
    with open(args.bfname) as f:
        for line in f:
            cols = line.rstrip().split('\t')
            estart = int(cols[8])
            eend = int(cols[9])
            transcript_id = re.search(r'(\w+)_exon', cols[10]).group(1)
            if transcript_id in gene_pred.transcripts:
                pstart, pend = gene_pred.get_protein_coord(transcript_id, estart, eend)
                ptm_value = ptm.cal_ptm(transcript_id, pstart, pend)
            print('\t'.join(map(str, cols + [ptm_value])))

if __name__ == '__main__':
    main()
