#!/usr/bin/env python

import argparse
import logging
import os
import sys

from caspredict.controller import Controller
from caspredict.prodigal import Prodigal
from caspredict.hmmer import HMMER
from caspredict.castyping import Typer
from caspredict.minced import Minced
from caspredict.xgb import XGB
from caspredict.crisprcas import CRISPRCas

########## Arguments ##########
ap = argparse.ArgumentParser()

# Required
ap.add_argument('input', help='Input fasta file')
ap.add_argument('output', help='Prefix for output directory')

# Optional
ap.add_argument('-t', '--threads', help='Number of parallel processes [%(default)s].', default=4, type=int)
ap.add_argument('--prodigal', help='Which mode to run prodigal in [%(default)s].', default='single', type=str, choices=['single','meta'])
ap.add_argument('--aa', help='Input is a protein fasta. Has to be in prodigal format.', action='store_true')
ap.add_argument('--skip_check', help='Skip check of input.', action='store_true',)
ap.add_argument('--keep_tmp', help='Keep temporary files (prodigal, hmmer, minced).', action='store_true')
ap.add_argument('--log_lvl', help='Logging level [%(default)s].', default='INFO', type=str, choices=['DEBUG','INFO','WARNING','ERROR'])
ap.add_argument('--redo_typing', help='Redo the typing. Skip prodigal and HMMER and load the hmmer.tab from the output dir.', action='store_true')

# Data
apd = ap.add_argument_group('data arguments')
apd.add_argument('--db', help='Path to database.', default='', type=str)

# Thresholds
apt = ap.add_argument_group('cas threshold arguments')
apt.add_argument('--dist', help='Max allowed distance between genes in operon [%(default)s].', default=3, type=int)
apt.add_argument('--overall_eval', help='Overall E-value threshold [%(default)s].', default=0.001, type=float)
apt.add_argument('--overall_cov_seq', help='Overall sequence coverage threshold [%(default)s].', default=0.5, type=float)
apt.add_argument('--overall_cov_hmm', help='Overall HMM coverage threshold [%(default)s].', default=0.5, type=float)
apt.add_argument('--two_gene_eval', help='Two-gene operon E-value threshold [%(default)s].', default=1e-5, type=float)
apt.add_argument('--two_gene_cov_seq', help='Two-gene operon sequence coverage threshold [%(default)s].', default=0.8, type=float)
apt.add_argument('--two_gene_cov_hmm', help='Two-gene operon HMM coverage threshold [%(default)s].', default=0.8, type=float)
apt.add_argument('--single_gene_eval', help='Lonely gene E-value threshold [%(default)s].', default=1e-10, type=float)
apt.add_argument('--single_gene_cov_seq', help='Lonely gene sequence coverage threshold [%(default)s].', default=0.9, type=float)
apt.add_argument('--single_cov_hmm', help='Lonely gene HMM coverage threshold [%(default)s].', default=0.9, type=float)
apt.add_argument('--vf_eval', help='V-F Cas12 specific E-value threshold [%(default)s].', default=1e-75, type=float)
apt.add_argument('--vf_cov_hmm', help='V-F Cas12 specific HMM coverage threshold [%(default)s].', default=0.97, type=float)

# CRISPRs
apc = ap.add_argument_group('crispr threshold arguments')
apc.add_argument('--ccd', help='Distance (bp) threshold to connect Cas operons and CRISPR arrays [%(default)s].', default=10000, type=int)
apc.add_argument('--pred_prob', help='Prediction probability cut-off for assigning subtype to CRISPR repeats [%(default)s].', default=0.75, type=float)
apc.add_argument('--kmer', help='kmer size. Has to match training kmer size! [%(default)s].', default=4, type=int)

# Workflow starts here


########## Initialize ##########
master = Controller(ap.parse_args())

########## Prodigal ##########
if not master.aa:
    proteins = Prodigal(master)
    proteins.run_prod()
else:
    proteins = master

########## Hmmer ##########
hmmeri = HMMER(proteins)

# Run
hmmeri.main_hmm()

# Check if any cas genes
hmmeri.check_hmm()

# Parse
hmmeri.parse_hmm()

########## Typing of operons ##########
castyper = Typer(hmmeri)

# Type
castyper.typing()

# Check
castyper.check_type()

# Output Cas operons
castyper.write_type()

########## CRISPRs ##########
if not master.aa:
    crispr = Minced(castyper)

    # Run
    crispr.run_minced()

    # Parse
    crispr.write_crisprs()
    crispr.write_spacers()

    # Xgboost
    repeatPred = XGB(crispr)

    # Predict
    repeatPred.xgb_run()

######### CRISPR-Cas ########
if not master.aa:
    criscas = CRISPRCas(repeatPred)

    # Find CRISPR-Cas
    criscas.crisprcas()

######### Clean ###########
master.clean()
