#! /usr/bin/env python

import pandas as pd
import numpy as np
import os
import shutil
import argparse
from argparse import RawTextHelpFormatter


def checkm_analysis(file_, fasta):
    df = pd.read_csv(file_, sep="\t")
    highCompletion = list(set(list(df.loc[(df['Completeness'] >= 95) & (df['Contamination'] <= 10), 'Bin Id'].values) + list(df.loc[(df['Completeness'] >= 80) & (
        df['Contamination'] <= 5), 'Bin Id'].values)+list(df.loc[(df['Completeness'] >= 50) & (df['Contamination'] <= 2), 'Bin Id'].values)))
    lowCompletion = list(set(df.loc[(df['Completeness'] <= 50) & (
        df['Contamination'] <= 2), 'Bin Id'].values))
    strainRedundancy = list(set(df.loc[(df['Completeness'] >= 50) & (
        df['Contamination'] >= 10) & (df['Strain heterogeneity'] >= 90), 'Bin Id'].values))
    all = df['Bin Id'].tolist()
    highRedundancy = np.setdiff1d(
        all, (highCompletion+lowCompletion+strainRedundancy))
    if os.path.isdir("high_completion") is False:
        os.makedirs("high_completion")
    if os.path.isdir("low_completion") is False:
        os.makedirs("low_completion")
    if os.path.isdir("high_redundancy") is False:
        os.makedirs("high_redundancy")
    if os.path.isdir("strain_redundancy") is False:
        os.makedirs("strain_redundancy")
    for name in highCompletion:
        shutil.move((str(name)+fasta), "high_completion")
    for name in lowCompletion:
        shutil.move((str(name)+fasta), "low_completion")
    for name in highRedundancy:
        shutil.move((str(name)+fasta), "high_redundancy")
    for name in strainRedundancy:
        shutil.move((str(name)+fasta), "strain_redundancy")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(prog='checkm_analysis', usage='%(prog)s -checkM checkm_qa -f fasta suffix [.fa,.fasta,.fna]', description="""
    *****************************************************************************
    *********************************BinSanity***********************************
    **   The script `checkm_analysis` is a simple parser that extracts the     **
    **   completion, contamination, and strain heterogeneity values from the   **
    **   output of `checkm qa`. Then it splits the corresponding genomes into  **
    **   categories of high completion, low completion, and high redundancy    **
    **   prior to moving the bins into appropriate subfolders.                 **
    *****************************************************************************""", formatter_class=RawTextHelpFormatter)
    parser.add_argument("-checkM", dest="inputqa",
                        help="Specify the CHeckM output File (should be the one created using --tab_table")
    parser.add_argument("-f", dest="inputfa", type=str, default=".fna",
                        help="Identify what your suffix for fasta file is [default: .fna]")
    args = parser.parse_args()
    if args.inputqa is None:
        print(parser.print_help())
    else:
        checkm_analysis(args.inputqa, args.inputfa)
