#! /usr/bin/env python
# -*- coding: utf-8 -*-
from Bio import SeqIO
import argparse
import os
from sklearn import metrics
from collections import OrderedDict
from argparse import RawTextHelpFormatter


def bin_output(b, l):
    contig_names_bin = []
    file_names = []
    for filename in os.listdir(b):
        if filename.endswith(l):
            file_names.append(str(filename))
    for filename in file_names:
        for record in SeqIO.parse(os.path.join(b, filename), "fasta"):
            contig_names_bin.append(record.id)
        else:
            print("No file's with this Suffix exist in this directory: ", b)
    new_ = dict(list(zip(contig_names_bin, file_names)))
    files_sorted = []
    od = dict(OrderedDict(sorted(new_.items())))
    for key, value in od.items():
        files_sorted.append(value)
    return files_sorted


def ari(a, b):
    print("_______________________________________________________________________________________")
    print("Adjusted-Rand-Index: ", metrics.adjusted_rand_score(a, b))
    print("Homogeneity, completeness, V-measure: ",
          metrics.homogeneity_completeness_v_measure(a, b))
    print("_______________________________________________________________________________________")


if __name__ == '__main__':
    parser = argparse.ArgumentParser(prog='bin_evaluation', usage='%(prog)s -b Putative Genomes -r reference genomes -l suffix of fasta files', description="""
    *****************************************************************************
    *********************************BinSanity***********************************
    **   The script `bin_evaluation` uses sklearn metrics                      **  
    **   (http://scikit-learn.org/stable/modules/classes.html) to calculate    **
    **   the adjusted rand index, homogeneity, completeness, and v-measure to  **
    **   evaluate clustering results compared to a of known clusters. See the  **
    **   BinSanity paper ( https://doi.org/10.7717/peerj.3035) for a full      **
    **   description of how these are used.                                    ** 
    **                                                                         **
    **   The `bin_evaluation` script can be used to compare the statistical    **
    **   accuracy of multiple clustering methods on a set of contigs with      **
    **   known identity. To use it you must have two directories. One          **
    **   containing genome with the expected cluster outcomes (identified with **
    **   `-r`), and the other containing genomes generated with clustering     **
    **   method you wish to evaluate (identified with `-b`).                   ** 
    *****************************************************************************""", formatter_class=RawTextHelpFormatter)
    parser.add_argument("-b", dest="inputPutative", metavar="",
                        help="Specify the directory containing Putative genomes")
    parser.add_argument("-r", dest="inputreference", metavar="",
                        help="Specify directory containing reference genomes")
    parser.add_argument("-l", dest="inputSuffix", metavar="",
                        help="specify suffix of bins e.g .fa, .fna, .fasta, etc.")

    args = parser.parse_args()
    if args.inputPutative is None:
        print("Need to specify directory containing putative genomes")
        parser.print_help()
    elif args.inputreference is None:
        print("Need to specify directory containing reference genomes")
        parser.print_help()
    elif args.inputSuffix is None:
        print("Need to specify Suffix linking putative genomes/reference genomes")
        parser.print_help()
    else:
        val1 = bin_output(args.inputPutative, args.inputSuffix)
        val2 = bin_output(args.inputreference, args.inputSuffix)
        ari(val2, val1)
