#! /usr/bin/env python
from argparse import RawTextHelpFormatter
from Bio import SeqIO
import numpy as np
import sys
import os
import argparse


def concat_alignments(directory, extension, prefix, output, num):
    fileNames = [f for f in os.listdir(directory) if f.endswith(extension) and str(prefix) in f]
    org_names = []
    for f in fileNames:
        for record in SeqIO.parse(f, "fasta"):
            org_names.append(record.id)
    unique = list(set(org_names))
    marker_number = {x: org_names.count(x) for x in org_names}
    for f in fileNames:
        ammended = f.rsplit('.', 1)[0]
        record_ids = [record.id for record in SeqIO.parse(f, "fasta")]
        mutual = np.intersect1d(unique, record_ids)
        different = np.setdiff1d(unique, record_ids)
        record_dict = SeqIO.index(f, "fasta")
        length = (len(record_dict[mutual[1]]))
        if len(different)>=0:
            with open(ammended+".tmp", "a") as edit:
                for dif in different:
                    seq = "-"*length
                    edit.writelines([">%s\n" % dif, seq+"\n"])
                for mut in mutual:
                    edit.writelines(
                        [">%s\n" % str(record_dict[mut].id), str(record_dict[mut].seq)+"\n"])
    with open(output, "a") as new:
        for org in unique:
            if marker_number[org] >= int(num):
                tmp = ""
                for f in fileNames:
                    name = f.rsplit('.', 1)[0]
                    record_dict = SeqIO.index(name+".tmp", "fasta")
                    tmp = tmp+str(record_dict[org].seq)
                new.writelines([">%s\n" % str(org), tmp+"\n"])
    for f in fileNames:
        tmp = f.rsplit(".", 1)[0]
        os.remove(tmp+".tmp")


if __name__ == '__main__':
    parser = argparse.ArgumentParser(prog='concat', usage='%(prog)s -f directory -e Alignment Extension --Prefix file linker -o output', description="""
    *****************************************************************************
    *********************************BinSanity***********************************
    **     The `concat` script is used to concatenate multiple sequence        **
    **     alignments for conducting a phylogenomic analysis. Note that you    **
    **     receive an error if there are any duplicate sequence ids in an      **
    **     alignment. 
    *****************************************************************************""", formatter_class=RawTextHelpFormatter)
    parser.add_argument("-f", metavar="", dest="inputDir",
                        help="Specify directory where alignments are")
    parser.add_argument("-e", metavar="", dest="inputExtension",
                        help="Specify the extension for your alignments (must be in Fasta format)")
    parser.add_argument("--Prefix", metavar="", dest="inputPrefix",
                        help="Specify the prefix that links your alignments (ex: if you have two alignments TOBG_RpL10, TOBG_RpL24, the --Prefix would be TOBG")
    parser.add_argument("-o", dest="inputOutFile",
                        metavar="", help="Specify output file")
    parser.add_argument("-N", dest="number", metavar="",
                        help="Specify the minimum number of sequences needed to be included in concatenation")
    args = parser.parse_args()
    if len(sys.argv) < 2:
        print(parser.print_help())
    elif os.path.isfile(args.inputOutFile):
        print("Your Output File Already Exists and We don't want to overwrite it")
    else:
        concat_alignments(args.inputDir, args.inputExtension,
                          args.inputPrefix, args.inputOutFile, args.number)
