#!/usr/bin/python
import biotools.IO               as io
import biotools.translate        as tran
import biotools.analysis.options as options

import Queue as queue
import md5, subprocess, threading
import os, sys

def run(direc, inputs):
	'''ClusterRelatedSequences(files)
Takes a collection of files generated by gene prediction, creates clusters based off of the genes that have homology to those predicted genes, and creates new fasta files in the clusters sub directory under the given directory and separated according to whether they are nucleotide or amino acid sequnces. These new fasta files are then used to create clustalw alignments of the genes if more than 1 sequence exists in the fasta file.'''

	sep = os.sep
	clusters = {}
	all_ids = set()
	ids = {}

	try: os.mkdir(direc)
	except: pass
	finally:
		try: os.mkdir(direc+'nt'+sep)
		except: pass
		try: os.mkdir(direc+'aa'+sep)
		except: pass

	for ipt in inputs:
		clusters[ipt] = []
		ids[ipt]      = set()
		for cluster in io.open(ipt, 'r').format('fastc'):
			cids        = set(c.name.split('|')[1] for c in cluster)
			clusters[ipt].append((cids, cluster.pop().seq))
			ids[ipt]   |= cids
			all_ids    |= cids

	sub_ids = []
	while all_ids:
		cid = all_ids.pop()
		subcluster = set(cid for ipt in clusters for cluster in clusters[ipt] \
			for i in cluster[0] if cid in cluster[0]) & (all_ids|{cid})

		for ipt in clusters:
			for cluster in clusters[ipt]:
				if cid in cluster[0]:
					subcluster = (subcluster & cluster[0]) | (subcluster - ids[ipt])
		sub_ids.append(subcluster)
		all_ids -= subcluster

	q = queue.Queue()
	for cid in sub_ids:
		q.put(cid)

	threads = []
	for i in xrange(options.NUM_PROCESSES-1):
		curr = threading.Thread(target=_run_clustal,args=(q,clusters,direc))
		threads.append(curr)
		curr.start()
	_run_clustal(q,clusters,direc)
	q.join()

def _run_clustal(q, clusters, direc):
	sep = os.sep

	while not q.empty():
		n = 0
		cid = q.get()
		dig = md5.new(' '.join(cid)).hexdigest()
		fpre = direc + 'nt' + sep + dig
		apre = direc + 'aa' + sep + dig
		fname = fpre + ".fasta"
		aname = apre + ".fasta"

		fh = io.open(fname, 'w')
		ah = io.open(aname, 'w')
		for ipt in clusters:
			counter = 0
			name = '_'.join(ipt.split('.')[0].split(sep)[-1].split())
			for cluster in clusters[ipt]:
				if cid & cluster[0]:
					seq = cluster[1]
					curr = sequ.Sequence(name, seq, defline=', '.join(cid))
					fh.write(curr)
					ah.write(tran.translate(curr))
					counter += 1
					n += 1
		fh.close()
		ah.close()

		cmd = "clustalw"
		if sys.platform in ('cygwin', 'win32'): cmd += '.exe'

		try:		ignore = open('/dev/null', 'w')
		except: ignore = open('nul', 'w')

		if n > 1:
			subprocess.call([cmd,"-INFILE="+fname,"-ALIGN","-TYPE=DNA",
											 "-OUTFILE="+fpre+".clustalw","-OUTORDER=ALIGNED",
											 "-DNAMATRIX=IUB","-GAPOPEN=10","-GAPEXT=0.1"],stdout=ignore)		
			subprocess.call([cmd,"-INFILE="+aname,"-ALIGN","-TYPE=PROTEIN",
											 "-OUTFILE="+apre+".clustalw","-OUTORDER=ALIGNED",
											 "-MATRIX=BLOSUM","-GAPOPEN=10","-GAPEXT=0.1"],stdout=ignore)
			os.remove(fpre + '.dnd')
			os.remove(apre + '.dnd')
		else:
			try:
				os.remove(fname)
				os.remove(aname)
			except: pass

		q.task_done()
