/* 
 * Validate a fasta file
 *
 * Rob Edawards January 2019.
 *
 * We exit with:
 *	0: this is a valid fasta file
 *	1: the first line does not start with a >
 *	2: the ids are not unique
 *	4: lines in the sequence (that do not start >) contain characters that do not match the perl regexp /[A-Z][a-z] /
 *
 *	Over 200:
 *	internal errors, eg. unable to allocate memory, etc.
 */
#include <Python.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <search.h>
#include "zlib.h"

#include "fasta_validate.h"


int contains_non_word_characters(char *seq, int verbose) {
	/*
	 * we consider word characters to be ASCII:
	 *      64 < char < 91   (A=65; Z=90)
	 *      96 < char < 123  (a=97; z=122)
	 */

	
	if (seq == NULL) {
		fprintf(stderr, "Empty line receieved. Empty string?\n");
		return 8;
	}

	for (size_t i=0; i < strlen(seq); i++) {
		if ((int) seq[i] < 65) {
			if (((int) seq[i] != 10) && ((int) seq[i] != 13))
				return 1;
		}
		else if (((int) seq[i] > 90) && ((int) seq[i] < 97))
			return 2;
		else if ((int) seq[i] > 122)
			return 3;
	}
	return 0;
}

int run(char *filename, int verbose) {
	// create a hash with 1,000,000 possible entries
	int hc = hcreate(NUMSEQS);
	if (hc == 0) {
		fprintf(stderr, "Unable to create the hash table\n");
		return -1;
	}

	gzFile fp;

	char line[MAXLINELEN];

	if ((fp = gzopen(filename, "r")) == NULL) {
		if (verbose)
			fprintf(stderr, "Can't open file %s\n", filename);
		exit(1);
	}

	int firstline = 1;
	while ((gzgets(fp, line, MAXLINELEN)) != NULL) {
		if ((int) line[0] == 62) { // not sure why I'm using an ascii comparison, but I'm thinking ascii at the moment
			firstline = 0;
			// remove anything after the first space
			char *p = strchr(line, ' ');
			if (p)
				*p = '\0';

			// in case you need this!
			// fprintf(stderr, "Parsing %s\n", line);

			// check to see if we have seen this line
			// if not, add it to the hash
			ENTRY item;
			item.key = strdup(line);
			ENTRY *found_item;
			if ((found_item = hsearch(item, FIND)) != NULL) {
				if (verbose)
					fprintf(stderr, "ERROR: Found a duplicate id: %s\n", line);
                hdestroy();
				return 2;
			}

			(void) hsearch(item, ENTER);
		} else {
			if (firstline > 0) {
				if (verbose)
					fprintf(stderr, "ERROR: The first line should start with a >\n");
                hdestroy();
				return 1; // the first line should start with a >
			}
			int nwc = contains_non_word_characters(line, verbose);
			if (nwc > 0) {
				if (verbose)
					fprintf(stderr, "ERROR: We have a non word character!\n");
                hdestroy();
				return 4;
			}
		}
	}
	hdestroy();
	return 0;
}

int main(int argc, char *argv[]) {
	if (argc < 2) {
		printf("%s [-v] [fasta file]\n", argv[0]);
		exit(1);
	}

	char *filename = argv[1];
	int verbose = 0;

	if (strcmp(filename, "-v") == 0) {
		verbose = 1;
		filename = argv[2];
	}
    return run(filename, verbose);
}




PyObject * python_input(PyObject *self, PyObject *args) {
    // Parse arguments
    char *filename;
    int verbose = 0;
    if(!PyArg_ParseTuple(args, "s", &filename)) {
        PyErr_SetString(PyExc_RuntimeError, "Could not parse the arguments to python_input");
        return NULL;
    }
    int r = run(filename, verbose);
    return PyLong_FromLong(r);
}

PyMODINIT_FUNC PyInit_FastaValidator(void) {
    return PyModule_Create(&FastaValidatorModule);
}
