#!/usr/bin/env python3

__author__      = "Joel Boyd"
__copyright__   = "Copyright 2018"
__credits__     = ["Joel Boyd"]
__license__     = "GPL3"
__version__     = "0.0.7"
__maintainer__  = "Joel Boyd"
__email__       = "joel.boyd near uq.net.au"
__status__      = "Development"

###############################################################################
# Imports
import argparse
import os
import sys
import textwrap
import time
sys.path = [os.path.join(os.path.dirname(os.path.realpath(__file__)),'..')]+sys.path
# Local
import enrichm
from enrichm.run import Run

###############################################################################

class CustomHelpFormatter(argparse.HelpFormatter):
    def _split_lines(self, text, width):
        return text.splitlines()

def phelp():
    print("""                         _____            _      _     __  __ 
                        | ____|_ __  _ __(_) ___| |__ |  \/  |
                        |  _| | '_ \| '__| |/ __| '_ \| |\/| |
                        | |___| | | | |  | | (__| | | | |  | |
                        |_____|_| |_|_|  |_|\___|_| |_|_|  |_|
  ------------------------------------------------------------------------------------

  Annotation
    annotate        -> Genome annotation pipeline.

  Enrichment analysis
    classify        -> Determine what pathways a genome encodes.
    enrichment      -> Calculate enrichment of functional genes between groups.

  Network analysis
    pathway         -> Generate a metabolic network from specific KEGG module or
                       compounds.
    explore         -> Take steps into metabolism using a KEGG compound ID as a 
                       starting point. Useful to see which pathways use a compound
                       of interest.

  Machine learning
    generate        -> Generate a random forest model.
    predict         -> Run random forest model on new data.

  Authors: Joel Boyd, Ben Woodcroft, Alex Baker
  Version: %s
""" % (enrichm.__version__))

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('--version', action='version', version='enrichm v%s' % enrichm.__version__)
    subparsers = parser.add_subparsers(help="--", dest='subparser_name')

    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    base_all             = argparse.ArgumentParser(add_help=False)
    base_logging_options = base_all.add_argument_group('Logging options')

    base_logging_options.add_argument('--log',
                help='Output logging information to this file.')
    base_logging_options.add_argument('--verbosity', type = int, default = 4,
                help='Level of verbosity (1 - 5 - default = 4) 5 = Very verbose, 1 = Silent')

    base_output_options = base_all.add_argument_group('Output options')
    base_output_options.add_argument('--output',
                help='Output directory', default = None)
    base_output_options.add_argument('--force', action='store_true',
                help='Overwrite previous run')

    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    base_network = argparse.ArgumentParser(add_help=False)

    base_input_options = base_network.add_argument_group('Input options')
    base_input_options.add_argument('--matrix', required=True,
                help='KO matrix. REQUIRED.')
    base_input_options.add_argument('--genome_metadata', 
                help='Metadata file with two columns, the first with the genome name, the second with the groupings to compare.')
    base_input_options.add_argument('--abundance',
                help='Abundance matrix.')
    base_input_options.add_argument('--abundance_metadata',
                help='Metadata file with two columns, the first with the genome name, the second with the groupings to compare.')
    base_input_options.add_argument('--tpm_values',
                help='TPM values produced by DetectM.')
    base_input_options.add_argument('--tpm_metadata',
                help='Metadata file with two columns, the first with the genome name, the second with the groupings to compare.')
    base_input_options.add_argument('--metabolome',
                help='Metabolome CID matrix.')

    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    data = subparsers.add_parser('data', formatter_class=CustomHelpFormatter, parents=[base_all])
    data_input_options = data.add_argument_group('Uninstall')
    data_input_options.add_argument('--uninstall', action='store_true',
                help = 'Remove enrichm database')
    data_input_options.add_argument('--create',
                help = 'create enrichm database')
    data_input_options.add_argument('--dry', action='store_true',
                help = 'Download an empty database (debug)')

    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    annotate = subparsers.add_parser('annotate', formatter_class=CustomHelpFormatter, parents=[base_all])

    annotate_input_options = annotate.add_argument_group('Input options (Use one)')
    annotate_input_options.add_argument('--genome_files', default = None, nargs='+',
                help = 'Space separated list of genomes to annotate')
    annotate_input_options.add_argument('--genome_directory', default = None,
                help = 'Directory containing genomes to annotate')
    annotate_input_options.add_argument('--protein_files', default = None,  nargs='+',
                help = 'Space separated list of .faa files of genomes to annotate. Protein files need to be generated by prodigal.')
    annotate_input_options.add_argument('--protein_directory', default = None,
                help = 'Directory containing .faa files of genomes to annotate. Protein files need to be generated by prodigal.')

    annotate_annotation_options = annotate.add_argument_group('Annotations options')
    annotate_annotation_options.add_argument('--ko', action='store_true',
                help='Annotate with KO ids')
    annotate_annotation_options.add_argument('--ko_hmm', action='store_true',
                help='Annotate with KO ids')
    annotate_annotation_options.add_argument('--pfam', action='store_true',
                help='Annotate with Pfam HMMs')
    annotate_annotation_options.add_argument('--tigrfam', action='store_true',
                help='Annotate with TIGRFAM HMMs')
    annotate_annotation_options.add_argument('--clusters', action='store_true',
                help='Annotate with cluster ids')
    annotate_annotation_options.add_argument('--orthologs', action='store_true',
                help='Annotate with ortholog ids')
    annotate_annotation_options.add_argument('--orthogroup', action='store_true',
                help='Annotate with orthogroup ids with OrthoFinder')
    annotate_annotation_options.add_argument('--cazy', action='store_true',
                help='Annotate with dbCAN HMMs')
    annotate_annotation_options.add_argument('--ec', action='store_true',
                help='Annotate with EC ids')

    annotate_cutoff_options = annotate.add_argument_group('Cutoff options')
    annotate_cutoff_options.add_argument('--cut_ga_pfam', action='store_true',
                help='For PFAM searches: use profiles GA gathering cutoffs to set all thresholding')
    annotate_cutoff_options.add_argument('--cut_nc_pfam', action='store_true',
                help='For PFAM searches: use profiles NC noise cutoffs to set all thresholding')
    annotate_cutoff_options.add_argument('--cut_tc_pfam', action='store_true',
                help='For PFAM searches: use profiles TC trusted cutoffs to set all thresholding')
    annotate_cutoff_options.add_argument('--cut_ga_tigrfam', action='store_true',
                help='For TIGRfam searches: use profiles GA gathering cutoffs to set all thresholding')
    annotate_cutoff_options.add_argument('--cut_nc_tigrfam', action='store_true',
                help='For TIGRfam searches: use profiles NC noise cutoffs to set all thresholding')
    annotate_cutoff_options.add_argument('--cut_tc_tigrfam', action='store_true',
                help='For TIGRfam searches: use profiles TC trusted cutoffs to set all thresholding')
    annotate_cutoff_options.add_argument('--cut_ko', action='store_true',
                help='For KO HMM annotation searches: use cutoffs defined by KEGG to maximise F-score.')
    annotate_cutoff_options.add_argument('--evalue', type=float, default=1e-05,
                help='Use this evalue cutoff to filter false positives (default: 1e-05)')
    annotate_cutoff_options.add_argument('--bit', type = float, default = 0,
                help='Use this bit score cutoff to filter false positives (default: 0)')
    annotate_cutoff_options.add_argument('--id', type = float, default = 0.3,
                help='Use this percent identity cutoff to filter false positives (default: 0.3)')
    annotate_cutoff_options.add_argument('--aln_query', type = float, default = 0.8,
                help='This fraction of the query must align to the reference (default: 0.7)')
    annotate_cutoff_options.add_argument('--aln_reference', type = float, default = 0.8,
                help='This fraction of the reference must align to the query (default: 0.7)')
    annotate_cutoff_options.add_argument('--c', type = float, default = 0.5,
                help='When clustering, use matches above this fraction of aligned (covered) query and target residues (default: 0.8)')

    annotate_runtime_options = annotate.add_argument_group('Runtime options')
    annotate_runtime_options.add_argument('--threads', default = '1',
                help='Use this number of threads when annotating with BLAST and HMMsearch (default: 1)')
    annotate_runtime_options.add_argument('--parallel', default = '5',
                help='Run this number of jobs in parallel when annotating with HMMsearch (default: 5)')
    annotate_runtime_options.add_argument('--inflation', type = float, default = 1.5,
                help='Inflation factor to use when calling clusters in ortholog (default = 1.5)')
    annotate_runtime_options.add_argument('--suffix',
                help='Treat files ending with this suffix within the --genome_directory as genomes (default: .fna for --genome_directory and .faa for )')
    annotate_runtime_options.add_argument('--light', action='store_true',
                help="Don't store metadata for genome files (can't use enrichM compare downstream, default=False)")
    annotate_runtime_options.add_argument('--count_domains', action='store_true',
                help="Fill the frequency matrix with the total number of times an annotation was detected (for example, when one domain more than once within a protein), rather than the count of proteins with with that annotation")
    annotate_runtime_options.add_argument('--chunk_number', type = float, default = 4,
                help='Split loading of genomes into this many chunks (default = 4)')
    annotate_runtime_options.add_argument('--chunk_max', type = float, default = 2500,
                help='Maximum number of genomes to load per chunk (default = 2500)')

    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    classify = subparsers.add_parser('classify', formatter_class=CustomHelpFormatter, parents=[base_all])

    classify_input_options = classify.add_argument_group('Input options')

    classify_input_options.add_argument('--genome_and_annotation_matrix',
                help='Path to file containing a genome annotation matrix')
    classify_input_options.add_argument('--custom_modules',
                help='Tab separated file containing module name, definition as the columns')
    classify_input_options.add_argument('--module_rules_json',
                help='json file specifying rules to interpret the annotation and guide module annotation')
    classify_input_options.add_argument('--gff_files',
                help='GFF files for the genomes being classified.')

    classify_cutoff_options = classify.add_argument_group('Cutoff options')
    classify_cutoff_options.add_argument('--cutoff', type=float, default=0.0,
                help='Output only modules with greater than this percent of the requied KO groups (default = print all modules)')

    classify_runtime_options = classify.add_argument_group('Runtime options')
    classify_runtime_options.add_argument('--aggregate', action='store_true',
                                         help='Calculate the abundance of each pathway within each genome/sample (column)')
    
    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    enrichment = subparsers.add_parser('enrichment', formatter_class=CustomHelpFormatter, parents=[base_all])

    enrichment_input_options = enrichment.add_argument_group('Input options')
    enrichment_input_options.add_argument('--annotate_output',
                help='Output directory provided by enrichm annotate')
    enrichment_input_options.add_argument('--metadata',
                help='Metadata file with two columns, the first with the genome name, the second with the groupings to compare.')
    enrichment_input_options.add_argument('--annotation_matrix',
                help='Annotation matrix to compare.')
    enrichment_input_options.add_argument('--gff_files', nargs='+',
                help='Gff files for genomes to compare.')
    enrichment_input_options.add_argument('--abundance',
                help='Genome abundance matrix.')
    enrichment_input_options.add_argument('--abundance_metadata',
                help='Metadata grouping abundance samples.')
    enrichment_input_options.add_argument('--transcriptome',
                help='Genome abundance matrix.')
    enrichment_input_options.add_argument('--transcriptome_metadata',
                help='Metadata grouping abundance samples.')

    enrichment_gtdb_options = enrichment.add_argument_group('Genome Taxonomy DataBase (GTDB) options')
    enrichment_gtdb_options.add_argument('--batchfile',
                help='metadata file to compare with.')

    enrichment_runtime_options = enrichment.add_argument_group('Runtime options')
    enrichment_runtime_options.add_argument('--pval_cutoff', type=float, default=0.05,
                help='Only output results with a p-value below a this cutoff (default=0.05).')
    enrichment_runtime_options.add_argument('--proportions_cutoff', type=float, default=1,
                help='Proportion enrichment cutoff.')
    enrichment_runtime_options.add_argument('--threshold', type=float, default=0.1,
                help='The threshold to control for in false discovery rate of familywise error rate.')
    enrichment_runtime_options.add_argument('--multi_test_correction',
                help='The form of mutiple test correction to use. Uses the statsmodel module and consequently has all of its options.\nDefault: Benjamini-Hochberg FDR (fdr_bh) \nOptions: Bonferroni (b) \n\t Sidak (s) \n\t Holm (h) \n\t Holm-Sidak (hs) \n\t Simes-Hochberg (sh) \n\t Hommel (ho) \n\t FDR Benjamini-Yekutieli (fdr_by) \n\t FDR 2-stage Benjamini-Hochberg (fdr_tsbh) \n\t FDR 2-stage Benjamini-Krieger-Yekutieli (fdr_tsbky) \n\t FDR adaptive Gavrilov-Benjamini-Sarkar (fdr_gbs))', default='fdr_bh')
    enrichment_runtime_options.add_argument('--processes', type = int,
                help='Number of processes to use for enrichment.', default=1)
    enrichment_runtime_options.add_argument('--allow_negative_values',
                help='Allow negative values in input matrix.', action='store_true',)
    enrichment_runtime_options.add_argument('--ko', action='store_true',
                help='Compare KO ids (annotated with DIAMOND)')
    enrichment_runtime_options.add_argument('--ko_hmm', action='store_true',
                help='Compare KO ids (annotated with HMMs)')
    enrichment_runtime_options.add_argument('--pfam', action='store_true',
                help='Compare Pfam ids')
    enrichment_runtime_options.add_argument('--tigrfam', action='store_true',
                help='Compare TIGRFAM ids')
    enrichment_runtime_options.add_argument('--cluster', action='store_true',
                help='Compare cluster ids')
    enrichment_runtime_options.add_argument('--ortholog', action='store_true',
                help='Compare ortholog ids')
    enrichment_runtime_options.add_argument('--cazy', action='store_true',
                help='Compare dbCAN ids')
    enrichment_runtime_options.add_argument('--ec', action='store_true',
                help='Compare EC ids')
    enrichment_runtime_options.add_argument('--range', default=2500, type = int,
                help='Base pair range to search for synteny within. Default = 2500.')
    enrichment_runtime_options.add_argument('--subblock_size', default=2, type = int,
                help='Number of genes clustered in a row to be reported. Default = 2.')
    enrichment_runtime_options.add_argument('--operon_mismatch_cutoff', default=2, type = int,
                help='Number of allowed mismatches when searching for operons across genomes. Default = 2.')
    enrichment_runtime_options.add_argument('--operon_match_score_cutoff', default=0.5, type = float,
                help='Score cutoff for operon matches')
    
    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    uses = subparsers.add_parser('uses', formatter_class=CustomHelpFormatter, parents=[base_all])

    uses_input_options = uses.add_argument_group('Input options')
    uses_input_options.add_argument('--annotation_matrix', required=True,
                help='Input annotate output')
    uses_input_options.add_argument('--metadata', required=True,
                help='Metadata file with two columns, the first with the genome name, the second with the groupings to compare.')
    uses_input_options.add_argument('--compounds_list', required=True,
                help='Metadata file with two columns, the first with the genome name, the second with the groupings to compare.')
    uses_input_options.add_argument('--count', required=True, action='store_true',
                help='Count rather than tally.')

    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    pathway = subparsers.add_parser('pathway', formatter_class=CustomHelpFormatter, parents=[base_network, base_all])

    pathway_pathway_options = pathway.add_argument_group('Pathway options')
    pathway_pathway_options.add_argument('--limit', default=list(), nargs='+',
                help='USE ONLY these reactions, or reactions within this pathway or module (space separated list).')
    pathway_pathway_options.add_argument('--filter', default=list(), nargs='+',
                help='Do not use these reactions, or reactions within this pathway or module (space separated list).')
    pathway_pathway_options.add_argument('--enrichment_output',
                help='Supply an enrichment output to integrate the results into the output network.')

    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    explore = subparsers.add_parser('explore', formatter_class=CustomHelpFormatter, parents=[base_network, base_all])

    explore_query_options = explore.add_argument_group('Query options')
    explore_query_options.add_argument('--queries', required=True,
                help='A file containing the KEGG ids of the compounds from which to start in the metabolic network')
    explore_query_options.add_argument('--depth', type=int, default=2,
                help='Number of steps to take into the metabolic network')

    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    generate = subparsers.add_parser('generate', formatter_class=CustomHelpFormatter, parents=[base_all])

    generate_input_options = generate.add_argument_group('Generate options')
    generate_input_options.add_argument('--input_matrix', required = True, help = 'input matrix of results')
    generate_input_options.add_argument('--groups', required = True, help = 'defined outcomes to train the data to')
    generate_input_options.add_argument('--model_type', required = True, help = 'regressor or classifier', choices=["regressor","classifier"])
    generate_input_options.add_argument('--testing_portion', type = float, help = 'portion of the input data to use for testing (default = 0.2)', default = 0.2)

    generate_tuning_options = generate.add_argument_group('Tuning options')
    generate_tuning_options.add_argument('--grid_search', action='store_true', help = 'grid search')
    generate_tuning_options.add_argument('--threads', type=int, default = -1, help = 'number of threads to use for hyperparameterization (default = all available)')

    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    predict = subparsers.add_parser('predict', formatter_class=CustomHelpFormatter, parents=[base_all])

    predict_input_options = predict.add_argument_group('Predict options')
    predict_input_options.add_argument('--forester_model_directory', required = True, help = 'Pickled model to use')
    predict_input_options.add_argument('--input_matrix', required = True, help = 'matrix of data to predict')

    #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

    if(len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv[1] == '--help'):
        phelp()

    else:
        args = parser.parse_args()
        if len(sys.argv) == 2:
           print(subparsers.choices[args.subparser_name].print_help())
           exit()
        if not args.log:
            args.log = os.path.join(args.subparser_name + '.log')

        run = Run()
        run.run_enrichm(args, sys.argv)

