#!/usr/bin/env python
import argparse
from pyphe.interpret import interpret 
import pandas as pd

if __name__ == '__main__':
    ###Set up parsing of command line arguments with argparse###
    parser = argparse.ArgumentParser(description='Welcome to pyphe-interpret, part of the pyphe toolbox. Written by stephan.kamrad@crick.ac.uk and maintained at https://github.com/Bahler-Lab/pyphe. Pyphe-interpret calculates summary statistics and p-values from the data reports generated by pyphe-analyse. For this, specifiying your column names correctly is crucial. Let us assume you have measured many strains in many conditions. Now you would like to know for each strain in each condition (for each condition-strain pair) if it is "significant". There are essentially two ways of doing this, asking different biological questions. (1) Check for each condition separately (--grouping_column <condition_column>) if there is a significant difference in means between a mutant strain and a control strain (--axis_column <strain_id_column>). Or (2) Check for each strain separately (--grouping_column <strain_id_column>) if there is a significant difference in the means of the strain in the assay condition versus the control condition (--axis_column <condition_column>). The second option tests for condition-specific growth effects (i.e. is does not return significant results if a strain is always faster or always slower growing than the grid strain). In both cases you need to specify the control against which to test using --control and this has to be a value that appears in the axis column. You should define the dependent variable of the t-test using --values_column. FDR correction with the Benjamini-Hochberg method will be applied on each level set of the grouping_column separately, ie for case (1) p-values will be corrected across each strain separately, ie more conditions means more stringent correction, and for case (2) p-values will be corrected for each condition separately, ie more strains means mpre stringent correction.')
  
   
    parser.add_argument('--ld', type=str, required=True, help="Path to the Data Report Table produced by pyphe-analyse.")
    parser.add_argument('--out', type=str, default='pyphe-interpret-report', help='Specifies the path where to save the output data result. By default, a table with all replicates will be saved as pyphe-interpret-report_reps.csv and the statistic table will be saved as pyphe-interpret-report_summaryStats.csv in the current working directory. Existing files will be overwritten.')
    parser.add_argument('--grouping_column', type=str, required=True, help='Name of the column in the data report to use for forming groups on which to perform independent sets of t-tests.')
    parser.add_argument('--axis_column', type=str, required=True, help='Name of the column in the data report to repeat t-tests along within each group. Levels in this column will be the explanatory/independent variable used for t-tests.')
    parser.add_argument('--values_column', type=str, default='Colony_size_corr_checked', help='Name of the column in the data report to use as fitness values. This will be the dependent variable for t-tests. Defaults to "Colony_size_corr_checked".')
    parser.add_argument('--control', type=str, required=True, help='Name of the control to compare against. This must be a value found in the axis column.')
    parser.add_argument('--ld_encoding', default='utf-8', type=str, help='Encoding of the data report table to be passed to pandas.read_csv().')
    parser.add_argument('--circularity', type=float, default=None, help='Exclude colonies from the analysis with a circularity below the one specified. A circularity of 1 corresponds to a perfect circle. We recommend a threshold around 0.85.')
    parser.add_argument('--set_missing_na', action='store_true', default=False, help='Set 0-sized colonies to NA. This is recommended if you expect no missing colonies in your data, which means these are probably due to pinning errors.')

    args = parser.parse_args()

    #Run analysis
    print('Interpretation is starting, with following parameters:')
    for k, v in vars(args).items():
        print('%s: %s'%(k, str(v)))

    #Load ld
    ld = pd.read_csv(args.ld, index_col=0, encoding=args.ld_encoding)

    interpret(ld, args.axis_column, args.grouping_column, args.values_column, args.control, args.out, circularity=args.circularity, set_missing_na=args.set_missing_na)
