#!/usr/bin/env python3
import argparse
import os
import pandas as pd
import sys

from harmonizer.harmonizer import harmonize, identify_stop_words


def parse_args(argv):
    parser = argparse.ArgumentParser(
        prog=argv[0], formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    sub = parser.add_subparsers(help='commands', dest='command')
    stop = sub.add_parser(
        'stop_words', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    stop.add_argument('csv', help='csv file to harmonize')
    stop.add_argument('column', help='column in csv to harmonize')
    stop.add_argument('-n',
                      '--top_n',
                      help='select from top `n` by count',
                      default=200)
    stop.add_argument('-o',
                      '--output_csv',
                      help='output file for stop words',
                      default='stop_words.csv')
    harmonize = sub.add_parser(
        'harmonize', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    harmonize.add_argument('csv', help='csv file to harmonize')
    harmonize.add_argument('column', help='column in csv to harmonize')
    harmonize.add_argument(
        '-t',
        '--threshold',
        default=0.85,
        type=float_zero_one,
        help='threshold for matching, closer to 1 means stricter match')
    harmonize.add_argument('-sw',
                           '--stop_words',
                           help='file to use for stop words',
                           default='stop_words.csv')
    harmonize.add_argument('-o',
                           '--output_csv',
                           help='output file for harmonized data,'
                           ' defaults to original csv name + _harmonized.csv')
    return parser.parse_args(argv[1:])


def float_zero_one(arg):
    try:
        f = float(arg)
    except ValueError:
        raise argparse.ArgumentTypeError('Must be a floating point number')
    if f < 0 or f > 1.0:
        raise argparse.ArgumentTypeError('Argument must be between 0 and 1')
    return f


if __name__ == '__main__':
    args = parse_args(sys.argv)
    df = pd.read_csv(args.csv)
    ser = df[args.column]
    if args.command == 'stop_words':
        stop_words = identify_stop_words(ser, args.top_n)
        pd.DataFrame(list(stop_words),
                     columns=['stop_words']).to_csv(args.output_csv,
                                                    index=False)
        print(f'\nsaved to {args.output_csv}')
    elif args.command == 'harmonize':
        stop_words = set()
        if os.path.exists(args.stop_words):
            stop_words = set(pd.read_csv(args.stop_words)['stop_words'].values)
        output_csv = args.output_csv
        if not output_csv:
            output_csv = os.path.splitext(args.csv)[0] + '_harmonized.csv'
        df_harmonized = harmonize(ser, args.threshold, stop_words)
        df.join(df_harmonized.drop(args.column, axis=1)).sort_values(
            args.column).to_csv(output_csv, index=False)
        print(f'\nsaved to {output_csv}')
