#!/usr/bin/env python
"""
Creates a frequency distribution of values from columns of the input file and prints it out in
columns - the first being the unique key and the last being the count of occurances.


Usage: gristle_freaker [options]


{see: helpdoc.HELP_SECTION}


Main Options:
    -i, --infiles [INFILES [INFILES ...]]
                         Input filenames or default to '-' for stdin.
    -o, --outfile OUTFILE
                         Output filename or '-' for stdout (the default).
    -c, --columns [COLUMNS [COLUMNS ...]]
                         Columns to collection distributions on.
                         Columns are identified based on a zero-offset, and multiple columns
                         can be provided.  Is not used with col-type=each
    --col-type COL_TYPE  Process columns individually or together as a group.
                         Valid values are: 'specified', 'all', or 'each'. The default is
                         'specified'.  The default is 'specified' in which the cols are
                         provided by the user.


Limiting Options:
    --max-freq MAX_FREQ  Identifies the max number of items in freq dict.
                         Default is 10m which would require approx 300MB of
                         mem to store 10 million 20 byte keys.
    --sampling-method SAMPLING_METHOD
                         Method to use for sampling: 'non' or 'interval'.
                         The default is 'non'.
    --sampling-rate SAMPLING_RATE
                         The number of records to skip if using 'interval' sampling-method.


Output Formatting Options:
    --max-key-len MAX_KEY_LEN
                         Max length of key written to output.
                         The default is 50.
    --write-limit WRITE_LIMIT
                         Max records to write.
                         Default of 0 indicates no limit.
    --sort-col SORT_COL  Column to sort by in the output, either 0 (the key) or 1 (count).
                         Default is 1.
    --sort-order SORT_ORDER
                         Specifies sort order of forward or reverse.
                         The default is reverse.
    --verbosity VERBOSITY
                         Stdout level of detail with valid values of quiet, normal, high, debug.


{see: helpdoc.CSV_SECTION}


{see: helpdoc.CONFIG_SECTION}


Examples:
    $ gristle_freaker -i sample.csv -d '|'  -c 0
        Creates two columns from the input - the first with unique keys from column 0, the second
        with a count of how many times each exists.
    $ gristle_freaker -i sample.csv -d '|'  -c 0 --sort-col 1 --sort-order forward
            --write-limit 25
        In addition to what was described in the first example, this example adds sorting of the
        output by count ascending and just prints the first 25 entries.
    $ gristle_freaker -i sample.csv -d '|'  -c 0 --sampling-rate 3
             --sampling-method interval
        In addition to what was described in the first example, this example adds a sampling in
        which it only references every third record.
    $ gristle_freaker -i sample.csv -d '|'  -c 0 1
        Creates three columns from the input - the first two with unique key combinations from
        columns 0 & 1, the third with the number of times each combination exists.
    $ gristle_freaker -i sample.csv -d '|'  -c -1
        Creates two columns from the input - the first with unique keys from the last column of
        the file (negative numbers wrap), then a second with the number of times each exists.
    $ gristle_freaker -i sample.csv -d '|'  --col-type all
        Creates two columns from the input - all columns combined into a key, then a second with
        the number of times each combination exists.
    $ gristle_freaker -i sample.csv -d '|'  --col-type each
        Unlike all other examples, this one performs a separate analysis for every single column
        of the file.  Each analysis produces three columns from the input - the first is a column
        number, second is a unique value from the column, and the third is the number of times
        that value appeared.  This output is repeated for each column.
    Many more examples can be found here:
        https://github.com/kenfar/DataGristle/tree/master/examples/gristle_freaker


Licensing and further Info:
    This source code is protected by the BSD license.  See the file "LICENSE"
    in the source code root directory for the full language or refer to it here:
        http://opensource.org/licenses/BSD-3-Clause
    Copyright 2011-2021 Ken Farmer
"""

import sys
from os.path import basename
import errno
from pprint import pprint as pp
from typing import Dict, List, Tuple, Optional, Any
from signal import signal, SIGPIPE, SIG_DFL

from datagristle.common import abort
import datagristle.common as comm
import datagristle.configulator as configulator
import datagristle.csvhelper as csvhelper
import datagristle.file_io as file_io
import datagristle.field_misc as field_misc
import datagristle.file_freaker as freaker
import datagristle.helpdoc as helpdoc

# Ignore SIG_PIPE and don't throw exceptions on it...
# (http://docs.python.org/library/signal.html)
signal(SIGPIPE, SIG_DFL)

NAME = basename(__file__)
LONG_HELP = helpdoc.expand_long_help(__doc__)
SHORT_HELP = helpdoc.get_short_help_from_long(LONG_HELP)

# pylint: disable=no-member



def main() -> int:

    return_code = 0 # success

    try:
        config_manager = ConfigManager(NAME, SHORT_HELP, LONG_HELP)
        nconfig, _ = config_manager.get_config()
    except EOFError:
        sys.exit(errno.ENODATA) # 61: empty file

    output_handler = file_io.OutputHandler(nconfig.outfile,
                                           dialect=None)

    if nconfig.col_type in ('specified', 'all'):
        run_one_freq(nconfig, nconfig.col_positions, output_handler)
    elif nconfig.col_type == 'each':
        for position in range(len(nconfig.header.field_names)):
            run_one_freq(nconfig, [position], output_handler)

    output_handler.close()
    return return_code



def run_one_freq(nconfig,
                 col_set,
                 output_handler):


    input_handler = file_io.InputHandler(nconfig.infiles,
                                         nconfig.dialect)

    colset_freaker = freaker.ColSetFreaker(input_handler,
                                           output_handler,
                                           nconfig.col_type,
                                           nconfig.max_freq,
                                           nconfig.sampling_method,
                                           nconfig.sampling_rate,
                                           nconfig.sort_order,
                                           nconfig.sort_col,
                                           nconfig.max_key_len)
    colset_freaker.create_freq_from_column_set(col_set)

    colset_freaker.write_output(colset_freaker.output_handler,
                                nconfig.write_limit,
                                col_set)

    if colset_freaker.truncated:
        abort(summary='Error: too many unique values',
              details='gristle_freaker more unique values than its max_freq value.  Consider raising this to continue',
              rc=errno.ENOMEM)

    input_handler.close()
    return colset_freaker.input_handler.rec_cnt






class ConfigManager(configulator.Config):


    def define_obsolete_config(self) -> None:
        self.add_obsolete_metadata('number', short_name='n', msg='-n/--number have been replaced by --max-freq')


    def define_user_config(self) -> None:
        """ Get the configuration elements from arguements and config files
        """

        self.add_standard_metadata('infiles')
        self.add_standard_metadata('outfile')

        self.add_custom_metadata(name='columns',
                                 short_name='c',
                                 default=[],
                                 type=str,
                                 nargs='*')
        self.add_custom_metadata(name='col_type',
                                 default='specified',
                                 choices=['specified', 'all', 'each'],
                                 type=str)
        self.add_custom_metadata(name='max_freq',
                                 default=10_000_000,
                                 type=int)
        self.add_custom_metadata(name='max_key_len',
                                 type=int,
                                 minimum=1,
                                 maximum=500,
                                 default=50)
        self.add_custom_metadata(name='write_limit',
                                 type=int,
                                 default=0)
        self.add_custom_metadata(name='sort_col',
                                 type=int,
                                 default=1,
                                 choices=[0, 1])
        self.add_custom_metadata(name='sort_order',
                                 type=str,
                                 choices=['forward', 'reverse'],
                                 default='reverse')
        self.add_custom_metadata(name='sampling_method',
                                 type=str,
                                 choices=['non', 'interval'],
                                 default='non')
        self.add_custom_metadata(name='sampling_rate',
                                 type=int,
                                 default=None)

        self.add_standard_metadata('verbosity')
        self.add_all_config_configs()
        self.add_all_csv_configs()
        self.add_all_help_configs()



    def validate_custom_config(self,
                               config) -> None:

        # Handle columns & col_type relationship
        if config['columns'] == []:
            if config['col_type'] == 'specified':
                abort("ERROR: Columns must be provided for col_type of 'specified'.")
        else:
            if config['col_type'] != 'specified':
                abort('ERROR: Columns cannot be provided for col_type of all or each')

        # Validate for stdin
        if config['infiles'][0] == '-':
            if config['col_type'] == 'each':
                abort("ERROR: col_type of 'each' is incompatible with piping with stdin")

        # Validate the sampling fields
        if config['sampling_method'] == 'non':
            if config['sampling_rate']:
                abort('ERROR: Sampling_rate populated',
                      'sampling_rate must not be provided if sampling_method is "non"')
        elif config['sampling_method'] == 'interval':
            if config['sampling_rate']:
                if config['sampling_rate'] != int(config['sampling_rate']):
                    abort('ERROR: Inverval sampling rate must be an int')
            else:
                abort('ERROR: Integer sampling_rate missing',
                      'sampling_rate must be provided if sampling_method == "interval"')


    def extend_config(self):
        """ Add derrived or extended elements to config.
            Note: csvhelper.get_dialect could raise EOFError
        """
        self.generate_csv_dialect_config()

        # add col_names to config:
        self.generate_csv_header_config()

        # add col_positions to config:
        try:
            col_positions = self.nconfig.header.get_field_positions_from_any(self.nconfig.columns)
        except KeyError as err:
            abort('ERROR: the columns option had invalid values within',
                  f'The following value was not in the header: {err},  '
                  'or the header was inaccessible (reading from stdin?)')
        self.update_config('col_positions', col_positions)



if __name__ == '__main__':
    sys.exit(main())
