#!/usr/bin/env python
"""
Gristle_validator is used to determine the validity of a file's structure
and semantics.  It's functionality includes:
   - output counts of invalid vs valid records
   - an exit code that indicates findings (0 if everything's good, else 74)
   - ability to split input file into separate valid & invalid files
   - ability to use a json-schema compliant yaml config file to define
     data validation requirements
   - ability to check every record for the proper number of fields based
     on an input argument

usage: gristle_validator [-h]
                         [-o OUTGOOD]
                         [-e OUTERR]
                         [-f FIELD_CNT]
                         [-d DELIMITER]
                         [--quoting QUOTING]
                         [--quotechar QUOTECHAR]
                         [--hasheader]
                         [--schemaid SCHEMA_ID]
                         [--validschema VALID_SCHEMA] [-s]
                         [--silent]
                         [--randomout RANDOMOUT] [--errmsg]
                         [--long-help]
                         [files [files ...]]

positional arguments:
  files                 Specifies the input file.  The default is stdin.  Multiple filenames can be provided.

optional arguments:
  -h, --help            show this help message and exit.
  --long-help           Print more verbose help.
  -o OUTGOOD, --outgood OUTGOOD
                        Specifies the output file.  The default is stdout.
                        Note thatif a filename is provided the program will
                        override any file of that name.
  -e OUTERR, --outerr OUTERR
                        Specifies the output file for invalid records.  The
                        default is stderr.  Note that if a filename is provided
                        the program will override any file of that name.
  -f FIELD_CNT, --fieldcnt FIELD_CNT
                        Specify the number of fields in the record.  If not
                        provided it will default to number of fields on first
                        record
  -d DELIMITER, --delimiter DELIMITER
                        Specify a quoted single-column field delimiter. This
                        may be determined automatically by the program.
  --quoting QUOTING     Specify field quoting - generally only used for stdin
                        data.  The default is to have the program determine
                        quoting.   Valid values include quote_none, quote_all,
                        quote_minimal and quote_nonnumeric.
  --quotechar QUOTECHAR
                        Specify field quoting character - generally only used
                        for stdin data.  Default is double-quote
  --hasheader           Indicates that there is a header in the file.
                        Validation will mostly ignore the header.
  --hasnoheader         Indicates that there is no header in the file.
                        Useful in overriding automatic csv-dialect guessing
                        behavior of program.
  --validschema VALID_SCHEMA
                        Name of validation schema file
  -s, --stats           Causes pgm to print counts.  Default is False.
  --silent              Causes pgm to print no stats, no valid or invalid recs.
                        Useful when you only want to use return code to check
                        for any invalid records.  Default is off.
  --randomout RANDOMOUT
                        Causes pgm to write only a percentage of records out.
                        Provide an argument from 0 to 100.
  --errmsg              Causes pgm to append error msg at end of each invalid
                        record.  Default is on.

Tips on usage:
   - return codes are based on c headers:
        0  = success
        1  = generic error
        61 = No data (ENODATA)
        74 = Invalid data found (EBADMSG)

To do:
   - allow detailed logs to be directed to a log directory
   - allow user extensible:
         - directory of programs to execute for validation
   - work with analyze_file to produce a special exception for empty files.
   - improve msg if user provides no args - and tell about -h
   - extended to write out records with incorrect types, ranges, case, etc.

This source code is protected by the BSD license.  See the file "LICENSE"
in the source code root directory for the full language or refer to it here:
  http://opensource.org/licenses/BSD-3-Clause

Copyright 2011,2012,2013,2014,2017 Ken Farmer
"""
import sys
import os
import argparse
import csv
import random
import errno
from typing import Dict, List, Any, Optional, Union

from signal import signal, SIGPIPE, SIG_DFL

import validictory as valid
import yaml

import datagristle.file_type as file_type
import datagristle.common as common
import datagristle.csvhelper as csvhelper
import datagristle.file_io as file_io

#--- Ignore SIG_PIPE and don't throw exceptions on it
#--- (http://docs.python.org/library/signal.html)
signal(SIGPIPE, SIG_DFL)



def main() -> int:
    """ runs all processes:
            - gets args
            - analyzes file to determine csv characteristics unless data is
              provided via stdin
            - runs each input record through process_cols to get output
            - writes records
    """
    args = get_args()

    try:
        input_handler = file_io.InputHandler(args.files,
                                            args.delimiter,
                                            args.quoting,
                                            args.quotechar,
                                            args.hasheader)
    except EOFError:
        return errno.ENODATA  # is a 61 on linux

    outgood_handler = file_io.OutputHandler(args.outgood,
                                            input_handler.dialect,
                                            sys.stdout,
                                            args.silent,
                                            args.randomout)
    outerr_handler = file_io.OutputHandler(args.outerr,
                                           input_handler.dialect,
                                           sys.stderr,
                                           args.silent,
                                           args.randomout)


    rec_schema = load_schema(args.valid_schema)
    rec_validator = RecValidator(valid_field_cnt=args.field_cnt,
                                 rec_schema=rec_schema)

    invalid_cnt = process_all_records(input_handler,
                                      outgood_handler,
                                      outerr_handler,
                                      rec_validator,
                                      args.stats)

    input_handler.close()
    outgood_handler.close()
    outerr_handler.close()

    if invalid_cnt > 0:
        return errno.EBADMSG  # is a 74 on linux
    elif input_handler.rec_cnt == 0:  # catches empty stdin
        return errno.ENODATA  # is a 61 on linux
    else:
        return 0



def process_all_records(input_handler: 'InputHandler',
                        outgood_handler: 'OutputHandler',
                        outerr_handler: 'OutputHandler',
                        rec_validator: 'RecValidator',
                        stats: bool) -> int:
    """ Walks through input file, validating each record, and writing
        out results.
    """

    valid_cnt = 0
    invalid_cnt = 0

    for rec in input_handler:
        valid_rec = True
        field_cnt_msg = None
        schema_msg = None

        if not rec_validator.check_field_cnt(len(rec)):
            valid_rec = False
            field_cnt_msg = rec_validator.error

        if input_handler.dialect.has_header and input_handler.rec_cnt == 1:
            pass
        elif not valid_rec:
            pass  # don't check schema if the rec failed field_cnt check
        else:
            if rec_validator.rec_schema:
                if not rec_validator.check_schema(rec):
                    valid_rec = False
                    schema_msg = rec_validator.error

        if valid_rec:
            valid_cnt += 1
            outgood_handler.write_rec(rec)
        else:
            invalid_cnt += 1
            rec.append(common.coalesce(field_cnt_msg, schema_msg))
            outerr_handler.write_rec(rec)

    write_stats(stats,
                input_handler.rec_cnt,
                valid_cnt,
                invalid_cnt)
    return invalid_cnt


def write_stats(stats: bool, input_cnt: int, valid_cnt: int, invalid_cnt: int) -> None:
    """ Writes input, output, and validation counts to stdout.
    """
    if stats:
        print('')
        print('input_cnt        | %d ' % input_cnt)
        print('invalid_cnt      | %d ' % invalid_cnt)
        print('valid_cnt        | %d ' % valid_cnt)


def load_schema(schema_file: str) -> Optional[Dict[Any, Any]]:
    """ Loads validation schema.
        If the schema_file argument is None, then it will not load,
        this is useful when the user wants to check field counts,
        but does not have or want to use a validation schema.
    """
    schema_dict = None
    if schema_file:
        schema_dict = yaml.safe_load(schema_file)
    return schema_dict




class RecValidator(object):
    # how to provide caller with actual vs valid values?

    def __init__(self,
                 valid_field_cnt: Optional[int] = None,
                 rec_schema=None) -> None:

        self.rec_schema = rec_schema
        self.valid_field_cnt = valid_field_cnt
        self.last_field_cnt = None
        self.error = None # holds last error encountered
        if rec_schema:
            try:
                ValidateTheValidator(self.rec_schema)
            except ValueError as error:
                print(error)
                sys.exit(1)


    def check_field_cnt(self, actual_field_cnt: int):
        """ If no valid_field_cnt was assigned originally, set it to the first
            actual field cnt.
        """

        if self.valid_field_cnt is None:
            self.valid_field_cnt = actual_field_cnt
        self.last_field_cnt = actual_field_cnt

        if actual_field_cnt == self.valid_field_cnt:
            self.error = None
            return True
        else:
            self.error = 'bad field count - should be %d but is: %d'  % \
                  (self.valid_field_cnt, self.last_field_cnt)
            return False


    def check_schema(self, record: List[str]) -> bool:
        """ First checks that the values match the dg_type custom value.
            Doing the custom check first since it gives the most detailed
            error message.
            If that passes, then run the validictory validation for more.
        """
        self.error = self._check_types_for_record(record)
        if self.error:
            return False

        self.error = self._check_ranges_for_record(record)
        if self.error:
            return False

        try:
            valid.validate(record, self.rec_schema)
        except ValueError as err:
            self.error = err
            return False
        else:
            return True


    def _check_types_for_record(self, record: List[str]) -> Optional[str]:
        """ Json schema and Validictory can't check types for us - since data
            returned from the csv module is a string.  So, this is the work around.
               - the json schema type should normally be: string
               - we're adding our own type called 'dg_type' - which allows values
                 of string, float or integer.  We will test for integers and floats.
        """
        # Loop through the schema, processing one column at a time.
        # for each column, check its info against record.
        # if any fail the test, exit early with msg.
        # else return None.
        for index, col_info in enumerate(self.rec_schema['items']):
            if 'dg_type' in col_info:
                if col_info['dg_type'] == 'integer':
                    try:
                        int(record[index])
                    except ValueError:
                        msg = self._msg_formatter(index, 'dg_type:integer', record[index], col_info)
                        return msg
                    except IndexError:
                        return 'Failed to validate field due to parsing error.  Wrong delimiter?'
                elif col_info['dg_type'] == 'float':
                    try:
                        float(record[index])
                    except ValueError:
                        msg = self._msg_formatter(index, 'dg_type:float', record[index], col_info)
                        return msg
                    except IndexError:
                        return 'Failed to validate field due to parsing error.  Wrong delimiter?'

        return None


    def _check_ranges_for_record(self, record: List[str]) -> Optional[str]:
        """ Json schema and Validictory can't check numeric min & max values -
            since the data returned from the csv module is a string.  So, this
            is the work around.
               - the dg_type constraint must be provided, and either float or
                 integer
               - the dg_minimum or dg_maximum value must be some appropriate
                 number
        """
        # Loop through the schema, processing one column at a time.
        # for each column, check its info against record.
        # if any fail the test, exit early with msg.
        # else return None.
        for index, col_info in enumerate(self.rec_schema['items']):
            if 'dg_type' not in col_info:
                continue
            if col_info['dg_type'] == 'integer':
                caster = int
            else:
                caster = float
            if 'dg_minimum' in col_info:
                try:
                    if caster(record[index]) < caster(col_info['dg_minimum']):
                        msg = self._msg_formatter(index, 'dg_minimum', record[index], col_info)
                        return msg
                except ValueError as err:
                    return err
                except IndexError:
                    return 'Failed to validate field due to parsing error.  Wrong delimiter?'
            if 'dg_maximum' in col_info:
                try:
                    if caster(record[index]) > caster(col_info['dg_maximum']):
                        msg = self._msg_formatter(index, 'dg_maximum', record[index], col_info)
                        return msg
                except ValueError as err:
                    return err
                except IndexError:
                    return 'Failed to validate field due to parsing error.  Wrong delimiter?'

        return None


    @staticmethod
    def _msg_formatter(field_num: int,
                       field_check: str,
                       field_value: str,
                       col_info: str,
                       hard_msg: str = None):

        if hard_msg:
            return hard_msg

        if 'title' in col_info:
            field_title = ' (%s) ' % col_info['title']
        else:
            field_title = ' '

        msg = (f'Failed to validate field number {field_num} {field_title} - '
               f'failed {field_check} check with actual value of {field_value}')
        return msg




class ValidateTheValidator(object):
    """ Validates the validation schema - to ensure that it is correctly
        structured.   This includes checks on its structure, keywords, and
        values.

        Any errors found will be handled by a ValueError exception.
    """

    def __init__(self, validation_schema: Dict[Any, Any]) -> None:
        self.schema = validation_schema
        self.unsupported_keys = ['minimum', 'maximum', 'format', 'divisibleBy']
        self.valid_keys = ['type', 'minLength', 'maxLength', 'title',
                           'description', 'enum', 'pattern', 'required',
                           'blank',
                           'dg_type', 'dg_minimum', 'dg_maximum']
        self._validate_schema()


    def _validate_schema(self) -> None:
        """ Validates entire schema - with a few high-level checks, and by
            running _validate_field for each field validation set.
        """
        if 'items' not in self.schema:
            raise ValueError("Error: invalid schema, missing 'items' key")
        if len(self.schema.keys()) != 1:
            raise ValueError("Error: invalid schema, incorrect number of 'items' keys")
        for field_validators in self.schema['items']:
            self._validate_field(field_validators)


    def _validate_field(self, field_validators) -> None:
        """ Validates one field, by running checks on each field
            validator in the schema.
        """
        for v_key in field_validators.keys():
            self._validate_validator(v_key, field_validators[v_key])

        self._validate_field_dg_range(field_validators)


    @staticmethod
    def _validate_field_dg_range(field_validators):
        """ Validates the custom dg_minimum and dg_maximum field validators.
            These fields and their values are dependent on dg_type being
            provided and having consistent types.
        """

        def limit_checker(limit: str):
            if limit in field_validators:
                if 'dg_type' not in field_validators:
                    raise ValueError("Error: invalid schema, %s but no dg_type" % limit)
                else:
                    try:
                        if field_validators['dg_type'] == 'integer':
                            int(field_validators[limit])
                        elif field_validators['dg_type'] == 'float':
                            float(field_validators[limit])
                        else:
                            raise ValueError("Error: invalid schema, %s requires a numeric dg_type" % limit)
                    except ValueError:
                        raise ValueError("Error: invalid schema, %s not of dg_type" % limit)

        limit_checker('dg_minimum')
        limit_checker('dg_maximum')


    def _validate_validator(self, v_key: str, v_value: Union[str, bool]) -> None:
        """ Validate individual field validators:
                - ensure all keys are valid & supported
                - ensure values are valid for the keys
        """

        if v_key not in self.valid_keys:
            if v_key in self.unsupported_keys:
                raise ValueError('Error:  invalid schema, unsupported key value: %s' % v_key)
            else:
                raise ValueError('Error:  invalid schema, unknown key value: %s' % v_key)
        if v_key == 'required':
            if v_value not in [True, False]:
                raise ValueError('Error:  invalid schema, unknown value for required: %s' % v_value)
        if v_key == 'blank':
            if v_value not in [True, False]:
                raise ValueError('Error:  invalid schema, unknown value for blank: %s' % v_value)
        if v_key == 'dg_type':
            if v_value not in ['integer', 'float', 'string']:
                raise ValueError('Error:  invalid schema, unknown value for dg_type: %s' % v_value)




def get_args() -> argparse.Namespace:
    """ gets args and returns them
        Input:
            - command line args & options
        Output:
            - args dictionary
    """
    use = ("%prog is used to validate the number of fields in a file. It "
           "writes records with a fieldcnt other than provided to output "
           "file or stdout \n"
           " \n"
           "   %prog [file] [misc options]")
    parser = argparse.ArgumentParser(description=use,
                                     formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('files',
                        default=['-'],
                        nargs='*',
                        help='Specifies the input file.  The default is stdin.  Multiple '
                             'filenames can be provided.')
    parser.add_argument('-o', '--outgood',
                        default='-',
                        help='Specifies the output file.  The default is stdout.  Note that'
                             'if a filename is provided the program will override any '
                             'file of that name.')
    parser.add_argument('-e', '--outerr',
                        default='-',
                        help='Specifies the output file for invalid records.  The default '
                             'is stderr.  Note that if a filename is provided the program '
                             'will override any file of that name.')

    parser.add_argument('-f', '--fieldcnt',
                        type=int,
                        dest='field_cnt',
                        help=('Specify the number of fields in the record.  If not provided'
                              ' it will default to number of fields on first record'))

    parser.add_argument('-d', '--delimiter',
                        help=('Specify a quoted single-column field delimiter. This may be'
                              'determined automatically by the program.'))
    parser.add_argument('--quoting',
                        default=None,
                        choices=[None, 'quote_none', 'quote_all', 'quote_minimal', 'quote_nonnumeric'],
                        help='Specify field quoting - generally only used for stdin data.'
                             '  The default is to have the program determine quoting.')
    parser.add_argument('--quotechar',
                        default='"',
                        help='Specify field quoting character - generally only used for '
                             'stdin data.  Default is double-quote')
    parser.add_argument('--hasheader',
                        dest='hasheader',
                        default=None,
                        action='store_true',
                        help='Indicate that there is a header in the file.')
    parser.add_argument('--hasnoheader',
                        dest='hasheader',
                        default=None,
                        action='store_false',
                        help=('Indicate that there is no header in the file.'
                              'Occasionally helpful in overriding automatic '
                              'csv dialect guessing.'))

    parser.add_argument('--validschema',
                        dest='valid_schema',
                        type=argparse.FileType('r'),
                        help=('Name of validation schema file'))

    parser.add_argument('-s', '--stats',
                        default=False,
                        action='store_true',
                        help='Causes pgm to print counts.  Default is False.')

    parser.add_argument('--silent',
                        default=False,
                        action='store_true',
                        help='Causes pgm to print no stats, no valid or invalid recs.  Default is off.')
    parser.add_argument('--randomout',
                        default=1.0,
                        type=float,
                        help='Causes pgm to write only a percentage of records out.')
    parser.add_argument('--errmsg',
                        default=False,
                        action='store_true',
                        help='Causes pgm to append error msg at end of each invalid record.  Default is on.')
    parser.add_argument('--long-help',
                        default=False,
                        action='store_true',
                        help='Print more verbose help')

    args = parser.parse_args()

    if args.long_help:
        print(__doc__)
        sys.exit(0)

    if args.files[0] == '-':  # stdin
        if not args.delimiter:
            parser.error('Please provide delimiter when piping data into program via stdin')
        if not args.quoting:
            args.quoting = 'quote_minimal'
    else:
        if len(args.files) > 1 and not args.delimiter:
            parser.error('Please provide delimiter when reading multiple input files')

    if args.randomout > 1.0 or args.randomout < 0.0:
        parser.error('Valid values for randomout are from 0.0 to 1.0')


    return args



if __name__ == '__main__':
    sys.exit(main())
