#!/usr/bin/env python
"""
Command line metadata reporting tool - used to provide various reports, in
various formats of data within the metadata database.

Currently some of this functionality is redundant with the gristle
determinator:
   - determinator:  populates metadata and reports on current analysis job
   - md_reporter:   reports on metadata database, is intended for steady
                    enhancemnts - eventually adding PDF outputs.

usage: gristle_md_reporter [-h] [--long-help] [--report {dd,datadictionary}]
                           [--schema_id SCHEMA_ID] [--schema_name SCHEMA_NAME]
                           [--collection_id COLLECTION_ID]
                           [--collection_name COLLECTION_NAME]
                           [--field_id FIELD_ID] [--field_name FIELD_NAME]

required arguments:
  --report, -r {dd,datadictionary} - Report to be produced:
                - dd and datadictionary are synonyms
                - required options:  collection_id
                - output: a data dictionary for the collection_id

optional arguments (some may be required, depending on report):
  -h, --help            show this help message and exit
  --long-help           Print verbose help and exit.
  --schema_id SCHEMA_ID, --si SCHEMA_ID
                        id of schema
  --schema_name SCHEMA_NAME, --sn SCHEMA_NAME
  --collection_id COLLECTION_ID, --ci COLLECTION_ID
  --collection_name COLLECTION_NAME, --cn COLLECTION_NAME
  --field_id FIELD_ID, --fi FIELD_ID
  --field_name FIELD_NAME, --fn FIELD_NAME

This source code is protected by the BSD license.  See the file "LICENSE"
in the source code root directory for the full language or refer to it here:
   http://opensource.org/licenses/BSD-3-Clause
Copyright 2011,2012,2013,2014,2017 Ken Farmer
"""
import sys
import argparse
from signal import signal, SIGPIPE, SIG_DFL

from sqlalchemy import (text)

from datagristle.common import ifprint    # pylint: disable=E0611
import datagristle.metadata as metadata

#Ignore SIG_PIPE and don't throw exceptions on it... (http://docs.python.org/library/signal.html)
signal(SIGPIPE, SIG_DFL)


def main():
    """ runs all processes:
            - gets opts & args
            - runs command
    """
    args = get_args()
    dargs = vars(args)
    my_md = metadata.GristleMetaData()

    reports = {'datadictionary':rpt_datadict,
               'dd':rpt_datadict}

    # eliminate any Nones so they don't confuse _create_where
    # alternatively change _create_where so that it checks for None pk/uks
    for key in dargs.keys():
        if dargs[key] is None:
            dargs.pop(key)
    dargs['md'] = my_md

    # get a pointer to the function:
    function = reports['dd']

    # run the function:
    if function(**dargs):
        return 0
    else:
        return 1



def rpt_datadict(**kwargs):
    """ Prints a data dictionary report.
        It returns a boolean indicating the success or failure of the rpt.
    """
    if 'analysis_id' not in kwargs:
        kwargs['analysis_id'] = get_analysis_id(kwargs['md'],
                                                kwargs['collection_id'])
    if not rpt_collection_analysis(**kwargs):
        return False

    rpt_field_analysis(**kwargs)
    return True



def rpt_collection_analysis(**kwargs):
    """Print information about a collection.
    """

    # next get collection_analysis data:
    sql = """ SELECT *                                       \
              FROM rpt_collection_analysis_v                 \
              WHERE collection_id  = :collection_id          \
                AND analysis_id    = :analysis_id            \
          """
    select_sql = text(sql)
    result = kwargs['md'].engine.execute(select_sql,
                                         collection_id=kwargs['collection_id'],
                                         analysis_id=kwargs['analysis_id'])
    coll_rows = result.fetchall()
    if not coll_rows:
        print('Error: no collection found')
        return False
    else:
        print('')
        print('================= Schema and Collection Info =====================')
        for row in coll_rows:
            print('schema_id:                %d' % row.schema_id)
            print('schema_name:              %s' % row.schema_name)
            print('collection_id:            %d' % row.collection_id)
            print('collection_name:          %s' % row.collection_name)
            print('instance_id:              %d' % row.instance_id)
            print('instance_name:            %s' % row.instance_name)
            print('analysis_id:              %d' % row.analysis_id)
            print('analysis_timestamp:       %s' % row.analysis_timestamp)
            print('ca_id:                    %d' % row.ca_id)
            print('ca_name:                  %s' % row.ca_name)
            print('ca_location:              %s' % row.ca_location)
            print('ca_row_cnt:               %d' % row.ca_row_cnt)
            print('ca_delimiter:             %s' % row.ca_delimiter)
            print('ca_hasheader:             %s' % row.ca_hasheader)
            print('ca_quoting:               %s' % row.ca_quoting)
            print('ca_quote_char:            %s' % row.ca_quote_char)
        return True




def rpt_field_analysis(**kwargs):
    """Prints field-level information.
    """

    #-----  format and run query ------------------
    sql1 = """ SELECT v.field_id, \
                      v.field_name, \
                      v.field_type, \
                      v.field_order, \
                      v.field_len, \
                      v.fa_type, \
                      v.fa_unique_cnt, \
                      v.fa_known_cnt, \
                      v.fa_unknown_cnt, \
                      v.fa_min, \
                      v.fa_max, \
                      v.fa_mean, \
                      v.fa_median, \
                      v.fa_stddev, \
                      v.fa_variance, \
                      v.fa_min_len, \
                      v.fa_max_len, \
                      v.fa_mean_len, \
                      v.fa_case, \
                      f.field_desc                            \
               FROM rpt_field_analysis_v v                    \
                  INNER JOIN field f                          \
                     ON v.collection_id = f.collection_id     \
                    AND v.field_id      = f.field_id          \
               WHERE v.collection_id  = :collection_id        \
                 AND v.analysis_id    = :analysis_id          \
               ORDER BY v.field_order                         \
          """


    sql2 = """SELECT c.collection_id,      \
                     ca.analysis_id,       \
                     ca.ca_id,             \
                     f.field_id,           \
                     f.field_name,         \
                     f.field_type,         \
                     f.field_order,        \
                     f.field_len,          \
                     f.field_desc,         \
                     fa.fa_type,           \
                     fa.fa_unique_cnt,     \
                     fa.fa_known_cnt,      \
                     fa.fa_unknown_cnt,    \
                     fa.fa_min,            \
                     fa.fa_max,            \
                     fa.fa_mean,           \
                     fa.fa_median,         \
                     fa.fa_stddev,         \
                     fa.fa_variance,       \
                     fa.fa_min_len,        \
                     fa.fa_max_len,        \
                     fa.fa_mean_len,       \
                     fa.fa_case            \
              FROM collection c                                 \
                  LEFT OUTER JOIN collection_analysis ca        \
                     ON c.collection_id = ca.collection_id      \
                  INNER JOIN field f                            \
                     ON c.collection_id = f.collection_id       \
                  LEFT OUTER JOIN field_analysis fa             \
                     ON f.field_id = fa.field_id                \
               WHERE c.collection_id  = :collection_id          \
                 AND ca.analysis_id   = :analysis_id            \
             ORDER BY f.field_order                             \
        """


    if 'field_id' in kwargs:
        sql2 += " AND field_id = :field_id "
        select_sql = text(sql2)
        result = kwargs['md'].engine.execute(select_sql,
                                             collection_id=kwargs['collection_id'],
                                             analysis_id=kwargs['analysis_id'],
                                             field_id=kwargs['field_id'])
    elif 'field_name' in kwargs:
        sql2 += " AND field_name = :field_name "
        select_sql = text(sql2)
        result = kwargs['md'].engine.execute(select_sql,
                                             collection_id=kwargs['collection_id'],
                                             analysis_id=kwargs['analysis_id'],
                                             field_name=kwargs['field_name'])
    else:
        select_sql = text(sql2)
        result = kwargs['md'].engine.execute(select_sql,
                                             collection_id=kwargs['collection_id'],
                                             analysis_id=kwargs['analysis_id'])
    fa_rows = result.fetchall()

    print('')
    print('================= Field Info =====================')
    for row in fa_rows:
        print('')
        print('---------------------------------------------------------')
        print('field_name:           %s' % row.field_name)
        print('field_order:          %s' % row.field_order)
        print('field_id:             %s' % row.field_id)
        print('field_desc:           %s' % row.field_desc)
        print('field_type:           %s' % row.field_type)
        ifprint(row.field_len,       'field_len:            %d', row.field_len)
        ifprint(row.fa_type,         'fa_type:              %s', row.fa_type)
        ifprint(row.fa_unique_cnt,   'fa_unique_cnt:        %d', row.fa_unique_cnt)
        ifprint(row.fa_known_cnt,    'fa_known_cnt:         %d', row.fa_known_cnt)
        ifprint(row.fa_unknown_cnt,  'fa_unknown_cnt:       %d', row.fa_unknown_cnt)
        ifprint(row.fa_min,          'fa_min:               %s', row.fa_min)
        ifprint(row.fa_max,          'fa_max:               %s', row.fa_max)
        ifprint(row.fa_mean,         'fa_mean:              %s', row.fa_mean)
        ifprint(row.fa_median,       'fa_median:            %s', row.fa_median)
        ifprint(row.fa_stddev,       'fa_stddev:            %s', row.fa_stddev)
        ifprint(row.fa_variance,     'fa_variance:          %s', row.fa_variance)
        ifprint(row.fa_min_len,      'fa_min_len:           %s', row.fa_min_len)
        ifprint(row.fa_max_len,      'fa_max_len:           %s', row.fa_max_len)
        ifprint(row.fa_mean_len,     'fa_mean_len:          %s', row.fa_mean_len)
        ifprint(row.fa_case,         'fa_case:              %s', row.fa_case)

        kwargs['field_id'] = row.field_id
        rpt_field_value(**kwargs)
        rpt_field_freq(**kwargs)



def rpt_field_value(**kwargs):
    """ Prints field value descriptions.
    """
    fv_sql = """ SELECT *                     \
                 FROM field_value fv          \
                 WHERE field_id = :field_id   \
             """
    select_sql = text(fv_sql)
    result = kwargs['md'].engine.execute(select_sql,
                                         field_id=kwargs['field_id'])
    fv_rows = result.fetchall()
    if fv_rows:
        print('')
        print('Field Known Values')
        row_cnt = 0
        for row in fv_rows:
            row_cnt += 1
            if row_cnt > 40:
                break
            ifprint(row.fv_value, '        %-40.40s             %s',
                    row.fv_value, row.fv_desc)



def rpt_field_freq(**kwargs):
    """ Prints field value frequencies.
    """
    fv_sql = """ SELECT fav.fav_value,                          \
                        fav.fav_count                           \
                 FROM field_analysis fa                         \
                    INNER JOIN field_analysis_value fav         \
                       ON fa.fa_id = fav.fa_id                  \
                 WHERE fa.field_id    = :field_id               \
                 ORDER BY fav.fav_count DESC                    \
             """
    select_sql = text(fv_sql)
    result = kwargs['md'].engine.execute(select_sql,
                                         field_id=kwargs['field_id'])
    fav_rows = result.fetchall()
    if fav_rows:
        print('')
        print('Top Values')
        row_cnt = 0
        for row in fav_rows:
            row_cnt += 1
            if row_cnt > 20:
                break
            ifprint(row.fav_value, '        %-40.40s             %d',
                    row.fav_value, row.fav_count)




def get_analysis_id(md, collection_id):
    """ Currently returns max analysis_id.
        In the future it may return the analysis id associated
        with analysis time or name or schema id or name or
        instance id or name.
    """

    sql = """ SELECT MAX(anal.analysis_id) as analysis_id  \
              FROM schema                 sch              \
                 INNER JOIN collection    coll             \
                    ON sch.schema_id = coll.schema_id      \
                 INNER JOIN instance      inst             \
                    ON sch.schema_id = inst.schema_id      \
                 INNER JOIN analysis      anal             \
                    ON inst.instance_id = anal.instance_id \
              WHERE collection_id  = :collection_id        \
          """
    select_sql = text(sql)
    result = md.engine.execute(select_sql, collection_id=collection_id)
    anal_rows = result.fetchall()
    try:
        return anal_rows[0].analysis_id
    except IndexError:  # no rows found
        return None



def get_args():
    """ gets opts & args and returns them
        Input:
            - command line args & options
        Output:
            - args namespace
    """
    use = ('\n'
           '\ngristle_md_reporter is used to provide command-line reporting'
           ' with the gristle metadata database.'
           '\n'
           '\ngristle_md_reporter -r [report]'
           '\n')

    parser = argparse.ArgumentParser(description=use,
                                     formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--long-help',
                        default=False,
                        action='store_true',
                        help='Print verbose help and exit.')
    parser.add_argument('--report', '-r',
                        choices=['dd', 'datadictionary'],
                        help='Report to be produced')
    parser.add_argument('--schema_id', '--si',
                        help='id of schema')
    parser.add_argument('--schema_name', '--sn')
    parser.add_argument('--collection_id', '--ci',
                        type=int)
    parser.add_argument('--collection_name', '--cn')
    parser.add_argument('--field_id', '--fi',
                        type=int)
    parser.add_argument('--field_name', '--fn')

    args = parser.parse_args()

    if args.long_help:
        print(__doc__)
        sys.exit(0)
    elif not args.report:
        parser.error('Report must be provided')
    elif args.report == 'dd' and args.collection_id is None:
        parser.error('collection_id must be provided for dd report')

    return args




if __name__ == '__main__':
    sys.exit(main())
