from demyst.analytics.report import report

import matplotlib.pyplot as plt
from io import BytesIO
import pdfkit
import pandas as pd
import numpy as np
import os
import shutil
from matplotlib import font_manager
from matplotlib import rcParams
import html
import platform
from yattag import Doc
import re

# This is a Windows workaround for a bug in wkhtmltopdf:
# https://github.com/wkhtmltopdf/wkhtmltopdf/issues/3081
# On Unix-like systems, use `ulimit -n 2048`.
def maximize_number_of_file_descriptors():
    if platform.system() == "Windows":
        import win32file
        win32file._setmaxstdio(2048)

def quality_report(analytics, df, rep=None):
    maximize_number_of_file_descriptors()
    # Need this so it finds custom fonts
    font_manager._rebuild()
    # Set default font
    rcParams['font.family'] = 'Roboto Mono'
    rcParams['font.weight'] = 'bold'
    rcParams['text.color'] = '#DEE2EA'
    rcParams['axes.labelcolor'] = '#DEE2EA'
    rcParams['axes.edgecolor'] = '#222733'
    rcParams['xtick.color'] = '#DEE2EA'
    rcParams['ytick.color'] = '#DEE2EA'

    if not rep:
        rep = analytics.report(df)

    here_dir = os.path.dirname(os.path.realpath(__file__))
    try:
        os.mkdir("report")
    except Exception:
        pass

    all_stats = analytics.product_stats(provider_names_from_report(rep))
    data_stats_diagram(analytics, rep)
    distribution_diagrams(analytics, rep, all_stats)
    generate_html(analytics, df, rep, all_stats)
    copy_files(here_dir)
    pdfkit.from_file("report/index.html",
                     "report/report.pdf",
                     { 'margin-top': '0px', 'margin-left': '0px', 'margin-bottom': '0px', 'margin-right': '0px' })

def save_figure(filename):
    plt.savefig("report/" + filename + ".svg", format='svg', bbox_inches='tight', transparent=True)
    plt.close()

def extract_data_for_match_rate_plot_from_report(rep):
    return rep[["product_name", "product_match_rate", "product_error_rate"]].drop_duplicates().sort_values(by=['product_match_rate'], ascending=False).reset_index(drop=True)

def data_stats(analytics, rep):
    outputs = analytics.product_outputs(provider_names_from_report(rep))
    temp = pd.DataFrame({'attribute' : list(rep['attribute_name'])})
    temp['attribute'] = temp['attribute'].replace(r"\[(\w+)\]", '[]', regex=True)
    data_types = pd.merge(temp, outputs, on="attribute", how="left")
    numerical = len(list(data_types[data_types['type'] == 'Number']['type']))
    datetime = len(list(data_types[data_types['type'] == 'DateTime']['type']))
    bools = len(list(data_types[data_types['type'] == 'Boolean']['type']))
    st = data_types[data_types['type'] == 'String']
    categorical = len(st) - len(st[st['attribute'].str.contains('description')])
    desc = len(data_types) - numerical - datetime - bools - categorical
    return (['Numerical', 'Categorical Text', 'Descriptive Text', 'Date', 'Boolean'], [numerical, categorical, desc, datetime, bools])

def data_stats_colors():
    return ["#b39ddb", "#1de9B6", "#2e3951", "#00acc1", "#7283a7"]
    
def data_stats_diagram(analytics, rep):
    types, values = data_stats(analytics, rep)
    patches, texts = plt.pie(values, colors=data_stats_colors())
    return save_figure("data_stats")

def generate_html(analytics, df, rep, all_stats):
    doc = Doc()
    with doc.tag("html"):
        with doc.tag("head"):
            doc.stag("meta", charset="utf-8")
            doc.stag("link", rel="stylesheet", href="style.css")
        with doc.tag("body"):
            cover(doc)
            page_break(doc)
            header(doc)
            product_stats_section(analytics, rep, all_stats, doc)
            page_break(doc)
            header(doc)
            data_stats_section(analytics, rep, doc)
            page_break(doc)
            header(doc)
            attr_stats_section(analytics, rep, doc)
            page_break(doc)
            header(doc)
            details_section(analytics, df, rep, all_stats, doc)
    text_file = open("report/index.html", "w+")
    text_file.write(doc.getvalue())
    text_file.close()

# Non-breaking space Unicode character
NBSP="\u00a0"

def nbsp(string):
    return string.replace(" ", NBSP)

def cover(doc):
    with doc.tag("div", klass="cover"):
        doc.stag("img", klass="cover_logo", src="demyst_logo_gray.svg")
        with doc.tag("h1"):
            doc.line("div", "Data")
            doc.line("div", "Quality")
            doc.line("div", "Report", klass="dq")

def legend_item(doc, color):
    doc.line("span", NBSP, klass="legend_item", style="background-color: " + color)

def product_stats_section(a, rep, all_stats, doc):
    doc.line("h2", "Data Products")
    # Legend
    with doc.tag("div", klass="product_stats_legend"):
        legend_item(doc, "#1de9B6")
        doc.text(" Match ")
        legend_item(doc, "#29b6f6")
        doc.text(" No Match ")
        legend_item(doc, "#b39ddb")
        doc.text(" Error ")
    with doc.tag("div", klass="product_stats"):
        # Diagram
        data = extract_data_for_match_rate_plot_from_report(rep)
        for _, row in data.iterrows():
            pid = row["product_name"]
            with doc.tag("div", klass="product_stat"):
                p = a._Analytics__config.lookup_provider(pid)
                if p:
                    name = p["aegean_data_source"]["name"]
                    doc.line("h3", name, klass="product_name")
                doc.line("h3", pid, klass="product_code")
                if p:
                    tags = " \u25aa ".join([t["name"] for t in p["tags"]])
                    doc.line("div", p["description"], klass="product_desc")
                    doc.line("div", tags, klass="product_tags")
                with doc.tag("table", style="width: 100%"):
                    with doc.tag("tr"):
                        doc.line("td", nbsp("\u25B6 Results for this Job: "), style="width: 10%")
                        with doc.tag("td"):
                            product_stat_bar(doc, row["product_match_rate"], row["product_error_rate"])
                    pstats = product_stats_from_all_stats(all_stats, pid)
                    if not pstats.empty:
                        with doc.tag("tr"):
                            doc.line("td", nbsp("\u25B6 Overall Results for this Product: "))
                            with doc.tag("td"):
                                product_stat_bar(doc, pstats["product_match_rate"].iloc[0], pstats["product_error_rate"].iloc[0])

def product_stats_from_all_stats(all_stats, pid):
    return all_stats.loc[all_stats["product"] == pid].sort_values("stats_updated_on", ascending=False)

def product_stat_bar(doc, match_rate, error_rate):
    with doc.tag("table", klass="product_stats_diagram", cellpadding="0", cellspacing="0"):
        with doc.tag("tr"):
            if match_rate >= 0.01:
                s = "%.2f%%" % match_rate
                doc.line("td", s, width=s, style="background-color: #1de9B6")
            no_match_rate = 100 - match_rate - error_rate
            if no_match_rate >= 0.01:
                s = "%.2f%%" % no_match_rate
                doc.line("td", s, width=s, style="background-color: #29b6f6")
            if error_rate >= 0.01:
                s = "%.2f%%" % error_rate
                doc.line("td", s, width=s, style="background-color: #b39ddb")

def distribution_diagrams(a, rep, all_stats):
    products = provider_names_from_report(rep)
    for p in products:
        # per-job hist
        numattr = len(rep.loc[rep['product_name'] == p])
        x = rep.loc[rep['product_name'] == p]['attribute_fill_rate'].to_numpy()
        arr = plt.hist(x)
        plt.xlabel('Fill Rate (%)')
        plt.ylabel('# of Attributes', rotation='90')
        plt.tight_layout()
        save_figure(p + "_fill")
        # overall hist
        pstats = product_stats_from_all_stats(all_stats, p)
        if not pstats.empty:
            pstats['attribute_flattened_name'] = pstats['attribute_flattened_name'].str.replace(r"\[(\d+)\]", '[*]', regex=True)
            pstats = pstats.drop_duplicates(subset="attribute_flattened_name")
            numattr = len(pstats.loc[pstats['product'] == p])
            x = pstats.loc[pstats['product'] == p]['attribute_fill_rate'].to_numpy() * 100.0
            arr = plt.hist(x)
            plt.xlabel('Fill Rate (%)')
            plt.ylabel('# of Attributes', rotation='90')
            plt.tight_layout()
            save_figure(p + "_fill2")

def data_stats_section(a, rep, doc):
    doc.line("h2", "Data Types")
    with doc.tag("table", klass="data_stats"):
        with doc.tag("tr"):
            with doc.tag("td"):
                doc.stag("img", klass="data_stats_diagram", src="data_stats.svg")
            with doc.tag("td", klass="data_stats_legend"):
                types, values = data_stats(a, rep)
                colors = data_stats_colors()
                for type, value, color in zip(types, values, colors):
                    with doc.tag("p"):
                        legend_item(doc, color)
                        doc.text(" " + type + ": " + str(value))

def attr_stats_section(a, rep, doc):
    doc.line("h2", "Attributes")
    most_filled_rep = filter_report(rep).head(10)
    least_filled_rep = filter_report(rep)[::-1].head(10)
    doc.line("h3", "Most Filled Attributes")
    for _, row in most_filled_rep.iterrows():
        render_attr(doc, row)
    page_break(doc)
    header(doc)
    doc.line("h2", "Attributes")
    doc.line("h3", "Least Filled Attributes")
    for _, row in least_filled_rep.iterrows():
        render_attr(doc, row)

def render_attr(doc, row):
    doc.line("h4", "\u25B6 " + row["product_name"] + "." + row["attribute_name"], klass="attr_name")
    render_attr_bar(doc, row["attribute_fill_rate"])

def render_attr_bar(doc, fill_rate):
    with doc.tag("table", klass="attr_stats_diagram", cellpadding="0", cellspacing="0"):
        with doc.tag("tr"):
            if fill_rate >= 0.01:
                s = "%.2f%%" % fill_rate
                doc.line("td", s, width=s, style="background-color: #1de9B6")
                if s != "100.00%":
                    doc.line("td", NBSP, style="background-color: #2e3951")
            else:
                s = "%.2f%%" % fill_rate
                doc.line("td", s, style="background-color: #2e3951; color: #dee2ea; text-align: left;")

def details_section(a, df, rep, all_stats, doc):
    for pid in provider_names_from_report(rep):
        p = a._Analytics__config.lookup_provider(pid)
        if p:
            doc.line("h2", p["aegean_data_source"]["name"])
            doc.line("h3", pid, klass="product_code")
            doc.line("h3", "Fill Rate")
            doc.line("h4", "\u25B6 Results for this Job: ")
            doc.stag("img", src=pid + "_fill.svg", style="width: 50%")
            pstats = product_stats_from_all_stats(all_stats, pid)
            if not pstats.empty:
                doc.line("h4", "\u25B6 Overall Results for this Product: ")
                doc.stag("img", src=pid + "_fill2.svg", style="width: 50%")
            attrs = rep.loc[(rep['product_name'] == pid)]
            pstats['attribute_flattened_name'] = pstats['attribute_flattened_name'].str.replace(r"\[(\d+)\]", '[*]', regex=True)
            pstats = pstats.drop_duplicates(subset="attribute_flattened_name")
            doc.line("h3", "Attributes")
            for idx, row in attrs.iterrows():
                with doc.tag("div", style="page-break-inside: avoid"):
                    doc.line("h4", row["product_name"] + "." + row["attribute_name"], klass="attr_name")
                    with doc.tag("table", style="width: 100%"):
                        with doc.tag("tr"):
                            doc.line("td", nbsp("\u25B6 Results for this Job: "), style="width: 10%")
                            with doc.tag("td"):
                                render_attr_bar(doc, row["attribute_fill_rate"])
                        if not pstats.empty:
                            astats = pstats.loc[(pstats['attribute_flattened_name'] == row["attribute_name"])]
                            if not astats.empty:
                                astat = astats.iloc[0]
                                with doc.tag("tr"):
                                    doc.line("td", nbsp("\u25B6 Overall Results for this Attribute: "), style="width: 10%")
                                    with doc.tag("td"):
                                        render_attr_bar(doc, astat["attribute_fill_rate"] * 100.0)
#                     if row["attribute_type"] == np.dtype("float64"):
#                         with doc.tag("tr"):
#                             with doc.tag("td"):
#                                 attr_hist_diagram(row, df, pid, idx)
#                                 doc.stag("img", src=pid + "_hist_" + str(idx) + ".svg")
#                    with doc.tag("tr"):
#                        doc.line("td", nbsp("Unique Values: "), style="width: 10%")
#                        doc.line("td", row["unique_values"])
            page_break(doc)
            header(doc)

def attr_hist_diagram(row, df, pid, idx):
    rcParams['font.family'] = 'Roboto Mono'
    rcParams['font.weight'] = '400'
    rcParams['text.color'] = '#a1adc4'
    rcParams['axes.labelcolor'] = '#a1adc4'
    rcParams['xtick.color'] = '#a1adc4'
    rcParams['ytick.color'] = '#a1adc4'
    # Note: uses only first element of list attr
    attr_name = row['attribute_name'].replace('[*]', '[0]')
    full_name = row['product_name'] + '.' + attr_name
    x = df[full_name].to_numpy()
    plt.grid(b=False)
    plt.box(on=False)
    num_bins = 50
    plt.yticks(ticks=[], labels=[])
    plt.gcf().set_size_inches(5, 2)

    try:
        sd = np.std(x)
        n, bins, patches = plt.hist(x, num_bins, density=1)
    except (TypeError, ValueError) as e:
        print(full_name)
        plt.clf()
        plt.text(0.5, 0.5, 'Not Really a Number', horizontalalignment='center', verticalalignment='center')
        return plt
    n, bins, patches = plt.hist(x, num_bins, density=1)
    sd = np.nanstd(x)
    m = np.nanmean(x)
    sdl = m - (2 * sd)
    sdh = m + (2 * sd)
    plt.axvline(m, color='b', linestyle='solid', linewidth=1)
    plt.axvline(sdl, color='b', linestyle='solid', linewidth=1)
    plt.axvline(sdh, color='b', linestyle='solid', linewidth=1)
    _, max_ = plt.ylim()
    plt.text(m + m / 10,
             max_ - max_ / 10,
             'Mean: {:.2f}'.format(m))
    plt.text(sdl + m / 10,
             max_ - (2 * max_ / 10),
             '-2sd: {:.2f}'.format(sdl))
    plt.text(sdh + m / 10,
             max_ - (2 * max_ / 10),
             '+2sd: {:.2f}'.format(sdh))
    plt.tight_layout()
    save_figure(pid + "_hist_" + str(idx))

def provider_names_from_report(rep):
    return rep[["product_name"]].drop_duplicates()["product_name"].tolist()

def header(doc):
    with doc.tag("table", klass="header"):
        with doc.tag("tr"):
            doc.line("th", "Data Quality Report")
            with doc.tag("td"):
                doc.stag("img", klass="header_logo", src="demyst_logo_gray.svg")

def page_break(doc):
    doc.line("div", NBSP, klass="page_break")
    doc.line("div", NBSP, klass="page_break_spacer")

def copy_files(here_dir):
    shutil.copy(here_dir + "/files/style.css", "report/style.css")
    shutil.copy(here_dir + "/files/header.html", "report/header.html")
    shutil.copy(here_dir + "/files/footer.html", "report/footer.html")
    shutil.copy(here_dir + "/files/demyst_logo_gray.svg", "report/demyst_logo_gray.svg")

def filter_report(rep):
    rep = rep[~rep['attribute_name'].isin(['client_id', 'row_id', 'error', 'is_hit'])]
    col = ['product_name', 'attribute_name', 'attribute_fill_rate']
    rep = rep[col].sort_values(by='attribute_fill_rate', ascending=False, na_position='last')
    return rep

