# %%
import pandas as pd

# %%
from IPython.display import display, Markdown

# %% [markdown]
# # **Intro & Helper**

# %%
import pandas as pd
from IPython.display import display, Markdown

def helper(value=None):
    descriptions = {
        "pp.distribution(df, \"column_name\")": "Displays the distribution of values for a given column.",
        "pp.range(df)": "Shows the minimum and maximum values for each column in the dataset.",
        "pp.unique(df)": "Provides a count of unique values for each column.",
        "pp.summarise(df)": "Summarizes numeric columns with count, sum, mean, median, max, and min values.",
        "pp.missing(df)": "Provides a summary of missing values for each column in the dataset.",
        "pp.zeros(df)": "Summarizes columns with zero values and their respective counts.",
    }

    if not value:
        functions_list = "\n".join([f"- {func}" for func in descriptions.keys()])
        standard_message = f"""
---
# Pivot Pal Helper:
---
## Welcome to 'Pivot Pal' Helper!
To get detailed descriptions of specific functions, provide a keyword inside the parentheses.
Example: `pivot_pal.help('value')` will show functions related to value statistics.
---
### Available Functions:
{functions_list}
---
### Try searching with keywords like 'missing', 'value', 'duplicate', etc.
"""
        display(Markdown(standard_message))
        return

    # Filter the descriptions based on the provided value
    filtered_descriptions = {k: v for k, v in descriptions.items() if value in k}
    if not filtered_descriptions:
        display(Markdown(f"## No functions found for the keyword '{value}'.\n\nTry another keyword."))
        return
    message = f"## Helper: '{value}'\n---\n\n"
    for func, desc in filtered_descriptions.items():
        message += f"### **{func}**:\n\n    {desc}\n\n"
    display(Markdown(message))


# %%
helper("missing")

# %% [markdown]
# ---
# # **Dataset**
# ---
# 

# %% [markdown]
# ###**Value Distribution Table:** `pp.distribution(df)`
# - Column Name, Count and Distribution
# 

# %%
def distribution(df, column_name):

    # Count Values of column
    counts = df[column_name].value_counts()

    # Calculate % distribution
    percentages = ((counts / len(df)) * 100).round(2)

    return pd.DataFrame({

        column_name: counts.index,
        'count': counts.values,
        '%': percentages.values,

    }).sort_values(by='count', ascending=False)

# %% [markdown]
# ###**Value Distribution Table:** `pp.missing(df)`
# 

# %%
def range(df):

    return pd.DataFrame({'Min Value': df.min(), 'Max Value': df.max()})

# %%
def unique(df):

    unique_counts = df.nunique()
    unique_df = pd.DataFrame({
        'Column Name': unique_counts.index,
        'Unique Count': unique_counts.values
    })
    return unique_df.sort_values(by='Unique Count', ascending=False)


# %%
def summarise(df):

    numeric_df = df.select_dtypes(include=[float, int])  # Select only numeric columns

    summary = pd.DataFrame({
        'Column Name': numeric_df.columns,
        'Count': numeric_df.count().values,
        'Sum': numeric_df.sum().values,
        'Mean': numeric_df.mean().values,
        'Median': numeric_df.median().values,
        'Max': numeric_df.max().values,
        'Min': numeric_df.min().values
    })

    # Reordering the columns for better readability
    column_order = ['Column Name', 'Count', 'Sum', 'Mean', 'Median', 'Max', 'Min']
    summary = summary[column_order]

    return summary


# %% [markdown]
# ---
# # **Missing**
# ---
# 
# 

# %% [markdown]
# ---
# **Missing Stats Table:** `cc.missing(df)`
# - Column Name, Count and Distribution
# ---

# %%
def missing(df):

    # Calculate the number of missing values for each column
    missing_values_count = df.isnull().sum()

    # Calculate the percentage of missing values for each column
    missing_percentage = round((missing_values_count / len(df)) * 100, 0)

    # Create a DataFrame to display the results
    statistics_df = pd.DataFrame({

        'Column Name': missing_values_count.index,
        'Missing Count': missing_values_count.values,
        'Missing %': missing_percentage.values

    })

    # Filter out columns with no missing values and sort by percentage
    statistics_df = statistics_df[statistics_df['Missing Count'] > 0].sort_values(by='Missing %', ascending=False)

    return statistics_df

# %% [markdown]
# ---
# # **Zeros**
# ---

# %% [markdown]
# **Duplicated Rows Table:** `cc.zeros`
# 

# %%
def zeros(df):

    # Find values equal to zero
    zero_counts = (df == 0).sum()

    # Calculate the distribution of zero values
    zero_percentage = (zero_counts / len(df) * 100).round(2)

    # Print results in DataFrame
    result_df = pd.DataFrame({'Zero Count': zero_counts, 'Zero %': zero_percentage})

    # Sorting the DataFrame by 'Zero %' in descending order
    result_df = result_df.sort_values(by='Zero %', ascending=False)

    return result_df


# %% [markdown]
# # **Datatypes**

# %%
def datatypes(df):
    # Get data types for each column
    dtypes = df.dtypes

    # Count the occurrences of each data type
    dtypes_count = dtypes.value_counts()

    # Calculate % distribution
    dtypes_percentage = (dtypes_count / len(dtypes) * 100).round()

    # Create a DataFrame for the pivot table-like layout
    pivot_df = pd.DataFrame({
        'Data Type': dtypes_count.index,
        'Column Count': dtypes_count.values,
        '% Distribution': dtypes_percentage.values
    })

    return pivot_df



