import pandas as pd
import numpy as np
import io
import math
from typing import Dict, List, Optional, Tuple, Any
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import compute_class_weight
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt 
import seaborn as sns


sns.set_style("whitegrid")


class Vizion:

    @staticmethod
    def quick_summary(df: pd.DataFrame, show: bool = True):
        """
        Simple clean summary.
        Prints:
            - Basic stats
            - Top missing columns
            - Top missing categorical columns
        Returns dict only when show=False.
        """

        n_rows, n_cols = df.shape

        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
        datetime_cols = df.select_dtypes(include=['datetime', 'datetime64[ns]']).columns.tolist()

        # Missing %
        missing = df.isnull().sum()
        missing_pct = (missing / n_rows * 100).round(2)

        # Top missing (overall)
        top_missing = (
            missing_pct[missing_pct > 0]
            .sort_values(ascending=False)
            .head(10)
            .to_dict()
        )

        # Top missing in categorical only
        cat_missing = missing_pct[categorical_cols]
        top_cat_missing = (
            cat_missing[cat_missing > 0]
            .sort_values(ascending=False)
            .head(10)
            .to_dict()
        )

        summary = {
            "rows": n_rows,
            "columns": n_cols,
            "numeric_columns": len(numeric_cols),
            "categorical_columns": len(categorical_cols),
            "datetime_columns": len(datetime_cols),
            "missing_columns": int((missing > 0).sum()),
            "duplicated_rows": int(df.duplicated().sum()),
            "top_missing": top_missing,
            "top_cat_missing": top_cat_missing
        }

        if show:
            print("===== SIMPLE DATA SUMMARY =====")
            print(f"Rows       : {n_rows}")
            print(f"Columns    : {n_cols}")
            print(f"Numeric    : {len(numeric_cols)}")
            print(f"Categorical: {len(categorical_cols)}")
            print(f"Datetime   : {len(datetime_cols)}")
            print(f"Missing columns: {summary['missing_columns']}")
            print(f"Duplicated rows: {summary['duplicated_rows']}\n")

            print("Top Missing Columns (%):")
            for col, pct in top_missing.items():
                print(f"  - {col}: {pct}%")

            print("\nTop Missing Categorical Columns (%):")
            if len(top_cat_missing) > 0:
                for col, pct in top_cat_missing.items():
                    print(f"  - {col}: {pct}%")
            else:
                print("  No missing categorical columns")

            return None  # Prevent Jupyter from printing dict

        return summary
    
    @staticmethod
    def missing_value_summary(df):
        """
        Detects missing values in the DataFrame and gives recommendations.
        
        Returns a DataFrame with:
        - Column Name
        - Data Type
        - Missing Count
        - % Missing
        - Recommended Action
        """
        # Missing count
        missing_count = df.isnull().sum()
        missing_count = missing_count[missing_count > 0].sort_values(ascending=False)
        
        # Data type
        dtype = df.dtypes[missing_count.index]
        
        # Missing percentage
        missing_percent = (missing_count / len(df)) * 100
        
        # Recommended action (basic rules for Ames dataset)
        action = []
        for col in missing_count.index:
            if missing_percent[col] > 50:
                # High missing → categorical features: fill with 'None'
                if df[col].dtype == 'object':
                    action.append('Fill with "None" (absence meaningful)')
                else:
                    action.append('Consider dropping or fill with 0')
            elif 5 < missing_percent[col] <= 50:
                # Medium missing
                if df[col].dtype == 'object':
                    action.append('Fill with "None" or mode')
                else:
                    action.append('Fill with median')
            else:
                # Low missing
                if df[col].dtype == 'object':
                    action.append('Fill with mode')
                else:
                    action.append('Fill with median')
        
        # Combine into DataFrame
        missing_df = pd.DataFrame({
            'Column': missing_count.index,
            'Data Type': dtype.values,
            'Missing Count': missing_count.values,
            'Missing %': missing_percent.values,
            'Recommended Action': action
        })
        
        return missing_df

    @staticmethod
    def get_column_types(df: pd.DataFrame):
            print("""
                num_col = df.select_dtypes(include=[np.number]).columns.tolist()
                cat_col = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
                mis_col = df.columns[df.isnull().sum() > 0].tolist()
                """)

    @staticmethod 
    def plot_columns(df, columns, plot_type='count', n_cols=3, main_title=None, target=None, theme='Set2'):
    
    
        """
        Smart plotting function for EDA — creates multiple plots in grid layout.

        Parameters:
        -----------
        df : pandas.DataFrame
            Dataset to plot.
        columns : list
            List of column names to visualize.
        plot_type : str
            Type of plot. Choose from: 'count', 'hist', 'box', 'violin'
        n_cols : int
            Number of plots per row (default = 3)
        main_title : str
            Optional main title for the figure.
        target : str or None
            Optional target variable for hue (e.g., 'y')
        theme : str
            Choose plot color theme: 'Set2', 'coolwarm', or 'skyblue'
        """
        import math


        # Validate theme
        valid_themes = ['Set2', 'coolwarm', 'skyblue']
        if theme not in valid_themes:
            print(f"⚠️ Invalid theme! Using default 'Set2'.")
            theme = 'Set2'

        # Handle theme settings
        if theme == 'skyblue':
            palette = None
            color = 'skyblue'
        else:
            palette = theme
            color = None

        # Safety: Limit max plots
        max_plots = 12
        if len(columns) > max_plots:
            print(f"⚠️ Too many columns! Showing first {max_plots} only.")
            columns = columns[:max_plots]

        n_plots = len(columns)
        n_rows = math.ceil(n_plots / n_cols)

        plt.figure(figsize=(5 * n_cols, 4 * n_rows))

        for i, col in enumerate(columns, 1):
            plt.subplot(n_rows, n_cols, i)

            # Select hue only if target is provided
            hue_param = target if target else None

            # ---- Choose plot type ----
            if plot_type == 'count':
                sns.countplot(data=df, x=col, hue=hue_param, palette=palette, color=color)
            elif plot_type == 'hist':
                sns.histplot(data=df, x=col, hue=hue_param, kde=True, palette=palette, color=color)
            elif plot_type == 'box':
                if hue_param:
                    sns.boxplot(data=df, x=hue_param, y=col, palette=palette, color=color)
                else:
                    sns.boxplot(data=df, y=col, palette=palette, color=color)
            elif plot_type == 'violin':
                if hue_param:
                    sns.violinplot(data=df, x=hue_param, y=col, palette=palette, color=color)
                else:
                    sns.violinplot(data=df, y=col, palette=palette, color=color)
            else:
                raise ValueError("Invalid plot_type. Use: 'count', 'hist', 'box', or 'violin'")

            plt.title(f"{col}", fontsize=12)
            plt.xticks(rotation=45)
            plt.tight_layout()

        if main_title:
            plt.suptitle(main_title, fontsize=16, y=1.02)

        plt.show()


    @staticmethod     
    def plot_numeric_eda(df, 
                        columns=None, 
                        plot_types=None,
                        n_cols=3,
                        target=None,
                        palette='Set2',
                        color=None,
                        show_corr=True,
                        show_pairplot=False,
                        max_plots=12,
                        sample_size=500,
                        main_title=None):
        """
    User Manual: plot_numeric_eda

    Description:
    ------------
    The `plot_numeric_eda` function is an all-in-one tool for visual exploratory data analysis (EDA) 
    of numeric columns in a pandas DataFrame. It generates multiple types of plots for each numeric 
    column, and also provides optional correlation heatmaps and pairplots. 
    It is designed to be safe for users who may pass all columns (numeric and non-numeric), 
    automatically filtering to numeric columns.

    Function Signature:
    ------------------
    plot_numeric_eda(df, 
                    columns=None, 
                    plot_types=None,
                    n_cols=3,
                    target=None,
                    palette='Set2',
                    color=None,
                    show_corr=True,
                    show_pairplot=False,
                    max_plots=12,
                    sample_size=500,
                    main_title=None)

    Parameters:
    -----------
    df : pandas.DataFrame
        The dataset containing numeric columns for analysis.

    columns : list or None, default=None
        List of column names to plot. If None, the function automatically selects all numeric columns.
        Only numeric columns are considered; non-numeric columns are ignored.

    plot_types : list of str or None, default=None
        Specifies which plot types to generate for each column. Options include:
            - 'hist'   : Histogram of values
            - 'kde'    : Kernel density estimate (smooth distribution)
            - 'box'    : Boxplot (with outliers)
            - 'violin' : Violin plot (distribution + density)
            - 'scatter': Scatter plot against a numeric target
            - 'line'   : Line plot (useful for trends over index or time)
            - 'area'   : Area plot (cumulative numeric trends)
        Default: ['hist', 'box', 'kde']

    n_cols : int, default=3
        Number of plots per row in the grid layout. Determines subplot arrangement.

    target : str or None, default=None
        Optional numeric column to use as the y-axis in scatter plots.
        Ignored for other plot types.

    palette : str, default='Set2'
        Seaborn color palette to use for plots. Can be any valid seaborn palette name.

    color : str or None, default=None
        Optional color to override the palette for all plots. If specified, this color is used.

    show_corr : bool, default=True
        Whether to display a correlation heatmap of all numeric columns at the end.

    show_pairplot : bool, default=False
        Whether to display a seaborn pairplot for numeric columns at the end.
        Large datasets will be sampled to 'sample_size' rows to avoid performance issues.

    max_plots : int, default=12
        Maximum number of numeric columns to plot. If the DataFrame has more, only the first
        'max_plots' numeric columns are used.

    sample_size : int, default=500
        Maximum number of rows to sample for the pairplot, to prevent heavy plotting for large datasets.

    main_title : str or None, default=None
        Optional main title to display on top of each plot grid, prefixed with the plot type.

    Usage Example:
    --------------
    import pandas as pd
    df = pd.read_csv("financial_data.csv")

    # Plot histograms, boxplots, and KDE for numeric columns
    plot_numeric_eda(df, plot_types=['hist','box','kde'], main_title="Numeric EDA")

    # Plot all numeric columns with scatter plots against a target variable
    plot_numeric_eda(df, columns=df.columns.to_list(), plot_types=['scatter'], target='profit')

    # Include correlation heatmap and pairplot
    plot_numeric_eda(df, plot_types=['hist','box'], show_corr=True, show_pairplot=True)

    Notes:
    ------
    1. The function automatically filters out non-numeric columns to prevent errors.
    2. Scatter plots require 'target' to be numeric; otherwise, a warning message is displayed.
    3. Pairplots and correlation heatmaps are optional and can be disabled to save computation time.
    4. The grid layout adapts to the number of columns and 'n_cols' parameter.
    5. 'max_plots' prevents overwhelming the notebook with too many plots at once.
    """

        import matplotlib.pyplot as plt
        import seaborn as sns
        import numpy as np
        import pandas as pd
        import math

        # Default plots
        if plot_types is None:
            plot_types = ['hist', 'box', 'kde']

        # Filter numeric columns
        if columns is None:
            columns = df.select_dtypes(include=[np.number]).columns.tolist()
        else:
            columns = [c for c in columns if pd.api.types.is_numeric_dtype(df[c])]
            if len(columns) == 0:
                print("⚠️ No numeric columns to plot after filtering!")
                return

        # Limit max plots
        if len(columns) > max_plots:
            print(f"⚠️ Too many columns! Showing first {max_plots} only.")
            columns = columns[:max_plots]

        n_plots = len(columns)
        n_rows = math.ceil(n_plots / n_cols)

        # Loop through each plot type
        for plot_type in plot_types:
            plt.figure(figsize=(5 * n_cols, 4 * n_rows))
            for i, col in enumerate(columns, 1):
                plt.subplot(n_rows, n_cols, i)

                if plot_type in ['hist', 'kde']:
                    if plot_type == 'hist':
                        sns.histplot(df[col].dropna(), kde=False, palette=palette, color=color)
                    else:
                        sns.kdeplot(df[col].dropna(), fill=True, palette=palette, color=color)
                elif plot_type == 'box':
                    sns.boxplot(y=df[col].dropna(), palette=palette, color=color)
                elif plot_type == 'violin':
                    sns.violinplot(y=df[col].dropna(), palette=palette, color=color)
                elif plot_type == 'scatter':
                    if target and target in df.columns and pd.api.types.is_numeric_dtype(df[target]):
                        sns.scatterplot(x=df[col], y=df[target], palette=palette, color=color)
                    else:
                        plt.text(0.5, 0.5, "Scatter requires numeric target", ha='center')
                elif plot_type == 'line':
                    sns.lineplot(x=df.index, y=df[col], palette=palette, color=color)
                elif plot_type == 'area':
                    df[col].dropna().plot.area(figsize=(5, 4), color=color)
                else:
                    plt.text(0.5, 0.5, f"Unsupported plot: {plot_type}", ha='center')

                plt.title(f"{col} ({plot_type})")
                plt.xticks(rotation=45)
                plt.tight_layout()

            if main_title:
                plt.suptitle(f"{main_title} - {plot_type}", fontsize=16, y=1.02)
            plt.show()

        # Correlation heatmap
        if show_corr and len(columns) > 1:
            plt.figure(figsize=(10, 8))
            sns.heatmap(df[columns].corr(), annot=True, fmt=".2f", cmap='coolwarm')
            plt.title("Correlation Heatmap")
            plt.show()

        # Pairplot
        if show_pairplot and len(columns) > 1:
            sns.pairplot(df[columns].sample(min(len(df), sample_size)))
            plt.show()

    @staticmethod
    def handle_outliers(
    df,
    cols=None,
    method="auto",
    threshold=1.5,
    visualize=True,
    n_col=3,
    random_state=42,
    verbose=True,
    force_include=None,
    skip=None
        ):

        """
        Smart Outlier Handler v3.1 (Offline Version)
        --------------------------------------------
        - Everything from v3
        - Added `skip` parameter to exclude target or unwanted columns
        """

        # # 1️⃣ Backup
        # bkp = df.copy()

        # 2️⃣ Select columns
        if cols is None:
            cols = df.select_dtypes(include=np.number).columns.tolist()
        elif isinstance(cols, str):
            cols = [cols]

        # Convert skip and force_include to lists if needed
        if skip is None:
            skip = []
        elif isinstance(skip, str):
            skip = [skip]

        if force_include is None:
            force_include = []
        elif isinstance(force_include, str):
            force_include = [force_include]

        # 3️⃣ Identify skipped columns
        skip_cols = []
        reasons = {}

        for c in cols:
            if c in skip:
                skip_cols.append(c)
                reasons[c] = "Skipped manually by user (e.g., target variable)."
                continue

            if c in force_include:
                continue
            if "id" in c.lower():
                skip_cols.append(c)
                reasons[c] = "Identifier column — unique values not useful for outlier detection."
            elif df[c].nunique() < 10:
                skip_cols.append(c)
                reasons[c] = f"Categorical/ordinal variable with {df[c].nunique()} unique values."
            elif df[c].isnull().all():
                skip_cols.append(c)
                reasons[c] = "Column contains only missing values."
            elif df[c].std() == 0:
                skip_cols.append(c)
                reasons[c] = "Constant column — all values are same."
            elif df[c].std() < 1e-6:
                skip_cols.append(c)
                reasons[c] = "Very low variance — not meaningful for outlier detection."

        proc_cols = [c for c in cols if c not in skip_cols or c in force_include]

        # 4️⃣ Print skipped columns + reasoning
        if verbose:
            print(f"📦 Backup created as variable 'bkp'")
            print(f"🧮 Processing {len(proc_cols)} columns (Skipped {len(skip_cols)})\n")

            if skip_cols:
                print("⛔ Skipped Columns (Offline Insights):")
                for c in skip_cols:
                    print(f"   • {c}: {reasons[c]}")
                print()

            if force_include:
                print(f"✅ Force-included columns: {force_include}\n")

        
        # 5️⃣ IsolationForest mode
        if method == "isolation":
            if verbose:
                print("🌲 Using IsolationForest for multivariate outlier detection...\n")
            iso = IsolationForest(contamination=0.02, random_state=random_state)
            preds = iso.fit_predict(df[proc_cols])
            df = df[preds == 1]
            if verbose:
                print(f"✅ Removed {(preds == -1).sum()} multivariate outliers. New shape: {df.shape}")
            return df

        # 6️⃣ Process each column
        viz_data = {}
        for col in proc_cols:
            if df[col].nunique() <= 1 or df[col].isnull().all():
                continue

            col_before = df[col].copy()
            skew_val = df[col].skew()
            adj_threshold = 3 if abs(skew_val) > 1 else threshold

            Q1, Q3 = df[col].quantile([0.25, 0.75])
            IQR = Q3 - Q1
            lower = Q1 - adj_threshold * IQR
            upper = Q3 + adj_threshold * IQR

            outliers = ((df[col] < lower) | (df[col] > upper)).sum()
            outlier_percent = outliers / len(df) * 100

            # auto-decide method
            if method == "auto":
                if len(df) < 10000 or outlier_percent > 10:
                    mode_used = "cap"
                else:
                    mode_used = "remove"
            else:
                mode_used = method

            if verbose:
                print(f"\n📊 Column: {col}")
                print(f"   Skew: {skew_val:.2f}, Outliers: {outliers} ({outlier_percent:.2f}%)")
                print(f"   Method used: {mode_used.upper()}, Threshold: {adj_threshold}")

            # Apply handling
            if mode_used == "remove":
                df = df[(df[col] >= lower) & (df[col] <= upper)]
            elif mode_used == "cap":
                df[col] = np.where(df[col] < lower, lower,
                                np.where(df[col] > upper, upper, df[col]))

            if visualize:
                viz_data[col] = (col_before, df[col].copy())

        # 7️⃣ Combined visualization
        if visualize and viz_data:
            n = len(viz_data)
            n_rows = int(np.ceil(n / n_col))
            fig, axes = plt.subplots(n_rows, n_col, figsize=(5*n_col, 4*n_rows))
            axes = axes.flatten()
            i = 0
            for col, (before, after) in viz_data.items():
                data = pd.DataFrame({'Before': before, 'After': after})
                sns.boxplot(data=data, ax=axes[i])
                axes[i].set_title(col)
                i += 1
            for j in range(i, len(axes)):
                axes[j].set_visible(False)
            plt.tight_layout()
            plt.suptitle("📊 Before vs After Outlier Handling", fontsize=16, y=1.02)
            plt.show()

        if verbose:
            print("\n✅ Outlier handling complete!")
            print(f"Final dataset shape: {df.shape}")

        return df

    @staticmethod
    def plot_categorical_eda(df, 
                         columns=None, 
                         plot_types=None, 
                         n_cols=3, 
                         target=None, 
                         palette='Set2', 
                         color=None, 
                         max_plots=12,
                         main_title=None,
                         top_n=20):
        """
        Enhanced categorical EDA plotting function.

        Parameters
        ----------
        df : pandas.DataFrame
            Dataset containing categorical columns.
        columns : list or None
            List of categorical columns to plot. If None, all categorical columns are selected automatically.
        plot_types : list of str or None
            Types of plots per column. Options:
                - 'count'  : Count/frequency plot
                - 'bar'    : Aggregated numeric target per category
                - 'box'    : Boxplot of numeric target per category
                - 'violin' : Violin plot of numeric target per category
                - 'pie'    : Pie chart of category proportions
            Default = ['count']
        n_cols : int, default=3
            Number of plots per row (grid layout)
        target : str or None
            Optional numeric target column used in 'bar', 'box', and 'violin' plots
        palette : str, default='Set2'
            Seaborn color palette
        color : str or None, default=None
            Override color for all plots
        max_plots : int, default=12
            Maximum number of categorical columns to plot
        main_title : str or None, default=None
            Optional main title for the figure
        top_n : int, default=20
            For count plots, number of top categories to display
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        import pandas as pd
        import math

        # Default plot types
        if plot_types is None:
            plot_types = ['count']

        # Select categorical columns
        if columns is None:
            columns = df.select_dtypes(include=['object','category','bool']).columns.tolist()
        else:
            columns = [c for c in columns if str(df[c].dtype) in ['object','category','bool']]

        if len(columns) == 0:
            print("⚠️ No categorical columns to plot!")
            return

        # Limit max plots
        if len(columns) > max_plots:
            print(f"⚠️ Too many columns! Showing first {max_plots} only.")
            columns = columns[:max_plots]

        n_plots = len(columns)
        n_rows = math.ceil(n_plots / n_cols)

        # Loop through each plot type
        for plot_type in plot_types:
            plt.figure(figsize=(5 * n_cols, 4 * n_rows))
            for i, col in enumerate(columns, 1):
                plt.subplot(n_rows, n_cols, i)

                if plot_type == 'count':
                    vc = df[col].value_counts(dropna=False).iloc[:top_n]
                    sns.barplot(x=vc.values, y=vc.index, palette=palette, color=color)
                elif plot_type == 'pie':
                    vc = df[col].value_counts(dropna=False).iloc[:top_n]
                    plt.pie(vc, labels=vc.index, autopct='%1.1f%%')
                elif plot_type in ['bar','box','violin']:
                    if target is None or target not in df.columns:
                        plt.text(0.5,0.5,"Requires numeric target",ha='center')
                    else:
                        if plot_type == 'bar':
                            sns.barplot(x=col, y=target, data=df, palette=palette, color=color)
                        elif plot_type == 'box':
                            sns.boxplot(x=col, y=target, data=df, palette=palette, color=color)
                        elif plot_type == 'violin':
                            sns.violinplot(x=col, y=target, data=df, palette=palette, color=color)
                else:
                    plt.text(0.5,0.5,f"Unsupported plot: {plot_type}",ha='center')

                plt.title(f"{col} ({plot_type})")
                plt.xticks(rotation=45)
                plt.tight_layout()

            if main_title:
                plt.suptitle(f"{main_title} - {plot_type}", fontsize=16, y=1.02)
            plt.show()
    
    @staticmethod
    def handle_missing(df, drop_threshold=0.75, numeric_strategy='median', categorical_strategy='mode', datetime_strategy='ffill'):
        """
        Handle missing values in a DataFrame.

        Parameters
        ----------
        df : pandas.DataFrame
            Input dataframe to process
        drop_threshold : float, default=0.75
            Drop columns with missing percentage >= drop_threshold
        numeric_strategy : str, default='median'
            How to fill numeric columns. Options: 'median' or 'mean'
        categorical_strategy : str, default='mode'
            How to fill categorical columns. Options: 'mode' or 'placeholder'
        datetime_strategy : str, default='ffill'
            How to fill datetime columns. Options: 'ffill', 'bfill', 'drop'

        Returns
        -------
        df_clean : pandas.DataFrame
            DataFrame after handling missing values
        report : dict
            Report of columns dropped and fill strategy used for each column
        """
        import pandas as pd
        import numpy as np

        df_clean = df.copy()
        n_rows = df_clean.shape[0]
        report = {'dropped_columns': [], 'filled_columns': {}}

        # Step 1: Drop columns with too many missing values
        missing_pct = df_clean.isnull().mean()
        drop_cols = missing_pct[missing_pct >= drop_threshold].index.tolist()
        if drop_cols:
            df_clean = df_clean.drop(columns=drop_cols)
            report['dropped_columns'] = drop_cols

        # Step 2: Fill missing values for remaining columns
        for col in df_clean.columns:
            if df_clean[col].isnull().sum() == 0:
                continue

            dtype = df_clean[col].dtype

            if np.issubdtype(dtype, np.number):
                if numeric_strategy == 'median':
                    df_clean[col] = df_clean[col].fillna(df_clean[col].median())
                    report['filled_columns'][col] = 'median'
                elif numeric_strategy == 'mean':
                    df_clean[col] = df_clean[col].fillna(df_clean[col].mean())
                    report['filled_columns'][col] = 'mean'
            elif np.issubdtype(dtype, np.datetime64):
                if datetime_strategy == 'ffill':
                    df_clean[col] = df_clean[col].fillna(method='ffill').fillna(method='bfill')
                    report['filled_columns'][col] = 'ffill/bfill'
                elif datetime_strategy == 'bfill':
                    df_clean[col] = df_clean[col].fillna(method='bfill')
                    report['filled_columns'][col] = 'bfill'
                elif datetime_strategy == 'drop':
                    df_clean = df_clean.dropna(subset=[col])
                    report['filled_columns'][col] = 'dropped rows'
            else:  # categorical / object / bool
                if categorical_strategy == 'mode':
                    mode_val = df_clean[col].mode(dropna=True)
                    if not mode_val.empty:
                        df_clean[col] = df_clean[col].fillna(mode_val[0])
                        report['filled_columns'][col] = 'mode'
                    else:
                        df_clean[col] = df_clean[col].fillna('missing')
                        report['filled_columns'][col] = 'placeholder'
                elif categorical_strategy == 'placeholder':
                    df_clean[col] = df_clean[col].fillna('missing')
                    report['filled_columns'][col] = 'placeholder'

        return df_clean, report

    @staticmethod
    def generate_doc(filename: Optional[str] = None):
        import inspect
        doc_text = ""
        for name, method in inspect.getmembers(Vizion, predicate=inspect.isfunction):
            doc_text += f"### {name}\n{inspect.getdoc(method)}\n\n"
        if filename:
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(doc_text)
            print(f"Documentation saved to {filename}")
        else:
            print(doc_text)




# make the func for workflow recommendation

# also make a function for all documentation of class


    '''
    📌 Verdict: Are You Missing Anything?
    ✔️ You currently have:

    Missing value detection

    Missing value handling

    Outlier detection & handling

    Numeric & categorical plots

    Dataset summary

    Column type extraction

    correlation / multicollinearity

    ❗ Missing critical components:
            - Future Scope / Planned Enhancements

    Add correlation heatmaps and multicollinearity detection

    Implement encoding methods (one-hot, label encoding)

    Add scaling options for numeric features

    Introduce datetime handling utilities

    Enable automated EDA report generation

    Include rare category detection

    Add imbalance analysis for classification targets
    '''




    def help_steps():
        """
        Stepwise guide for using the Vizion class.
        Prints recommended sequence of functions for a new dataset.
        """
        steps = [
            "STEP 1: Load your dataset into a pandas DataFrame (df).",
            "STEP 2: Quick overview of data:",
            "        Vizion.quick_summary(df)  # basic stats and missing info",
            "STEP 3: Check missing values in detail:",
            "        Vizion.missing_value_summary(df)",
            "STEP 4: Set global column types (numeric, categorical, missing):",
            "        Vizion.get_column_types(df)",
            "STEP 5: Handle missing values:",
            "        df_clean, report = Vizion.handle_missing(df)  # drops/fills missing",
            "STEP 6: Explore numeric columns with visual EDA:",
            "        Vizion.plot_numeric_eda(df_clean)  # hist, box, kde, scatter",
            "STEP 7: Explore categorical columns with visual EDA:",
            "        Vizion.plot_categorical_eda(df_clean)  # count, bar, box, pie",
            "STEP 8: Detect and handle outliers:",
            "        df_clean = Vizion.handle_outliers(df_clean)  # cap/remove outliers",
            "STEP 9: After EDA, you can perform feature engineering, encoding, scaling etc.",
            "STEP 10: Build models (optional guidance):",
            "        Use 'next_steps(df_clean, target=\"your_target\")' to see recommended modeling steps."
        ]

        print("=== VIZION CLASS STEP-BY-STEP GUIDE ===")
        for s in steps:
            print(s)

    pass