
#___________________________Statistics__________________________________
"""
statistics_toolkit.py

Statistics (Hybrid Edition v3.0)

A single-file, production-oriented Statistics class that provides an extensive
suite of statistical methods and plotting utilities. The implementation is
built on a lightweight core (NumPy + Matplotlib) and automatically enables
advanced functionality when optional libraries are present (SciPy, Pandas,
Statsmodels, Seaborn).

Design goals:
- Run in minimal environments (NumPy + Matplotlib) while gracefully enabling
  pro features when optional libs are installed.
- Accept common input types (list, np.ndarray, pd.Series, pd.DataFrame).
- Provide broad statistical coverage (means, dispersion, tests, regression,
  distributions, time-series) and versatile plotting (single/multi-frame).

Author: Generated for Ibrahim Shahid
"""


import math
from typing import Optional, Sequence, Tuple, List, Dict, Any, Union
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Optional libraries (auto-detected)
try:
    from scipy import stats
    from scipy.stats import gaussian_kde
except Exception:
    stats = None
    gaussian_kde = None

try:
    import statsmodels.api as sm
    import statsmodels.tsa.api as tsa
    from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
except Exception:
    sm = None
    tsa = None
    plot_acf = None
    plot_pacf = None

try:
    import seaborn as sns
except Exception:
    sns = None


ArrayLike = Union[Sequence[float], np.ndarray, pd.Series]


class Statistics:
    """Hybrid Statistics class with core + pro features.

    Core dependencies: numpy, matplotlib
    Optional (pro) dependencies: scipy, pandas, statsmodels, seaborn

    Instantiate with a 1-D dataset (list, numpy array, pandas Series). For
    multi-column inputs (DataFrame), several functions accept DataFrame
    directly (pairwise plots / covariance / correlation matrix).
    """

    def __init__(self, data: Optional[ArrayLike] = None, name: Optional[str] = None):
        self.name = name or 'series'
        self.data: Optional[pd.Series] = None
        if data is not None:
            self.set_data(data)

    # ------------------------------ Input handling ------------------------------
    @staticmethod
    def _to_series(data: ArrayLike) -> pd.Series:
        if isinstance(data, pd.Series):
            return data.dropna().reset_index(drop=True)
        if isinstance(data, pd.DataFrame):
            raise ValueError('Expected 1-D data; received DataFrame. Use column or pass DataFrame to multi-variate methods.')
        arr = np.asarray(data)
        if arr.ndim > 1:
            arr = arr.flatten()
        series = pd.Series(arr).replace([np.inf, -np.inf], np.nan).dropna().reset_index(drop=True)
        return series

    def set_data(self, data: ArrayLike):
        self.data = self._to_series(data)
        return self

    def as_array(self) -> np.ndarray:
        if self.data is None:
            raise ValueError('No data set')
        return self.data.to_numpy()

    # ------------------------------ Central tendency ------------------------------
    def arithmetic_mean(self) -> float:
        return float(self.data.mean())

    def geometric_mean(self) -> float:
        x = self.as_array()
        if np.any(x <= 0):
            raise ValueError('Geometric mean requires all positive values')
        return float(np.exp(np.mean(np.log(x))))

    def harmonic_mean(self) -> float:
        x = self.as_array()
        if np.any(x == 0):
            raise ValueError('Harmonic mean undefined for zero values')
        return float(len(x) / np.sum(1.0 / x))

    def rms(self) -> float:
        x = self.as_array()
        return float(np.sqrt(np.mean(x ** 2)))

    def weighted_mean(self, weights: Sequence[float]) -> float:
        x = self.as_array()
        w = np.asarray(weights, dtype=float)
        if len(w) != len(x):
            raise ValueError('weights must have same length as data')
        return float(np.sum(w * x) / np.sum(w))

    def trimmed_mean(self, proportion: float = 0.1) -> float:
        x = np.sort(self.as_array())
        n = len(x)
        k = int(math.floor(proportion * n))
        if k * 2 >= n:
            raise ValueError('proportion too large')
        trimmed = x[k: n - k]
        return float(np.mean(trimmed))

    def median(self) -> float:
        return float(self.data.median())

    def mode(self) -> List[float]:
        m = self.data.mode()
        return list(m.values) if not m.empty else []

    def midrange(self) -> float:
        x = self.as_array()
        return float((np.min(x) + np.max(x)) / 2.0)

    # ------------------------------ Dispersion ------------------------------
    def variance(self, ddof: int = 1) -> float:
        return float(self.data.var(ddof=ddof))

    def population_variance(self) -> float:
        return float(self.data.var(ddof=0))

    def std(self, ddof: int = 1) -> float:
        return float(self.data.std(ddof=ddof))

    def population_std(self) -> float:
        return float(self.data.std(ddof=0))

    def coefficient_of_variation(self) -> float:
        mean = self.arithmetic_mean()
        if mean == 0:
            return float('nan')
        return float(self.std() / mean)

    def range(self) -> float:
        x = self.as_array()
        return float(np.max(x) - np.min(x))

    def iqr(self) -> float:
        return float(self.data.quantile(0.75) - self.data.quantile(0.25))

    def quartile_range(self, lower: float = 0.25, upper: float = 0.75) -> float:
        return float(self.data.quantile(upper) - self.data.quantile(lower))

    def mad(self) -> float:
        x = self.as_array()
        med = np.median(x)
        return float(np.median(np.abs(x - med)))

    # ------------------------------ Shape & outliers ------------------------------
    def skewness(self) -> float:
        if stats is not None:
            return float(stats.skew(self.as_array(), nan_policy='omit'))
        return float(self.data.skew())

    def kurtosis(self) -> float:
        if stats is not None:
            return float(stats.kurtosis(self.as_array(), fisher=True, nan_policy='omit'))
        return float(self.data.kurt())

    def z_scores(self) -> np.ndarray:
        x = self.as_array()
        mu = np.mean(x)
        sd = np.std(x, ddof=1)
        return (x - mu) / sd

    def outliers_iqr(self, factor: float = 1.5) -> np.ndarray:
        q1 = self.data.quantile(0.25)
        q3 = self.data.quantile(0.75)
        iqr = q3 - q1
        low = q1 - factor * iqr
        high = q3 + factor * iqr
        return self.data[(self.data < low) | (self.data > high)].to_numpy()

    def outliers_zscore(self, threshold: float = 3.0) -> np.ndarray:
        zs = self.z_scores()
        return self.as_array()[np.abs(zs) > threshold]

    # ------------------------------ Percentiles & quantiles ------------------------------
    def percentile(self, p: float) -> float:
        return float(np.nanpercentile(self.as_array(), p))

    def quantiles(self, q: Sequence[float] = (0.25, 0.5, 0.75)) -> Dict[str, float]:
        qv = np.quantile(self.as_array(), q)
        return {f'q{int(p*100)}': float(v) for p, v in zip(q, qv)}

    # ------------------------------ Correlation & association ------------------------------
    @staticmethod
    def pearsonr(x: Sequence[float], y: Sequence[float]) -> Tuple[float, Optional[float]]:
        if stats is not None:
            r, p = stats.pearsonr(np.asarray(x), np.asarray(y))
            return float(r), float(p)
        r = np.corrcoef(np.asarray(x), np.asarray(y))[0, 1]
        return float(r), None

    @staticmethod
    def spearmanr(x: Sequence[float], y: Sequence[float]) -> Tuple[float, Optional[float]]:
        if stats is not None:
            r, p = stats.spearmanr(np.asarray(x), np.asarray(y))
            return float(r), float(p)
        xr = pd.Series(x).rank().values
        yr = pd.Series(y).rank().values
        r = np.corrcoef(xr, yr)[0, 1]
        return float(r), None

    @staticmethod
    def kendalltau(x: Sequence[float], y: Sequence[float]) -> Tuple[float, Optional[float]]:
        if stats is not None:
            r, p = stats.kendalltau(np.asarray(x), np.asarray(y))
            return float(r), float(p)
        return float('nan'), None

    # ------------------------------ Distribution & tests ------------------------------
    def fit_normal(self) -> Dict[str, float]:
        x = self.as_array()
        mu = float(np.mean(x))
        sigma = float(np.std(x, ddof=1))
        return {'mu': mu, 'sigma': sigma}

    def ks_test_normal(self) -> Dict[str, float]:
        if stats is None:
            raise RuntimeError('scipy required for ks_test_normal')
        mu, sigma = self.fit_normal()['mu'], self.fit_normal()['sigma']
        d, p = stats.kstest(self.as_array(), 'norm', args=(mu, sigma))
        return {'d_stat': float(d), 'p_value': float(p)}

    def shapiro_test(self) -> Dict[str, float]:
        if stats is None:
            raise RuntimeError('scipy required for shapiro_test')
        stat, p = stats.shapiro(self.as_array())
        return {'stat': float(stat), 'p_value': float(p)}

    def t_test_1sample(self, popmean: float = 0.0) -> Dict[str, Any]:
        x = self.as_array()
        if stats is not None:
            tstat, p = stats.ttest_1samp(x, popmean)
            return {'t_stat': float(tstat), 'p_value': float(p)}
        # fallback
        n = len(x)
        mu = float(np.mean(x))
        se = float(np.std(x, ddof=1) / math.sqrt(n))
        tstat = (mu - popmean) / se
        return {'t_stat': float(tstat), 'p_value': None}

    def t_test_ind(self, other: Sequence[float], equal_var: bool = True) -> Dict[str, Any]:
        if stats is not None:
            tstat, p = stats.ttest_ind(self.as_array(), np.asarray(other), equal_var=equal_var, nan_policy='omit')
            return {'t_stat': float(tstat), 'p_value': float(p)}
        # fallback: basic implementation
        a = self.as_array()
        b = np.asarray(other)
        na, nb = len(a), len(b)
        ma, mb = a.mean(), b.mean()
        sa2, sb2 = a.var(ddof=1), b.var(ddof=1)
        if equal_var:
            sp2 = ((na - 1) * sa2 + (nb - 1) * sb2) / (na + nb - 2)
            se = math.sqrt(sp2 * (1 / na + 1 / nb))
            df = na + nb - 2
        else:
            se = math.sqrt(sa2 / na + sb2 / nb)
            df = (sa2 / na + sb2 / nb) ** 2 / ((sa2 ** 2) / (na ** 2 * (na - 1)) + (sb2 ** 2) / (nb ** 2 * (nb - 1)))
        t_stat = (ma - mb) / se
        return {'t_stat': float(t_stat), 'df': float(df), 'p_value': None}

    def anova_oneway(self, groups: List[Sequence[float]]) -> Dict[str, float]:
        if stats is None:
            raise RuntimeError('scipy required for anova_oneway')
        fstat, p = stats.f_oneway(*groups)
        return {'f_stat': float(fstat), 'p_value': float(p)}

    # ------------------------------ Regression & modeling ------------------------------
    @staticmethod
    def ols(x: Sequence[float], y: Sequence[float], add_constant: bool = True) -> Dict[str, Any]:
        X = np.asarray(x)
        Y = np.asarray(y)
        if add_constant:
            Xmat = np.column_stack((np.ones(len(X)), X))
        else:
            Xmat = X.reshape(-1, 1)
        beta = np.linalg.lstsq(Xmat, Y, rcond=None)[0]
        yhat = Xmat @ beta
        residuals = Y - yhat
        sse = (residuals ** 2).sum()
        s2 = sse / (len(Y) - Xmat.shape[1])
        cov = s2 * np.linalg.pinv(Xmat.T @ Xmat)
        se = np.sqrt(np.diag(cov))
        return {'beta': beta, 'yhat': yhat, 'residuals': residuals, 'se': se}

    @staticmethod
    def polyfit(x: Sequence[float], y: Sequence[float], deg: int = 2) -> Dict[str, Any]:
        coeffs = np.polyfit(np.asarray(x), np.asarray(y), deg)
        p = np.poly1d(coeffs)
        yhat = p(np.asarray(x))
        residuals = np.asarray(y) - yhat
        return {'coeffs': coeffs, 'yhat': yhat, 'residuals': residuals}

    # ------------------------------ Time series helpers ------------------------------
    @staticmethod
    def rolling_mean(series: Sequence[float], window: int = 5, center: bool = False) -> pd.Series:
        s = pd.Series(series).dropna()
        return s.rolling(window=window, center=center).mean()

    @staticmethod
    def ewma(series: Sequence[float], span: int = 12) -> pd.Series:
        s = pd.Series(series).dropna()
        return s.ewm(span=span, adjust=False).mean()

    @staticmethod
    def acf(series: Sequence[float], nlags: int = 40) -> Tuple[np.ndarray, np.ndarray]:
        s = pd.Series(series).dropna()
        n = len(s)
        acf_vals = np.array([s.autocorr(lag=i) for i in range(min(n - 1, nlags + 1))])
        lags = np.arange(len(acf_vals))
        return lags, acf_vals

    @staticmethod
    def pacf(series: Sequence[float], nlags: int = 40) -> Tuple[np.ndarray, np.ndarray]:
        if plot_pacf is not None and tsa is not None:
            vals = tsa.stattools.pacf(np.asarray(series), nlags=nlags)
            lags = np.arange(len(vals))
            return lags, vals
        return np.arange(nlags + 1), np.zeros(nlags + 1)

    @staticmethod
    def seasonal_decompose(series: Sequence[float], period: Optional[int] = None) -> Dict[str, pd.Series]:
        if tsa is None:
            raise RuntimeError('statsmodels required for seasonal_decompose')
        res = tsa.seasonal_decompose(pd.Series(series).dropna(), period=period, model='additive', extrapolate_trend='freq')
        return {'trend': res.trend, 'seasonal': res.seasonal, 'resid': res.resid}

    # ------------------------------ Spectral analysis ------------------------------
    @staticmethod
    def fourier(series: Sequence[float]) -> Dict[str, np.ndarray]:
        x = np.asarray(series)
        n = len(x)
        freqs = np.fft.rfftfreq(n)
        fft_vals = np.fft.rfft(x)
        psd = (np.abs(fft_vals) ** 2) / n
        return {'freqs': freqs, 'fft': fft_vals, 'psd': psd}

    # ------------------------------ Plotting utilities ------------------------------
    @staticmethod
    def _ensure_ax(ax: Optional[plt.Axes] = None) -> Tuple[plt.Figure, plt.Axes]:
        if ax is None:
            fig, ax = plt.subplots()
            return fig, ax
        return ax.figure, ax

    def plot_histogram(self, bins: int = 30, ax: Optional[plt.Axes] = None, kde: bool = False, **kwargs) -> plt.Axes:
        fig, ax = self._ensure_ax(ax)
        if sns is not None and kde:
            sns.histplot(self.data, bins=bins, kde=True, ax=ax, **kwargs)
        else:
            ax.hist(self.as_array(), bins=bins, **kwargs)
            if kde and gaussian_kde is not None:
                x = np.linspace(np.min(self.as_array()), np.max(self.as_array()), 200)
                kde_vals = gaussian_kde(self.as_array())(x)
                ax.plot(x, kde_vals * len(self.as_array()) * (x[1] - x[0]) * bins / 10)
        ax.set_title(f'Histogram - {self.name}')
        return ax

    def plot_box(self, ax: Optional[plt.Axes] = None, **kwargs) -> plt.Axes:
        fig, ax = self._ensure_ax(ax)
        ax.boxplot(self.as_array(), **kwargs)
        ax.set_title(f'Boxplot - {self.name}')
        return ax

    def plot_violin(self, ax: Optional[plt.Axes] = None, **kwargs) -> plt.Axes:
        if sns is None:
            return self.plot_box(ax=ax, **kwargs)
        fig, ax = self._ensure_ax(ax)
        sns.violinplot(x=self.data, ax=ax, **kwargs)
        ax.set_title(f'Violin - {self.name}')
        return ax

    def plot_kde(self, ax: Optional[plt.Axes] = None, **kwargs) -> plt.Axes:
        fig, ax = self._ensure_ax(ax)
        if sns is not None:
            sns.kdeplot(self.data, ax=ax, **kwargs)
        elif gaussian_kde is not None:
            x = np.linspace(np.min(self.as_array()), np.max(self.as_array()), 200)
            ax.plot(x, gaussian_kde(self.as_array())(x))
        else:
            raise RuntimeError('seaborn or scipy required for KDE plot')
        ax.set_title(f'KDE - {self.name}')
        return ax

    def plot_scatter(self, x: Sequence[float], y: Sequence[float], ax: Optional[plt.Axes] = None, label: Optional[str] = None, **kwargs) -> plt.Axes:
        fig, ax = self._ensure_ax(ax)
        ax.scatter(np.asarray(x), np.asarray(y), label=label, **kwargs)
        if label:
            ax.legend()
        ax.set_title('Scatter')
        return ax

    @staticmethod
    def plot_pairplot(df: pd.DataFrame, diag_kind: str = 'hist', **kwargs) -> plt.Figure:
        if sns is None:
            raise RuntimeError('seaborn required for pairplot')
        g = sns.pairplot(df, diag_kind=diag_kind, **kwargs)
        return g.fig

    def plot_time_series(self, times: Optional[Sequence[Any]] = None, ax: Optional[plt.Axes] = None, label: Optional[str] = None) -> plt.Axes:
        fig, ax = self._ensure_ax(ax)
        if times is None:
            ax.plot(self.as_array(), label=label)
        else:
            ax.plot(times, self.as_array(), label=label)
        if label:
            ax.legend()
        ax.set_title(f'Time Series - {self.name}')
        return ax

    @staticmethod
    def plot_multi(plots: List[Tuple[str, Any]], layout: Tuple[int, int] = (2, 2), figsize: Tuple[int, int] = (12, 8)) -> plt.Figure:
        """Plot a list of (title, plotting_function_or_tuple) into a grid.

        Each item in `plots` can be either:
        - ('title', callable(ax) -> None) where the callable draws into provided ax, or
        - ('title', ('hist', data, kwargs)) shorthand for built-in plot types.
        """
        rows, cols = layout
        fig, axes = plt.subplots(rows, cols, figsize=figsize)
        axes_flat = np.array(axes).flatten()
        for i, (title, plot_spec) in enumerate(plots):
            if i >= len(axes_flat):
                break
            ax = axes_flat[i]
            if callable(plot_spec):
                plot_spec(ax)
            elif isinstance(plot_spec, tuple):
                ptype = plot_spec[0]
                if ptype == 'hist':
                    ax.hist(np.asarray(plot_spec[1]), **(plot_spec[2] if len(plot_spec) > 2 else {}))
                elif ptype == 'line':
                    ax.plot(np.asarray(plot_spec[1]))
                elif ptype == 'scatter':
                    ax.scatter(np.asarray(plot_spec[1]), np.asarray(plot_spec[2]))
                elif ptype == 'box':
                    ax.boxplot(np.asarray(plot_spec[1]))
                else:
                    ax.text(0.5, 0.5, 'Unknown plot type', ha='center')
            else:
                ax.text(0.5, 0.5, 'Invalid plot spec', ha='center')
            ax.set_title(title)
        for j in range(i + 1, len(axes_flat)):
            axes_flat[j].set_visible(False)
        fig.tight_layout()
        return fig

    def plot_acf_pacf(self, nlags: int = 40) -> plt.Figure:
        if plot_acf is not None and plot_pacf is not None:
            fig, axes = plt.subplots(2, 1, figsize=(10, 6))
            plot_acf(self.as_array(), ax=axes[0], lags=nlags)
            plot_pacf(self.as_array(), ax=axes[1], lags=nlags)
            fig.tight_layout()
            return fig
        else:
            lags, acf_vals = self.acf(self.as_array(), nlags=nlags)
            lags_p, pacf_vals = self.pacf(self.as_array(), nlags=nlags)
            fig, axes = plt.subplots(2, 1, figsize=(10, 6))
            axes[0].bar(lags, acf_vals)
            axes[1].bar(lags_p, pacf_vals)
            axes[0].set_title('ACF')
            axes[1].set_title('PACF')
            fig.tight_layout()
            return fig
    def show(self):
        plt.show()
        
    def plot_spectrum(self) -> plt.Figure:
        sp = self.fourier(self.as_array())
        fig, ax = plt.subplots()
        ax.plot(sp['freqs'], sp['psd'])
        ax.set_title('Power Spectral Density')
        return fig
