from configs import *
import numpy as np
import pandas as pd
from pathlib import Path

def process_task_response_time(workflow_name, sampling_points=1000):
    output_path = FR_DATA_DIR / 'task_response_time' / f'{workflow_name}.csv'
    output_path.parent.mkdir(parents=True, exist_ok=True)

    records = {}

    def load_single_df(template_path):
        csv_path = Path(template_path) / 'csv-files' / 'task_response_time.csv'
        return pd.read_csv(csv_path, usecols=["Response Time"], low_memory=False)

    global_max = 0.0
    for template in FR_TEMPLATES:
        for repeat_id in REPEAT_IDS:
            template_path = Path(LOGS_DIR) / "fault_recovery" / workflow_name / f"repeat{repeat_id}" / template
            df = load_single_df(template_path)
            values = df["Response Time"].values.astype(float)
            if values.size > 0:
                current_max = values.max()
                if current_max > global_max:
                    global_max = current_max

    np.random.seed(42)
    if sampling_points <= 1:
        time_axis = np.array([global_max], dtype=float)
    elif global_max == 0.0:
        time_axis = np.zeros(sampling_points, dtype=float)
    else:
        inner_n = max(sampling_points - 2, 0)
        sample_points = np.random.uniform(0.0, global_max, inner_n) if inner_n > 0 else np.array([])
        time_axis = np.sort(np.concatenate(([0.0], sample_points, [global_max])))

    records["response-time"] = np.round(time_axis, 2)

    for template in FR_TEMPLATES:
        raw_arrays = []
        for repeat_id in REPEAT_IDS:
            template_path = Path(LOGS_DIR) / "fault_recovery" / workflow_name / f"repeat{repeat_id}" / template
            df = load_single_df(template_path)
            raw_arrays.append(df["Response Time"].values.astype(float))

        arrays = []
        for arr in raw_arrays:
            if arr.size == 0:
                arrays.append(np.zeros_like(time_axis))
                continue
            arr_sorted = np.sort(arr)
            cdf = np.linspace(1/len(arr_sorted), 1.0, len(arr_sorted))
            arrays.append(np.interp(time_axis, arr_sorted, cdf, left=0.0, right=1.0))

        arrays = np.array(arrays)
        records[f"{template}-CDF-mean"] = np.round(arrays.mean(axis=0), 2)
        records[f"{template}-CDF-max"] = np.round(arrays.max(axis=0), 2)
        records[f"{template}-CDF-min"] = np.round(arrays.min(axis=0), 2)

    df_out = pd.DataFrame(records)
    df_out.to_csv(output_path, index=False)

def process_workflow_completion_time(workflow_name, sampling_points=1000):
    output_path = FR_DATA_DIR / 'workflow_completion_time' / f'{workflow_name}.csv'
    output_path.parent.mkdir(parents=True, exist_ok=True)

    def load_single_df(template_path):
        csv_path = Path(template_path) / 'csv-files' / 'task_completion_percentiles.csv'
        df = pd.read_csv(
            csv_path,
            usecols=["Percentile", "Completion Time"],
            low_memory=False
        )
        df["Percentile"] = pd.to_numeric(df["Percentile"], errors="coerce").astype("Int64")
        df["Completion Time"] = pd.to_numeric(df["Completion Time"], errors="coerce")
        df = df.rename(columns={
            "Percentile": "workflow_completion_percentage",
            "Completion Time": "elapsed_time"
        })
        return df

    all_outs = []
    full_index = pd.RangeIndex(0, 101, name="workflow_completion_percentage")
    percent_axis = np.linspace(0.0, 100.0, sampling_points)
    for template in FR_TEMPLATES:
        dfs = []
        for rid in REPEAT_IDS:
            template_path = Path(LOGS_DIR) / "fault_recovery" / workflow_name / f"repeat{rid}" / template
            df = load_single_df(template_path)
            dfs.append(df.set_index("workflow_completion_percentage").reindex(full_index))

        combined = pd.concat([d["elapsed_time"] for d in dfs], axis=1)
        name = Path(template).name

        mean_series = combined.mean(axis=1)
        max_series = combined.max(axis=1)
        min_series = combined.min(axis=1)

        mean_series = mean_series.astype(float).interpolate().ffill().bfill()
        max_series = max_series.astype(float).interpolate().ffill().bfill()
        min_series = min_series.astype(float).interpolate().ffill().bfill()

        xp = combined.index.to_numpy(dtype=float)
        mean_interp = np.interp(percent_axis, xp, mean_series.to_numpy())
        max_interp = np.interp(percent_axis, xp, max_series.to_numpy())
        min_interp = np.interp(percent_axis, xp, min_series.to_numpy())

        out = pd.DataFrame({
            "workflow_completion_percentage": percent_axis,
            f"{name}-mean": mean_interp,
            f"{name}-max": max_interp,
            f"{name}-min": min_interp,
        })

        all_outs.append(out.set_index("workflow_completion_percentage"))

    final = pd.concat(all_outs, axis=1).reset_index()
    final.to_csv(output_path, index=False)

def process_task_concurrency(workflow_name, sampling_points=1000):
    repeat_id = 2

    tmax_global = 0.0
    for template in FR_TEMPLATES:
        template_path = Path(LOGS_DIR) / "fault_recovery" / workflow_name / f"repeat{repeat_id}" / template
        input_path = template_path / 'csv-files' / 'task_concurrency.csv'
        try:
            df = pd.read_csv(input_path, usecols=["time"], low_memory=False)
        except Exception:
            continue
        df["time"] = pd.to_numeric(df["time"], errors="coerce")
        df = df.dropna(subset=["time"]) 
        df = df[df["time"] >= 0]
        if not df.empty:
            tmax = float(df["time"].max())
            if tmax > tmax_global:
                tmax_global = tmax

    if sampling_points <= 0:
        return
    if tmax_global == 0.0:
        time_axis = np.zeros(sampling_points, dtype=float)
    else:
        time_axis = np.linspace(0.0, tmax_global, sampling_points)

    for template in FR_TEMPLATES:
        template_path = Path(LOGS_DIR) / "fault_recovery" / workflow_name / f"repeat{repeat_id}" / template
        input_path = template_path / 'csv-files' / 'task_concurrency.csv'
        output_path = FR_DATA_DIR / 'task_concurrency' / workflow_name / f"{Path(template).name}.csv"
        output_path.parent.mkdir(parents=True, exist_ok=True)

        try:
            df = pd.read_csv(input_path, usecols=["time", "Waiting", "Executing"], low_memory=False)
        except Exception:
            df = pd.DataFrame(columns=["time", "Waiting", "Executing"])

        df["time"] = pd.to_numeric(df["time"], errors="coerce")
        df["Waiting"] = pd.to_numeric(df["Waiting"], errors="coerce")
        df["Executing"] = pd.to_numeric(df["Executing"], errors="coerce")
        df = df.dropna(subset=["time"]) 
        df = df[df["time"] >= 0]
        if df.empty:
            out = pd.DataFrame({
                "Time (s)": time_axis,
                "Waiting": np.zeros_like(time_axis, dtype=float),
                "Executing": np.zeros_like(time_axis, dtype=float),
            })
            out.to_csv(output_path, index=False)
            continue

        df = (
            df.groupby("time", as_index=False)[["Waiting", "Executing"]]
              .mean()
              .sort_values("time")
        )
        xp = df["time"].to_numpy()
        fp_wait = df["Waiting"].to_numpy()
        fp_exec = df["Executing"].to_numpy()
        waiting_axis = np.interp(time_axis, xp, fp_wait, left=0.0, right=0.0)
        executing_axis = np.interp(time_axis, xp, fp_exec, left=0.0, right=0.0)

        out = pd.DataFrame({
            "Time (s)": time_axis,
            "Waiting": waiting_axis,
            "Executing": executing_axis,
        })
        out.to_csv(output_path, index=False)

def process_waiting_recovery_tasks(workflow_name, sampling_points=1000):
    repeat_id = 2
    tmax_global = 0.0
    for template in FR_TEMPLATES:
        template_path = Path(LOGS_DIR) / "fault_recovery" / workflow_name / f"repeat{repeat_id}" / template
        input_path = template_path / 'csv-files' / 'task_concurrency_recovery_only.csv'
        try:
            df = pd.read_csv(input_path, usecols=["time"], low_memory=False)
        except Exception:
            continue
        df["time"] = pd.to_numeric(df["time"], errors="coerce")
        df = df.dropna(subset=["time"]) 
        df = df[df["time"] >= 0]
        if not df.empty:
            tmax = float(df["time"].max())
            if tmax > tmax_global:
                tmax_global = tmax

    if sampling_points <= 0:
        return
    if tmax_global == 0.0:
        time_axis = np.zeros(sampling_points, dtype=float)
    else:
        time_axis = np.linspace(0.0, tmax_global, sampling_points)

    series = {"Time (s)": time_axis}
    for template in FR_TEMPLATES:
        name = Path(template).name
        template_path = Path(LOGS_DIR) / "fault_recovery" / workflow_name / f"repeat{repeat_id}" / template
        input_path = template_path / 'csv-files' / 'task_concurrency_recovery_only.csv'
        try:
            df = pd.read_csv(input_path, usecols=["time", "Waiting"], low_memory=False)
        except Exception:
            series[name] = np.zeros_like(time_axis, dtype=float)
            continue
        df["time"] = pd.to_numeric(df["time"], errors="coerce")
        df["Waiting"] = pd.to_numeric(df["Waiting"], errors="coerce")
        df = df.dropna(subset=["time"]) 
        df = df[df["time"] >= 0]
        if df.empty:
            series[name] = np.zeros_like(time_axis, dtype=float)
            continue
        df = df.groupby("time", as_index=False)[["Waiting"]].mean().sort_values("time")
        xp = df["time"].to_numpy()
        fp = df["Waiting"].to_numpy()
        series[name] = np.interp(time_axis, xp, fp, left=0.0, right=0.0)

    out = pd.DataFrame(series)
    output_path = FR_DATA_DIR / 'waiting_recovery_tasks' / f'{workflow_name}.csv'
    output_path.parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(output_path, index=False)

if __name__ == "__main__":
    for workflow_name in WORKFLOWS:
        # process_task_response_time(workflow_name, sampling_points=1000)
        process_workflow_completion_time(workflow_name, sampling_points=100)
        process_task_concurrency(workflow_name, sampling_points=100)
        process_waiting_recovery_tasks(workflow_name, sampling_points=100)
