from configs import *


def process_storage_csv(workflow_name):
    repeat_id = 1

    for template in FP_TEMPLATES:
        template_path = Path(LOGS_DIR) / "fault_prevention" / workflow_name / f"repeat{repeat_id}" / template
        input_path = template_path / 'csv-files' / 'worker_storage_consumption.csv'
        output_path = FP_DATA_DIR / 'storage_consumption_per_worker' / workflow_name / f"{template_path.name}.csv"
        output_path.parent.mkdir(parents=True, exist_ok=True)

        df = pd.read_csv(input_path, low_memory=False)

        # 1) drop time
        if 'time' in df.columns:
            df = df.drop(columns=['time'])

        # 2) integerize workflow_completion_percentage (floor)
        df['workflow_completion_percentage'] = np.floor(
            pd.to_numeric(df['workflow_completion_percentage'], errors='coerce')
        ).astype('Int64')

        # 3) ensure worker cols numeric
        worker_cols = [c for c in df.columns if c != 'workflow_completion_percentage']
        df[worker_cols] = df[worker_cols].apply(pd.to_numeric, errors='coerce')

        # 4) per-% group by and take max
        key = 'workflow_completion_percentage'
        maxvals = df.groupby(key, as_index=True)[worker_cols].max()
        has_zero = (df[worker_cols] == 0).groupby(df[key]).any()
        grouped = maxvals.mask(has_zero, 0).reset_index().sort_values(key)

        # 5) fill 0..100, only fill when both left and right are valid
        full_index = pd.RangeIndex(0, 101, name=key)
        grouped = grouped.set_index(key).reindex(full_index)

        left = grouped[worker_cols].ffill()
        right = grouped[worker_cols].bfill()
        mask = grouped[worker_cols].isna() & left.notna() & right.notna()
        grouped[worker_cols] = grouped[worker_cols].where(~mask, left)

        grouped = grouped.reset_index()

        # 6) MB -> GB (divide by 1024)
        grouped[worker_cols] = (grouped[worker_cols] / 1024.0).round(4)

        # 7) add units row (GB)
        cols = ['workflow_completion_percentage'] + worker_cols
        grouped = grouped[cols]
        units_df = pd.DataFrame([['%'] + ['GB'] * len(worker_cols)], columns=cols)

        final_df = pd.concat([units_df, grouped], ignore_index=True)
        final_df.to_csv(output_path, index=False, na_rep='')

def process_workflow_completion_percentiles(workflow_name):

    output_path = FP_DATA_DIR / 'workflow_completion_percentiles' / f'{workflow_name}.csv'
    output_path.parent.mkdir(parents=True, exist_ok=True)

    def load_single_df(template_path):
        csv_path = Path(template_path) / 'csv-files' / 'task_completion_percentiles.csv'

        df = pd.read_csv(
            csv_path,
            usecols=["Percentile", "Completion Time"],
            low_memory=False
        )

        df["Percentile"] = pd.to_numeric(df["Percentile"], errors="coerce").astype("Int64")
        df["Completion Time"] = pd.to_numeric(df["Completion Time"], errors="coerce")

        # rename columns to match our unified interface
        df = df.rename(columns={"Percentile": "workflow_completion_percentage",
                                "Completion Time": "elapsed_time"})

        return df

    all_outs = []
    for template in FP_TEMPLATES:
        dfs = []
        for rid in REPEAT_IDS:
            template_path = Path(LOGS_DIR) / "fault_prevention" / workflow_name / f"repeat{rid}" / template
            df = load_single_df(template_path)
            dfs.append(df.set_index("workflow_completion_percentage"))

        combined = pd.concat([d["elapsed_time"] for d in dfs], axis=1)
        name = Path(template).name
        out = pd.DataFrame({
            "workflow_completion_percentage": combined.index,
            f"{name}-mean": combined.mean(axis=1),
            f"{name}-max": combined.max(axis=1),
            f"{name}-min": combined.min(axis=1),
        }).reset_index(drop=True)

        all_outs.append(out.set_index("workflow_completion_percentage"))

    final = pd.concat(all_outs, axis=1).reset_index()
    final.to_csv(output_path, index=False)

def process_file_activation_time(workflow_name):
    output_path = FP_DATA_DIR / 'file_activation_time' / f'{workflow_name}.csv'
    output_path.parent.mkdir(parents=True, exist_ok=True)

    def load_single_df(template_path):
        template_path = Path(template_path)
        csv_path = template_path / 'csv-files' / 'file_replica_activation_intervals.csv'

        df = pd.read_csv(
            csv_path,
            usecols=["time_activation"],
            low_memory=False
        )

        x = pd.to_numeric(df["time_activation"], errors="coerce").dropna().sort_values().to_numpy()
        if x.size == 0:
            return pd.DataFrame({"time": [], "percent": []})
        t, counts = np.unique(x, return_counts=True)
        cum = counts.cumsum()
        percent = (cum / x.size) * 100.0
        out = pd.DataFrame({"time": t, "percent": percent})
        out["time"] = np.floor(out["time"]).astype(int)
        out = (
            out.groupby("time", as_index=False, sort=True)["percent"].max()
            .sort_values("time")
        )
        out["percent"] = np.floor(out["percent"].clip(lower=0.0, upper=100.0)).astype(int)
        dedup = (
            out.sort_values(["percent", "time"])\
               .groupby("percent", as_index=False)["time"].max()
        )
        dedup = dedup.sort_values("time")
        dedup = dedup[["time", "percent"]]
        return dedup

    all_outs = []
    all_times = []
    for template in FP_TEMPLATES:
        per_rep = []
        grids = []
        for rid in REPEAT_IDS:
            template_path = Path(LOGS_DIR) / "fault_prevention" / workflow_name / f"repeat{rid}" / template
            df = load_single_df(template_path)
            per_rep.append(df)
            if not df.empty:
                grids.append(df["time"].to_numpy())

        if len(grids) == 0:
            continue
        time_grid = np.unique(np.concatenate(grids))

        series_list = []
        for df in per_rep:
            if df.empty:
                s = pd.Series(data=np.zeros_like(time_grid, dtype=float), index=time_grid)
            else:
                s = df.set_index("time")["percent"].reindex(time_grid, method="ffill").fillna(0.0)
            series_list.append(s)

        combined = pd.concat(series_list, axis=1)
        name = Path(template).name
        out = pd.DataFrame({
            "time": combined.index,
            f"{name}-mean": combined.mean(axis=1),
            f"{name}-max": combined.max(axis=1),
            f"{name}-min": combined.min(axis=1),
        }).reset_index(drop=True)

        num_cols = [c for c in out.columns if c != "time"]
        out[num_cols] = np.floor(out[num_cols].clip(lower=0.0, upper=100.0)).astype(int)

        out_idxed = out.set_index("time")
        all_outs.append(out_idxed)
        all_times.append(out_idxed.index.to_numpy())

    if len(all_outs) == 0:
        final = pd.DataFrame({"time": []})
    else:
        global_time = np.unique(np.concatenate(all_times))
        aligned = []
        for df in all_outs:
            df2 = df.reindex(global_time).ffill().fillna(0.0)
            df2 = np.floor(df2.clip(lower=0.0, upper=100.0)).astype(int)
            aligned.append(df2)
        final = pd.concat(aligned, axis=1).reset_index()
    final.to_csv(output_path, index=False)


if __name__ == "__main__":
    for workflow_name in WORKFLOWS:
        # process_storage_csv(workflow_name)
        # process_workflow_completion_percentiles(workflow_name)
        process_file_activation_time(workflow_name)