# AUTOGENERATED! DO NOT EDIT! File to edit: notebooks/retraitement_erfs-fpr/modules/reduction_variables.ipynb (unless otherwise specified).

__all__ = ["remove_useless_variables"]

# Cell

# Imports pour le module
import pandas as pd
from openfisca_france import FranceTaxBenefitSystem

from .toolbase import convert_erfr_to_openfisca, create_simulation

# Cell

# Tester les variables qui sont inutiles pour notre calcul (IR et Socio-fisca)
# Code repris de agg_pop.py > test_useless_variables de leximpact-server

# TODO : il faut traiter le cas où les variables sont en foyers fiscaux et non en individus
def remove_useless_variables(
    input_h5,
    outfile_path,
    name_variables,
    PERIOD,
):
    """ """
    # Initialisation
    pd.options.mode.chained_assignment = None
    list_useless_variables = []
    TBS = FranceTaxBenefitSystem()
    DUMMY_DATA = convert_erfr_to_openfisca(input_h5)
    simulation_base_deciles, dictionnaire_datagrouped = create_simulation(
        PERIOD, DUMMY_DATA, TBS
    )
    df = dictionnaire_datagrouped["foyer_fiscal"][["wprm"]]
    for nv in name_variables:
        # Si on est en cas individus:
        df[f"{nv}_base"] = simulation_base_deciles.calculate(
            nv, PERIOD
        )  # Ça ne marche que pour des individus
    for col in DUMMY_DATA.columns:
        if col == "wprm":  # we don't want to remove this one
            continue
        isdif = False
        data_wo_column = DUMMY_DATA[[k for k in DUMMY_DATA.columns if k != col]]
        try:
            newsim, ddg2 = create_simulation(PERIOD, data_wo_column, TBS)
            resvar = {nv: {} for nv in name_variables}
            for nv in name_variables:
                df[f"{nv}_{col}"] = newsim.calculate(nv, PERIOD)
                resvar[nv]["countdif"] = len(df[df[f"{nv}_{col}"] != df[f"{nv}_base"]])
                # print(col,nv,resvar[nv]["countdif"])
                # print(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]],len(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]]))
                isdif |= resvar[nv]["countdif"]
            if not isdif:
                list_useless_variables += [col]
            print(
                col,
                "is",
                "not" if isdif else "",
                "useless",
                "{}".format([resvar[nv]["countdif"] for nv in name_variables])
                if isdif
                else "",
            )
        except Exception:
            print(col, "is definitely not useless")

    # On force la sauvegarde de variables que l'on veut plus tard
    to_keep = ["quifoy", "quifam", "quimen", "idfam", "idfoy", "idmen"]
    for item in to_keep:
        if item in list_useless_variables:
            list_useless_variables.remove(item)

    # Suppression des variables inutiles
    data_wo_useless = DUMMY_DATA[
        [k for k in DUMMY_DATA.columns if k not in list_useless_variables]
    ]
    newsim, ddg2 = create_simulation(PERIOD, data_wo_column, TBS)
    isdif = False
    for nv in name_variables:
        # print(col,nv,resvar[nv]["countdif"])
        # print(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]],len(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]]))
        isdif |= len(df[df[f"{nv}_{col}"] != df[f"{nv}_base"]])
    if isdif:
        print("Removing all variables at once didn't work, good luck with that")
    else:
        if outfile_path is None:
            outfile_path = input_h5.replace(".h5", "_useful.h5")
        data_wo_useless.to_hdf(outfile_path, key="input")
        print(
            f"It seems lots of columns don't do anything. Data with only useful columns was exported to {outfile_path}"
        )
    return list_useless_variables
