# AUTOGENERATED! DO NOT EDIT! File to edit: ../notebooks/aggregates_from_dads.ipynb.

# %% auto 0
__all__ = [
    "config",
    "tc",
    "extraction_date",
    "csv_dads_extractions_folder",
    "calib_copulas_and_quantile_folder",
    "of_vars",
    "df_dgfip",
    "metadata",
    "ref_dads_2020",
    "perimeter",
    "year",
    "liste_des_variables_pepa_2020",
    "dads",
    "aggregates_from_csv",
    "dads_var_to_dict",
]

# %% ../notebooks/aggregates_from_dads.ipynb 5
import time
import unittest

# import numpy as np
import pandas as pd
from leximpact_socio_fisca_simu_etat.config import Configuration

from leximpact_aggregates.aggregate import (
    AggregateManager,
    Perimeter,
    Reference,
    openfisca_variables,
)

# from typing import List, Union


# from leximpact_socio_fisca_simu_etat.logger import logger
# from ruamel.yaml import YAML
# from tqdm import tqdm


config = Configuration(project_folder="leximpact-aggregates")
tc = unittest.TestCase()


# yaml = YAML()  # typ='unsafe' for testing

extraction_date = time.strftime("%Y-%m-%d")

# %% ../notebooks/aggregates_from_dads.ipynb 6
csv_dads_extractions_folder = "/mnt/data-in/casd_extract/dads/all_data"
calib_copulas_and_quantile_folder = "/mnt/data-out/leximpact/casd-anonimyzed/dads/"
# dads_quantiles = config.get("DATASETS") + "20220530-ExtractQuantiles/data/"

# %% ../notebooks/aggregates_from_dads.ipynb 12
of_vars = openfisca_variables

# %% ../notebooks/aggregates_from_dads.ipynb 17
df_dgfip = pd.read_csv(
    "Base_Tous_salaries_fichier_Postes-2019.csv", skiprows=8, sep=";", usecols=[0, 1, 2]
)
df_dgfip.drop("Modalités", axis=1, inplace=True)
df_dgfip.columns = ["Variable", "Libellé"]
df_dgfip.head(2)

# %% ../notebooks/aggregates_from_dads.ipynb 20
metadata = {}


def dads_var_to_dict(row):
    """On contruit un dictionnaire `metadata` à partir du fichier de
    description de DADS Si la variable existe dans le dictionnaire d'OpenFisca
    (OF), on utilise le libelllé OF."""
    dads_var = row["Variable"].lower()
    if of_vars.get(dads_var):
        # The name from dads exist in OpenFisca
        metadata[dads_var] = {
            "openfisca_variable": dads_var,
            "ux_name": of_vars[dads_var]["label"],
            "description": of_vars[dads_var]["label"],
        }
    elif of_vars.get("f" + dads_var[1:]):
        # The name exist in OFF with a f instead of a z as first letter
        metadata[dads_var] = {
            "openfisca_variable": "f" + dads_var[1:],
        }
        if of_vars["f" + dads_var[1:]].get("label"):
            metadata[dads_var]["ux_name"] = of_vars["f" + dads_var[1:]]["label"]
            metadata[dads_var]["description"] = of_vars["f" + dads_var[1:]]["label"]
    else:
        # The name from dads don't exist in OpenFisca
        metadata[dads_var] = {
            "ux_name": row["Libellé"],
            "description": row["Libellé"],
        }
    return row


_ = df_dgfip.apply(dads_var_to_dict, axis=1)

# %% ../notebooks/aggregates_from_dads.ipynb 25
ref_dads_2020 = Reference(
    title="DADS 2020 (DGFIP)",
    href="https://www.casd.eu/source/base-tous-salaries-fichier-postes/",
)

perimeter = Perimeter(entity="individu", period="year", geographic="France entière")

# %% ../notebooks/aggregates_from_dads.ipynb 26
metadata["mat"] = {
    "description": "Situation matrimoniale du foyer fiscal",
    "openfisca_variable": "statut_marital",
    "ux_template": "Parmi les foyers français {value} sont {label}.",
}


year = "2020"

# %% ../notebooks/aggregates_from_dads.ipynb 36
pd.set_option("display.max_colwidth", 80)
pd.options.display.float_format = "{:,.7f}".format

liste_des_variables_pepa_2020 = pd.read_csv(
    csv_dads_extractions_folder + "/agregats_DADS_2020.csv"
)


dads = {
    "2020": [liste_des_variables_pepa_2020],
}

# %% ../notebooks/aggregates_from_dads.ipynb 39
aggregates_from_csv = []
for year, dfs in dads.items():
    for df in dfs:
        if year == "2020":
            ref = ref_dads_2020
        # df.columns
        df["name"] = df["name"].str.lower()
        _ = df.apply(
            AggregateManager.get_aggregats_from_row,
            args=[
                aggregates_from_csv,
                year,
                df.columns.to_list(),
                ref,
                metadata,
                perimeter,
            ],
            axis=1,
        )
aggregates_from_csv[-1]
