# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/monte_carlo_shapley.ipynb (unless otherwise specified).

__all__ = ['MonteCarloShapley', 'MonteCarloShapleyBatch']

# Cell
# Author: Simon Grah <simon.grah@thalesgroup.com>
#         Vincent Thouvenot <vincent.thouvenot@thalesgroup.com>

# MIT License

# Copyright (c) 2020 Thales Six GTS France

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Cell
import numpy as np
import pandas as pd
from tqdm import tqdm

# Cell
def MonteCarloShapley(x, fc, ref, n_iter, callback=None):
    """
    Estimate the Shapley Values using an optimized Monte Carlo version.
    """

    # Get general information
    feature_names = list(x.index)
    d = len(feature_names) # dimension

    # Individual reference or dataset of references
    if isinstance(ref, pd.core.series.Series):
        individual_ref = True
        f_r = fc(ref.values)
    elif isinstance(ref, pd.core.frame.DataFrame):
        if ref.shape[0] == 1:
            ref = ref.iloc[0]
            individual_ref = True
            f_r = fc(ref.values)
        else:
            individual_ref = False
            n_ref = len(ref)

    if individual_ref:
        # If x[j] = r[j] => Φ[j] = 0 and we can reduce the dimension
        distinct_feature_names = list(x[x!=ref].index)
        if set(distinct_feature_names) == set(feature_names):
            distinct_feature_names = feature_names
            sub_d = d
            x_cp = x.copy()
            r_cp = ref.copy()
            reward = lambda z: fc(z)
            pass
        else:
            sub_d = len(distinct_feature_names) # new dimension
            x_cp = x[distinct_feature_names].copy()
            r_cp = ref[distinct_feature_names].copy()
            print("new dimension {0}".format(sub_d))
            def reward(z):
                z_tmp = ref.copy()
                z_tmp[distinct_feature_names] = z
                return fc(z_tmp.values)
    else:
        distinct_feature_names = feature_names
        sub_d = d
        x_cp = x.copy()
        reward = lambda z: fc(z)

    # Store all Shapley Values in a numpy array
    Φ_storage = np.empty((n_iter, sub_d))

    # Monte Carlo loop
    for m in tqdm(range(1, n_iter+1)):
        # Sample a random permutation order
        o = np.random.permutation(sub_d)
        # initiate useful variables for this iteration
        # if several references select at random one new ref at each iter
        if individual_ref:
            f_less_j = f_r
            x_plus_j = r_cp.values.copy()
        else:
            r_cp = ref.values[np.random.choice(n_ref, size=1)[0],:].copy()
            f_less_j = fc(r_cp)
            x_plus_j = r_cp.copy()
        # iterate through the permutation of features
        for j in o:
            x_plus_j[j] = x_cp.values[j]
            f_plus_j = reward(x_plus_j)
            # update Φ
            Φ_j = f_plus_j - f_less_j
            Φ_storage[m-1,j] = Φ_j
            # reassign f_less_j
            f_less_j = f_plus_j
        if callback:
            Φ = pd.Series(np.mean(Φ_storage[:m,:],axis=0), index=feature_names)
            callback(Φ)

    Φ_mean = np.mean(Φ_storage,axis=0)
    Φ = pd.Series(np.zeros(d), index=feature_names)
    Φ[distinct_feature_names] = Φ_mean
    return Φ

# Cell
def MonteCarloShapleyBatch(x, fc, ref, n_iter):
    """
    Estimate the Shapley Values using an optimized Monte Carlo version in Batch mode.
    """

    # Get general information
    feature_names = list(x.index)
    dimension = len(feature_names)

    # Individual reference or dataset of references
    if isinstance(ref, pd.core.series.Series):
        individual_ref = True
    elif isinstance(ref, pd.core.frame.DataFrame):
        if ref.shape[0] == 1:
            ref = ref.iloc[0]
            individual_ref = True
        else:
            individual_ref = False
            n_ref = len(ref)

    # Compute the matrix X of hybrid individuals between x and ref
    # and keep trace of permutation orders
    array_of_hybrid_individuals = np.zeros(shape=(n_iter * (dimension+1), dimension))
    orders = np.zeros(shape=(n_iter * (dimension+1),), dtype=int)

    for iter_monte_carlo in range(n_iter):
        order = np.random.permutation(dimension)
        if individual_ref == True:
            hybrid_individual = ref.values.copy()
        else:
            hybrid_individual = ref.values[np.random.choice(n_ref, size=1)[0],:].copy()
        array_of_hybrid_individuals[iter_monte_carlo * (dimension+1),:] = hybrid_individual
        orders[iter_monte_carlo * (dimension+1)] = -1
        for iter_order, idx_feature in enumerate(order):
            hybrid_individual[idx_feature] = x.values[idx_feature]
            array_of_hybrid_individuals[iter_monte_carlo*(dimension+1) + (iter_order+1),:] = hybrid_individual
            orders[iter_monte_carlo*(dimension+1) + (iter_order+1)] = idx_feature

    try:
        rewards = fc(array_of_hybrid_individuals)
    except Exception as e:
        print("Oops!", e.__class__, "occurred.")
        print("Your function fc should be able to handle a data set of inputs")
        exit()

    rewards_diff = np.diff(rewards)

    mc_shap_batch = pd.Series(np.zeros(dimension), index=feature_names)
    for idx_feature in range(dimension):
        shap_val_feature = np.mean(rewards_diff[orders[1:] == idx_feature])
        mc_shap_batch[idx_feature] = shap_val_feature

    return mc_shap_batch