# freq_mob/freq_mob.py
# version 0.1
# import freq_mob

import tabulate, numpy
from scipy.stats import spearmanr
from sklearn.isotonic import IsotonicRegression as isoreg
from sklearn.cluster import KMeans as kmeans
from sklearn import metrics 
from lightgbm import LGBMRegressor as gbmreg

########## 01. qcut() ##########

def qcut(x, n):
  """
  It is an utility function to discretizes a numeric vector into n pieces based on quantiles.
  Parameters:
    x : A numeric vector.
    n : An integer indicating the number of categories to discretize.
  Returns:
    A list of numeric values to divide the vector x into n categories.
  """

  _q = numpy.linspace(0, 100, n, endpoint = False)[1:]
  _x = [_ for _ in x if not numpy.isnan(_)]
  _c = numpy.unique(numpy.percentile(_x, _q, method = "lower"))
  return([_ for _ in _c])

########## 02. manual_bin() ##########

def manual_bin(x, y, cuts):
  """
  It is an utility function to discretize the x vector and summarize
  over the y vector based on the discretization result.
  Parameters:
    x    : A numeric vector to discretize without missing values
    y    : A numeric vector with the same length of x
    cuts : A list of numeric values as cut points to discretize x.
  Returns:
    A list of dictionaries for the binning outcome.
  """

  _x = [_ for _ in x]
  _y = [_ for _ in y]
  _c = sorted([_ for _ in set(cuts)] + [numpy.NINF, numpy.PINF])
  _g = numpy.searchsorted(_c, _x).tolist()

  _l1 = sorted(zip(_g, _x, _y), key = lambda x: x[0])
  _l2 = zip(set(_g), [[l for l in _l1 if l[0] == g] for g in set(_g)])

  return(sorted([dict(zip(["bin", "freq", "miss", "ysum", "minx", "maxx"],
                          [_1, len(_2), 0,
                           sum([_[2] for _ in _2]),
                           min([_[1] for _ in _2]),
                           max([_[1] for _ in _2])])) for _1, _2 in _l2],
                key = lambda x: x["bin"]))

########## 03. miss_bin() ##########

def miss_bin(y):
  """
  It is an utility function to summarize the y vector.
  Parameters:
    y : A numeric vector.
  Returns:
    A dictionary.
  """

  return({"bin": 0, "freq": len([_ for _ in y]), "miss": len([_ for _ in y]),
          "ysum": sum([_ for _ in y]), "minx": numpy.nan, "maxx": numpy.nan})

########## 04. add_miss() ##########

def add_miss(d, l):
  """
  It is an utility function to append the missing value category, if any, to the binning outcome.
  Parameters:
    d : A list with lists generated by input vectors of binning functions.
    l : A list of dicts to append.
  Returns:
    A list of dicts.
  """

  _l = l[:]

  if len([_ for _ in d if _[2] == 0]) > 0:
    _m = miss_bin([_[1] for _ in d if _[2] == 0])
    if _m["ysum"] == 0:
      for _ in ['freq', 'miss', 'ysum']:
        _l[0][_]  = _l[0][_]  + _m[_]
    else:
      _l.append(_m)

  return(_l)

########## 05. gen_newx() ##########

def gen_newx(x):
  """
  It is an utility function to generate the variable transformation based on the binning outcome.
  Parameters:
    x : A list of dictionaries for the binning outcome.
  Returns:
    A list of dictionaries with additional keys to the input.
  """

  _freq = sum(_["freq"] for _ in x)
  _ysum = sum(_["ysum"] for _ in x)

  _l1 = sorted([{**_,
                 "yavg": round(_["ysum"] / _["freq"], 8),
                 "newx": round(numpy.log((_["ysum"] / _ysum) / (_["freq"] / _freq)), 8)
                } for _ in x], key = lambda _x: _x["bin"])
  return(_l1)

########## 06. gen_rule() ##########

def gen_rule(tbl, pts):
  """
  It is an utility function to generate binning rules based on the binning 
  outcome table and the list of cut points.
  Parameters:
    tbl : A intermediate table of the binning outcome
    pts : A list cut points for the binning
  Returns:
    A list of dictionaries with binning rules
  """

  for _ in tbl:
    if _["bin"] == 0:
      _["rule"] = "numpy.isnan($X$)"
    elif _["bin"] == len(pts) + 1:
      if _["miss"] == 0:
        _["rule"] = "$X$ > " + str(pts[-1])
      else:
        _["rule"] = "$X$ > " + str(pts[-1]) + " or numpy.isnan($X$)"
    elif _["bin"] == 1:
      if _["miss"] == 0:
        _["rule"] = "$X$ <= " + str(pts[0])
      else:
        _["rule"] = "$X$ <= " + str(pts[0]) + " or numpy.isnan($X$)"
    else:
        _["rule"] = "$X$ > " + str(pts[_["bin"] - 2]) + " and $X$ <= " + str(pts[_["bin"] - 1])

  _sel = ["bin", "freq", "miss", "ysum", "yavg", "newx", "rule"]

  return([{k: _[k] for k in _sel} for _ in tbl])

########## 07. cal_newx() ##########

def cal_newx(x, bin):
  """
  It applies the transformation to a numeric vector based on the binning outcome.
  Parameters:
    x   : A numeric vector, which can be a list, 1-D numpy array, or pandas series
    bin : An object containing the binning outcome.
  Returns:
    A list of dictionaries with three keys
  """

  _cut = sorted([_ for _ in bin['cut']] + [numpy.PINF, numpy.NINF])

  _dat = [[_1[0], _1[1], _2] for _1, _2 in zip(enumerate(x), ~numpy.isnan(x))]

  _m1 = [_[:2] for _ in _dat if _[2] == 0]
  _l1 = [_[:2] for _ in _dat if _[2] == 1]

  _l2 = [[*_1, _2] for _1, _2 in zip(_l1, numpy.searchsorted(_cut, [_[1] for _ in _l1]).tolist())]

  flatten = lambda l: [item for subl in l for item in subl]

  _l3 = flatten([[[*l, b['newx']] for l in _l2 if l[2] == b['bin']] for b in bin['tbl'] if b['bin'] > 0])

  if len(_m1) > 0:
    if len([_ for _ in bin['tbl'] if _['miss'] > 0]) > 0:
      _m2 = [l + [_['bin'] for _ in bin['tbl'] if _['miss'] > 0]
               + [_['newx'] for _ in bin['tbl'] if _['miss'] > 0] for l in _m1]
    else:
      _m2 = [l + [0, 0] for l in _m1]
    _l3.extend(_m2)

  _key = ['x', 'bin', 'newx']

  return(list(dict(zip(_key, _[1:])) for _ in sorted(_l3, key = lambda x: x[0])))

########## 08. view_bin() ##########

def view_bin(x):
  """
  It displays the binning outcome generated from a binning function, i.e. iso_bin().
  Parameters:
    x: An object containing the binning outcome.
  Returns:
    None
  """

  tabulate.PRESERVE_WHITESPACE = True

  _sel = ["bin", "freq", "miss", "ysum", "yavg", "newx"]

  _tbl = [{**(lambda v: {k: v[k] for k in _sel})(_), "rule": _["rule"].ljust(45)} for _ in x["tbl"]]

  print(tabulate.tabulate(_tbl, headers = "keys", tablefmt = "github",
                          colalign = ["center"] + ["right"] * (len(_sel) - 1),
                          floatfmt = (".0f", ".0f", ".0f", ".4f", ".4f", ".4f")))


########## 09. qtl_bin() ##########

def qtl_bin(x, y):
  """
  It discretizes the x vector based on percentiles and summarizes
  over the y vector to derive the variable transformation.
  Parameters:
    x : A numeric vector to discretize, e.g. list, numpy array, or pandas series.
    y : A numeric vector of frequency outcomes with the same length of x.
  Returns:
    A dictionary with two keys:
      "cut" : A numeric vector with cut points applied to the x vector.
      "tbl" : A list of dictionaries summarizing the binning outcome.
  """

  _data = [_ for _ in zip(x, y, ~numpy.isnan(x))]

  _x = [_[0] for _ in _data if _[2] == 1]
  _y = [_[1] for _ in _data if _[2] == 1]

  _n = numpy.arange(2, max(3, min(100, len(numpy.unique(_x)) - 1)))
  _p = set(tuple(qcut(_x, _)) for _ in _n)

  _l1 = [[_, manual_bin(_x, _y, _)] for _ in _p]

  _l2 = [[l[0],
          min([_["ysum"] / _["freq"] for _ in l[1]]),
          max([_["ysum"] / _["freq"] for _ in l[1]]),
          spearmanr([_["bin"] for _ in l[1]], [_["ysum"] / _["freq"] for _ in l[1]])[0]
         ] for l in _l1]

  _l3 = [l[0] for l in sorted(_l2, key = lambda x: -len(x[0]))
         if numpy.abs(round(l[3], 8)) == 1 and round(l[1], 8) > 0][0]

  _l4 = sorted(*[l[1] for l in _l1 if l[0] == _l3], key = lambda x: x["ysum"] / x["freq"])

  _l5 = add_miss(_data, _l4)

  return({"cut": _l3, "tbl": gen_rule(gen_newx(_l5), _l3)})

########## 10. iso_bin() ##########

def iso_bin(x, y):
  """
  It discretizes the x vector based on the isotonic regression and summarizes
  over the y vector to derive the variable transformation.
  Parameters:
    x : A numeric vector to discretize, e.g. list, numpy array, or pandas series.
    y : A numeric vector of frequency outcomes with the same length of x.
  Returns:
    A dictionary with two keys:
      "cut" : A numeric vector with cut points applied to the x vector.
      "tbl" : A list of dictionaries summarizing the binning outcome.
  """

  _data = [_ for _ in zip(x, y, ~numpy.isnan(x))]

  _x = [_[0] for _ in _data if _[2] == 1]
  _y = [_[1] for _ in _data if _[2] == 1]

  _cor = spearmanr(_x, _y)[0]
  _reg = isoreg()

  _f = numpy.abs(_reg.fit_transform(_x, list(map(lambda y:  y * _cor / numpy.abs(_cor), _y))))

  _l1 = sorted(list(zip(_f, _x, _y)), key = lambda x: x[0])

  _l2 = [[l for l in _l1 if l[0] == f] for f in sorted(set(_f))]

  _l3 = [[*set(_[0] for _ in l),
          max(_[1] for _ in l),
          numpy.mean([_[2] for _ in l]),
          len(list(_[2] for _ in l))] for l in _l2]

  _c = sorted([_[1] for _ in [l for l in _l3 if l[2] > 0 and l[3] >= 3]])
  _p = _c[1:-1] if len(_c) > 2 else _c[:-1]

  _l4 = sorted(manual_bin(_x, _y, _p), key = lambda x: x["ysum"] / x["freq"])

  _l5 = add_miss(_data, _l4)

  return({"cut": _p, "tbl": gen_rule(gen_newx(_l5), _p)})

########## 11. gbm_bin() ##########

def gbm_bin(x, y):
  """
  It discretizes the x vector based on the gradient boosting machine and
  summarizes over the y vector to derive the variable transformation.
  Parameters:
    x : A numeric vector to discretize. It is a list, 1-D numpy array,
        or pandas series.
    y : A numeric vector with binary values of 0/1 and with the same length
        of x. It is a list, 1-D numpy array, or pandas series.
  Returns:
    A dictionary with two keys:
      "cut" : A numeric vector with cut points applied to the x vector.
      "tbl" : A list of dictionaries summarizing the binning outcome.
  """

  _data = [_ for _ in zip(x, y, ~numpy.isnan(x))]

  _x = [_[0] for _ in _data if _[2] == 1]
  _y = [_[1] for _ in _data if _[2] == 1]

  _cor = spearmanr(_x, _y)[0]
  _con = "1" if _cor > 0 else "-1"

  _gbm = gbmreg(num_leaves = 100, min_child_samples = 3, n_estimators = 1,
                random_state = 1, monotone_constraints = _con)
  _gbm.fit(numpy.reshape(_x, [-1, 1]), _y)

  _f = numpy.abs(_gbm.predict(numpy.reshape(_x, [-1, 1])))

  _l1 = sorted(list(zip(_f, _x, _y)), key = lambda x: x[0])

  _l2 = [[l for l in _l1 if l[0] == f] for f in sorted(set(_f))]

  _l3 = [[*set(_[0] for _ in l),
          max(_[1] for _ in l),
          numpy.mean([_[2] for _ in l]),
          len(list(_[2] for _ in l))] for l in _l2]

  _c = sorted([_[1] for _ in [l for l in _l3 if l[2] > 0 and l[3] >= 3]])

  _p = _c[1:-1] if len(_c) > 2 else _c[:-1]

  _l4 = sorted(manual_bin(_x, _y, _p), key = lambda x: x["ysum"] / x["freq"])

  _l5 = add_miss(_data, _l4)

  return({"cut": _p, "tbl": gen_rule(gen_newx(_l5), _p)})

########## 12. rng_bin() ##########

def rng_bin(x, y):
  """
  It discretizes the x vector based on the range of x values and summarizes over
  the y vector to derive the variable transformaton.
  Parameters:
    x : A numeric vector to discretize, e.g. list, numpy array, or pandas series.
    y : A numeric vector of frequency outcomes with the same length of x.
  Returns:
    A dictionary with two keys:
      "cut" : A numeric vector with cut points applied to the x vector.
      "tbl" : A list of dictionaries summarizing the binning outcome.
  """

  _data = [_ for _ in zip(x, y, ~numpy.isnan(x))]
  _x = [_[0] for _ in _data if _[2] == 1]
  _y = [_[1] for _ in _data if _[2] == 1]

  _n = numpy.arange(2, max(3, min(100, len(numpy.unique(_x)) - 1)))

  _m = [[numpy.median([_[0] for _ in _data if _[2] == 1 and _[1] > 0])],
        [numpy.median([_[0] for _ in _data if _[2] == 1])]]

  _p = list(set(tuple(qcut(numpy.unique(_x), _)) for _ in _n)) + _m

  _l1 = [[_, manual_bin(_x, _y, _)] for _ in _p]

  _l2 = [[l[0],
          min([_["ysum"] / _["freq"] for _ in l[1]]),
          max([_["ysum"] / _["freq"] for _ in l[1]]),
          spearmanr([_["bin"] for _ in l[1]], [_["ysum"] / _["freq"] for _ in l[1]])[0]
         ] for l in _l1]

  _l3 = [l[0] for l in sorted(_l2, key = lambda x: -len(x[0]))
         if numpy.abs(round(l[3], 8)) == 1 and round(l[1], 8) > 0][0]

  _l4 = sorted(*[l[1] for l in _l1 if l[0] == _l3], key = lambda x: x["ysum"] / x["freq"])

  _l5 = add_miss(_data, _l4)

  return({"cut": _l3, "tbl": gen_rule(gen_newx(_l5), _l3)})

########## 13. kmn_bin() ##########

def kmn_bin(x, y):
  """
  It discretizes the x vector based on the kmeans clustering and summarizes over 
  the y vector to derive the variable transformation.
  Parameters:
    x : A numeric vector to discretize, e.g. list, numpy array, or pandas series.
    y : A numeric vector of frequency outcomes with the same length of x.
  Returns:
    A dictionary with two keys:
      "cut" : A numeric vector with cut points applied to the x vector.
      "tbl" : A list of dictionaries summarizing the binning outcome.
  """

  _data = [_ for _ in zip(x, y, ~numpy.isnan(x))]
  _x = [_[0] for _ in _data if _[2] == 1]
  _y = [_[1] for _ in _data if _[2] == 1]

  _n = numpy.arange(2, max(3, min(100, len(numpy.unique(_x)) - 1)))

  _m = [[numpy.median([_[0] for _ in _data if _[2] == 1 and _[1] > 0])],
        [numpy.median([_[0] for _ in _data if _[2] == 1])]]

  _c1 = [kmeans(n_clusters = _, random_state = 1, n_init = 'auto').fit(numpy.reshape(_x, [-1, 1])).labels_ for _ in _n]

  _c2 = [sorted(_l, key = lambda x: x[0]) for _l in [list(zip(_, _x)) for _ in _c1]]

  group = lambda x: [[_l for _l in x if _l[0] == _k] for _k in set([_[0] for _ in x])]

  upper = lambda x: sorted([max([_2[1] for _2 in _1]) for _1 in x])

  _c3 = list(set(tuple(upper(_2)[:-1]) for _2 in [group(_1) for _1 in _c2])) + _m

  _l1 = [[_, manual_bin(_x, _y, _)] for _ in _c3]

  _l2 = [[l[0],
          min([_["ysum"] / _["freq"] for _ in l[1]]),
          max([_["ysum"] / _["freq"] for _ in l[1]]),
          spearmanr([_["bin"] for _ in l[1]], [_["ysum"] / _["freq"] for _ in l[1]])[0]
         ] for l in _l1]

  _l3 = [l[0] for l in sorted(_l2, key = lambda x: -len(x[0]))
         if numpy.abs(round(l[3], 8)) == 1 and round(l[1], 8) > 0][0]

  _l4 = sorted(*[l[1] for l in _l1 if l[0] == _l3], key = lambda x: x["ysum"] / x["freq"])

  _l5 = add_miss(_data, _l4)

  return({"cut": _l3, "tbl": gen_rule(gen_newx(_l5), _l3)})

########## 14. cnt_bin() ##########

def cnt_bin(x, y):
  """
  It discretizes the x vector based on percentiles and summarizes over
  the y vector with y > 0, i.e. nonzero count, to derive the variable transformation.
  Parameters:
    x : A numeric vector to discretize, e.g. list, numpy array, or pandas series.
    y : A numeric vector of frequency outcomes with the same length of x.
  Returns:
    A dictionary with two keys:
      "cut" : A numeric vector with cut points applied to the x vector.
      "tbl" : A list of dictionaries summarizing the binning outcome.
  """

  _data = [_ for _ in zip(x, y, ~numpy.isnan(x))]
  _x = [_[0] for _ in _data if _[2] == 1]
  _y = [_[1] for _ in _data if _[2] == 1]

  _n = numpy.arange(2, max(3, min(100, len(numpy.unique([_[0] for _ in _data if _[1] > 0 and _[2] == 1])) - 1)))

  _p = set(tuple(qcut([_[0] for _ in _data if _[1] > 0 and _[2] == 1], _)) for _ in _n)

  _l1 = [[_, manual_bin(_x, _y, _)] for _ in _p]

  _l2 = [[l[0],
          min([_["ysum"] / _["freq"] for _ in l[1]]),
          max([_["ysum"] / _["freq"] for _ in l[1]]),
          spearmanr([_["bin"] for _ in l[1]], [_["ysum"] / _["freq"] for _ in l[1]])[0]
         ] for l in _l1]

  _l3 = [l[0] for l in sorted(_l2, key = lambda x: -len(x[0]))
         if numpy.abs(round(l[3], 8)) == 1 and round(l[1], 8) > 0][0]

  _l4 = sorted(*[l[1] for l in _l1 if l[0] == _l3], key = lambda x: x["ysum"] / x["freq"])

  _l5 = add_miss(_data, _l4)

  return({"cut": _l3, "tbl": gen_rule(gen_newx(_l5), _l3)})

