# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbdev_nbs/peptide/fragment.ipynb.

# %% auto 0
__all__ = ['get_charged_frag_types', 'parse_charged_frag_type', 'init_zero_fragment_dataframe',
           'init_fragment_dataframe_from_other', 'init_fragment_by_precursor_dataframe',
           'update_sliced_fragment_dataframe', 'get_sliced_fragment_dataframe', 'concat_precursor_fragment_dataframes',
           'calc_fragment_mz_values_for_same_nAA', 'mask_fragments_for_charge_greater_than_precursor_charge',
           'create_fragment_mz_dataframe_by_sort_precursor', 'create_fragment_mz_dataframe']

# %% ../../nbdev_nbs/peptide/fragment.ipynb 4
import numpy as np
import pandas as pd
from typing import List, Union, Tuple, Dict
import warnings

from .mass_calc import *
from alphabase.constants.modification import (
    calc_modloss_mass
)
from alphabase.constants.element import (
    MASS_H2O, MASS_PROTON, 
    MASS_NH3, CHEM_MONO_MASS
)

from alphabase.peptide.precursor import (
    refine_precursor_df,
    update_precursor_mz,
    is_precursor_sorted
)

# %% ../../nbdev_nbs/peptide/fragment.ipynb 5
def get_charged_frag_types(
    frag_types:List[str], 
    max_frag_charge:int = 2
)->List[str]:
    '''
    Combine fragment types and charge states.

    Parameters
    ----------
    frag_types : List[str]
        e.g. ['b','y','b_modloss','y_modloss']

    max_frag_charge : int
        max fragment charge. (default: 2)
    
    Returns
    -------
    List[str]
        charged fragment types
    
    Examples
    --------
    >>> frag_types=['b','y','b_modloss','y_modloss']
    >>> get_charged_frag_types(frag_types, 2)
    ['b_z1','b_z2','y_z1','y_z2','b_modloss_z1','b_modloss_z2','y_modloss_z1','y_modloss_z2']
    '''
    charged_frag_types = []
    for _type in frag_types:
        for _ch in range(1, max_frag_charge+1):
            charged_frag_types.append(f"{_type}_z{_ch}")
    return charged_frag_types

def parse_charged_frag_type(
    charged_frag_type: str
)->Tuple[str,int]:
    '''
    Oppsite to `get_charged_frag_types`.
    
    Parameters
    ----------
    charged_frag_type : str
        e.g. 'y_z1', 'b_modloss_z1'

    Returns
    -------
    tuple
        str. Fragment type, e.g. 'b','y'

        int. Charge state
    '''
    items = charged_frag_type.split('_')
    _ch = items[-1]
    _type = '_'.join(items[:-1])
    return _type, int(_ch[1:])

# %% ../../nbdev_nbs/peptide/fragment.ipynb 10
def init_zero_fragment_dataframe(
    peplen_array:np.ndarray,
    charged_frag_types:List[str], 
    dtype=np.float64
)->Tuple[pd.DataFrame, np.ndarray, np.ndarray]: 
    '''Initialize a zero dataframe based on peptide length 
    (nAA) array (peplen_array) and charge_frag_types (column number).
    The row number of returned dataframe is np.sum(peplen_array-1).

    Parameters
    ----------
    peplen_array : np.ndarray
        peptide lengths for the fragment dataframe
        
    charged_frag_types : List[str]
        `['b_z1','b_z2','y_z1','y_z2','b_modloss_z1','y_H2O_z1'...]`
    
    Returns
    -------
    tuple
        pd.DataFrame, `fragment_df` with zero values

        np.ndarray (int64), the start indices point to the `fragment_df` for each peptide

        np.ndarray (int64), the end indices point to the `fragment_df` for each peptide
    '''
    indices = np.zeros(len(peplen_array)+1, dtype=np.int64)
    indices[1:] = peplen_array-1
    indices = np.cumsum(indices)
    fragment_df = pd.DataFrame(
        np.zeros((indices[-1],len(charged_frag_types)), dtype=dtype),
        columns = charged_frag_types
    )
    return fragment_df, indices[:-1], indices[1:]

def init_fragment_dataframe_from_other(
    reference_fragment_df: pd.DataFrame,
    dtype=np.float64
):
    '''
    Init zero fragment dataframe from the `reference_fragment_df` (same rows and same columns)
    '''
    return pd.DataFrame(
        np.zeros_like(reference_fragment_df.values, dtype=dtype),
        columns = reference_fragment_df.columns
    )

def init_fragment_by_precursor_dataframe(
    precursor_df,
    charged_frag_types: List[str],
    *,
    reference_fragment_df: pd.DataFrame = None,
    dtype:np.dtype=np.float64,
    inplace_in_reference:bool=False,
):
    '''
    Init zero fragment dataframe for the `precursor_df`. If 
    the `reference_fragment_df` is provided, the result dataframe's 
    length will be the same as reference_fragment_df. Otherwise it 
    generates the dataframe from scratch.
    
    Parameters
    ----------
    precursor_df : pd.DataFrame
        precursors to generate fragment masses,
        if `precursor_df` contains the 'frag_start_idx' column, 
        it is better to provide `reference_fragment_df` as 
        `precursor_df.frag_start_idx` and `precursor.frag_end_idx` 
        point to the indices in `reference_fragment_df`

    charged_frag_types : List
        `['b_z1','b_z2','y_z1','y_z2','b_modloss_z1','y_H2O_z1'...]`

    reference_fragment_df : pd.DataFrame
        init zero fragment_mz_df based
        on this reference. If None, fragment_mz_df will be 
        initialized by :func:`alphabase.peptide.fragment.init_zero_fragment_dataframe`.
        Defaults to None.

    inplace_in_reference : bool, optional
        if calculate the fragment mz 
        inplace in the reference_fragment_df (default: False)

    Returns
    -------
    pd.DataFrame
        zero `fragment_df` with given `charged_frag_types` columns
    
    # Raises
    # ------
    # ValueError
    #     If `reference_fragment_df` is None but there are 'frag_start_idx'
    #     in the `precursor_df`, meaning that there are some other fragment 
    #     dataframes linked to the `precursor_df`, these fragment dataframes must 
    #     be provided as `reference_fragment_df`. 
    #     If we are sure that other fragment dataframes are not needed any more, 
    #     we can just `del precursor_df['frag_start_idx']` before call this function.
    '''
    if 'frag_start_idx' not in precursor_df.columns:
        (
            fragment_df, start_indices, end_indices
        ) = init_zero_fragment_dataframe(
            precursor_df.nAA.values,
            charged_frag_types,
            dtype=dtype
        )
        precursor_df['frag_start_idx'] = start_indices
        precursor_df['frag_end_idx'] = end_indices
    else:
        if reference_fragment_df is None:
            # raise ValueError(
            #     "`precursor_df` contains 'frag_start_idx' column, "\
            #     "please provide `reference_fragment_df` argument"
            # )
            fragment_df = pd.DataFrame(
                np.zeros((
                    precursor_df.frag_end_idx.max(), 
                    len(charged_frag_types)
                )),
                columns = charged_frag_types
            )
        else:
            if inplace_in_reference: 
                fragment_df = reference_fragment_df[[
                    _fr for _fr in charged_frag_types 
                    if _fr in reference_fragment_df.columns
                ]]
            else:
                fragment_df = pd.DataFrame(
                    np.zeros((
                        len(reference_fragment_df), 
                        len(charged_frag_types)
                    )),
                    columns = charged_frag_types
                )
    return fragment_df

# %% ../../nbdev_nbs/peptide/fragment.ipynb 12
def update_sliced_fragment_dataframe(
    fragment_df: pd.DataFrame,
    values: np.ndarray,
    frag_start_end_list: List[Tuple[int,int]],
    charged_frag_types: List[str]=None,
)->pd.DataFrame:
    '''
    Set the values of the slices `frag_start_end_list=[(start,end),(start,end),...]` 
    of fragment_df.

    Parameters
    ----------
    fragment_df : pd.DataFrame
        fragment dataframe to set the values

    values : np.ndarray
        values to set

    frag_start_end_list : List[Tuple[int,int]]
        e.g. `[(start,end),(start,end),...]`

    charged_frag_types : List[str], optional
        e.g. `['b_z1','b_z2','y_z1','y_z2']`.
        If None, the columns of values should be the same as fragment_df's columns.
        It is much faster if charged_frag_types is None as we use numpy slicing, 
        otherwise we use pd.loc (much slower).
        Defaults to None.
    
    Returns
    -------
    pd.DataFrame
        fragment_df after the values are set into slices
    '''
    frag_slice_list = [slice(start,end) for start,end in frag_start_end_list]
    frag_slices = np.r_[tuple(frag_slice_list)]
    if charged_frag_types is None or len(charged_frag_types)==0:
        fragment_df.values[frag_slices, :] = values
    else:
        charged_frag_idxes = [fragment_df.columns.get_loc(c) for c in charged_frag_types]
        fragment_df.iloc[frag_slices, charged_frag_idxes] = values
    return fragment_df

def get_sliced_fragment_dataframe(
    fragment_df: pd.DataFrame,
    frag_start_end_list:Union[List,np.ndarray],
    charged_frag_types:List = None,
)->pd.DataFrame:
    '''
    Get the sliced fragment_df from `frag_start_end_list=[(start,end),(start,end),...]`.
    
    Parameters
    ----------
    fragment_df : pd.DataFrame
        fragment dataframe to get values

    frag_start_end_list : Union
        List[Tuple[int,int]], e.g. `[(start,end),(start,end),...]` or np.ndarray

    charged_frag_types : List[str]
        e.g. `['b_z1','b_z2','y_z1','y_z2']`.
        if None, all columns will be considered
    
    Returns
    -------
    pd.DataFrame
    
        sliced fragment_df. If `charged_frag_types` is None, 
        return fragment_df with all columns
    '''
    frag_slice_list = [slice(start,end) for start,end in frag_start_end_list]
    frag_slices = np.r_[tuple(frag_slice_list)]
    if charged_frag_types is None or len(charged_frag_types)==0:
        charged_frag_idxes = slice(None)
    else:
        charged_frag_idxes = [fragment_df.columns.get_loc(c) for c in charged_frag_types]
    return fragment_df.iloc[frag_slices, charged_frag_idxes]

# %% ../../nbdev_nbs/peptide/fragment.ipynb 14
def concat_precursor_fragment_dataframes(
    precursor_df_list: List[pd.DataFrame],
    fragment_df_list: List[pd.DataFrame],
    *other_fragment_df_lists
)->Tuple[pd.DataFrame,...]:
    '''
    Since fragment_df is indexed by precursor_df, when we concatenate multiple 
    fragment_df, the indexed positions will change for in precursor_dfs,  
    this function keeps the correct indexed positions of precursor_df when 
    concatenating multiple fragment_df dataframes.
    
    Parameters
    ----------
    precursor_df_list : List[pd.DataFrame]
        precursor dataframe list to concatenate

    fragment_df_list : List[pd.DataFrame]
        fragment dataframe list to concatenate

    *other_fragment_df_lists
        arbitray other fragment dataframe list to concatenate, 
        e.g. fragment_mass_df, fragment_inten_df, ...
    
    Returns
    -------
    Tuple[pd.DataFrame,...]
        concatenated precursor_df, fragment_df, *other_fragment_df ...
    '''
    fragment_df_lens = [len(fragment_df) for fragment_df in fragment_df_list]
    precursor_df_list = [precursor_df.copy() for precursor_df in precursor_df_list]
    cum_frag_df_lens = np.cumsum(fragment_df_lens)
    for i,precursor_df in enumerate(precursor_df_list[1:]):
        precursor_df[['frag_start_idx','frag_end_idx']] += cum_frag_df_lens[i]
    return (
        pd.concat(precursor_df_list, ignore_index=True),
        pd.concat(fragment_df_list, ignore_index=True),
        *[pd.concat(other_list, ignore_index=True)
            for other_list in other_fragment_df_lists
        ]
    )

# %% ../../nbdev_nbs/peptide/fragment.ipynb 15
def calc_fragment_mz_values_for_same_nAA(
    df_group:pd.DataFrame, 
    nAA:int, 
    charged_frag_types:list
):
    mod_list = df_group.mods.str.split(';').apply(
        lambda x: [m for m in x if len(m)>0]
    ).values
    site_list = df_group.mod_sites.str.split(';').apply(
        lambda x: [int(s) for s in x if len(s)>0]
    ).values

    if 'mod_deltas' in df_group.columns:
        mod_delta_list = df_group.mod_deltas.str.split(';').apply(
            lambda x: [float(m) for m in x if len(m)>0]
        ).values
        mod_delta_site_list = df_group.mod_delta_sites.str.split(';').apply(
            lambda x: [int(s) for s in x if len(s)>0]
        ).values
    else:
        mod_delta_list = None
        mod_delta_site_list = None
    (
        b_mass, y_mass, pepmass
    ) = calc_b_y_and_peptide_masses_for_same_len_seqs(
        df_group.sequence.values.astype('U'), 
        mod_list, site_list,
        mod_delta_list,
        mod_delta_site_list
    )
    b_mass = b_mass.reshape(-1)
    y_mass = y_mass.reshape(-1)

    for charged_frag_type in charged_frag_types:
        if charged_frag_type.startswith('b_modloss'):
            b_modloss = np.concatenate([
                calc_modloss_mass(nAA, mods, sites, True)
                for mods, sites in zip(mod_list, site_list)
            ])
            break
    for charged_frag_type in charged_frag_types:
        if charged_frag_type.startswith('y_modloss'):
            y_modloss = np.concatenate([
                calc_modloss_mass(nAA, mods, sites, False)
                for mods, sites in zip(mod_list, site_list)
            ])
            break

    mz_values = []
    # neutral masses also considered for future uses
    for frag_type in charged_frag_types:
        if frag_type == 'b':
            mz_values.append(b_mass)
        elif frag_type == 'y':
            mz_values.append(y_mass)
    add_proton = MASS_PROTON
    for charged_frag_type in charged_frag_types:
        frag_type, charge = parse_charged_frag_type(charged_frag_type)
        if frag_type =='b':
            mz_values.append(b_mass/charge + add_proton)
        elif frag_type == 'y':
            mz_values.append(y_mass/charge + add_proton)
        elif frag_type == 'b_modloss':
            _mass = (b_mass-b_modloss)/charge + add_proton
            _mass[b_modloss == 0] = 0
            mz_values.append(_mass)
        elif frag_type == 'y_modloss':
            _mass = (y_mass-y_modloss)/charge + add_proton
            _mass[y_modloss == 0] = 0
            mz_values.append(_mass)
        elif frag_type == 'b_H2O':
            _mass = (b_mass-MASS_H2O)/charge + add_proton
            mz_values.append(_mass)
        elif frag_type == 'y_H2O':
            _mass = (y_mass-MASS_H2O)/charge + add_proton
            mz_values.append(_mass)
        elif frag_type == 'b_NH3':
            _mass = (b_mass-MASS_NH3)/charge + add_proton
            mz_values.append(_mass)
        elif frag_type == 'y_NH3':
            _mass = (y_mass-MASS_NH3)/charge + add_proton
            mz_values.append(_mass)
        elif frag_type == 'c':
            _mass = (b_mass+MASS_NH3)/charge + add_proton
            mz_values.append(_mass)
        elif frag_type == 'z':
            _mass = (
                y_mass-(MASS_NH3-CHEM_MONO_MASS['H'])
            )/charge + add_proton
            mz_values.append(_mass)
        else:
            raise NotImplementedError(
                f'Fragment type "{frag_type}" is not in fragment_mz_df.'
            )
    return np.array(mz_values).T

# %% ../../nbdev_nbs/peptide/fragment.ipynb 16
def mask_fragments_for_charge_greater_than_precursor_charge(
    fragment_df:pd.DataFrame, 
    precursor_charge_array:np.ndarray,
    nAA_array:np.ndarray,
    *,
    candidate_fragment_charges:list = [2,3,4],
):
    """Mask the fragment dataframe when 
    the fragment charge is larger than the precursor charge"""
    precursor_charge_array = np.repeat(
        precursor_charge_array, nAA_array-1
    )
    for col in fragment_df.columns:
        for charge in candidate_fragment_charges:
            if col.endswith(f'z{charge}'):
                fragment_df.loc[
                    precursor_charge_array<charge,col
                ] = 0
    return fragment_df

# %% ../../nbdev_nbs/peptide/fragment.ipynb 18
def create_fragment_mz_dataframe_by_sort_precursor(
    precursor_df: pd.DataFrame,
    charged_frag_types:List,
    batch_size:int=500000,
)->pd.DataFrame:
    """Sort nAA in precursor_df for faster fragment mz dataframe creation.

    Because the fragment mz values are continous in memory, so it is faster
    when setting values in pandas.

    Note that this function will change the order and index of precursor_df

    Parameters
    ----------
    precursor_df : pd.DataFrame
        precursor dataframe

    charged_frag_types : List
        fragment types list

    batch_size : int, optional
        Calculate fragment mz values in batch. 
        Defaults to 500000.
    """
    if 'frag_start_idx' in precursor_df.columns:
        precursor_df.drop(columns=[
            'frag_start_idx','frag_end_idx'
        ], inplace=True)

    refine_precursor_df(precursor_df)

    fragment_mz_df = init_fragment_by_precursor_dataframe(
        precursor_df, charged_frag_types
    )

    _grouped = precursor_df.groupby('nAA')
    for nAA, big_df_group in _grouped:
        for i in range(0, len(big_df_group), batch_size):
            batch_end = i+batch_size
            
            df_group = big_df_group.iloc[i:batch_end,:]

            mz_values = calc_fragment_mz_values_for_same_nAA(
                df_group, nAA, charged_frag_types
            )

            fragment_mz_df.iloc[
                df_group.frag_start_idx.values[0]:
                df_group.frag_end_idx.values[-1], :
            ] = mz_values
    return mask_fragments_for_charge_greater_than_precursor_charge(
            fragment_mz_df,
            precursor_df.charge.values,
            precursor_df.nAA.values,
        )

def create_fragment_mz_dataframe(
    precursor_df: pd.DataFrame,
    charged_frag_types:List,
    *,
    reference_fragment_df: pd.DataFrame = None,
    inplace_in_reference:bool = False,
    batch_size:int=500000,
)->pd.DataFrame:
    '''
    Generate fragment mass dataframe for the precursor_df. If 
    the `reference_fragment_df` is provided and precursor_df contains `frag_start_idx`, 
    it will generate  the mz dataframe based on the reference. Otherwise it 
    generates the mz dataframe from scratch.
    
    Parameters
    ----------
    precursor_df : pd.DataFrame
        precursors to generate fragment masses,
        if `precursor_df` contains the 'frag_start_idx' column, 
        `reference_fragment_df` must be provided
    charged_frag_types : List
        `['b_z1','b_z2','y_z1','y_z2','b_modloss_1','y_H2O_z1'...]`

    reference_fragment_df : pd.DataFrame
        kwargs only. Generate fragment_mz_df based on this reference, 
        as `precursor_df.frag_start_idx` and 
        `precursor.frag_end_idx` point to the indices in 
        `reference_fragment_df`.
        Defaults to None

    inplace_in_reference : bool
        kwargs only. Change values in place in the `reference_fragment_df`.
        Defaults to False

    batch_size: int
        Number of peptides for each batch, to save RAM.

    Returns
    -------
    pd.DataFrame
        `fragment_mz_df` with given `charged_frag_types`
    
    # Raises
    # ------
    # ValueError
    #     when `precursor_df` contains 'frag_start_idx' but 
    #     `reference_fragment_df` is not None
    '''
    if reference_fragment_df is None:
        if 'frag_start_idx' in precursor_df.columns:
            # raise ValueError(
            #     "`precursor_df` contains 'frag_start_idx' column, "\
            #     "please provide `reference_fragment_df` argument"
            # )
            fragment_mz_df = init_fragment_by_precursor_dataframe(
                precursor_df, charged_frag_types,
            )
            return create_fragment_mz_dataframe(
                precursor_df=precursor_df, 
                charged_frag_types=charged_frag_types,
                reference_fragment_df=fragment_mz_df,
                inplace_in_reference=True,
                batch_size=batch_size,
            )
    if 'nAA' not in precursor_df.columns:
        # fast
        return create_fragment_mz_dataframe_by_sort_precursor(
            precursor_df, charged_frag_types, batch_size
        )

    if (is_precursor_sorted(precursor_df) and 
        reference_fragment_df is None
    ):
        # fast
        return create_fragment_mz_dataframe_by_sort_precursor(
            precursor_df, charged_frag_types, batch_size
        )

    else:
        # slow
        if reference_fragment_df is not None:
            if inplace_in_reference:
                fragment_mz_df = reference_fragment_df.loc[:,[
                    _fr for _fr in charged_frag_types 
                    if _fr in reference_fragment_df.columns
                ]]
            else:
                fragment_mz_df = pd.DataFrame(
                    np.zeros((
                        len(reference_fragment_df), 
                        len(charged_frag_types)
                    )),
                    columns = charged_frag_types
                )
        else:
            fragment_mz_df = init_fragment_by_precursor_dataframe(
                precursor_df, charged_frag_types,
            )

        _grouped = precursor_df.groupby('nAA')
        for nAA, big_df_group in _grouped:
            for i in range(0, len(big_df_group), batch_size):
                batch_end = i+batch_size
                
                df_group = big_df_group.iloc[i:batch_end,:]

                mz_values = calc_fragment_mz_values_for_same_nAA(
                    df_group, nAA, fragment_mz_df.columns
                )
                
                update_sliced_fragment_dataframe(
                    fragment_mz_df, mz_values, 
                    df_group[['frag_start_idx','frag_end_idx']].values, 
                )

    return mask_fragments_for_charge_greater_than_precursor_charge(
            fragment_mz_df,
            precursor_df.charge.values,
            precursor_df.nAA.values,
        )

