'''
General data models for metabolomics data.

For experimental data,
The hierarchy is Experiment -> empCpd -> Features -> Peaks

For theoretical data,
The hierarchy is Network/pathway -> reactions -> compounds

Not all concepts have to be explicitly modeled in a project (e.g. expt, peak, network).
Use derived/inherited classes for more explict or specialized data.

To learn about mass spectrometry concepts and pre-processing:
https://pyopenms.readthedocs.io/en/latest/datastructures.html
https://github.com/jorainer/metabolomics2018

To learn about genome scale metabolic models:
https://link.springer.com/article/10.1186/s13059-019-1730-3
https://link.springer.com/protocol/10.1007/978-1-0716-0239-3_19

'''

import json

#
# Experimental concepts: experiment, peak, feature, empirical compound
# only considering mass spec data here
#

class Experiment:
    '''
    An experiment of LC-MS, LC-MS/MS, GC-MS, LC-IMS, etc.
    This can be equivalent to XCMSnExp class in the XCMS R package, 
    or MSExperiment in the OpenMS software,
    but need not be so extensive when pre-processing is not the focus.

    Measurement data are attached to an Experiment, 
    in the form of a list of features and 
    a list of empCpds (the latter generated by annotation).
    Flexibility is given by any type of data can be attached.

    For LC-MS, the feature-level data is a DataFrame,
    features in rows and observations (samples) in columns, similar to gene express data matrix.
    On disk, the data can follow the convention of ANNdata and HiCoNet,
    the 3-file-society Data Strucutre: DataMatrix, FeatureAnnotation and ObservationAnnotation. 
    The DataMatrix in file format uses a single row for observation IDs and a single column for feature IDs.
    Ref: https://github.com/shuzhao-li/hiconet

    The empCpd-level data can be in JSON or other formats.
    '''
    id = 'EXP00001234'                         # long str to be unique in the world
    input_data_from = ''

    # expt meta data
    type = 'LC-MS'
    instrument = ''
    instrument_parameters = {}
    study_metadata = {}

    chromatography = ''
    chromatography_parameters = {
        'column_length': '',
        'column_diameter': '',
        'total_time': '300', # seconds
        'gradient': '',
        # etc.
    }
    
    preprocess_software = ''
    preprocess_parameters = {
        'version': '0.0',
        'ppm': 1,
        'SNR': 1.5,
        # etc.
    }
    
    # data 
    feature_DataFrame = None
    FeatureAnnotation = {} 
    ObservationAnnotation = {
        sample_list: [],
        file_sample_mapper: {}
    }
    # immutable ordered version of sample_list
    ordered_samples = ()

    # EmpiricalCompounds, after annotation
    List_of_empCpds = []

    @property
    def __init__(self, id):
        self.id = id

    def from_json():
        pass

    def to_json():
        pass
    

class Peak:
    '''
    The default is a chromatographic peak in LC-MS, specific to a sample in an experiment.
    This can be extended to other type of peaks as needed.

    Preprocessing software extracts peaks per sample, then performs alignment/correspondence.
    For high-resolution data, m/z alignment isn't a major concern.
    Retention time alignment shifts the data values.
    For this class, pre-alignment data is preferred, 
    so that people can use different methods for their own alignment.
    
    When data tables come as post-alignment data, 
    which are accommodated in list_retention_time_corrected.

    '''
    id = 'P00001234'
    ms_level = 1                    # MS levle - 1, 2. 3, etc.
    ionization = 'positive'
    # XIC and peak_shape are defined by intensity as the the function of rtime
    list_retention_time = []
    list_intensity = []
    # if RT aligned/adjusted
    list_retention_time_corrected = []

    # derivative to XIC
    mz, min_mz, max_mz = 0, 0, 0
    rtime, min_rtime, max_rtime = 0, 0, 0
    # other attributes of interest
    # collision_cross_section = 0     # reserved for IM data

    # optional as this can be reverse indexed
    corresponding_feature_id = ''   # belong to which feature after correspondence
    experiment_belonged = ''
    
    def from_json():
        pass

    def to_json():
        pass
    

class Feature:
    '''
    A feature is a set of peaks that are aligned across samples.
    So this is experiment specific.
    The m/z and retention_time of a feature is summarized on the member peaks.
    The variation between samples is reflected in data at peak level.

    The default is LC-MS feature. Derivative classes include MS2feature, etc.
    '''

    # to enable getters and setters
    @property
    def __init__(self, id):
        self.id = id                # e.g. 'F00001234'
        self.ms_level = 1           # MS levle - 1, 2. 3, etc.
        self.mz = 0
        self.rtime = 0

        # other attributes of interest
        including_peaks = []

        experiment_belonged = ''

        # statistics across samples
        self.statistics = {
            'intensity_sample_mean': None,
            'intensity_sample_std': None,
            'intensity_sample_cv': None,
            'intensity_replicate_cv': None,
            # statistic_score and p_value depend on the statistical test
            'statistic_score': None,
            'p_value': None,
        }

    def from_json():
        pass

    def to_json():
        pass
    


class EmpiricalCompound:
    '''
    EmpiricalCompound is a tentative compound/metabolite,
    a computational unit to represent the result of annotation on mass spec experiment.
    It should have reference to multiple ions (isotopes/adducts) that belong to the same metabolite,
    and can be a mixture of isobaric/isomeric metabolites when they are not distinguished by the mass spec data.

    This is used because the identification of compounds is not definitve at many stages of a project, 
    and this allows probablistic annotation on experimental data. 
    The probablity ranges between [0, 1]. 
    This unit then enables approaches to factor the probablistic models into biological interpretation (e.g. mummichog). 
    If an annotation method only provides scores (e.g. from MS2 search), mummichog will use them.

    EmpiricalCompound is experiment specific,
    and can combine multiple methods, including pos and neg ESI, MS^n.

    Similar concepts are 'pseudo spectrum' in CAMERA, and 'feature group' in mz.Unity.
    '''

    @property
    def __init__(self):
        '''
        An empCpd is the result of annotation.
        It has one and only one base neutral mass.
        Many attributes are optional.
        '''
        self.id = 'E00001234'
        self.neutral_base_mass = 0.0000

        # Experiment specific.
        self.experiment_belonged = ''
        self.annotation_method = ''

        # after annotation, not ruling out an empCpd can be mixture (isomers, etc)
        # neutral formulae
        self.candidate_formulae = []

        #
        # one of the scores or probabilities is expected after annotation
        #
        # How to assign probability depends on annotation method
        # Changing from dict to list, as these are tables not easy keys 
        self.identity_probability = [
                  # (compound or mixtures): probability
                  [0.0, ('Compound x')],
                  [0.0, ('Compound y', 'Compound z')],
          ]

        self.identity_probability_mummichog = [
            # updated probability after mummichog analysis
        ]

        # Scores from annotation method
        self.identity_scores = [
                  [0.0, ('Compound x')],
                  [0.0, ('Compound y', 'Compound z')],
          ]

        # Representative intensity values, can base on the MS1 feature of highest intensity
        self.intensities = {
            "sample1": 0, "sample2": 0, # etc
        }
        # more efficient version of self.intensities
        self.intensities_by_ordered_samples = []

        # this is list of MS1 features, either using pointers to Features in the database,
        # or shorthanding everything here.
        # How to group ions into empCpd depends on annotation method
        self.MS1_pseudo_Spectra = [
            {'feature': 'row23', 'ion': 'M+H[1+]', 'm/z': 169.0013, 'rtime': 55},
            {},
            # ...
        ]

        # this is list of  MS2 features; templated on MONA JSON
        self.MS2_Spectra = [
            {"instrument": "",
            "precursor type": "M+H[1+]",
            "precursor m/z": 169,
            "collision energy": "30V",
            "score": 5.5,
            "spectrum": "59.000:0.615142 72.600:0.031546 74.600:0.015773 78.900:0.086751 85.200:1.490536 150.500:0.055205 166.000:0.055205 167.200:100.000000",
            },
            {},
            {}
        ]

    def from_json():
        pass

    def to_json():
        pass
    


#
# Theoretical concepts (metabolic model): compound, reaction, pathway, network
#

class Compound:
    def __init__(self):
        '''
        All metabolites are compounds, but the reverse is not true.
        Thus, compound is a basic class.

        Azimuth ID starts with `az`, 
        and incorporates HMDB ID (less ambiguous than KEGG) whereas possible.
        
        '''
        self.internal_id = ''
        self.name = ''          # common name
        self.db_ids = {
            'KEGG': '',
            'HMDB': '',
            'Azimuth': '',
            'PubChem': '',
            'MetaNetX': '',
            # etc.
            }
        self.neutral_formula = ''
        self.neutral_mono_mass = 0.0000

        self.SMILES = ''
        self.inchi = ''
        
    def from_json():
        pass

    def to_json():
        pass
    

class Reaction:
    '''
    A reaction is defined by reactants and products, each a list of compounds.
    There is directionality of a reaction. A forward reaction is different from reverse reaction.
    We can treat the reactions catalyzed by different enzymes as the same 


    Reactions are species specific, 
    because genes are species specific.
    '''
    def __init__(self):
        self.azimuth_id = ''
        self.source = []
        self.version = ''
        # status, one of ['active', 'under review', 'obsolete']
        self.status = ''

        self.reactants = []
        self.products = []

        # below this line are optional
        self.enzymes = []
        self.genes = []

        # belong to
        self.pathways = []
        # still looking for good ontology for reactions. Maybe notes like "Glucuronidation" for now.
        self.ontologies = []

        self.species = ''
        self.compartments = []
        self.cell_types = []
        self.tissues = []
        
    def from_json():
        pass

    def to_json():
        pass
    

class Pathway:
    '''
    A pathway is defined by connected biochemical reactions, according to human definition.
    '''
    def __init__(self):
        self.azimuth_id = ''
        self.name = ''
        self.source = []
        self.list_of_reactions = []
        self.status = ''

    def from_json():
        pass

    def to_json():
        pass
    

class Network:
    '''
    Metabolic network 
    is defined by connected biochemical reactions.

    Network is mathematically identical to pathway, but not limited by pathway definition.
    Edges and nodes are computed based on reactions.

    All based on prior knowledge.
    This class does not include correlation networks and as such.
    '''
    def __init__(self):
        self.azimuth_id = ''
        self.name = ''
        self.source = []
        self.list_of_reactions = []
        self.status = ''

    def from_json():
        pass

    def to_json():
        pass
    
