from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

class sentimentModel:
    def __init__(self):
        name = 'sentimentModel'

    # load a clean dataset
    def load_dataset(self, filename):
        return load(open(filename, 'rb'))

    # fit a tokenizer
    def create_tokenizer(self, lines):
        tokenizer = Tokenizer()
        #print("XX " + str(lines))
        tokenizer.fit_on_texts(lines)

        return tokenizer

    # calculate the maximum document length
    def max_length(self, lines):
        return max([len(s.split()) for s in lines])

    # encode a list of lines
    def encode_text(self, tokenizer, lines, length):
        # integer encode
        encoded = tokenizer.texts_to_sequences(lines)
        # pad encoded sequences
        padded = pad_sequences(encoded, maxlen=length, padding='post')
        return padded


    # define the model
    def define_model(self, length, vocab_size):
        # channel 1
        inputs1 = Input(shape=(length,))
        embedding1 = Embedding(vocab_size, 100)(inputs1)
        conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
        drop1 = Dropout(0.5)(conv1)
        pool1 = MaxPooling1D(pool_size=2)(drop1)
        flat1 = Flatten()(pool1)
        # channel 2
        inputs2 = Input(shape=(length,))
        embedding2 = Embedding(vocab_size, 100)(inputs2)
        conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
        drop2 = Dropout(0.5)(conv2)
        pool2 = MaxPooling1D(pool_size=2)(drop2)
        flat2 = Flatten()(pool2)
        # channel 3
        inputs3 = Input(shape=(length,))
        embedding3 = Embedding(vocab_size, 100)(inputs3)
        conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
        drop3 = Dropout(0.5)(conv3)
        pool3 = MaxPooling1D(pool_size=2)(drop3)
        flat3 = Flatten()(pool3)
        # merge
        merged = concatenate([flat1, flat2, flat3])
        # interpretation
        dense1 = Dense(10, activation='relu')(merged)
        outputs = Dense(1, activation='sigmoid')(dense1)
        model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)

        # compile
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        # summarize
        print(model.summary())
        #plot_model(model, show_shapes=True, to_file='multichannel.png')
        return model

if __name__ == '__main__':
    # load training dataset
    trainLines, trainLabels = sentimentModel().load_dataset('train.pkl')
    # create tokenizer
    tokenizer = sentimentModel().create_tokenizer(trainLines)
    print("Tokenize " + str(tokenizer.document_count))

    # calculate max document length
    length = sentimentModel().max_length(trainLines)
    # calculate vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    print('Max document length: %d' % length)
    print('Vocabulary size: %d' % vocab_size)
    # encode data
    trainX = sentimentModel().encode_text(tokenizer, trainLines, length)
    print(trainX.shape)

    # define model
    model = sentimentModel().define_model(length, vocab_size)
    # fit model
    model.fit([trainX, trainX, trainX], array(trainLabels), epochs=10, batch_size=16)
    # save the model
    model.save('model.h5')
