text_Emotion.py

# *-* coding:utf-8 *-*
# 词向量: 
#   https://www.cnblogs.com/Darwin2000/p/5786984.html
# 数据集:
#   https://blog.csdn.net/alip39/article/details/95891321
# 参考代码:
#   https://blog.csdn.net/u012052268/article/details/90238282
# Attention:
#   https://github.com/philipperemy/keras-attention-mechanism
import re
import os
import keras
import random
import gensim
import numpy as np
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
from keras import Model
from keras.models import load_model
from keras.layers.normalization import BatchNormalization
from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input, multiply
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.optimizers import Adam
from config import Config
import pickle
import matplotlib.pyplot as plt


# 存储模型: 持久化
def load_pkl(filename):
    with open(filename, 'rb') as fr:
        model = pickle.load(fr)
    return model


def save_pkl(model, filename):
    with open(filename, 'wb') as fw:
        pickle.dump(model, fw)


## 训练自己的词向量，并保存。
def trainWord2Vec(infile, outfile):
    sentences =  gensim.models.word2vec.LineSentence(infile) # 读取分词后的 文本
    model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=1, workers=4) # 训练模型
    model.save(outfile)


def loadMyWord2Vec(outfile):
    # 导入 预训练的词向量
    Word2VecModel = gensim.models.Word2Vec.load(outfile)
    return Word2VecModel


def load_embeding():
    # 训练词向量(用空格隔开的文本)
    infile = "./CarCommentAll_cut.csv"
    outfile = "/opt/data/开源词向量/gensim_word2vec_60/Word60.model"
    # trainWord2Vec(infile, outfile)
    # 加载词向量
    Word2VecModel = loadMyWord2Vec(outfile)

    print('空间的词向量（60 维）:', Word2VecModel.wv['空间'].shape, Word2VecModel.wv['空间'])
    print('打印与空间最相近的5个词语：', Word2VecModel.wv.most_similar('空间', topn=5))

    ## 2 构造包含所有词语的 list，以及初始化 “词语-序号”字典 和 “词向量”矩阵
    vocab_list = [word for word, Vocab in Word2VecModel.wv.vocab.items()]# 存储 所有的 词语

    word_index = {" ": 0}# 初始化 `[word : token]` ，后期 tokenize 语料库就是用该词典。
    word_vector = {} # 初始化`[word : vector]`字典

    # 初始化存储所有向量的大矩阵，留意其中多一位（首行），词向量全为 0，用于 padding补零。
    # 行数 为 所有单词数+1 比如 10000+1 ； 列数为 词向量“维度”比如60。
    embeddings_matrix = np.zeros((len(vocab_list) + 1, Word2VecModel.vector_size))

    ## 3 填充 上述 的字典 和 大矩阵
    for i in range(len(vocab_list)):
        # print(i)
        word = vocab_list[i]  # 每个词语
        word_index[word] = i + 1 # 词语：序号
        word_vector[word] = Word2VecModel.wv[word] # 词语：词向量
        embeddings_matrix[i + 1] = Word2VecModel.wv[word]  # 词向量矩阵
    print("加载词向量结束..")
    return vocab_list, word_index, embeddings_matrix


def plot_history(history):
    history_dict = history.history
    print(history_dict.keys())
    acc = history_dict['accuracy']
    val_acc = history_dict['val_accuracy']
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']
    epochs = range(1, len(acc) + 1)
    # “bo”代表 "蓝点"
    plt.plot(epochs, loss, 'bo', label='Training loss')
    # b代表“蓝色实线”
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig('Emotion_loss.png')
    # plt.show()

    plt.clf()   # 清除数字

    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig('Emotion_acc.png')
    # plt.show()


class EmotionModel(object):
    def __init__(self, config):
        self.model = None
        self.config = config
        self.pre_num = self.config.pre_num
        self.data_file = self.config.data_file
        self.vocab_list = self.config.vocab_list
        self.word_index = self.config.word_index
        self.EMBEDDING_DIM = self.config.EMBEDDING_DIM
        self.MAX_SEQUENCE_LENGTH = self.config.MAX_SEQUENCE_LENGTH

        # 如果模型文件存在则直接加载模型，否则开始训练
        if os.path.exists(self.config.model_file):
            self.model = load_model(self.config.model_file)
            self.model.summary()
        else:
            self.train()

    def build_model(self, embeddings_matrix):
        ## 4 在 keras的Embedding层中使用 预训练词向量
        embedding_layer = Embedding(
            input_dim = len(embeddings_matrix), # 字典长度
            output_dim = self.EMBEDDING_DIM, # 词向量 长度（60）
            weights = [embeddings_matrix], # 重点：预训练的词向量系数
            input_length = self.MAX_SEQUENCE_LENGTH, # 每句话的 最大长度（必须padding） 
            trainable = False # 是否在 训练的过程中 更新词向量
        )
        # 如果不加载外界的，可以自己训练
        # 可以看出在使用 Keras的中Embedding层时候，不指定参数 weights=[embeddings_matrix] 即可自动生成词向量。
        # embedding_layer = Embedding(
        #     input_dim = len(word_index) + 1, # 由于 没有预训练，设置+1 
        #     output_dim = EMBEDDING_DIM, # 设置词向量的维度
        #     input_length=MAX_SEQUENCE_LENGTH
        # ) #设置句子的最大长度
        print("开始训练模型.....")
        sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32')  # 返回一个张量，长度为1000，也就是模型的输入为batch_size*1000
        embedded_sequences = embedding_layer(sequence_input)  # 返回batch_size*1000*100
        # 添加 注意力(本质上是通过加入  一个随机向量 作为 权重 来优化 输入的值 - 与全链接不同的是，这个还会作为输入项 和 输入做点乘 )
        attention_probs = Dense(self.EMBEDDING_DIM, activation='softmax', name='attention_probs')(embedded_sequences)
        attention_mul = multiply([embedded_sequences, attention_probs], name='attention_mul')
        x = Bidirectional(GRU(self.EMBEDDING_DIM, return_sequences=True, dropout=0.5))(attention_mul)
        x = Dropout(0.5)(x)
        x = Flatten()(x)
        # x = BatchNormalization()(x)
        preds = Dense(self.pre_num, activation='softmax')(x)
        self.model = Model(sequence_input, preds)
        # 设置优化器
        optimizer = Adam(lr=self.config.learning_rate, beta_1=0.95, beta_2=0.999,epsilon=1e-08)
        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
        self.model.summary()

    def load_word2jieba(self):
        vocab_list = load_pkl(self.vocab_list)
        if vocab_list != []:
            print("加载词的总量: ", len(vocab_list))
            for word in vocab_list:
                jieba.add_word(word)

    def predict(self, line):
        '''预测'''
        word_index = load_pkl(self.word_index)
        STOPWORDS = ["-", "\t", "\n", ".", "。", ",", "，", ";", "!", "！", "?", "？", "%"]
        words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
        indexs = [word_index.get(word, 0) for word in words]
        x_pred = pad_sequences([indexs], maxlen=self.MAX_SEQUENCE_LENGTH)
        res = self.model.predict(x_pred, verbose=0)[0]
        return res

    def load_data(self, word_index, vocab_list, test_size=0.25):
        STOPWORDS = ["-", "\t", "\n", ".", "。", ",", "，", ";", "!", "！", "?", "？", "%"]
        if vocab_list != []:
            for word in vocab_list:
                jieba.add_word(word)

        def func(line):
            # 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]
            words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
            indexs = [word_index.get(word, 0) for word in words]
            return indexs

        df = pd.read_excel(self.data_file, header=0, error_bad_lines=False, encoding="utf_8_sig")
        x = df["comment"].apply(lambda line: func(line)).tolist()
        x = pad_sequences(x, maxlen=self.MAX_SEQUENCE_LENGTH)
        y = df["label"].tolist()
        # 按照大小和顺序，生成 label(0,1,2...自然数类型)
        """
        In [7]: to_categorical(np.asarray([1,1,0,1,3]))
        Out[7]:
        array([[0., 1., 0., 0.],
            [0., 1., 0., 0.],
            [1., 0., 0., 0.],
            [0., 1., 0., 0.],
            [0., 0., 0., 1.]], dtype=float32)
        """
        y = to_categorical(np.asarray(y))
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=10000)
        return (x_train, y_train), (x_test, y_test) 

    def train(self):
        '''训练模型'''
        vocab_list, word_index, embeddings_matrix = load_embeding()
        save_pkl(vocab_list, self.vocab_list)
        save_pkl(word_index, self.word_index)
        (x_train, y_train), (x_test, y_test) = self.load_data(word_index, vocab_list)
        print("---------")
        print(x_train[:3], "\n", y_train[:3])
        print("\n")
        print(x_test[:3], "\n", y_test[:3])
        print("---------")
        self.build_model(embeddings_matrix)

        # 画相关的 loss 和 accuracy=(预测正确-正or负/总预测的)
        history = self.model.fit(x_train, y_train, batch_size=60, epochs=40, validation_split=0.2, verbose=0)
        plot_history(history)

        # self.model.fit(x_train, y_train, batch_size=60, epochs=40)
        self.model.evaluate(x_test, y_test, verbose=2)
        self.model.save(self.config.model_file)


if __name__ == '__main__':
    # 测试加载外界word2vec词向量
    # vocab_list, word_index, embeddings_matrix = load_embeding()
    model = EmotionModel(Config)
    status = False
    while 1:
        text = input("text:")
        if text in ["exit", "quit"]:
            break
        # 首次启动加载jieba词库
        if not status:
            model.load_word2jieba()
            status = True
        res = model.predict(text)
        label_dic = {0:"消极的", 1:"中性的", 2:"积极的"}
        print(res, " : ", label_dic[np.argmax(res)])