text_Emotion.py 10.4 KB
Newer Older
1
# *-* coding:utf-8 *-*
2 3
# 词向量: 
#   https://www.cnblogs.com/Darwin2000/p/5786984.html
片刻小哥哥's avatar
片刻小哥哥 已提交
4 5
# 数据集:
#   https://blog.csdn.net/alip39/article/details/95891321
6 7
# 参考代码:
#   https://blog.csdn.net/u012052268/article/details/90238282
8 9
# Attention:
#   https://github.com/philipperemy/keras-attention-mechanism
10 11 12 13 14 15
import re
import os
import keras
import random
import gensim
import numpy as np
片刻小哥哥's avatar
片刻小哥哥 已提交
16 17 18
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
19 20
from keras import Model
from keras.models import load_model
21 22
from keras.layers.normalization import BatchNormalization
from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input, multiply
片刻小哥哥's avatar
片刻小哥哥 已提交
23 24
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
25 26
from keras.optimizers import Adam
from config import Config
片刻小哥哥's avatar
片刻小哥哥 已提交
27
import pickle
28
import matplotlib.pyplot as plt
片刻小哥哥's avatar
片刻小哥哥 已提交
29 30 31 32 33 34 35 36 37 38 39 40


# 存储模型: 持久化
def load_pkl(filename):
    with open(filename, 'rb') as fr:
        model = pickle.load(fr)
    return model


def save_pkl(model, filename):
    with open(filename, 'wb') as fw:
        pickle.dump(model, fw)
41 42 43


## 训练自己的词向量,并保存。
44 45
def trainWord2Vec(infile, outfile):
    sentences =  gensim.models.word2vec.LineSentence(infile) # 读取分词后的 文本
46
    model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=1, workers=4) # 训练模型
47
    model.save(outfile)
48 49


50
def loadMyWord2Vec(outfile):
51
    # 导入 预训练的词向量
52
    Word2VecModel = gensim.models.Word2Vec.load(outfile)
53 54 55 56
    return Word2VecModel


def load_embeding():
57 58
    # 训练词向量(用空格隔开的文本)
    infile = "./CarCommentAll_cut.csv"
片刻小哥哥's avatar
片刻小哥哥 已提交
59
    outfile = "/opt/data/开源词向量/gensim_word2vec_60/Word60.model"
60
    # trainWord2Vec(infile, outfile)
61
    # 加载词向量
62 63 64 65
    Word2VecModel = loadMyWord2Vec(outfile)

    print('空间的词向量(60 维):', Word2VecModel.wv['空间'].shape, Word2VecModel.wv['空间'])
    print('打印与空间最相近的5个词语:', Word2VecModel.wv.most_similar('空间', topn=5))
66 67 68 69 70 71 72 73

    ## 2 构造包含所有词语的 list,以及初始化 “词语-序号”字典 和 “词向量”矩阵
    vocab_list = [word for word, Vocab in Word2VecModel.wv.vocab.items()]# 存储 所有的 词语

    word_index = {" ": 0}# 初始化 `[word : token]` ,后期 tokenize 语料库就是用该词典。
    word_vector = {} # 初始化`[word : vector]`字典

    # 初始化存储所有向量的大矩阵,留意其中多一位(首行),词向量全为 0,用于 padding补零。
74
    # 行数 为 所有单词数+1 比如 10000+1 ; 列数为 词向量“维度”比如60。
75 76 77 78 79 80 81 82 83
    embeddings_matrix = np.zeros((len(vocab_list) + 1, Word2VecModel.vector_size))

    ## 3 填充 上述 的字典 和 大矩阵
    for i in range(len(vocab_list)):
        # print(i)
        word = vocab_list[i]  # 每个词语
        word_index[word] = i + 1 # 词语:序号
        word_vector[word] = Word2VecModel.wv[word] # 词语:词向量
        embeddings_matrix[i + 1] = Word2VecModel.wv[word]  # 词向量矩阵
84
    print("加载词向量结束..")
片刻小哥哥's avatar
片刻小哥哥 已提交
85
    return vocab_list, word_index, embeddings_matrix
86 87


88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
def plot_history(history):
    history_dict = history.history
    print(history_dict.keys())
    acc = history_dict['accuracy']
    val_acc = history_dict['val_accuracy']
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']
    epochs = range(1, len(acc) + 1)
    # “bo”代表 "蓝点"
    plt.plot(epochs, loss, 'bo', label='Training loss')
    # b代表“蓝色实线”
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig('Emotion_loss.png')
    # plt.show()

    plt.clf()   # 清除数字

    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig('Emotion_acc.png')
    # plt.show()


119 120 121 122 123
class EmotionModel(object):
    def __init__(self, config):
        self.model = None
        self.config = config
        self.pre_num = self.config.pre_num
片刻小哥哥's avatar
片刻小哥哥 已提交
124 125 126 127 128
        self.data_file = self.config.data_file
        self.vocab_list = self.config.vocab_list
        self.word_index = self.config.word_index
        self.EMBEDDING_DIM = self.config.EMBEDDING_DIM
        self.MAX_SEQUENCE_LENGTH = self.config.MAX_SEQUENCE_LENGTH
129 130

        # 如果模型文件存在则直接加载模型,否则开始训练
片刻小哥哥's avatar
片刻小哥哥 已提交
131 132
        if os.path.exists(self.config.model_file):
            self.model = load_model(self.config.model_file)
133 134 135 136 137 138 139 140
            self.model.summary()
        else:
            self.train()

    def build_model(self, embeddings_matrix):
        ## 4 在 keras的Embedding层中使用 预训练词向量
        embedding_layer = Embedding(
            input_dim = len(embeddings_matrix), # 字典长度
片刻小哥哥's avatar
片刻小哥哥 已提交
141
            output_dim = self.EMBEDDING_DIM, # 词向量 长度(60)
142
            weights = [embeddings_matrix], # 重点:预训练的词向量系数
片刻小哥哥's avatar
片刻小哥哥 已提交
143
            input_length = self.MAX_SEQUENCE_LENGTH, # 每句话的 最大长度(必须padding) 
144 145 146 147 148 149 150 151 152 153
            trainable = False # 是否在 训练的过程中 更新词向量
        )
        # 如果不加载外界的,可以自己训练
        # 可以看出在使用 Keras的中Embedding层时候,不指定参数 weights=[embeddings_matrix] 即可自动生成词向量。
        # embedding_layer = Embedding(
        #     input_dim = len(word_index) + 1, # 由于 没有预训练,设置+1 
        #     output_dim = EMBEDDING_DIM, # 设置词向量的维度
        #     input_length=MAX_SEQUENCE_LENGTH
        # ) #设置句子的最大长度
        print("开始训练模型.....")
片刻小哥哥's avatar
片刻小哥哥 已提交
154
        sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32')  # 返回一个张量,长度为1000,也就是模型的输入为batch_size*1000
155
        embedded_sequences = embedding_layer(sequence_input)  # 返回batch_size*1000*100
156 157 158 159 160
        # 添加 注意力(本质上是通过加入  一个随机向量 作为 权重 来优化 输入的值 - 与全链接不同的是,这个还会作为输入项 和 输入做点乘 )
        attention_probs = Dense(self.EMBEDDING_DIM, activation='softmax', name='attention_probs')(embedded_sequences)
        attention_mul = multiply([embedded_sequences, attention_probs], name='attention_mul')
        x = Bidirectional(GRU(self.EMBEDDING_DIM, return_sequences=True, dropout=0.5))(attention_mul)
        x = Dropout(0.5)(x)
161
        x = Flatten()(x)
162
        # x = BatchNormalization()(x)
片刻小哥哥's avatar
片刻小哥哥 已提交
163 164
        preds = Dense(self.pre_num, activation='softmax')(x)
        self.model = Model(sequence_input, preds)
165
        # 设置优化器
166
        optimizer = Adam(lr=self.config.learning_rate, beta_1=0.95, beta_2=0.999,epsilon=1e-08)
片刻小哥哥's avatar
片刻小哥哥 已提交
167 168
        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
        self.model.summary()
169

片刻小哥哥's avatar
片刻小哥哥 已提交
170 171 172
    def load_word2jieba(self):
        vocab_list = load_pkl(self.vocab_list)
        if vocab_list != []:
173
            print("加载词的总量: ", len(vocab_list))
片刻小哥哥's avatar
片刻小哥哥 已提交
174 175 176 177
            for word in vocab_list:
                jieba.add_word(word)

    def predict(self, line):
178
        '''预测'''
片刻小哥哥's avatar
片刻小哥哥 已提交
179 180 181 182 183 184
        word_index = load_pkl(self.word_index)
        STOPWORDS = ["-", "\t", "\n", ".", "。", ",", ",", ";", "!", "!", "?", "?", "%"]
        words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
        indexs = [word_index.get(word, 0) for word in words]
        x_pred = pad_sequences([indexs], maxlen=self.MAX_SEQUENCE_LENGTH)
        res = self.model.predict(x_pred, verbose=0)[0]
185 186
        return res

片刻小哥哥's avatar
片刻小哥哥 已提交
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
    def load_data(self, word_index, vocab_list, test_size=0.25):
        STOPWORDS = ["-", "\t", "\n", ".", "。", ",", ",", ";", "!", "!", "?", "?", "%"]
        if vocab_list != []:
            for word in vocab_list:
                jieba.add_word(word)

        def func(line):
            # 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]
            words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
            indexs = [word_index.get(word, 0) for word in words]
            return indexs

        df = pd.read_excel(self.data_file, header=0, error_bad_lines=False, encoding="utf_8_sig")
        x = df["comment"].apply(lambda line: func(line)).tolist()
        x = pad_sequences(x, maxlen=self.MAX_SEQUENCE_LENGTH)
        y = df["label"].tolist()
        # 按照大小和顺序,生成 label(0,1,2...自然数类型)
        """
        In [7]: to_categorical(np.asarray([1,1,0,1,3]))
        Out[7]:
        array([[0., 1., 0., 0.],
            [0., 1., 0., 0.],
            [1., 0., 0., 0.],
            [0., 1., 0., 0.],
            [0., 0., 0., 1.]], dtype=float32)
        """
        y = to_categorical(np.asarray(y))
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=10000)
        return (x_train, y_train), (x_test, y_test) 

217 218
    def train(self):
        '''训练模型'''
片刻小哥哥's avatar
片刻小哥哥 已提交
219 220 221 222 223 224 225 226 227
        vocab_list, word_index, embeddings_matrix = load_embeding()
        save_pkl(vocab_list, self.vocab_list)
        save_pkl(word_index, self.word_index)
        (x_train, y_train), (x_test, y_test) = self.load_data(word_index, vocab_list)
        print("---------")
        print(x_train[:3], "\n", y_train[:3])
        print("\n")
        print(x_test[:3], "\n", y_test[:3])
        print("---------")
228
        self.build_model(embeddings_matrix)
229 230 231 232 233 234

        # 画相关的 loss 和 accuracy=(预测正确-正or负/总预测的)
        history = self.model.fit(x_train, y_train, batch_size=60, epochs=40, validation_split=0.2, verbose=0)
        plot_history(history)

        # self.model.fit(x_train, y_train, batch_size=60, epochs=40)
片刻小哥哥's avatar
片刻小哥哥 已提交
235 236
        self.model.evaluate(x_test, y_test, verbose=2)
        self.model.save(self.config.model_file)
237 238 239


if __name__ == '__main__':
240
    # 测试加载外界word2vec词向量
片刻小哥哥's avatar
片刻小哥哥 已提交
241 242 243 244 245 246 247 248 249 250 251 252 253
    # vocab_list, word_index, embeddings_matrix = load_embeding()
    model = EmotionModel(Config)
    status = False
    while 1:
        text = input("text:")
        if text in ["exit", "quit"]:
            break
        # 首次启动加载jieba词库
        if not status:
            model.load_word2jieba()
            status = True
        res = model.predict(text)
        label_dic = {0:"消极的", 1:"中性的", 2:"积极的"}
片刻小哥哥's avatar
片刻小哥哥 已提交
254
        print(res, " : ", label_dic[np.argmax(res)])