未验证 提交 c4f55f4c 编写于 作者: 片刻小哥哥's avatar 片刻小哥哥 提交者: GitHub

Merge pull request #560 from jiangzhonglian/master

添加情感分类第一个版本
......@@ -271,6 +271,9 @@
* Python 自然语言处理 第二版: <https://usyiyi.github.io/nlp-py-2e-zh>
* 推荐一个[liuhuanyong大佬](https://github.com/liuhuanyong)整理的nlp全面知识体系: <https://liuhuanyong.github.io>
* 开源 - 词向量库集合:
* <https://www.cnblogs.com/Darwin2000/p/5786984.html>
* <https://ai.tencent.com/ailab/nlp/embedding.html>
* <https://blog.csdn.net/xiezj007/article/details/85073890>
* <https://github.com/Embedding/Chinese-Word-Vectors>
* <https://github.com/brightmart/nlp_chinese_corpus>
* <https://github.com/codemayq/chinese_chatbot_corpus>
......
......@@ -8,7 +8,14 @@
class Config(object):
poetry_file = 'poetry.txt'
weight_file = 'poetry_model.h5'
data_file = 'EmotionData.xlsx'
model_file = 'EmotionModel.h5'
vocab_list = 'vocal_list.pkl'
word_index = 'word_index.pkl'
# 根据前六个字预测第七个字
max_len = 6
batch_size = 512
learning_rate = 0.001
pre_num = 3
MAX_SEQUENCE_LENGTH = 1000 # 每个文本或者句子的截断长度,只保留1000个单词
EMBEDDING_DIM = 60 # 词向量维度
{
"nbformat": 4,
"nbformat_minor": 2,
"metadata": {
"language_info": {
"name": "python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"version": "3.6.3"
},
"orig_nbformat": 2,
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"npconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": 3
},
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": "/Users/jiangzl/.virtualenvs/python3.6/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n from ._conv import register_converters as _register_converters\nUsing TensorFlow backend.\n"
}
],
"source": [
"import sys\n",
"# 加载自定义包(添加:中间件)\n",
"sys.path.append(\"src/py3.x/tensorflow2.x\")\n",
"from text_Emotion import *"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"outfile = \"/opt/data/开源词向量/gensim_word2vec_60/Word60.model\"\n",
"# 加载词向量\n",
"Word2VecModel = loadMyWord2Vec(outfile)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "空间的词向量(60 维): (60,) [ 2.2506642 -1.7324443 0.35593075 -3.7236977 -0.6317619 2.1253817\n -0.8911206 0.61192095 -2.5709946 5.6513844 2.3008282 -4.102604\n -0.61898416 -1.1190889 -6.060641 2.3529105 1.8131357 2.0764832\n -2.102738 -0.414962 -2.0553887 0.37966883 -2.015982 -1.4542716\n 3.191199 0.3265181 0.7307454 1.4761372 -2.2383723 0.925493\n 6.2617674 -1.3852879 0.6405419 -0.5601632 -1.084447 5.689829\n 0.46593904 -2.824275 4.2015862 -0.87934065 1.518804 -1.493514\n -1.9851282 -0.63166183 0.96814466 1.6375747 1.1566993 1.1981301\n 0.7950756 -3.0055897 1.2649575 1.2099069 1.9403213 1.3719954\n 2.6494706 1.8465079 -0.5507954 -2.3987298 -1.8990258 -4.651662 ]\n打印与空间最相近的5个词语: [('物件', 0.7354965806007385), ('维度', 0.7326242923736572), ('自由空间', 0.7247114181518555), ('拓扑', 0.7112817764282227), ('三维空间', 0.7062257528305054)]\n加载词向量结束..\n"
}
],
"source": [
"embeddings_matrix = load_embeding()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "--: [[ 0. 0. 0. ... 0. 0.\n 0. ]\n [ 3.6153059 2.63272738 -0.98327219 ... 0.03685202 -0.78566265\n 1.06350613]\n [ 0.21444647 2.58100891 0.08306306 ... -0.43973923 -0.2102039\n -1.37015963]\n ...\n [-1.07420349 1.90465117 2.2614491 ... -1.90614116 -0.34697708\n -2.43622112]\n [ 1.53204441 0.60434735 -0.02905927 ... -0.04591536 -0.63762575\n 0.29778937]\n [ 0.20260553 0.03990031 -0.22745971 ... -0.17701624 0.16334218\n 0.06799572]]\n"
}
],
"source": [
"print('--: ', embeddings_matrix)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import os\n",
"import keras\n",
"import random\n",
"import gensim\n",
"import numpy as np\n",
"import pandas as pd\n",
"from keras import Model\n",
"from keras.models import load_model\n",
"from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input\n",
"from keras.optimizers import Adam\n",
"# 该目录下的 config.py文件, 数据文件是: poetry.txt\n",
"from config import Config\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "空间的词向量(60 维): (60,) [ 2.2506642 -1.7324443 0.35593075 -3.7236977 -0.6317619 2.1253817\n -0.8911206 0.61192095 -2.5709946 5.6513844 2.3008282 -4.102604\n -0.61898416 -1.1190889 -6.060641 2.3529105 1.8131357 2.0764832\n -2.102738 -0.414962 -2.0553887 0.37966883 -2.015982 -1.4542716\n 3.191199 0.3265181 0.7307454 1.4761372 -2.2383723 0.925493\n 6.2617674 -1.3852879 0.6405419 -0.5601632 -1.084447 5.689829\n 0.46593904 -2.824275 4.2015862 -0.87934065 1.518804 -1.493514\n -1.9851282 -0.63166183 0.96814466 1.6375747 1.1566993 1.1981301\n 0.7950756 -3.0055897 1.2649575 1.2099069 1.9403213 1.3719954\n 2.6494706 1.8465079 -0.5507954 -2.3987298 -1.8990258 -4.651662 ]\n打印与空间最相近的5个词语: [('物件', 0.7354965806007385), ('维度', 0.7326242923736572), ('自由空间', 0.7247114181518555), ('拓扑', 0.7112817764282227), ('三维空间', 0.7062257528305054)]\n加载词向量结束..\n"
},
{
"ename": "NameError",
"evalue": "name 'load_data' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-18-afd80ed77829>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mEmotionModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mConfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/opt/git/AiLearning/src/py3.x/tensorflow2.x/text_Emotion.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msummary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mbuild_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0membeddings_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/git/AiLearning/src/py3.x/tensorflow2.x/text_Emotion.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;34m'''训练模型'''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0membeddings_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_embeding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 125\u001b[0;31m \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 126\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0membeddings_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'load_data' is not defined"
]
}
],
"source": [
"model = EmotionModel(Config)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>comment</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>0</td>\n <td>1</td>\n <td>距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...</td>\n </tr>\n <tr>\n <td>1</td>\n <td>1</td>\n <td>商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!</td>\n </tr>\n <tr>\n <td>2</td>\n <td>1</td>\n <td>早餐太差,无论去多少人,那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。</td>\n </tr>\n <tr>\n <td>3</td>\n <td>1</td>\n <td>宾馆在小街道上,不大好找,但还好北京热心同胞很多~宾馆设施跟介绍的差不多,房间很小,确实挺小...</td>\n </tr>\n <tr>\n <td>4</td>\n <td>1</td>\n <td>CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风</td>\n </tr>\n <tr>\n <td>5</td>\n <td>1</td>\n <td>总的来说,这样的酒店配这样的价格还算可以,希望他赶快装修,给我的客人留些好的印象</td>\n </tr>\n <tr>\n <td>6</td>\n <td>1</td>\n <td>价格比比较不错的酒店。这次免费升级了,感谢前台服务员。房子还好,地毯是新的,比上次的好些。早...</td>\n </tr>\n <tr>\n <td>7</td>\n <td>1</td>\n <td>不错,在同等档次酒店中应该是值得推荐的!</td>\n </tr>\n <tr>\n <td>8</td>\n <td>1</td>\n <td>入住丽晶,感觉很好。因为是新酒店,的确有淡淡的油漆味,房间内较新。房间大小合适,卫生间设备齐...</td>\n </tr>\n <tr>\n <td>9</td>\n <td>1</td>\n <td>1。酒店比较新,装潢和设施还不错,只是房间有些油漆味。2。早餐还可以,只是品种不是很多。3。...</td>\n </tr>\n </tbody>\n</table>\n</div>",
"text/plain": " label comment\n0 1 距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...\n1 1 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!\n2 1 早餐太差,无论去多少人,那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。\n3 1 宾馆在小街道上,不大好找,但还好北京热心同胞很多~宾馆设施跟介绍的差不多,房间很小,确实挺小...\n4 1 CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风\n5 1 总的来说,这样的酒店配这样的价格还算可以,希望他赶快装修,给我的客人留些好的印象\n6 1 价格比比较不错的酒店。这次免费升级了,感谢前台服务员。房子还好,地毯是新的,比上次的好些。早...\n7 1 不错,在同等档次酒店中应该是值得推荐的!\n8 1 入住丽晶,感觉很好。因为是新酒店,的确有淡淡的油漆味,房间内较新。房间大小合适,卫生间设备齐...\n9 1 1。酒店比较新,装潢和设施还不错,只是房间有些油漆味。2。早餐还可以,只是品种不是很多。3。..."
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_excel(\"src/py3.x/tensorflow2.x/EmotionData.xlsx\", header=0, error_bad_lines=False, encoding=\"utf_8_sig\")\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y = df[\"label\"].tolist()\n",
"y[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def func(line, ngrams=[]):\n",
" # 加入我们的组合词,保证分词的准确性\n",
" \n",
" if ngrams != []:\n",
" for word in ngrams:\n",
" jieba.add_word(\"\".join(word.lower()))\n",
" # # 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]\n",
" words = [word for word in jieba.cut(str(line).lower(), cut_all=False)]\n",
" # print(\">>> \", train)\n",
" return \" \".join(words)\n",
"x = df[\"comment\"].apply(lambda line: func(line))\n"
]
}
]
}
\ No newline at end of file
# *-* coding:utf-8 *-*
# 词向量:
# https://www.cnblogs.com/Darwin2000/p/5786984.html
# 数据集:
# https://blog.csdn.net/alip39/article/details/95891321
# 参考代码:
# https://blog.csdn.net/u012052268/article/details/90238282
import re
import os
import keras
import random
import gensim
import numpy as np
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
from keras import Model
from keras.models import load_model
from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.optimizers import Adam
from config import Config
import pickle
# 存储模型: 持久化
def load_pkl(filename):
with open(filename, 'rb') as fr:
model = pickle.load(fr)
return model
def save_pkl(model, filename):
with open(filename, 'wb') as fw:
pickle.dump(model, fw)
## 训练自己的词向量,并保存。
def trainWord2Vec(infile, outfile):
sentences = gensim.models.word2vec.LineSentence(infile) # 读取分词后的 文本
model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=1, workers=4) # 训练模型
model.save(outfile)
def loadMyWord2Vec(outfile):
# 导入 预训练的词向量
Word2VecModel = gensim.models.Word2Vec.load(outfile)
return Word2VecModel
def load_embeding():
# 训练词向量(用空格隔开的文本)
infile = "./CarCommentAll_cut.csv"
outfile = "gensim_word2vec_60/Word60.model"
# trainWord2Vec(infile, outfile)
# 加载词向量
Word2VecModel = loadMyWord2Vec(outfile)
print('空间的词向量(60 维):', Word2VecModel.wv['空间'].shape, Word2VecModel.wv['空间'])
print('打印与空间最相近的5个词语:', Word2VecModel.wv.most_similar('空间', topn=5))
## 2 构造包含所有词语的 list,以及初始化 “词语-序号”字典 和 “词向量”矩阵
vocab_list = [word for word, Vocab in Word2VecModel.wv.vocab.items()]# 存储 所有的 词语
word_index = {" ": 0}# 初始化 `[word : token]` ,后期 tokenize 语料库就是用该词典。
word_vector = {} # 初始化`[word : vector]`字典
# 初始化存储所有向量的大矩阵,留意其中多一位(首行),词向量全为 0,用于 padding补零。
# 行数 为 所有单词数+1 比如 10000+1 ; 列数为 词向量“维度”比如60。
embeddings_matrix = np.zeros((len(vocab_list) + 1, Word2VecModel.vector_size))
## 3 填充 上述 的字典 和 大矩阵
for i in range(len(vocab_list)):
# print(i)
word = vocab_list[i] # 每个词语
word_index[word] = i + 1 # 词语:序号
word_vector[word] = Word2VecModel.wv[word] # 词语:词向量
embeddings_matrix[i + 1] = Word2VecModel.wv[word] # 词向量矩阵
print("加载词向量结束..")
return vocab_list, word_index, embeddings_matrix
class EmotionModel(object):
def __init__(self, config):
self.model = None
self.config = config
self.pre_num = self.config.pre_num
self.data_file = self.config.data_file
self.vocab_list = self.config.vocab_list
self.word_index = self.config.word_index
self.EMBEDDING_DIM = self.config.EMBEDDING_DIM
self.MAX_SEQUENCE_LENGTH = self.config.MAX_SEQUENCE_LENGTH
# 如果模型文件存在则直接加载模型,否则开始训练
if os.path.exists(self.config.model_file):
self.model = load_model(self.config.model_file)
self.model.summary()
else:
self.train()
def build_model(self, embeddings_matrix):
## 4 在 keras的Embedding层中使用 预训练词向量
embedding_layer = Embedding(
input_dim = len(embeddings_matrix), # 字典长度
output_dim = self.EMBEDDING_DIM, # 词向量 长度(60)
weights = [embeddings_matrix], # 重点:预训练的词向量系数
input_length = self.MAX_SEQUENCE_LENGTH, # 每句话的 最大长度(必须padding)
trainable = False # 是否在 训练的过程中 更新词向量
)
# 如果不加载外界的,可以自己训练
# 可以看出在使用 Keras的中Embedding层时候,不指定参数 weights=[embeddings_matrix] 即可自动生成词向量。
# embedding_layer = Embedding(
# input_dim = len(word_index) + 1, # 由于 没有预训练,设置+1
# output_dim = EMBEDDING_DIM, # 设置词向量的维度
# input_length=MAX_SEQUENCE_LENGTH
# ) #设置句子的最大长度
print("开始训练模型.....")
# 使用
sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32') # 返回一个张量,长度为1000,也就是模型的输入为batch_size*1000
embedded_sequences = embedding_layer(sequence_input) # 返回batch_size*1000*100
x = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
x = Dropout(0.6)(x)
x = Flatten()(x)
preds = Dense(self.pre_num, activation='softmax')(x)
self.model = Model(sequence_input, preds)
# 设置优化器
optimizer = Adam(lr=self.config.learning_rate)
self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
self.model.summary()
def load_word2jieba(self):
vocab_list = load_pkl(self.vocab_list)
if vocab_list != []:
for word in vocab_list:
jieba.add_word(word)
def predict(self, line):
'''预测'''
word_index = load_pkl(self.word_index)
STOPWORDS = ["-", "\t", "\n", ".", "。", ",", ",", ";", "!", "!", "?", "?", "%"]
words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
indexs = [word_index.get(word, 0) for word in words]
x_pred = pad_sequences([indexs], maxlen=self.MAX_SEQUENCE_LENGTH)
res = self.model.predict(x_pred, verbose=0)[0]
return res
def load_data(self, word_index, vocab_list, test_size=0.25):
STOPWORDS = ["-", "\t", "\n", ".", "。", ",", ",", ";", "!", "!", "?", "?", "%"]
if vocab_list != []:
for word in vocab_list:
jieba.add_word(word)
def func(line):
# 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]
words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
indexs = [word_index.get(word, 0) for word in words]
return indexs
df = pd.read_excel(self.data_file, header=0, error_bad_lines=False, encoding="utf_8_sig")
x = df["comment"].apply(lambda line: func(line)).tolist()
x = pad_sequences(x, maxlen=self.MAX_SEQUENCE_LENGTH)
y = df["label"].tolist()
# 按照大小和顺序,生成 label(0,1,2...自然数类型)
"""
In [7]: to_categorical(np.asarray([1,1,0,1,3]))
Out[7]:
array([[0., 1., 0., 0.],
[0., 1., 0., 0.],
[1., 0., 0., 0.],
[0., 1., 0., 0.],
[0., 0., 0., 1.]], dtype=float32)
"""
y = to_categorical(np.asarray(y))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=10000)
return (x_train, y_train), (x_test, y_test)
def train(self):
'''训练模型'''
vocab_list, word_index, embeddings_matrix = load_embeding()
save_pkl(vocab_list, self.vocab_list)
save_pkl(word_index, self.word_index)
(x_train, y_train), (x_test, y_test) = self.load_data(word_index, vocab_list)
print("---------")
print(x_train[:3], "\n", y_train[:3])
print("\n")
print(x_test[:3], "\n", y_test[:3])
print("---------")
self.build_model(embeddings_matrix)
self.model.fit(x_train, y_train, batch_size=60, epochs=10)
self.model.evaluate(x_test, y_test, verbose=2)
self.model.save(self.config.model_file)
if __name__ == '__main__':
# 测试加载外界word2vec词向量
# vocab_list, word_index, embeddings_matrix = load_embeding()
model = EmotionModel(Config)
status = False
while 1:
text = input("text:")
if text in ["exit", "quit"]:
break
# 首次启动加载jieba词库
if not status:
model.load_word2jieba()
status = True
res = model.predict(text)
label_dic = {0:"消极的", 1:"中性的", 2:"积极的"}
print(res, " : ", label_dic[np.argmax(res)])
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册