提交 0a4b1020 编写于 作者: W wuzewu

add senta demo

上级 fd34ef4f
import io
import paddle.fluid as fluid
import processor
import numpy as np
import nets
import paddle_hub as hub
def load_vocab(file_path):
"""
load the given vocabulary
"""
vocab = {}
with io.open(file_path, 'r', encoding='utf8') as f:
wid = 0
for line in f:
line = line.rstrip()
parts = line.split('\t')
vocab[parts[0]] = int(parts[1])
vocab["<unk>"] = len(vocab)
return vocab
def create_module():
network = nets.bilstm_net
# word seq data
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
# label data
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
word_dict_path = "./resources/train.vocab"
word_dict = load_vocab(word_dict_path)
cost, acc, pred = network(data, label, len(word_dict) + 1)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
model_path = "./resources/senta_model"
fluid.io.load_inference_model(model_path, exe)
# assets
assets = [word_dict_path]
# create a module
sign = hub.create_signature(
name="sentiment_classify", inputs=[data], outputs=[pred])
hub.create_module(
sign_arr=[sign],
module_dir="hub_module_senta",
exe=exe,
module_info="resources/module_info.yml",
processor=processor.Processor,
assets=assets)
if __name__ == "__main__":
create_module()
#!/bin/bash
set -o nounset
set -o errexit
script_path=$(cd `dirname $0`; pwd)
cd $script_path
python create_module.py
python ../../paddle_hub/commands/hub.py run hub_module_senta/ --signature sentiment_classify --config resources/test/test.yml --dataset resources/test/test.csv
import paddle.fluid as fluid
def bilstm_net(data,
label,
dict_dim,
emb_dim=128,
hid_dim=128,
hid_dim2=96,
class_dim=2,
emb_lr=30.0):
"""
Bi-Lstm net
"""
# embedding layer
emb = fluid.layers.embedding(
input=data,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(learning_rate=emb_lr))
# bi-lstm layer
fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
rfc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
lstm_h, c = fluid.layers.dynamic_lstm(
input=fc0, size=hid_dim * 4, is_reverse=False)
rlstm_h, c = fluid.layers.dynamic_lstm(
input=rfc0, size=hid_dim * 4, is_reverse=True)
# extract last layer
lstm_last = fluid.layers.sequence_last_step(input=lstm_h)
rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h)
lstm_last_tanh = fluid.layers.tanh(lstm_last)
rlstm_last_tanh = fluid.layers.tanh(rlstm_last)
# concat layer
lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1)
# full connect layer
fc1 = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh')
# softmax layer
prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, acc, prediction
import paddle
import paddle.fluid as fluid
import paddle_hub as hub
import numpy as np
import os
import io
from paddle_hub import BaseProcessor
from paddle_hub.hub_server import default_hub_server
from paddle_hub.module.manager import default_module_manager
def load_vocab(file_path):
"""
load the given vocabulary
"""
vocab = {}
with io.open(file_path, 'r', encoding='utf8') as f:
wid = 0
for line in f:
line = line.rstrip()
parts = line.split('\t')
vocab[parts[0]] = int(parts[1])
vocab["<unk>"] = len(vocab)
return vocab
def get_predict_label(pos_prob):
neg_prob = 1 - pos_prob
# threshold should be (1, 0.5)
neu_threshold = 0.55
if neg_prob > neu_threshold:
label, key = 0, "负面"
elif pos_prob > neu_threshold:
label, key = 2, "正面"
else:
label, key = 1, "中性"
return label, key
class Processor(BaseProcessor):
def __init__(self, module):
self.module = module
assets_path = self.module.helper.assets_path()
word_dict_path = os.path.join(assets_path, "train.vocab")
self.word_dict = load_vocab(word_dict_path)
path = default_module_manager.search_module("lac")
if path:
self.lac = hub.Module(module_dir=path)
else:
result, _, path = default_module_manager.install_module("lac")
assert path, "can't found necessary module lac"
self.lac = hub.Module(module_dir=path)
def preprocess(self, sign_name, data_dict):
result = {'text': []}
processed = self.lac.segment(data=data_dict)
unk_id = len(self.word_dict)
for index, data in enumerate(processed):
result_i = {'processed': []}
result_i['origin'] = data_dict['text'][index]
for result_dict in data:
if result_dict['word'] in self.word_dict:
_index = self.word_dict[result_dict['word']]
else:
_index = unk_id
result_i['processed'].append(_index)
result['text'].append(result_i)
return result
def postprocess(self, sign_name, data_out, data_info, **kwargs):
if sign_name == "sentiment_classify":
result = []
pred = fluid.executor.as_numpy(data_out)
for index in range(len(data_info['text'])):
result_i = {}
result_i['text'] = data_info['text'][index]['origin']
label, key = get_predict_label(pred[0][index, 1])
result_i['sentiment_label'] = label
result_i['sentiment_key'] = key
result.append(result_i)
return result
def data_format(self, sign_name):
if sign_name == "sentiment_classify":
return {
"text": {
"type": hub.DataType.TEXT,
"feed_key": self.module.signatures[sign_name].inputs[0].name
}
}
return None
#!/bin/bash
set -o nounset
set -o errexit
script_path=$(cd `dirname $0`; pwd)
cd $script_path
wget --no-check-certificate https://paddlehub.bj.bcebos.com/paddle_model/senta.tar.gz
wget --no-check-certificate https://paddlehub.bj.bcebos.com/paddle_model/train.vocab
tar xvzf senta.tar.gz
rm senta.tar.gz
name: senta
type: nlp/sentiment_analysis
author: paddlepaddle
author-email: paddle-dev@baidu.com
version: 1.0.0
TEXT_INPUT
这部电影真的很赞
售后太差!
input_data:
text:
type : TEXT
key : TEXT_INPUT
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册