From 0a4b1020021373ee2957754a8e282850ecf7e15f Mon Sep 17 00:00:00 2001 From: wuzewu Date: Thu, 21 Mar 2019 20:28:24 +0800 Subject: [PATCH] add senta demo --- demo/senta/create_module.py | 56 +++++++++++++++++ demo/senta/create_module.sh | 8 +++ demo/senta/infer.sh | 1 + demo/senta/nets.py | 50 +++++++++++++++ demo/senta/processor.py | 91 ++++++++++++++++++++++++++++ demo/senta/resources/download.sh | 11 ++++ demo/senta/resources/module_info.yml | 5 ++ demo/senta/resources/test/test.csv | 3 + demo/senta/resources/test/test.yml | 4 ++ 9 files changed, 229 insertions(+) create mode 100644 demo/senta/create_module.py create mode 100644 demo/senta/create_module.sh create mode 100644 demo/senta/infer.sh create mode 100644 demo/senta/nets.py create mode 100644 demo/senta/processor.py create mode 100644 demo/senta/resources/download.sh create mode 100644 demo/senta/resources/module_info.yml create mode 100644 demo/senta/resources/test/test.csv create mode 100644 demo/senta/resources/test/test.yml diff --git a/demo/senta/create_module.py b/demo/senta/create_module.py new file mode 100644 index 00000000..c3c2f97b --- /dev/null +++ b/demo/senta/create_module.py @@ -0,0 +1,56 @@ +import io +import paddle.fluid as fluid +import processor +import numpy as np +import nets +import paddle_hub as hub + + +def load_vocab(file_path): + """ + load the given vocabulary + """ + vocab = {} + with io.open(file_path, 'r', encoding='utf8') as f: + wid = 0 + for line in f: + line = line.rstrip() + parts = line.split('\t') + vocab[parts[0]] = int(parts[1]) + vocab[""] = len(vocab) + return vocab + + +def create_module(): + network = nets.bilstm_net + # word seq data + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + # label data + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + word_dict_path = "./resources/train.vocab" + word_dict = load_vocab(word_dict_path) + cost, acc, pred = network(data, label, len(word_dict) + 1) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + model_path = "./resources/senta_model" + fluid.io.load_inference_model(model_path, exe) + + # assets + assets = [word_dict_path] + + # create a module + sign = hub.create_signature( + name="sentiment_classify", inputs=[data], outputs=[pred]) + hub.create_module( + sign_arr=[sign], + module_dir="hub_module_senta", + exe=exe, + module_info="resources/module_info.yml", + processor=processor.Processor, + assets=assets) + + +if __name__ == "__main__": + create_module() diff --git a/demo/senta/create_module.sh b/demo/senta/create_module.sh new file mode 100644 index 00000000..70bd274c --- /dev/null +++ b/demo/senta/create_module.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -o nounset +set -o errexit + +script_path=$(cd `dirname $0`; pwd) +cd $script_path + +python create_module.py diff --git a/demo/senta/infer.sh b/demo/senta/infer.sh new file mode 100644 index 00000000..f71ba312 --- /dev/null +++ b/demo/senta/infer.sh @@ -0,0 +1 @@ +python ../../paddle_hub/commands/hub.py run hub_module_senta/ --signature sentiment_classify --config resources/test/test.yml --dataset resources/test/test.csv diff --git a/demo/senta/nets.py b/demo/senta/nets.py new file mode 100644 index 00000000..1125e5af --- /dev/null +++ b/demo/senta/nets.py @@ -0,0 +1,50 @@ +import paddle.fluid as fluid + + +def bilstm_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + """ + Bi-Lstm net + """ + # embedding layer + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + + # bi-lstm layer + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4) + + rfc0 = fluid.layers.fc(input=emb, size=hid_dim * 4) + + lstm_h, c = fluid.layers.dynamic_lstm( + input=fc0, size=hid_dim * 4, is_reverse=False) + + rlstm_h, c = fluid.layers.dynamic_lstm( + input=rfc0, size=hid_dim * 4, is_reverse=True) + + # extract last layer + lstm_last = fluid.layers.sequence_last_step(input=lstm_h) + rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h) + + lstm_last_tanh = fluid.layers.tanh(lstm_last) + rlstm_last_tanh = fluid.layers.tanh(rlstm_last) + + # concat layer + lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1) + + # full connect layer + fc1 = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh') + # softmax layer + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction diff --git a/demo/senta/processor.py b/demo/senta/processor.py new file mode 100644 index 00000000..eb9eca70 --- /dev/null +++ b/demo/senta/processor.py @@ -0,0 +1,91 @@ +import paddle +import paddle.fluid as fluid +import paddle_hub as hub +import numpy as np +import os +import io +from paddle_hub import BaseProcessor +from paddle_hub.hub_server import default_hub_server +from paddle_hub.module.manager import default_module_manager + + +def load_vocab(file_path): + """ + load the given vocabulary + """ + vocab = {} + with io.open(file_path, 'r', encoding='utf8') as f: + wid = 0 + for line in f: + line = line.rstrip() + parts = line.split('\t') + vocab[parts[0]] = int(parts[1]) + vocab[""] = len(vocab) + return vocab + + +def get_predict_label(pos_prob): + neg_prob = 1 - pos_prob + # threshold should be (1, 0.5) + neu_threshold = 0.55 + if neg_prob > neu_threshold: + label, key = 0, "负面" + elif pos_prob > neu_threshold: + label, key = 2, "正面" + else: + label, key = 1, "中性" + return label, key + + +class Processor(BaseProcessor): + def __init__(self, module): + self.module = module + assets_path = self.module.helper.assets_path() + word_dict_path = os.path.join(assets_path, "train.vocab") + self.word_dict = load_vocab(word_dict_path) + path = default_module_manager.search_module("lac") + if path: + self.lac = hub.Module(module_dir=path) + else: + result, _, path = default_module_manager.install_module("lac") + assert path, "can't found necessary module lac" + self.lac = hub.Module(module_dir=path) + + def preprocess(self, sign_name, data_dict): + result = {'text': []} + processed = self.lac.segment(data=data_dict) + unk_id = len(self.word_dict) + for index, data in enumerate(processed): + result_i = {'processed': []} + result_i['origin'] = data_dict['text'][index] + for result_dict in data: + if result_dict['word'] in self.word_dict: + _index = self.word_dict[result_dict['word']] + else: + _index = unk_id + result_i['processed'].append(_index) + result['text'].append(result_i) + return result + + def postprocess(self, sign_name, data_out, data_info, **kwargs): + if sign_name == "sentiment_classify": + result = [] + pred = fluid.executor.as_numpy(data_out) + for index in range(len(data_info['text'])): + result_i = {} + result_i['text'] = data_info['text'][index]['origin'] + label, key = get_predict_label(pred[0][index, 1]) + result_i['sentiment_label'] = label + result_i['sentiment_key'] = key + result.append(result_i) + return result + + def data_format(self, sign_name): + if sign_name == "sentiment_classify": + return { + "text": { + "type": hub.DataType.TEXT, + "feed_key": self.module.signatures[sign_name].inputs[0].name + } + } + return None diff --git a/demo/senta/resources/download.sh b/demo/senta/resources/download.sh new file mode 100644 index 00000000..79c39404 --- /dev/null +++ b/demo/senta/resources/download.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -o nounset +set -o errexit + +script_path=$(cd `dirname $0`; pwd) +cd $script_path + +wget --no-check-certificate https://paddlehub.bj.bcebos.com/paddle_model/senta.tar.gz +wget --no-check-certificate https://paddlehub.bj.bcebos.com/paddle_model/train.vocab +tar xvzf senta.tar.gz +rm senta.tar.gz diff --git a/demo/senta/resources/module_info.yml b/demo/senta/resources/module_info.yml new file mode 100644 index 00000000..df6d4fe8 --- /dev/null +++ b/demo/senta/resources/module_info.yml @@ -0,0 +1,5 @@ +name: senta +type: nlp/sentiment_analysis +author: paddlepaddle +author-email: paddle-dev@baidu.com +version: 1.0.0 diff --git a/demo/senta/resources/test/test.csv b/demo/senta/resources/test/test.csv new file mode 100644 index 00000000..f5e4263a --- /dev/null +++ b/demo/senta/resources/test/test.csv @@ -0,0 +1,3 @@ +TEXT_INPUT +这部电影真的很赞 +售后太差! diff --git a/demo/senta/resources/test/test.yml b/demo/senta/resources/test/test.yml new file mode 100644 index 00000000..51314996 --- /dev/null +++ b/demo/senta/resources/test/test.yml @@ -0,0 +1,4 @@ +input_data: + text: + type : TEXT + key : TEXT_INPUT -- GitLab