From 0a4b1020021373ee2957754a8e282850ecf7e15f Mon Sep 17 00:00:00 2001
From: wuzewu <wuzewu@baidu.com>
Date: Thu, 21 Mar 2019 20:28:24 +0800
Subject: [PATCH] add senta demo

---
 demo/senta/create_module.py          | 56 +++++++++++++++++
 demo/senta/create_module.sh          |  8 +++
 demo/senta/infer.sh                  |  1 +
 demo/senta/nets.py                   | 50 +++++++++++++++
 demo/senta/processor.py              | 91 ++++++++++++++++++++++++++++
 demo/senta/resources/download.sh     | 11 ++++
 demo/senta/resources/module_info.yml |  5 ++
 demo/senta/resources/test/test.csv   |  3 +
 demo/senta/resources/test/test.yml   |  4 ++
 9 files changed, 229 insertions(+)
 create mode 100644 demo/senta/create_module.py
 create mode 100644 demo/senta/create_module.sh
 create mode 100644 demo/senta/infer.sh
 create mode 100644 demo/senta/nets.py
 create mode 100644 demo/senta/processor.py
 create mode 100644 demo/senta/resources/download.sh
 create mode 100644 demo/senta/resources/module_info.yml
 create mode 100644 demo/senta/resources/test/test.csv
 create mode 100644 demo/senta/resources/test/test.yml
diff --git a/demo/senta/create_module.py b/demo/senta/create_module.py
new file mode 100644
index 00000000..c3c2f97b
--- /dev/null
+++ b/demo/senta/create_module.py
@@ -0,0 +1,56 @@
+import io
+import paddle.fluid as fluid
+import processor
+import numpy as np
+import nets
+import paddle_hub as hub
+
+
+def load_vocab(file_path):
+    """
+    load the given vocabulary
+    """
+    vocab = {}
+    with io.open(file_path, 'r', encoding='utf8') as f:
+        wid = 0
+        for line in f:
+            line = line.rstrip()
+            parts = line.split('\t')
+            vocab[parts[0]] = int(parts[1])
+    vocab["<unk>"] = len(vocab)
+    return vocab
+
+
+def create_module():
+    network = nets.bilstm_net
+    # word seq data
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    # label data
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    word_dict_path = "./resources/train.vocab"
+    word_dict = load_vocab(word_dict_path)
+    cost, acc, pred = network(data, label, len(word_dict) + 1)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    model_path = "./resources/senta_model"
+    fluid.io.load_inference_model(model_path, exe)
+
+    # assets
+    assets = [word_dict_path]
+
+    # create a module
+    sign = hub.create_signature(
+        name="sentiment_classify", inputs=[data], outputs=[pred])
+    hub.create_module(
+        sign_arr=[sign],
+        module_dir="hub_module_senta",
+        exe=exe,
+        module_info="resources/module_info.yml",
+        processor=processor.Processor,
+        assets=assets)
+
+
+if __name__ == "__main__":
+    create_module()
diff --git a/demo/senta/create_module.sh b/demo/senta/create_module.sh
new file mode 100644
index 00000000..70bd274c
--- /dev/null
+++ b/demo/senta/create_module.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -o nounset
+set -o errexit
+
+script_path=$(cd `dirname $0`; pwd)
+cd $script_path
+
+python create_module.py
diff --git a/demo/senta/infer.sh b/demo/senta/infer.sh
new file mode 100644
index 00000000..f71ba312
--- /dev/null
+++ b/demo/senta/infer.sh
@@ -0,0 +1 @@
+python ../../paddle_hub/commands/hub.py run hub_module_senta/ --signature sentiment_classify --config resources/test/test.yml  --dataset resources/test/test.csv
diff --git a/demo/senta/nets.py b/demo/senta/nets.py
new file mode 100644
index 00000000..1125e5af
--- /dev/null
+++ b/demo/senta/nets.py
@@ -0,0 +1,50 @@
+import paddle.fluid as fluid
+
+
+def bilstm_net(data,
+               label,
+               dict_dim,
+               emb_dim=128,
+               hid_dim=128,
+               hid_dim2=96,
+               class_dim=2,
+               emb_lr=30.0):
+    """
+    Bi-Lstm net
+    """
+    # embedding layer
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    # bi-lstm layer
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+
+    rfc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+
+    rlstm_h, c = fluid.layers.dynamic_lstm(
+        input=rfc0, size=hid_dim * 4, is_reverse=True)
+
+    # extract last layer
+    lstm_last = fluid.layers.sequence_last_step(input=lstm_h)
+    rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h)
+
+    lstm_last_tanh = fluid.layers.tanh(lstm_last)
+    rlstm_last_tanh = fluid.layers.tanh(rlstm_last)
+
+    # concat layer
+    lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1)
+
+    # full connect layer
+    fc1 = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh')
+    # softmax layer
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
diff --git a/demo/senta/processor.py b/demo/senta/processor.py
new file mode 100644
index 00000000..eb9eca70
--- /dev/null
+++ b/demo/senta/processor.py
@@ -0,0 +1,91 @@
+import paddle
+import paddle.fluid as fluid
+import paddle_hub as hub
+import numpy as np
+import os
+import io
+from paddle_hub import BaseProcessor
+from paddle_hub.hub_server import default_hub_server
+from paddle_hub.module.manager import default_module_manager
+
+
+def load_vocab(file_path):
+    """
+    load the given vocabulary
+    """
+    vocab = {}
+    with io.open(file_path, 'r', encoding='utf8') as f:
+        wid = 0
+        for line in f:
+            line = line.rstrip()
+            parts = line.split('\t')
+            vocab[parts[0]] = int(parts[1])
+    vocab["<unk>"] = len(vocab)
+    return vocab
+
+
+def get_predict_label(pos_prob):
+    neg_prob = 1 - pos_prob
+    # threshold should be (1, 0.5)
+    neu_threshold = 0.55
+    if neg_prob > neu_threshold:
+        label, key = 0, "负面"
+    elif pos_prob > neu_threshold:
+        label, key = 2, "正面"
+    else:
+        label, key = 1, "中性"
+    return label, key
+
+
+class Processor(BaseProcessor):
+    def __init__(self, module):
+        self.module = module
+        assets_path = self.module.helper.assets_path()
+        word_dict_path = os.path.join(assets_path, "train.vocab")
+        self.word_dict = load_vocab(word_dict_path)
+        path = default_module_manager.search_module("lac")
+        if path:
+            self.lac = hub.Module(module_dir=path)
+        else:
+            result, _, path = default_module_manager.install_module("lac")
+            assert path, "can't found necessary module lac"
+            self.lac = hub.Module(module_dir=path)
+
+    def preprocess(self, sign_name, data_dict):
+        result = {'text': []}
+        processed = self.lac.segment(data=data_dict)
+        unk_id = len(self.word_dict)
+        for index, data in enumerate(processed):
+            result_i = {'processed': []}
+            result_i['origin'] = data_dict['text'][index]
+            for result_dict in data:
+                if result_dict['word'] in self.word_dict:
+                    _index = self.word_dict[result_dict['word']]
+                else:
+                    _index = unk_id
+                result_i['processed'].append(_index)
+            result['text'].append(result_i)
+        return result
+
+    def postprocess(self, sign_name, data_out, data_info, **kwargs):
+        if sign_name == "sentiment_classify":
+            result = []
+            pred = fluid.executor.as_numpy(data_out)
+            for index in range(len(data_info['text'])):
+                result_i = {}
+                result_i['text'] = data_info['text'][index]['origin']
+                label, key = get_predict_label(pred[0][index, 1])
+                result_i['sentiment_label'] = label
+                result_i['sentiment_key'] = key
+                result.append(result_i)
+            return result
+
+    def data_format(self, sign_name):
+        if sign_name == "sentiment_classify":
+            return {
+                "text": {
+                    "type": hub.DataType.TEXT,
+                    "feed_key": self.module.signatures[sign_name].inputs[0].name
+                }
+            }
+        return None
diff --git a/demo/senta/resources/download.sh b/demo/senta/resources/download.sh
new file mode 100644
index 00000000..79c39404
--- /dev/null
+++ b/demo/senta/resources/download.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -o nounset
+set -o errexit
+
+script_path=$(cd `dirname $0`; pwd)
+cd $script_path
+
+wget --no-check-certificate https://paddlehub.bj.bcebos.com/paddle_model/senta.tar.gz
+wget --no-check-certificate https://paddlehub.bj.bcebos.com/paddle_model/train.vocab
+tar xvzf senta.tar.gz
+rm senta.tar.gz
diff --git a/demo/senta/resources/module_info.yml b/demo/senta/resources/module_info.yml
new file mode 100644
index 00000000..df6d4fe8
--- /dev/null
+++ b/demo/senta/resources/module_info.yml
@@ -0,0 +1,5 @@
+name: senta
+type: nlp/sentiment_analysis
+author: paddlepaddle
+author-email: paddle-dev@baidu.com
+version: 1.0.0
diff --git a/demo/senta/resources/test/test.csv b/demo/senta/resources/test/test.csv
new file mode 100644
index 00000000..f5e4263a
--- /dev/null
+++ b/demo/senta/resources/test/test.csv
@@ -0,0 +1,3 @@
+TEXT_INPUT
+这部电影真的很赞
+售后太差！
diff --git a/demo/senta/resources/test/test.yml b/demo/senta/resources/test/test.yml
new file mode 100644
index 00000000..51314996
--- /dev/null
+++ b/demo/senta/resources/test/test.yml
@@ -0,0 +1,4 @@
+input_data:
+  text:
+    type : TEXT
+    key : TEXT_INPUT
-- 
GitLab