module.py 10.2 KB
Newer Older
W
wuzewu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import json
import math
import os
import six

import paddle.fluid as fluid
import paddlehub as hub
from paddlehub.common.paddle_helper import add_vars_prefix
from paddlehub.module.module import moduleinfo, serving

from senta_bilstm.net import bilstm_net
from senta_bilstm.processor import load_vocab, preprocess, postprocess


@moduleinfo(
    name="senta_bilstm",
S
Steffy-zxf 已提交
22
    version="1.2.0",
W
wuzewu 已提交
23 24 25 26 27 28 29 30 31
    summary="Baidu's open-source Sentiment Classification System.",
    author="baidu-nlp",
    author_email="",
    type="nlp/sentiment_analysis")
class SentaBiLSTM(hub.NLPPredictionModule):
    def _initialize(self):
        """
        initialize with the necessary elements
        """
S
Steffy-zxf 已提交
32 33
        self.pretrained_model_path = os.path.join(self.directory, "assets",
                                                  "infer_model")
W
wuzewu 已提交
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
        self.vocab_path = os.path.join(self.directory, "assets/vocab.txt")
        self.word_dict = load_vocab(self.vocab_path)
        self._word_seg_module = None

        self.predict = self.sentiment_classify

        self._set_config()

    @property
    def word_seg_module(self):
        """
        lac module
        """
        if not self._word_seg_module:
            self._word_seg_module = hub.Module(name="lac")
        return self._word_seg_module

S
Steffy-zxf 已提交
51
    def context(self, trainable=False, max_seq_len=128, num_slots=1):
W
wuzewu 已提交
52 53 54 55
        """
        Get the input ,output and program of the pretrained senta_bilstm

        Args:
S
Steffy-zxf 已提交
56 57 58 59 60 61 62
             trainable(bool): whether fine-tune the pretrained parameters of senta_bilstm or not.
             max_seq_len (int): It will limit the total sequence returned so that it has a maximum length.
             num_slots(int): It's number of data inputted to the model, selectted as following options:

                 - 1(default): There's only one data to be feeded in the model, e.g. the module is used for text classification task.
                 - 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise).
                 - 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise).
W
wuzewu 已提交
63 64 65

        Returns:
             inputs(dict): the input variables of senta_bilstm (words)
S
Steffy-zxf 已提交
66 67 68
             outputs(dict): the output variables of input words (word embeddings and label probilities);
                 the sentence embedding and sequence length of the first input text.
             main_program(Program): the main_program of Senta with pretrained prameters
W
wuzewu 已提交
69
        """
S
Steffy-zxf 已提交
70
        assert num_slots >= 1 and num_slots <= 3, "num_slots must be 1, 2, or 3, but the input is %d" % num_slots
W
wuzewu 已提交
71 72 73
        main_program = fluid.Program()
        startup_program = fluid.Program()
        with fluid.program_guard(main_program, startup_program):
S
Steffy-zxf 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
            text_1 = fluid.layers.data(
                name="text",
                shape=[-1, max_seq_len, 1],
                dtype="int64",
                lod_level=0)
            seq_len = fluid.layers.data(
                name="seq_len", shape=[1], dtype='int64', lod_level=0)
            seq_len_used = fluid.layers.squeeze(seq_len, axes=[1])

            # Add embedding layer.
            w_param_attrs = fluid.ParamAttr(
                name="embedding_0.w_0",
                initializer=fluid.initializer.TruncatedNormal(scale=0.02),
                trainable=trainable)
            dict_dim = 1256607
            emb_1 = fluid.layers.embedding(
                input=text_1,
                size=[dict_dim, 128],
                is_sparse=True,
                padding_idx=dict_dim - 1,
                dtype='float32',
                param_attr=w_param_attrs)
            emb_1_name = emb_1.name
            data_list = [text_1]
            emb_name_list = [emb_1_name]
W
wuzewu 已提交
99

S
Steffy-zxf 已提交
100 101
            # Add lstm layer.
            pred, fc = bilstm_net(emb_1, seq_len_used)
W
wuzewu 已提交
102 103 104
            pred_name = pred.name
            fc_name = fc.name

S
Steffy-zxf 已提交
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
            if num_slots > 1:
                text_2 = fluid.data(
                    name='text_2',
                    shape=[-1, max_seq_len],
                    dtype='int64',
                    lod_level=0)
                emb_2 = fluid.embedding(
                    input=text_2,
                    size=[dict_dim, 128],
                    is_sparse=True,
                    padding_idx=dict_dim - 1,
                    dtype='float32',
                    param_attr=w_param_attrs)
                emb_2_name = emb_2.name
                data_list.append(text_2)
                emb_name_list.append(emb_2_name)

            if num_slots > 2:
                text_3 = fluid.data(
                    name='text_3',
                    shape=[-1, max_seq_len],
                    dtype='int64',
                    lod_level=0)
                emb_3 = fluid.embedding(
                    input=text_3,
                    size=[dict_dim, 128],
                    is_sparse=True,
                    padding_idx=dict_dim - 1,
                    dtype='float32',
                    param_attr=w_param_attrs)
                emb_3_name = emb_3.name
                data_list.append(text_3)
                emb_name_list.append(emb_3_name)

            variable_names = filter(
                lambda v: v not in ['text', 'text_2', 'text_3', "seq_len"],
                list(main_program.global_block().vars.keys()))
W
wuzewu 已提交
142
            prefix_name = "@HUB_{}@".format(self.name)
S
Steffy-zxf 已提交
143 144
            add_vars_prefix(
                program=main_program, prefix=prefix_name, vars=variable_names)
W
wuzewu 已提交
145 146 147 148 149 150 151

            for param in main_program.global_block().iter_parameters():
                param.trainable = trainable

            place = fluid.CPUPlace()
            exe = fluid.Executor(place)

S
Steffy-zxf 已提交
152
            # Load the senta_lstm pretrained model.
W
wuzewu 已提交
153 154 155 156 157 158 159
            def if_exist(var):
                return os.path.exists(
                    os.path.join(self.pretrained_model_path, var.name))

            fluid.io.load_vars(
                exe, self.pretrained_model_path, predicate=if_exist)

S
Steffy-zxf 已提交
160
            inputs = {'seq_len': seq_len}
W
wuzewu 已提交
161 162 163 164 165 166
            outputs = {
                "class_probs":
                main_program.global_block().vars[prefix_name + pred_name],
                "sentence_feature":
                main_program.global_block().vars[prefix_name + fc_name]
            }
S
Steffy-zxf 已提交
167 168 169 170 171 172 173 174 175
            for index, data in enumerate(data_list):
                if index == 0:
                    inputs['text'] = data
                    outputs['emb'] = main_program.global_block().vars[
                        prefix_name + emb_name_list[0]]
                else:
                    inputs['text_%s' % (index + 1)] = data
                    outputs['emb_%s' % (index + 1)] = main_program.global_block(
                    ).vars[prefix_name + emb_name_list[index]]
W
wuzewu 已提交
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
            return inputs, outputs, main_program

    @serving
    def sentiment_classify(self, texts=[], data={}, use_gpu=False,
                           batch_size=1):
        """
        Get the sentiment prediction results results with the texts as input

        Args:
             texts(list): the input texts to be predicted, if texts not data
             data(dict): key must be 'text', value is the texts to be predicted, if data not texts
             use_gpu(bool): whether use gpu to predict or not
             batch_size(int): the program deals once with one batch

        Returns:
             results(list): the word segmentation results
        """
S
Steffy-zxf 已提交
193 194 195 196 197 198 199 200
        if use_gpu:
            try:
                _places = os.environ["CUDA_VISIBLE_DEVICES"]
                int(_places[0])
            except:
                raise RuntimeError(
                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
                )
W
wuzewu 已提交
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245

        if texts != [] and isinstance(texts, list) and data == {}:
            predicted_data = texts
        elif texts == [] and isinstance(data, dict) and isinstance(
                data.get('text', None), list) and data['text']:
            predicted_data = data["text"]
        else:
            raise ValueError(
                "The input data is inconsistent with expectations.")

        predicted_data = self.to_unicode(predicted_data)
        start_idx = 0
        iteration = int(math.ceil(len(predicted_data) / batch_size))
        results = []
        for i in range(iteration):
            if i < (iteration - 1):
                batch_data = predicted_data[start_idx:(start_idx + batch_size)]
            else:
                batch_data = predicted_data[start_idx:]

            start_idx = start_idx + batch_size
            processed_results = preprocess(self.word_seg_module, batch_data,
                                           self.word_dict, use_gpu, batch_size)
            tensor_words = self.texts2tensor(processed_results)

            if use_gpu:
                batch_out = self.gpu_predictor.run([tensor_words])
            else:
                batch_out = self.cpu_predictor.run([tensor_words])
            batch_result = postprocess(batch_out[0], processed_results)
            results += batch_result
        return results

    def get_labels(self):
        """
        Get the labels which was used when pretraining
        Returns:
             self.labels(dict)
        """
        self.labels = {"positive": 1, "negative": 0}
        return self.labels


if __name__ == "__main__":
    senta = SentaBiLSTM()
S
Steffy-zxf 已提交
246 247 248
    inputs, outputs, main_program = senta.context(num_slots=3)
    print(inputs)
    print(outputs)
W
wuzewu 已提交
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
    # Data to be predicted
    test_text = ["这家餐厅很好吃", "这部电影真的很差劲"]

    # execute predict and print the result
    input_dict = {"text": test_text}
    results = senta.sentiment_classify(data=input_dict, batch_size=3)
    for index, result in enumerate(results):
        if six.PY2:
            print(
                json.dumps(results[index], encoding="utf8", ensure_ascii=False))
        else:
            print(results[index])
    results = senta.sentiment_classify(texts=test_text)
    for index, result in enumerate(results):
        if six.PY2:
            print(
                json.dumps(results[index], encoding="utf8", ensure_ascii=False))
        else:
            print(results[index])