# -*- coding:utf-8 -*- from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import os import paddle.fluid as fluid import paddlehub as hub from paddlehub.common.paddle_helper import add_vars_prefix from paddlehub.module.module import moduleinfo, serving from emotion_detection_textcnn.net import textcnn_net from emotion_detection_textcnn.processor import load_vocab, preprocess, postprocess @moduleinfo( name="emotion_detection_textcnn", version="1.2.0", summary="Baidu's open-source Emotion Detection Model(TextCNN).", author="baidu-nlp", author_email="", type="nlp/sentiment_analysis") class EmotionDetectionTextCNN(hub.NLPPredictionModule): def _initialize(self): """ initialize with the necessary elements """ self.pretrained_model_path = os.path.join(self.directory, "assets", "infer_model") self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt") self.vocab = load_vocab(self.vocab_path) self._word_seg_module = None self.predict = self.emotion_classify self._set_config() @property def word_seg_module(self): """ lac module """ if not self._word_seg_module: self._word_seg_module = hub.Module(name="lac") return self._word_seg_module def context(self, trainable=False, max_seq_len=128, num_slots=1): """ Get the input ,output and program of the pretrained emotion_detection_textcnn Args: trainable(bool): Whether fine-tune the pretrained parameters of emotion_detection_textcnn or not. max_seq_len (int): It will limit the total sequence returned so that it has a maximum length. num_slots(int): It's number of data inputted to the model, selectted as following options: - 1(default): There's only one data to be feeded in the model, e.g. the module is used for text classification task. - 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise). - 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise). Returns: inputs(dict): the input variables of emotion_detection_textcnn (words) outputs(dict): the output variables of input words (word embeddings and label probilities); the sentence embedding and sequence length of the first input text. main_program(Program): the main_program of emotion_detection_textcnn with pretrained prameters """ assert num_slots >= 1 and num_slots <= 3, "num_slots must be 1, 2, or 3, but the input is %d" % num_slots main_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(main_program, startup_program): text_1 = fluid.layers.data(name="text", shape=[-1, max_seq_len, 1], dtype="int64", lod_level=0) seq_len = fluid.layers.data(name="seq_len", shape=[1], dtype='int64', lod_level=0) seq_len_used = fluid.layers.squeeze(seq_len, axes=[1]) # Add embedding layer. w_param_attrs = fluid.ParamAttr( name="embedding_0.w_0", initializer=fluid.initializer.TruncatedNormal(scale=0.02), trainable=trainable) dict_dim = 240466 emb_1 = fluid.layers.embedding( input=text_1, size=[dict_dim, 128], is_sparse=True, padding_idx=dict_dim - 1, dtype='float32', param_attr=w_param_attrs) emb_1_name = emb_1.name data_list = [text_1] emb_name_list = [emb_1_name] # Add lstm layer. pred, fc = textcnn_net(emb_1, seq_len_used) pred_name = pred.name fc_name = fc.name if num_slots > 1: text_2 = fluid.data(name='text_2', shape=[-1, max_seq_len], dtype='int64', lod_level=0) emb_2 = fluid.embedding( input=text_2, size=[dict_dim, 128], is_sparse=True, padding_idx=dict_dim - 1, dtype='float32', param_attr=w_param_attrs) emb_2_name = emb_2.name data_list.append(text_2) emb_name_list.append(emb_2_name) if num_slots > 2: text_3 = fluid.data(name='text_3', shape=[-1, max_seq_len], dtype='int64', lod_level=0) emb_3 = fluid.embedding( input=text_3, size=[dict_dim, 128], is_sparse=True, padding_idx=dict_dim - 1, dtype='float32', param_attr=w_param_attrs) emb_3_name = emb_3.name data_list.append(text_3) emb_name_list.append(emb_3_name) variable_names = filter(lambda v: v not in ['text', 'text_2', 'text_3', "seq_len"], list(main_program.global_block().vars.keys())) prefix_name = "@HUB_{}@".format(self.name) add_vars_prefix(program=main_program, prefix=prefix_name, vars=variable_names) for param in main_program.global_block().iter_parameters(): param.trainable = trainable place = fluid.CPUPlace() exe = fluid.Executor(place) # Load the emotion_detection_textcnn pretrained model. def if_exist(var): return os.path.exists(os.path.join(self.pretrained_model_path, var.name)) fluid.io.load_vars(exe, self.pretrained_model_path, predicate=if_exist) inputs = {'seq_len': seq_len} outputs = { "class_probs": main_program.global_block().vars[prefix_name + pred_name], "sentence_feature": main_program.global_block().vars[prefix_name + fc_name] } for index, data in enumerate(data_list): if index == 0: inputs['text'] = data outputs['emb'] = main_program.global_block().vars[prefix_name + emb_name_list[0]] else: inputs['text_%s' % (index + 1)] = data outputs['emb_%s' % (index + 1)] = main_program.global_block().vars[prefix_name + emb_name_list[index]] return inputs, outputs, main_program @serving def emotion_classify(self, texts=[], data={}, use_gpu=False, batch_size=1, use_device=None): """ Get the emotion prediction results results with the texts as input Args: texts(list): the input texts to be predicted, if texts not data data(dict): key must be 'text', value is the texts to be predicted, if data not texts use_gpu(bool): whether use gpu to predict or not batch_size(int): the program deals once with one batch Returns: results(list): the emotion prediction results use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag. """ # real predictor to use if use_device is not None: if use_device == "cpu": predictor = self.cpu_predictor elif use_device == "xpu": predictor = self.xpu_predictor elif use_device == "npu": predictor = self.npu_predictor elif use_device == "gpu": predictor = self.gpu_predictor else: raise Exception("Unsupported device: " + use_device) else: # use_device is not set, therefore follow use_gpu if use_gpu: predictor = self.gpu_predictor else: predictor = self.cpu_predictor if texts != [] and isinstance(texts, list) and data == {}: predicted_data = texts elif texts == [] and isinstance(data, dict) and isinstance(data.get('text', None), list) and data['text']: predicted_data = data["text"] else: raise ValueError("The input data is inconsistent with expectations.") predicted_data = self.to_unicode(predicted_data) start_idx = 0 iteration = int(math.ceil(len(predicted_data) / batch_size)) results = [] for i in range(iteration): if i < (iteration - 1): batch_data = predicted_data[start_idx:(start_idx + batch_size)] else: batch_data = predicted_data[start_idx:] start_idx = start_idx + batch_size processed_results = preprocess(self.word_seg_module, batch_data, self.vocab, use_gpu, batch_size, use_device) predictor_output = self._internal_predict(predictor, processed_results) batch_result = postprocess(predictor_output, processed_results) results += batch_result return results def get_labels(self): """ Get the labels which was used when pretraining Returns: self.labels(dict) """ self.labels = {"positive": 2, "negative": 0, "neutral": 1} return self.labels if __name__ == "__main__": emotion_detection_textcnn = EmotionDetectionTextCNN() inputs, outputs, main_program = emotion_detection_textcnn.context(num_slots=3) print(inputs) print(outputs) # Data to be predicted test_text = ["今天天气真好", "湿纸巾是干垃圾", "别来吵我"] input_dict = {"text": test_text} results = emotion_detection_textcnn.emotion_classify(data=input_dict, batch_size=2) for result in results: print(result['text']) print(result['emotion_label']) print(result['emotion_key']) print(result['positive_probs']) print(result['negative_probs']) print(result['neutral_probs'])