module.py

# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import ast
import json
import math
import os
import six

import numpy as np
import paddle.fluid as fluid
from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
import paddlehub as hub
from paddlehub.common.paddle_helper import add_vars_prefix, get_variable_info
from paddlehub.common.utils import sys_stdin_encoding
from paddlehub.io.parser import txt_parser
from paddlehub.module.module import serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.module import runnable

from simnet_bow.processor import load_vocab, preprocess, postprocess


class DataFormatError(Exception):
    def __init__(self, *args):
        self.args = args


@moduleinfo(
    name="simnet_bow",
    version="1.2.0",
    summary=
    "Baidu's open-source similarity network model based on bow_pairwise.",
    author="baidu-nlp",
    author_email="",
    type="nlp/sentiment_analysis")
class SimnetBow(hub.Module):
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.pretrained_model_path = os.path.join(self.directory, "assets",
                                                  "infer_model")
        self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
        self.vocab = load_vocab(self.vocab_path)
        self.param_file = os.path.join(self.directory, "assets", "params.txt")
        self._word_seg_module = None

        self._set_config()

    @property
    def word_seg_module(self):
        """
        lac module
        """
        if not self._word_seg_module:
            self._word_seg_module = hub.Module(name="lac")
        return self._word_seg_module

    def _set_config(self):
        """
        predictor config setting
        """
        cpu_config = AnalysisConfig(self.pretrained_model_path)
        cpu_config.disable_glog_info()
        cpu_config.disable_gpu()
        cpu_config.switch_ir_optim(False)
        self.cpu_predictor = create_paddle_predictor(cpu_config)

        try:
            _places = os.environ["CUDA_VISIBLE_DEVICES"]
            int(_places[0])
            use_gpu = True
        except:
            use_gpu = False
        if use_gpu:
            gpu_config = AnalysisConfig(self.pretrained_model_path)
            gpu_config.disable_glog_info()
            gpu_config.enable_use_gpu(memory_pool_init_size_mb=500, device_id=0)
            self.gpu_predictor = create_paddle_predictor(gpu_config)

    def context(self, trainable=False, max_seq_len=128, num_slots=1):
        """
        Get the input ,output and program of the pretrained simnet_bow

        Args:
             trainable(bool): whether fine-tune the pretrained parameters of simnet_bow or not。
             max_seq_len (int): It will limit the total sequence returned so that it has a maximum length.
             num_slots(int): It's number of data inputted to the model, selectted as following options:

                 - 1(default): There's only one data to be feeded in the model, e.g. the module is used for sentence classification task.
                 - 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise).
                 - 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise).

        Returns:
             inputs(dict): the input variables of simnet_bow (words)
             outputs(dict): the output variables of input words (word embeddings) and sequence lenght of the first input_text
             main_program(Program): the main_program of simnet_bow with pretrained prameters
        """
        assert num_slots >= 1 and num_slots <= 3, "num_slots must be 1, 2, or 3, but the input is %d" % num_slots
        main_program = fluid.Program()
        startup_program = fluid.Program()
        with fluid.program_guard(main_program, startup_program):
            text_1 = fluid.layers.data(
                name="text",
                shape=[-1, max_seq_len, 1],
                dtype="int64",
                lod_level=0)
            seq_len = fluid.layers.data(
                name="seq_len", shape=[1], dtype='int64', lod_level=0)
            seq_len_used = fluid.layers.squeeze(seq_len, axes=[1])

            # Add embedding layer.
            w_param_attrs = fluid.ParamAttr(
                name="emb",
                initializer=fluid.initializer.TruncatedNormal(scale=0.02),
                trainable=trainable)
            dict_dim = 500002
            emb_1 = fluid.layers.embedding(
                input=text_1,
                size=[dict_dim, 128],
                is_sparse=True,
                padding_idx=dict_dim - 1,
                dtype='float32',
                param_attr=w_param_attrs)
            emb_1_name = emb_1.name
            data_list = [text_1]
            emb_name_list = [emb_1_name]

            if num_slots > 1:
                text_2 = fluid.data(
                    name='text_2',
                    shape=[-1, max_seq_len],
                    dtype='int64',
                    lod_level=0)
                emb_2 = fluid.embedding(
                    input=text_2,
                    size=[dict_dim, 128],
                    is_sparse=True,
                    padding_idx=dict_dim - 1,
                    dtype='float32',
                    param_attr=w_param_attrs)
                emb_2_name = emb_2.name
                data_list.append(text_2)
                emb_name_list.append(emb_2_name)

            if num_slots > 2:
                text_3 = fluid.data(
                    name='text_3',
                    shape=[-1, max_seq_len],
                    dtype='int64',
                    lod_level=0)
                emb_3 = fluid.embedding(
                    input=text_3,
                    size=[dict_dim, 128],
                    is_sparse=True,
                    padding_idx=dict_dim - 1,
                    dtype='float32',
                    param_attr=w_param_attrs)
                emb_3_name = emb_3.name
                data_list.append(text_3)
                emb_name_list.append(emb_3_name)

            variable_names = filter(
                lambda v: v not in ['text', 'text_2', 'text_3', "seq_len"],
                list(main_program.global_block().vars.keys()))
            prefix_name = "@HUB_{}@".format(self.name)
            add_vars_prefix(
                program=main_program, prefix=prefix_name, vars=variable_names)

            for param in main_program.global_block().iter_parameters():
                param.trainable = trainable

            place = fluid.CPUPlace()
            exe = fluid.Executor(place)

            # Load the senta_lstm pretrained model.
            def if_exist(var):
                return os.path.exists(
                    os.path.join(self.pretrained_model_path, var.name))

            fluid.io.load_vars(
                exe, self.pretrained_model_path, predicate=if_exist)

            inputs = {'seq_len': seq_len}
            outputs = {}
            for index, data in enumerate(data_list):
                if index == 0:
                    inputs['text'] = data
                    outputs['emb'] = main_program.global_block().vars[
                        prefix_name + emb_name_list[0]]
                else:
                    inputs['text_%s' % (index + 1)] = data
                    outputs['emb_%s' % (index + 1)] = main_program.global_block(
                    ).vars[prefix_name + emb_name_list[index]]
            return inputs, outputs, main_program

    def texts2tensor(self, texts):
        """
        Tranform the texts(dict) to PaddleTensor
        Args:
             texts(list): texts
        Returns:
             tensor(PaddleTensor): tensor with texts data
        """
        lod = [0]
        data = []
        for i, text in enumerate(texts):
            data += text['processed']
            lod.append(len(text['processed']) + lod[i])
        tensor = PaddleTensor(np.array(data).astype('int64'))
        tensor.name = "words"
        tensor.lod = [lod]
        tensor.shape = [lod[-1], 1]
        return tensor

    def to_unicode(self, texts):
        """
        Convert each element's type(str) of texts(list) to unicode in python2.7
        Args:
             texts(list): each element's type is str in python2.7
        Returns:
             texts(list): each element's type is unicode in python2.7
        """

        if six.PY2:
            unicode_texts = []
            for text in texts:
                if isinstance(text, six.string_types):
                    unicode_texts.append(
                        text.decode(sys_stdin_encoding()).decode("utf8"))
                else:
                    unicode_texts.append(text)
            texts = unicode_texts
        return texts

    def check_data(self, texts=[], data={}):
        """
        check input data
        Args:
             texts(list): the input texts to be predicted which the first element is text_1(list)
                          and the second element is text_2(list), such as [['这道题很难'], ['这道题不简单']]
                          if texts not data.
             data(dict): key must be 'text_1' and 'text_2', value is the texts(list) to be predicted
        Returns:
             results(dict): predicted data
        """
        predicted_data = {'text_1': [], 'text_2': []}
        if texts != [] and isinstance(texts, list) and len(texts) == 2 and (len(
                texts[0]) == len(
                    texts[1])) and texts[0] and texts[1] and data == {}:

            predicted_data['text_1'] = texts[0]
            predicted_data['text_2'] = texts[1]

        elif texts == [] and isinstance(data, dict) and isinstance(
                data.get('text_1', None), list) and isinstance(
                    data.get('text_2', None),
                    list) and (len(data['text_1']) == len(
                        data['text_2'])) and data['text_1'] and data['text_2']:

            predicted_data = data

        else:
            raise ValueError(
                "The input data is inconsistent with expectations.")

        return predicted_data

    @serving
    def similarity(self, texts=[], data={}, use_gpu=False, batch_size=1):
        """
        Get the sentiment prediction results results with the texts as input
        Args:
             texts(list): the input texts to be predicted which the first element is text_1(list)
                          and the second element is text_2(list), such as [['这道题很难'], ['这道题不简单']]
                          if texts not data.
             data(dict): key must be 'text_1' and 'text_2', value is the texts(list) to be predicted
             use_gpu(bool): whether use gpu to predict or not
             batch_size(int): the program deals once with one batch
        Returns:
             results(list): the word segmentation results
        """
        if use_gpu:
            try:
                _places = os.environ["CUDA_VISIBLE_DEVICES"]
                int(_places[0])
            except:
                raise RuntimeError(
                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
                )

        data = self.check_data(texts, data)

        start_idx = 0
        iteration = int(math.ceil(len(data['text_1']) / batch_size))
        results = []
        for i in range(iteration):
            batch_data = {'text_1': [], 'text_2': []}
            if i < (iteration - 1):
                batch_data['text_1'] = data['text_1'][start_idx:(
                    start_idx + batch_size)]
                batch_data['text_2'] = data['text_2'][start_idx:(
                    start_idx + batch_size)]
            else:
                batch_data['text_1'] = data['text_1'][start_idx:(
                    start_idx + batch_size)]
                batch_data['text_2'] = data['text_2'][start_idx:(
                    start_idx + batch_size)]
            start_idx = start_idx + batch_size
            processed_results = preprocess(self.word_seg_module, self.vocab,
                                           batch_data, use_gpu, batch_size)

            tensor_words_1 = self.texts2tensor(processed_results["text_1"])
            tensor_words_2 = self.texts2tensor(processed_results["text_2"])

            if use_gpu:
                batch_out = self.gpu_predictor.run(
                    [tensor_words_1, tensor_words_2])
            else:
                batch_out = self.cpu_predictor.run(
                    [tensor_words_1, tensor_words_2])
            batch_result = postprocess(batch_out[1], processed_results)
            results += batch_result
        return results

    @runnable
    def run_cmd(self, argvs):
        """
        Run as a command
        """
        self.parser = argparse.ArgumentParser(
            description="Run the simnet_bow module.",
            prog='hub run simnet_bow',
            usage='%(prog)s',
            add_help=True)

        self.arg_input_group = self.parser.add_argument_group(
            title="Input options", description="Input data. Required")
        self.arg_config_group = self.parser.add_argument_group(
            title="Config options",
            description=
            "Run configuration for controlling module behavior, not required.")

        self.add_module_config_arg()
        self.add_module_input_arg()

        args = self.parser.parse_args(argvs)

        try:
            input_data = self.check_input_data(args)
        except DataFormatError and RuntimeError:
            self.parser.print_help()
            return None

        results = self.similarity(
            data=input_data, use_gpu=args.use_gpu, batch_size=args.batch_size)

        return results

    def add_module_config_arg(self):
        """
        Add the command config options
        """
        self.arg_config_group.add_argument(
            '--use_gpu',
            type=ast.literal_eval,
            default=False,
            help="whether use GPU for prediction")

        self.arg_config_group.add_argument(
            '--batch_size',
            type=int,
            default=1,
            help="batch size for prediction")

    def add_module_input_arg(self):
        """
        Add the command input options
        """
        self.arg_input_group.add_argument(
            '--input_file',
            type=str,
            default=None,
            help="file contain input data")
        self.arg_input_group.add_argument(
            '--text_1', type=str, default=None, help="text to predict")
        self.arg_input_group.add_argument(
            '--text_2', type=str, default=None, help="text to predict")

    def check_input_data(self, args):
        input_data = {}
        if args.input_file:
            if not os.path.exists(args.input_file):
                print("File %s is not exist." % args.input_file)
                raise RuntimeError
            else:
                input_data = txt_parser.parse(args.input_file, use_strip=True)
        elif args.text_1 and args.text_2:
            if args.text_1.strip() != '' and args.text_2.strip() != '':
                if six.PY2:
                    input_data = {
                        "text_1": [
                            args.text_1.strip().decode(
                                sys_stdin_encoding()).decode("utf8")
                        ],
                        "text_2": [
                            args.text_2.strip().decode(
                                sys_stdin_encoding()).decode("utf8")
                        ]
                    }
                else:
                    input_data = {
                        "text_1": [args.text_1],
                        "text_2": [args.text_2]
                    }
            else:
                print(
                    "ERROR: The input data is inconsistent with expectations.")

        if input_data == {}:
            print("ERROR: The input data is inconsistent with expectations.")
            raise DataFormatError

        return input_data

    def get_vocab_path(self):
        """
        Get the path to the vocabulary whih was used to pretrain
        Returns:
             self.vocab_path(str): the path to vocabulary
        """
        return self.vocab_path


if __name__ == "__main__":

    simnet_bow = SimnetBow()
    inputs, outputs, program = simnet_bow.context(num_slots=3)
    print(inputs)
    print(outputs)

    # Data to be predicted
    test_text_1 = ["这道题太难了", "这道题太难了", "这道题太难了"]
    test_text_2 = ["这道题是上一年的考题", "这道题不简单", "这道题很有意思"]

    inputs = {"text_1": test_text_1, "text_2": test_text_2}
    results = simnet_bow.similarity(data=inputs, batch_size=2)
    print(results)
    max_score = -1
    result_text = ""
    for result in results:
        if result['similarity'] > max_score:
            max_score = result['similarity']
            result_text = result['text_2']

    print("The most matching with the %s is %s" % (test_text_1[0], result_text))