Add nlp module

c380150e · wuzewu · 6acb2dd4 · c380150e · c380150e · c380150e
9 changed file
--- a/paddlehub/compat/module/nlp_module.py
+++ b/paddlehub/compat/module/nlp_module.py
+# coding:utf-8
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import ast
+import os
+import re
+import six
+from typing import Any, List, Text, Tuple
+
+import paddle
+import numpy as np
+from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
+
+from paddlehub.compat import paddle_utils
+from paddlehub.compat.task.transformer_emb_task import TransformerEmbeddingTask
+from paddlehub.compat.task.config import RunConfig
+from paddlehub.compat.task.reader import ClassifyReader
+from paddlehub.module.module import runnable, RunModule
+from paddlehub.utils.parser import txt_parser
+from paddlehub.utils.utils import sys_stdin_encoding
+
+
+class DataFormatError(Exception):
+    def __init__(self, *args):
+        self.args = args
+
+
+class NLPBaseModule(RunModule):
+    def get_vocab_path(self):
+        '''
+        Get the path to the vocabulary whih was used to pretrain
+        Returns:
+             self.vocab_path(str): the path to vocabulary
+        '''
+        return self.vocab_path
+
+
+class NLPPredictionModule(NLPBaseModule):
+    def _set_config(self):
+        '''predictor config setting'''
+        cpu_config = AnalysisConfig(self.pretrained_model_path)
+        cpu_config.disable_glog_info()
+        cpu_config.disable_gpu()
+        self.cpu_predictor = create_paddle_predictor(cpu_config)
+
+        try:
+            _places = os.environ['CUDA_VISIBLE_DEVICES']
+            int(_places[0])
+            use_gpu = True
+        except:
+            use_gpu = False
+        if use_gpu:
+            gpu_config = AnalysisConfig(self.pretrained_model_path)
+            gpu_config.disable_glog_info()
+            gpu_config.enable_use_gpu(memory_pool_init_size_mb=500, device_id=0)
+            self.gpu_predictor = create_paddle_predictor(gpu_config)
+
+    def texts2tensor(self, texts: List[dict]) -> paddle.Tensor:
+        '''
+        Tranform the texts(dict) to PaddleTensor
+        Args:
+             texts(list): each element is a dict that must have a named 'processed' key whose value is word_ids, such as
+                          texts = [{'processed': [23, 89, 43, 906]}]
+        Returns:
+             tensor(PaddleTensor): tensor with texts data
+        '''
+        lod = [0]
+        data = []
+        for i, text in enumerate(texts):
+            data += text['processed']
+            lod.append(len(text['processed']) + lod[i])
+        tensor = PaddleTensor(np.array(data).astype('int64'))
+        tensor.name = 'words'
+        tensor.lod = [lod]
+        tensor.shape = [lod[-1], 1]
+        return tensor
+
+    def to_unicode(self, texts: str) -> Text:
+        '''
+        Convert each element's type(str) of texts(list) to unicode in python2.7
+        Args:
+             texts(list): each element's type is str in python2.7
+        Returns:
+             texts(list): each element's type is unicode in python2.7
+        '''
+        if six.PY2:
+            unicode_texts = []
+            for text in texts:
+                if isinstance(text, six.string_types):
+                    unicode_texts.append(text.decode(sys_stdin_encoding()).decode('utf8'))
+                else:
+                    unicode_texts.append(text)
+            texts = unicode_texts
+        return texts
+
+    @runnable
+    def run_cmd(self, argvs: List[Any]):
+        '''Run as a command'''
+        self.parser = argparse.ArgumentParser(
+            description='Run the %s module.' % self.name,
+            prog='hub run %s' % self.name,
+            usage='%(prog)s',
+            add_help=True)
+
+        self.arg_input_group = self.parser.add_argument_group(title='Input options', description='Input data. Required')
+        self.arg_config_group = self.parser.add_argument_group(
+            title='Config options', description='Run configuration for controlling module behavior, not required.')
+
+        self.add_module_config_arg()
+        self.add_module_input_arg()
+
+        args = self.parser.parse_args(argvs)
+
+        try:
+            input_data = self.check_input_data(args)
+        except DataFormatError and RuntimeError:
+            self.parser.print_help()
+            return None
+
+        results = self.predict(texts=input_data, use_gpu=args.use_gpu, batch_size=args.batch_size)
+
+        return results
+
+    def add_module_config_arg(self):
+        '''Add the command config options'''
+        self.arg_config_group.add_argument(
+            '--use_gpu', type=ast.literal_eval, default=False, help='whether use GPU for prediction')
+
+        self.arg_config_group.add_argument('--batch_size', type=int, default=1, help='batch size for prediction')
+
+    def add_module_input_arg(self):
+        '''Add the command input options'''
+        self.arg_input_group.add_argument('--input_file', type=str, default=None, help='file contain input data')
+        self.arg_input_group.add_argument('--input_text', type=str, default=None, help='text to predict')
+
+    def check_input_data(self, args):
+        input_data = []
+        if args.input_file:
+            if not os.path.exists(args.input_file):
+                raise FileNotFoundError('File %s does not exist.' % args.input_file)
+            else:
+                input_data = txt_parser.parse(args.input_file, use_strip=True)
+        elif args.input_text:
+            input_data = [args.input_text]
+
+        return input_data
+
+
+class TransformerModule(NLPBaseModule):
+    '''
+    Tranformer Module base class can be used by BERT, ERNIE, RoBERTa and so on.
+    '''
+
+    def __init__(self,
+                 name: str = None,
+                 directory: str = None,
+                 module_dir: List = None,
+                 version: str = None,
+                 max_seq_len: int = 128,
+                 **kwargs):
+        if not directory:
+            return
+        super(TransformerModule, self).__init__(
+            name=name, directory=directory, module_dir=module_dir, version=version, **kwargs)
+
+        self.max_seq_len = max_seq_len
+
+    def init_pretraining_params(self, exe: paddle.static.Executor, pretraining_params_path: str,
+                                main_program: paddle.static.Program):
+        assert os.path.exists(pretraining_params_path), '[{}] cann\'t be found.'.format(pretraining_params_path)
+
+        def existed_params(var):
+            if not isinstance(var, paddle.fluid.framework.Parameter):
+                return False
+            return os.path.exists(os.path.join(pretraining_params_path, var.name))
+
+        paddle.io.load(
+            executor=exe,
+            model_path=pretraining_params_path,
+            program=main_program,
+            var_list=main_program.all_parameters())
+
+    def param_prefix(self) -> str:
+        return '@HUB_%s@' % self.name
+
+    def context(
+            self,
+            max_seq_len: int = None,
+            trainable: bool = True,
+            num_slots: int = 1,
+    ) -> Tuple[dict, dict, paddle.static.Program]:
+        '''
+        get inputs, outputs and program from pre-trained module
+        Args:
+            max_seq_len (int): It will limit the total sequence returned so that it has a maximum length.
+            trainable (bool): Whether fine-tune the pre-trained module parameters or not.
+            num_slots(int): It's number of data inputted to the model, selectted as following options:
+                - 1(default): There's only one data to be feeded in the model, e.g. the module is used for sentence classification task.
+                - 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise).
+                - 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise).
+        Returns: inputs, outputs, program.
+                 The inputs is a dict with keys named input_ids, position_ids, segment_ids, input_mask and task_ids
+                 The outputs is a dict with two keys named pooled_output and sequence_output.
+        '''
+        assert num_slots >= 1 and num_slots <= 3, 'num_slots must be 1, 2, or 3, but the input is %d' % num_slots
+        if not max_seq_len:
+            max_seq_len = self.max_seq_len
+
+        assert max_seq_len <= self.MAX_SEQ_LEN and max_seq_len >= 1, 'max_seq_len({}) should be in the range of [1, {}]'.format(
+            max_seq_len, self.MAX_SEQ_LEN)
+
+        module_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(module_program, startup_program):
+            with paddle.fluid.unique_name.guard():
+                input_ids = paddle.data(name='input_ids', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
+                position_ids = paddle.data(name='position_ids', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
+                segment_ids = paddle.data(name='segment_ids', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
+                input_mask = paddle.data(name='input_mask', shape=[-1, max_seq_len, 1], dtype='float32', lod_level=0)
+                pooled_output, sequence_output = self.net(input_ids, position_ids, segment_ids, input_mask)
+
+                data_list = [(input_ids, position_ids, segment_ids, input_mask)]
+                output_name_list = [(pooled_output.name, sequence_output.name)]
+
+                if num_slots > 1:
+                    input_ids_2 = paddle.data(
+                        name='input_ids_2', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
+                    position_ids_2 = paddle.data(
+                        name='position_ids_2', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
+                    segment_ids_2 = paddle.data(
+                        name='segment_ids_2', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
+                    input_mask_2 = paddle.data(
+                        name='input_mask_2', shape=[-1, max_seq_len, 1], dtype='float32', lod_level=0)
+                    pooled_output_2, sequence_output_2 = self.net(input_ids_2, position_ids_2, segment_ids_2,
+                                                                  input_mask_2)
+                    data_list.append((input_ids_2, position_ids_2, segment_ids_2, input_mask_2))
+                    output_name_list.append((pooled_output_2.name, sequence_output_2.name))
+
+                if num_slots > 2:
+                    input_ids_3 = paddle.data(
+                        name='input_ids_3', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
+                    position_ids_3 = paddle.data(
+                        name='position_ids_3', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
+                    segment_ids_3 = paddle.data(
+                        name='segment_ids_3', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
+                    input_mask_3 = paddle.data(
+                        name='input_mask_3', shape=[-1, max_seq_len, 1], dtype='float32', lod_level=0)
+                    pooled_output_3, sequence_output_3 = self.net(input_ids_3, position_ids_3, segment_ids_3,
+                                                                  input_mask_3)
+                    data_list.append((input_ids_3, position_ids_3, segment_ids_3, input_mask_3))
+                    output_name_list.append((pooled_output_3.name, sequence_output_3.name))
+
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+
+        # To be compatible with the module v1
+        vars = filter(
+            lambda var: var not in [
+                'input_ids', 'position_ids', 'segment_ids', 'input_mask', 'input_ids_2', 'position_ids_2',
+                'segment_ids_2', 'input_mask_2', 'input_ids_3', 'position_ids_3', 'segment_ids_3', 'input_mask_3'
+            ], list(module_program.global_block().vars.keys()))
+        paddle_utils.add_vars_prefix(program=module_program, prefix=self.param_prefix(), vars=vars)
+        self.init_pretraining_params(exe, self.params_path, main_program=module_program)
+
+        self.params_layer = {}
+        for param in module_program.global_block().iter_parameters():
+            param.trainable = trainable
+            match = re.match(r'.*layer_(\d+).*', param.name)
+            if match:
+                # layer num begins from 0
+                layer = match.group(1)
+                self.params_layer[param.name] = int(layer)
+
+        inputs = {}
+        outputs = {}
+        for index, data in enumerate(data_list):
+
+            if index == 0:
+                inputs['input_ids'] = data[0]
+                inputs['position_ids'] = data[1]
+                inputs['segment_ids'] = data[2]
+                inputs['input_mask'] = data[3]
+                outputs['pooled_output'] = module_program.global_block().vars[self.param_prefix() +
+                                                                              output_name_list[0][0]]
+                outputs['sequence_output'] = module_program.global_block().vars[self.param_prefix() +
+                                                                                output_name_list[0][1]]
+            else:
+                inputs['input_ids_%s' % (index + 1)] = data[0]
+                inputs['position_ids_%s' % (index + 1)] = data[1]
+                inputs['segment_ids_%s' % (index + 1)] = data[2]
+                inputs['input_mask_%s' % (index + 1)] = data[3]
+                outputs['pooled_output_%s' % (index + 1)] = module_program.global_block().vars[
+                    self.param_prefix() + output_name_list[index][0]]
+                outputs['sequence_output_%s' % (index + 1)] = module_program.global_block().vars[
+                    self.param_prefix() + output_name_list[index][1]]
+
+        return inputs, outputs, module_program
+
+    def get_embedding(self, texts: List[str], max_seq_len: int = 512, use_gpu: bool = False, batch_size: int = 1):
+        '''
+        get pooled_output and sequence_output for input texts.
+        Warnings: this method depends on Paddle Inference Library, it may not work properly in PaddlePaddle <= 1.6.2.
+        Args:
+            texts (list): each element is a text sample, each sample include text_a and text_b where text_b can be omitted.
+                          for example: [[sample0_text_a, sample0_text_b], [sample1_text_a, sample1_text_b], ...]
+            max_seq_len (int): the max sequence length.
+            use_gpu (bool): use gpu or not, default False.
+            batch_size (int): the data batch size, default 1.
+        Returns:
+            pooled_outputs(list): its element is a numpy array, the first feature of each text sample.
+            sequence_outputs(list): its element is a numpy array, the whole features of each text sample.
+        '''
+        if not hasattr(self,
+                       'emb_job') or self.emb_job['batch_size'] != batch_size or self.emb_job['use_gpu'] != use_gpu:
+            inputs, outputs, program = self.context(trainable=True, max_seq_len=max_seq_len)
+
+            reader = ClassifyReader(
+                dataset=None,
+                vocab_path=self.get_vocab_path(),
+                max_seq_len=max_seq_len,
+                sp_model_path=self.get_spm_path() if hasattr(self, 'get_spm_path') else None,
+                word_dict_path=self.get_word_dict_path() if hasattr(self, 'word_dict_path') else None)
+
+            feed_list = [
+                inputs['input_ids'].name,
+                inputs['position_ids'].name,
+                inputs['segment_ids'].name,
+                inputs['input_mask'].name,
+            ]
+
+            pooled_feature, seq_feature = outputs['pooled_output'], outputs['sequence_output']
+
+            config = RunConfig(use_data_parallel=False, use_cuda=use_gpu, batch_size=batch_size)
+
+            self.emb_job = {}
+            self.emb_job['task'] = TransformerEmbeddingTask(
+                pooled_feature=pooled_feature,
+                seq_feature=seq_feature,
+                feed_list=feed_list,
+                data_reader=reader,
+                config=config,
+            )
+            self.emb_job['batch_size'] = batch_size
+            self.emb_job['use_gpu'] = use_gpu
+
+        return self.emb_job['task'].predict(data=texts, return_result=True, accelerate_mode=True)
+
+    def get_spm_path(self) -> str:
+        if hasattr(self, 'spm_path'):
+            return self.spm_path
+        return None
+
+    def get_word_dict_path(self) -> str:
+        if hasattr(self, 'word_dict_path'):
+            return self.word_dict_path
+        return None
+
+    def get_params_layer(self) -> dict:
+        if not hasattr(self, 'params_layer'):
+            raise AttributeError('The module context has not been initialized. '
+                                 'Please call context() before using get_params_layer')
+        return self.params_layer
--- a/paddlehub/compat/task/base_task.py
+++ b/paddlehub/compat/task/base_task.py
--- a/paddlehub/compat/task/batch.py
+++ b/paddlehub/compat/task/batch.py
+# coding:utf-8
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''Mask, padding and batching.'''
+
+from typing import List, Union
+
+import numpy as np
+
+
+def pad_batch_data(insts: List,
+                   pad_idx: int = 0,
+                   max_seq_len: int = 128,
+                   return_pos: bool = False,
+                   return_input_mask: bool = False,
+                   return_max_len: bool = False,
+                   return_num_token: bool = False,
+                   return_seq_lens: bool = False) -> Union[List, np.ndarray]:
+    '''
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    '''
+    return_list = []
+    #max_len = max(len(inst) for inst in insts)
+    max_len = max_seq_len
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array([list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts])
+    return_list += [inst_data.astype('int64').reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) for inst in insts])
+
+        return_list += [inst_pos.astype('int64').reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype('float32')]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    if return_seq_lens:
+        seq_lens = np.array([len(inst) for inst in insts])
+        return_list += [seq_lens.astype('int64').reshape([-1, 1])]
+
+    return return_list if len(return_list) > 1 else return_list[0]
--- a/paddlehub/compat/task/config.py
+++ b/paddlehub/compat/task/config.py
+# coding:utf-8
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+
+class RunConfig(object):
+    ''' This class specifies the configurations for PaddleHub to finetune '''
+
+    def __init__(self,
+                 log_interval: int = 10,
+                 eval_interval: int = 100,
+                 use_data_parallel: bool = True,
+                 save_ckpt_interval: int = None,
+                 use_cuda: bool = True,
+                 checkpoint_dir: str = None,
+                 num_epoch: int = 1,
+                 batch_size: int = 32):
+        ''' Construct finetune Config '''
+        self.log_interval = log_interval
+        self.eval_interval = eval_interval
+        self.save_ckpt_interval = save_ckpt_interval
+        self.use_cuda = use_cuda
+        self.num_epoch = num_epoch
+        self.batch_size = batch_size
+        self.use_data_parallel = use_data_parallel
+
+        if checkpoint_dir is None:
+            now = int(time.time())
+            time_str = time.strftime('%Y%m%d%H%M%S', time.localtime(now))
+            self.checkpoint_dir = 'ckpt_' + time_str
+        else:
+            self.checkpoint_dir = checkpoint_dir
+
+    def __repr__(self):
+        return 'config with num_epoch={}, batch_size={}, use_cuda={}, checkpoint_dir={} '.format(
+            self.num_epoch, self.batch_size, self.use_cuda, self.checkpoint_dir)
--- a/paddlehub/compat/task/hook.py
+++ b/paddlehub/compat/task/hook.py
+# coding:utf-8
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from collections import OrderedDict
+from typing import Callable
+
+
+class TaskHooks(object):
+    '''TaskHooks can handle some tasks during the spectific event.'''
+
+    def __init__(self):
+        self._registered_hooks = {
+            'build_env_start_event': OrderedDict(),
+            'build_env_end_event': OrderedDict(),
+            'finetune_start_event': OrderedDict(),
+            'finetune_end_event': OrderedDict(),
+            'predict_start_event': OrderedDict(),
+            'predict_end_event': OrderedDict(),
+            'eval_start_event': OrderedDict(),
+            'eval_end_event': OrderedDict(),
+            'log_interval_event': OrderedDict(),
+            'save_ckpt_interval_event': OrderedDict(),
+            'eval_interval_event': OrderedDict(),
+            'run_step_event': OrderedDict(),
+        }
+        self._hook_params_num = {
+            'build_env_start_event': 1,
+            'build_env_end_event': 1,
+            'finetune_start_event': 1,
+            'finetune_end_event': 2,
+            'predict_start_event': 1,
+            'predict_end_event': 2,
+            'eval_start_event': 1,
+            'eval_end_event': 2,
+            'log_interval_event': 2,
+            'save_ckpt_interval_event': 1,
+            'eval_interval_event': 1,
+            'run_step_event': 2,
+        }
+
+    def add(self, hook_type: str, name: str = None, func: Callable = None):
+        '''
+        add the handler function to spectific event.
+        Args:
+            hook_type (str): the spectific event name
+            name (str): the handler function name, default None
+            func (func): the handler function, default None
+        '''
+        if not func or not callable(func):
+            raise TypeError('The hook function is empty or it is not a function')
+        if name == None:
+            name = 'hook_%s' % id(func)
+
+        # check validity
+        if not isinstance(name, str) or name.strip() == '':
+            raise TypeError('The hook name must be a non-empty string')
+        if hook_type not in self._registered_hooks:
+            raise ValueError('hook_type: %s does not exist' % (hook_type))
+        if name in self._registered_hooks[hook_type]:
+            raise ValueError('name: %s has existed in hook_type:%s, use modify method to modify it' % (name, hook_type))
+        else:
+            args_num = len(inspect.getfullargspec(func).args)
+            if args_num != self._hook_params_num[hook_type]:
+                raise ValueError('The number of parameters to the hook hook_type:%s should be %i' %
+                                 (hook_type, self._hook_params_num[hook_type]))
+            self._registered_hooks[hook_type][name] = func
+
+    def delete(self, hook_type: str, name: str):
+        '''
+        delete the handler function of spectific event.
+        Args:
+            hook_type (str): the spectific event name
+            name (str): the handler function name
+        '''
+        if self.exist(hook_type, name):
+            del self._registered_hooks[hook_type][name]
+        else:
+            raise ValueError(
+                'No hook_type: %s exists or name: %s does not exist in hook_type: %s' % (hook_type, name, hook_type))
+
+    def modify(self, hook_type: str, name: str, func: Callable):
+        '''
+        modify the handler function of spectific event.
+        Args:
+            hook_type (str): the spectific event name
+            name (str): the handler function name
+            func (func): the new handler function
+        '''
+        if not (isinstance(name, str) and callable(func)):
+            raise TypeError('The hook name must be a string, and the hook function must be a function')
+        if self.exist(hook_type, name):
+            self._registered_hooks[hook_type][name] = func
+        else:
+            raise ValueError(
+                'No hook_type: %s exists or name: %s does not exist in hook_type: %s' % (hook_type, name, hook_type))
+
+    def exist(self, hook_type: str, name: str) -> bool:
+        '''
+        check if the the handler function of spectific event is existing.
+        Args:
+            hook_type (str): the spectific event name
+            name (str): the handler function name
+        Returns:
+            bool: True or False
+        '''
+        if hook_type not in self._registered_hooks \
+                or name not in self._registered_hooks[hook_type]:
+            return False
+        else:
+            return True
+
+    def info(self, show_default: bool = False) -> str:
+        '''
+        get the hooks information, including the source code.
+        Args:
+            show_default (bool): show the information of Paddlehub default hooks or not, default False
+        Returns:
+            str: the formatted string of the hooks information
+        '''
+        # formatted output the source code
+        ret = ''
+        for hook_type, hooks in self._registered_hooks.items():
+            already_print_type = False
+            for name, func in hooks.items():
+                if name == 'default' and not show_default:
+                    continue
+                if not already_print_type:
+                    ret += 'hook_type: %s{\n' % hook_type
+                    already_print_type = True
+                source = inspect.getsource(func)
+                ret += ' name: %s{\n' % name
+                for line in source.split('\n'):
+                    ret += '  %s\n' % line
+                ret += ' }\n'
+            if already_print_type:
+                ret += '}\n'
+        if not ret:
+            ret = 'Not any customized hooks have been defined, you can set show_default=True to see the default hooks information'
+        return ret
+
+    def __getitem__(self, hook_type: str) -> OrderedDict:
+        return self._registered_hooks[hook_type]
+
+    def __repr__(self) -> str:
+        return self.info(show_default=False)
--- a/paddlehub/compat/task/reader.py
+++ b/paddlehub/compat/task/reader.py
+# coding:utf-8
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+from typing import Callable, Generator, Generic, List
+
+import numpy as np
+
+from paddlehub.utils.log import logger
+from paddlehub.compat.task import tokenization
+from paddlehub.compat.task.batch import pad_batch_data
+
+
+class InputExample(object):
+    '''
+    Input data structure of BERT/ERNIE, can satisfy single sequence task like
+    text classification, sequence lableing; Sequence pair task like dialog
+    task.
+    '''
+
+    def __init__(self, guid: int, text_a: str, text_b: str = None, label: str = None):
+        '''Constructs a InputExample.
+    Args:
+      guid: Unique id for the example.
+      text_a: string. The untokenized text of the first sequence. For single
+        sequence tasks, only this sequence must be specified.
+      text_b: (Optional) string. The untokenized text of the second sequence.
+        Only must be specified for sequence pair tasks.
+      label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    '''
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+    def __str__(self):
+        if self.text_b is None:
+            return 'text={}\tlabel={}'.format(self.text_a, self.label)
+        else:
+            return 'text_a={}\ttext_b={},label={}'.format(self.text_a, self.text_b, self.label)
+
+
+class BaseReader(object):
+    def __init__(self, dataset: Generic, random_seed: int = None):
+        self.dataset = dataset
+        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
+        np.random.seed(random_seed)
+
+        # generate label map
+        self.label_map = {}
+        try:
+            for index, label in enumerate(self.dataset.get_labels()):
+                self.label_map[label] = index
+            logger.info('Dataset label map = {}'.format(self.label_map))
+        except:
+            # some dataset like squad, its label_list=None
+            logger.info('Dataset is None or it has not any labels, label map = {}'.format(self.label_map))
+
+    def get_train_examples(self) -> List:
+        return self.dataset.get_train_examples()
+
+    def get_dev_examples(self) -> List:
+        return self.dataset.get_dev_examples()
+
+    def get_test_examples(self) -> List:
+        return self.dataset.get_test_examples()
+
+    def data_generator(self) -> Generic:
+        raise NotImplementedError
+
+
+class BaseNLPReader(BaseReader):
+    def __init__(self,
+                 vocab_path: str,
+                 dataset: Generic = None,
+                 max_seq_len: int = 512,
+                 do_lower_case: bool = True,
+                 random_seed: int = None,
+                 sp_model_path: str = None,
+                 word_dict_path: str = None,
+                 in_tokens: bool = False):
+        super(BaseNLPReader, self).__init__(dataset, random_seed)
+        self.max_seq_len = max_seq_len
+        if sp_model_path and word_dict_path:
+            self.tokenizer = tokenization.WSSPTokenizer(vocab_path, sp_model_path, word_dict_path, ws=True, lower=True)
+        else:
+            self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self.vocab = self.tokenizer.vocab
+        self.pad_id = self.vocab['[PAD]']
+        self.cls_id = self.vocab['[CLS]']
+        self.sep_id = self.vocab['[SEP]']
+        self.mask_id = self.vocab['[MASK]']
+        self.in_tokens = in_tokens
+
+        self.Record_With_Label_Id = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id'])
+        self.Record_Wo_Label_Id = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids'])
+
+    def _truncate_seq_pair(self, tokens_a: List, tokens_b: List, max_length: int):
+        '''Truncates a sequence pair in place to the maximum length.'''
+
+        # This is a simple heuristic which will always truncate the longer sequence
+        # one token at a time. This makes more sense than truncating an equal percent
+        # of tokens from each, since if one sequence is very short then each token
+        # that's truncated likely contains more information than a longer sequence.
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                break
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
+    def _convert_example_to_record(self,
+                                   example: InputExample,
+                                   max_seq_length: int,
+                                   tokenizer: Generic,
+                                   phase: str = None) -> namedtuple:
+        '''Converts a single `Example` into a single `Record`.'''
+
+        text_a = tokenization.convert_to_unicode(example.text_a)
+        tokens_a = tokenizer.tokenize(text_a)
+        tokens_b = None
+        if example.text_b is not None:
+            #if 'text_b' in example._fields:
+            text_b = tokenization.convert_to_unicode(example.text_b)
+            tokens_b = tokenizer.tokenize(text_b)
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with '- 3'
+            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with '- 2'
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+        # The convention in BERT/ERNIE is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where 'type_ids' are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the 'sentence vector'. Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        text_type_ids = []
+        tokens.append('[CLS]')
+        text_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            text_type_ids.append(0)
+        tokens.append('[SEP]')
+        text_type_ids.append(0)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                text_type_ids.append(1)
+            tokens.append('[SEP]')
+            text_type_ids.append(1)
+
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+
+        if self.label_map:
+            if example.label not in self.label_map:
+                raise KeyError('example.label = {{{}}} not in label'.format(example.label))
+            label_id = self.label_map[example.label]
+        else:
+            label_id = example.label
+
+        if phase != 'predict':
+            record = self.Record_With_Label_Id(
+                token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids, label_id=label_id)
+        else:
+            record = self.Record_Wo_Label_Id(
+                token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids)
+
+        return record
+
+    def _pad_batch_records(self, batch_records: List, phase: str):
+        raise NotImplementedError
+
+    def _prepare_batch_data(self, examples: List, batch_size: int, phase: str = None) -> Generator:
+        '''generate batch records'''
+        batch_records, max_len = [], 0
+        for index, example in enumerate(examples):
+            if phase == 'train':
+                self.current_example = index
+            record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer, phase)
+            max_len = max(max_len, len(record.token_ids))
+            if self.in_tokens:
+                to_append = (len(batch_records) + 1) * max_len <= batch_size
+            else:
+                to_append = len(batch_records) < batch_size
+            if to_append:
+                batch_records.append(record)
+            else:
+                yield self._pad_batch_records(batch_records, phase)
+                batch_records, max_len = [record], len(record.token_ids)
+
+        if batch_records:
+            yield self._pad_batch_records(batch_records, phase)
+
+    def data_generator(self,
+                       batch_size: int = 1,
+                       phase: str = 'train',
+                       shuffle: bool = True,
+                       data: List = None,
+                       return_list: bool = True) -> Callable:
+        if phase != 'predict' and not self.dataset:
+            raise ValueError('The dataset is None ! It isn\'t allowed.')
+        if phase == 'train':
+            shuffle = True
+            examples = self.get_train_examples()
+            self.num_examples['train'] = len(examples)
+        elif phase == 'val' or phase == 'dev':
+            shuffle = False
+            examples = self.get_dev_examples()
+            self.num_examples['dev'] = len(examples)
+        elif phase == 'test':
+            shuffle = False
+            examples = self.get_test_examples()
+            self.num_examples['test'] = len(examples)
+        elif phase == 'predict':
+            shuffle = False
+            examples = []
+            seq_id = 0
+
+            for item in data:
+                # set label in order to run the program
+                if self.dataset:
+                    label = list(self.label_map.keys())[0]
+                else:
+                    label = 0
+                if len(item) == 1:
+                    item_i = InputExample(guid=seq_id, text_a=item[0], label=label)
+                elif len(item) == 2:
+                    item_i = InputExample(guid=seq_id, text_a=item[0], text_b=item[1], label=label)
+                else:
+                    raise ValueError('The length of input_text is out of handling, which must be 1 or 2!')
+                examples.append(item_i)
+                seq_id += 1
+        else:
+            raise ValueError('Unknown phase, which should be in [\'train\', \'dev\', \'test\', \'predict\'].')
+
+        def wrapper():
+            if shuffle:
+                np.random.shuffle(examples)
+
+            for batch_data in self._prepare_batch_data(examples, batch_size, phase=phase):
+                if return_list:
+                    # for DataFeeder
+                    yield [batch_data]
+                else:
+                    # for DataLoader
+                    yield batch_data
+
+        return wrapper
+
+
+class ClassifyReader(BaseNLPReader):
+    def _pad_batch_records(self, batch_records: List, phase: str = None) -> List:
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+
+        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
+            batch_token_ids,
+            max_seq_len=self.max_seq_len,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            return_seq_lens=True)
+        padded_text_type_ids = pad_batch_data(batch_text_type_ids, max_seq_len=self.max_seq_len, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(batch_position_ids, max_seq_len=self.max_seq_len, pad_idx=self.pad_id)
+
+        return_list = [padded_token_ids, padded_position_ids, padded_text_type_ids, input_mask, batch_seq_lens]
+        if phase != 'predict':
+            batch_labels = [record.label_id for record in batch_records]
+            batch_labels = np.array(batch_labels).astype('int64').reshape([-1, 1])
+            return_list += [batch_labels]
+
+        return return_list
--- a/paddlehub/compat/task/task_utils.py
+++ b/paddlehub/compat/task/task_utils.py
+# coding:utf-8
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import time
+from typing import Any
+
+import paddle
+
+
+class RunState(object):
+    '''
+    RunState is used to save the result of every running step
+    Args:
+        length (int): the number of fetch result
+    '''
+
+    def __init__(self, length: int):
+        self.run_time_begin = time.time()
+        self.run_step = 0
+        self.run_examples = 0
+        self.run_results = [0] * length
+        self.run_time_used = 0
+        self.run_speed = 0.0
+
+    def __add__(self, other):
+        self.run_step += other.run_step
+        self.run_examples += other.run_examples
+        for index in range(len(self.run_results)):
+            self.run_results[index] += other.run_results[index]
+        return self
+
+    def update(self):
+        self.run_time_used = time.time() - self.run_time_begin
+        self.run_speed = self.run_step / self.run_time_used
+        return self
+
+
+class RunEnv(object):
+    '''RunEnv saves the running environment of the train/dev/predict phase, including program, reader, metrics and so on.'''
+
+    def __init__(self):
+        self.current_epoch = 0
+        self.current_step = 0
+        self.main_program = None
+        self.start_program = None
+        self.main_program_compiled = None
+        self.py_reader = None
+        self.generator = None
+        self.loss = None
+        self.labels = None
+        self.metrics = None
+        self.is_inititalized = False
+        self.UNG = copy.deepcopy(paddle.fluid.unique_name.generator)
+
+    def __setattr__(self, key: str, value: Any):
+        self.__dict__[key] = value
+
+    def __getattr__(self, key: str) -> Any:
+        return self.__dict__[key]
--- a/paddlehub/compat/task/tokenization.py
+++ b/paddlehub/compat/task/tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''Tokenization classes.'''
+
+import collections
+import io
+import pickle
+import unicodedata
+from typing import List, Union
+
+
+def convert_to_unicode(text: Union[str, bytes]) -> str:
+    '''Converts `text` to Unicode (if it's not already), assuming utf-8 input.'''
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode('utf-8', 'ignore')
+    else:
+        raise ValueError('Unsupported type: {}'.format(type(text)))
+
+
+def load_vocab(vocab_file: str) -> List:
+    '''Loads a vocabulary file into a dictionary.'''
+    vocab = collections.OrderedDict()
+    with io.open(vocab_file, 'r', encoding='UTF-8') as file:
+
+        for num, line in enumerate(file):
+            items = convert_to_unicode(line.strip()).split('\t')
+            if len(items) > 2:
+                break
+            token = items[0]
+            index = items[1] if len(items) == 2 else num
+            token = token.strip()
+            vocab[token] = int(index)
+
+        return vocab
+
+
+def convert_by_vocab(vocab: collections.OrderedDict, items: List[str]) -> List:
+    '''Converts a sequence of [tokens|ids] using the vocab.'''
+    output = []
+    for item in items:
+        output.append(vocab[item])
+
+    return output
+
+
+def convert_tokens_to_ids(vocab: collections.OrderedDict, tokens: List[str]) -> List:
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text: str) -> List:
+    '''Runs basic whitespace cleaning and splitting on a peice of text.'''
+    text = text.strip()
+    if not text:
+        return []
+
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    '''Runs end-to-end tokenziation.'''
+
+    def __init__(self, vocab_file: str, do_lower_case: bool = True, use_sentence_piece_vocab: bool = False):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.use_sentence_piece_vocab = use_sentence_piece_vocab
+        self.wordpiece_tokenizer = WordpieceTokenizer(
+            vocab=self.vocab, use_sentence_piece_vocab=self.use_sentence_piece_vocab)
+
+    def tokenize(self, text: str) -> List:
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens: List) -> List:
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids: List) -> List:
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class WSSPTokenizer(object):
+    def __init__(self, vocab_file: str, sp_model_dir: str, word_dict: str, ws: bool = True, lower: bool = True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.ws = ws
+        self.lower = lower
+        self.dict = pickle.load(open(word_dict, 'rb'))
+
+        import sentencepiece as spm
+        self.sp_model = spm.SentencePieceProcessor()
+        self.window_size = 5
+        self.sp_model.Load(sp_model_dir)
+
+    def cut(self, chars: List) -> List:
+        words = []
+        idx = 0
+        while idx < len(chars):
+            matched = False
+            for i in range(self.window_size, 0, -1):
+                cand = chars[idx:idx + i]
+                if cand in self.dict:
+                    words.append(cand)
+                    matched = True
+                    break
+            if not matched:
+                i = 1
+                words.append(chars[idx])
+            idx += i
+        return words
+
+    def tokenize(self, text: Union[str, bytes], unk_token: str = '[UNK]') -> List:
+        text = convert_to_unicode(text)
+        if self.ws:
+            text = [s for s in self.cut(text) if s != ' ']
+        else:
+            text = text.split(' ')
+        if self.lower:
+            text = [s.lower() for s in text]
+        text = ' '.join(text)
+        tokens = self.sp_model.EncodeAsPieces(text)
+        in_vocab_tokens = []
+        for token in tokens:
+            if token in self.vocab:
+                in_vocab_tokens.append(token)
+            else:
+                in_vocab_tokens.append(unk_token)
+        return in_vocab_tokens
+
+    def convert_tokens_to_ids(self, tokens: List) -> List:
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids: List) -> List:
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+    '''Runs basic tokenization (punctuation splitting, lower casing, etc.).'''
+
+    def __init__(self, do_lower_case: bool = True):
+        '''Constructs a BasicTokenizer.
+        Args:
+            do_lower_case: Whether to lower case the input.
+        '''
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text: Union[str, bytes]) -> List:
+        '''Tokenizes a piece of text.'''
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(' '.join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text: str) -> str:
+        '''Strips accents from a piece of text.'''
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text: str) -> List:
+        '''Splits punctuation on a piece of text.'''
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text: str) -> str:
+        '''Adds whitespace around any CJK character.'''
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(' ')
+                output.append(char)
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+    def _is_chinese_char(self, cp: int) -> bool:
+        '''Checks whether CP is the codepoint of a CJK character.'''
+        # This defines a 'chinese character' as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text: str) -> str:
+        '''Performs invalid character removal and whitespace cleanup on text.'''
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+
+class WordpieceTokenizer(object):
+    '''Runs WordPiece tokenziation.'''
+
+    def __init__(self,
+                 vocab: collections.OrderedDict,
+                 unk_token: str = '[UNK]',
+                 max_input_chars_per_word: int = 100,
+                 use_sentence_piece_vocab: bool = False):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+        self.use_sentence_piece_vocab = use_sentence_piece_vocab
+
+    def tokenize(self, text: Union[str, bytes]) -> List:
+        '''Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+            input = 'unaffable'
+            output = ['un', '##aff', '##able']
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+        Returns:
+            A list of wordpiece tokens.
+        '''
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = ''.join(chars[start:end])
+                    if start == 0 and self.use_sentence_piece_vocab:
+                        substr = u'\u2581' + substr
+                    if start > 0 and not self.use_sentence_piece_vocab:
+                        substr = '##' + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char: str) -> bool:
+    '''Checks whether `chars` is a whitespace character.'''
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
+        return True
+    cat = unicodedata.category(char)
+    if cat == 'Zs':
+        return True
+    return False
+
+
+def _is_control(char: str) -> bool:
+    '''Checks whether `chars` is a control character.'''
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == '\t' or char == '\n' or char == '\r':
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith('C'):
+        return True
+    return False
+
+
+def _is_punctuation(char: str) -> bool:
+    '''Checks whether `chars` is a punctuation character.'''
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as '^', '$', and '`' are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith('P'):
+        return True
+    return False
--- a/paddlehub/compat/task/transformer_emb_task.py
+++ b/paddlehub/compat/task/transformer_emb_task.py
+# coding:utf-8
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Generic, List
+
+import paddle
+import numpy as np
+
+from paddlehub.compat.task.config import RunConfig
+from paddlehub.compat.task.base_task import BaseTask
+from paddlehub.compat.task.task_utils import RunState
+
+
+class TransformerEmbeddingTask(BaseTask):
+    def __init__(self,
+                 pooled_feature: paddle.Variable,
+                 seq_feature: paddle.Variable,
+                 feed_list: List[str],
+                 data_reader: Generic,
+                 config: RunConfig = None):
+        main_program = pooled_feature.block.program
+        super(TransformerEmbeddingTask, self).__init__(
+            main_program=main_program, config=config, feed_list=feed_list, data_reader=data_reader, metrics_choices=[])
+        self.pooled_feature = pooled_feature
+        self.seq_feature = seq_feature
+
+    def _build_net(self) -> List[paddle.Variable]:
+        # ClassifyReader will return the seqence length of an input text
+        self.seq_len = paddle.data(name='seq_len', shape=[1], dtype='int64', lod_level=0)
+        return [self.pooled_feature, self.seq_feature]
+
+    def _postprocessing(self, run_states: List[RunState]) -> List[List[np.ndarray]]:
+        results = []
+        for batch_state in run_states:
+            batch_result = batch_state.run_results
+            batch_pooled_features = batch_result[0]
+            batch_seq_features = batch_result[1]
+            for i in range(len(batch_pooled_features)):
+                results.append([batch_pooled_features[i], batch_seq_features[i]])
+        return results
+
+    @property
+    def feed_list(self) -> List[str]:
+        feed_list = [varname for varname in self._base_feed_list] + [self.seq_len.name]
+        return feed_list
+
+    @property
+    def fetch_list(self) -> List[str]:
+        fetch_list = [output.name for output in self.outputs] + [self.seq_len.name]
+        return fetch_list