fix save predict model

2efeb39b · xixiaoyao · d44b6381 · 2efeb39b · 2efeb39b · d44b6381
20 changed file
--- a/.gitignore
+++ b/.gitignore
 *.pyc
 __pycache__
 pretrain_model
+pretrain
+output*
 output_model
 build
 dist

--- a/demo/demo3/run.py
+++ b/demo/demo3/run.py
@@ -15,7 +15,6 @@ if __name__ == '__main__':
    config = json.load(open('./pretrain/ernie/ernie_config.json'))
    # ernie = palm.backbone.ERNIE(...)
    ernie = palm.backbone.ERNIE.from_config(config)
-    # pred_ernie = palm.backbone.ERNIE.from_config(config, phase='pred')

    # cls_reader2 = palm.reader.cls(train_file_topic, vocab_path, batch_size, max_seqlen)
    # cls_reader3 = palm.reader.cls(train_file_subj, vocab_path, batch_size, max_seqlen)
@@ -30,7 +29,6 @@ if __name__ == '__main__':
    print(cls_reader.outputs_attr)
    # 创建任务头（task head），如分类、匹配、机器阅读理解等。每个任务头有跟该任务相关的必选/可选参数。注意，任务头与reader是解耦合的，只要任务头依赖的数据集侧的字段能被reader提供，那么就是合法的
    cls_head = palm.head.Classify(4, 1024, 0.1)
-    # cls_pred_head = palm.head.Classify(4, 1024, 0.1, phase='pred')

    # 根据reader和任务头来创建一个训练器trainer，trainer代表了一个训练任务，内部维护着训练进程、和任务的关键信息，并完成合法性校验，该任务的模型保存、载入等相关规则控制
    trainer = palm.Trainer('senti_cls', cls_reader, cls_head)
@@ -64,7 +62,12 @@ if __name__ == '__main__':

    # print(trainer.train_one_step(next(iterator_fn())))
    # trainer.train_one_epoch()
-    trainer.train(iterator_fn, print_steps=1, save_steps=5, save_path='outputs/ckpt')
+    # for save predict model.
+    pred_ernie = palm.backbone.ERNIE.from_config(config, phase='pred')
+    cls_pred_head = palm.head.Classify(4, 1024, phase='pred')
+    trainer.build_predict_head(cls_pred_head, pred_ernie)
+
+    trainer.train(iterator_fn, print_steps=1, save_steps=5, save_path='outputs', save_type='ckpt,predict')
    # trainer.save()



--- a/interface.py
+++ b/interface.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""v1.1"""
-
-class reader(object):
-    """interface of data manager."""
-
-    def __init__(self, config):
-        assert isinstance(config, dict)
-
-    # @property
-    # def inputs_attr(self):
-    #     """描述reader输入对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1.
-    #     Return:
-    #         dict类型。对各个输入对象的属性描述。例如，
-    #         对于文本分类任务，可能需要包含输入文本和所属标签的id
-    #             {"text": ([], 'str'),
-    #              "label": ([], 'int')}
-    #         对于标注任务，可能需要输入词序列和对应的标签
-    #             {"tokens", ([-1], 'str'),
-    #              "tags", ([-1], 'str')}
-    #         对于机器阅读理解任务，可能需要包含上下文、问题、回答、答案区域的起止位置等
-    #             {"paragraph", ([], 'str'),
-    #              "question", ([], 'str'),
-    #              "start_position", ([], 'int')
-    #         """
-    #     raise NotImplementedError()
-
-    @property
-    def outputs_attr(self):
-        """描述reader输出对象（被yield出的对象）的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
-        注意：当使用mini-batch梯度下降学习策略时，，应为常规的输入对象设置batch_size维度（一般为-1）
-        Return:
-            dict类型。对各个输入对象的属性描述。例如，
-            对于文本分类和匹配任务，yield的输出内容可能包含如下的对象（下游backbone和task可按需访问其中的对象）
-                {"token_ids": ([-1, max_len], 'int64'),
-                 "input_ids": ([-1, max_len], 'int64'),
-                 "segment_ids": ([-1, max_len], 'int64'),
-                 "input_mask": ([-1, max_len], 'float32'),
-                 "label": ([-1], 'int')}
-        """
-        raise NotImplementedError()
-
-    # def parse_line(self):
-    #     """框架内部使用字典描述每个样本，字典的key为inputs_attr，value为每个input对应的符合attr描述的值。
-    #         该函数负责将文本行解析成符合inputs_attr描述的字典类型的样本。默认的parse_line方法会读取json格式的数据集文件，数据集的每一行为json格式描述的样本。
-    #         用户可通过对该方法的继承改写来适配不同格式的数据集，例如csv格式甚至tfrecord文件。
-    #         """
-    #     raise NotImplementedError()
-    # 
-    # def tokenize(self, line):
-    #     """框架中内置了word piece tokenizer等分词器，用户可通过修改tokenizer超参数来制定使用的分词器，若内置的分词器均无法满足需求，用户可通过对该方法的继承改写来自定义分词器。
-    #         Args:
-    #             - line: a unicode string. 
-    #         Return:
-    #             a list of tokens
-    #         """
-    #     raise NotImplementedError()
-    
-    def iterator(self):
-        """数据集遍历接口，注意，当数据集遍历到尾部时该接口应自动完成指针重置，即重新从数据集头部开始新的遍历。
-        Yield:
-            (dict) elements that meet the requirements in output_templete
-        """
-        raise NotImplementedError()
-
-    @property
-    def num_examples(self):
-        """数据集中的样本数量，即每个epoch中iterator所生成的样本数。注意，使用滑动窗口等可能导致数据集样本数发生变化的策略时，该接口应返回runtime阶段的实际样本数。"""
-        raise NotImplementedError()
-
-
-
-class backbone(object):
-    """interface of backbone model."""
-
-    def __init__(self, config, phase):
-        """
-        Args:
-            config: dict类型。描述了 多任务配置文件+预训练模型配置文件 中定义超参数
-            phase: str类型。运行阶段，目前支持train和predict
-            """
-        assert isinstance(config, dict)
-
-    @property
-    def inputs_attr(self):
-        """描述backbone从reader处需要得到的输入对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
-        Return:
-            dict类型。对各个输入对象的属性描述。例如，
-            对于文本分类和匹配任务，bert backbone依赖的reader对象主要包含如下的对象
-                {"token_ids": ([-1, max_len], 'int64'),
-                 "input_ids": ([-1, max_len], 'int64'),
-                 "segment_ids": ([-1, max_len], 'int64'),
-                 "input_mask": ([-1, max_len], 'float32')}"""
-        raise NotImplementedError()
-
-    @property
-    def outputs_attr(self):
-        """描述backbone输出对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
-        Return:
-            dict类型。对各个输出对象的属性描述。例如，
-            对于文本分类和匹配任务，bert backbone的输出内容可能包含如下的对象
-                {"word_emb": ([-1, max_seqlen, word_emb_size], 'float32'),
-                 "sentence_emb": ([-1, hidden_size], 'float32'),
-                 "sim_vec": ([-1, hidden_size], 'float32')}""" 
-        raise NotImplementedError()
-
-    def build(self, inputs):
-        """建立backbone的计算图。将符合inputs_attr描述的静态图Variable输入映射成符合outputs_attr描述的静态图Variable输出。
-        Args:
-            inputs: dict类型。字典中包含inputs_attr中的对象名到计算图Variable的映射，inputs中至少会包含inputs_attr中定义的对象
-        Return:
-           需要输出的计算图变量，输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
-            """
-        raise NotImplementedError()
-
-
-
-
-class task_paradigm(object):
-
-    def __init__(self, config, phase, backbone_config):
-        """
-            config: dict类型。描述了 任务实例(task instance)+多任务配置文件 中定义超参数
-            phase: str类型。运行阶段，目前支持train和predict
-            """
-
-    @property
-    def inputs_attrs(self):
-        """描述task_layer需要从reader, backbone等输入对象集合所读取到的输入对象的属性，第一级key为对象集和的名字，如backbone，reader等（后续会支持更灵活的输入），第二级key为对象集和中各对象的属性，包括对象的名字，shape和dtype。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
-        Return:
-            dict类型。对各个对象集及其输入对象的属性描述。"""
-        raise NotImplementedError()
-
-    @property
-    def outputs_attr(self):
-        """描述task输出对象的属性，包括对象的名字，shape和dtype。输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
-        当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
-        Return:
-            dict类型。对各个输入对象的属性描述。注意，训练阶段必须包含名为loss的输出对象。
-            """
-
-        raise NotImplementedError()
-
-    @property
-    def epoch_inputs_attrs(self):
-        return {}
-
-    def build(self, inputs, scope_name=""):
-        """建立task_layer的计算图。将符合inputs_attrs描述的来自各个对象集的静态图Variables映射成符合outputs_attr描述的静态图Variable输出。
-        Args:
-            inputs: dict类型。字典中包含inputs_attrs中的对象名到计算图Variable的映射，inputs中至少会包含inputs_attr中定义的对象
-        Return:
-           需要输出的计算图变量，输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
-
-        """
-        raise NotImplementedError()
-
-    def postprocess(self, rt_outputs):
-        """每个训练或推理step后针对当前batch的task_layer的runtime计算结果进行相关后处理。注意，rt_outputs除了包含build方法，还自动包含了loss的计算结果。"""
-        pass
-        
-    def epoch_postprocess(self, post_inputs):
-        pass
-
--- a/paddlepalm/trainer.py
+++ b/paddlepalm/trainer.py
@@ -38,7 +38,7 @@ class Trainer(object):
        self._reader = reader
        self._pred_reader = None
        self._task_head = task_head
-        self._pred_head = pred_head
+        self._pred_head = None

        # if save_predict_model:
        #     self._save_predict_model = True
@@ -89,20 +89,24 @@ class Trainer(object):
        self._lock = False
        self._build_forward = False

-    def build_predict_head(self, pred_backbone, pred_prog=None, pred_init_prog=None):
+    def build_predict_head(self, pred_head, pred_backbone, pred_prog=None, pred_init_prog=None):
+        self._pred_head = pred_head
+        # self._pred_reader = self._reader.clone(phase='pred')
        pred_task_attr_from_reader = helper.encode_inputs(self._pred_head.inputs_attrs['reader'], self.name)
        # pred_task_attr_from_reader = self._pred_head.inputs_attrs['reader']

        # _check_io(pred_backbone.inputs_attr, pred_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.pred')
        # _check_io(pred_parad.inputs_attrs['reader'], pred_reader.outputs_attr, in_name='task_paradigm.pred.reader', out_name='reader.pred')
        # _check_io(pred_parad.inputs_attrs['backbone'], pred_backbone.outputs_attr, in_name='task_paradigm.pred.backbone', out_name=bb_name+'_backbone')
-        pred_input_names, pred_shape_and_dtypes, _ = reader_helper.merge_input_attrs(backbone.inputs_attr, pred_task_attr_from_reader, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
+        pred_input_names, pred_shape_and_dtypes, _ = reader_helper.merge_input_attrs(pred_backbone.inputs_attr, pred_task_attr_from_reader, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
        pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_input_names, pred_shape_and_dtypes)]
        
        if pred_prog is None:
            pred_prog = fluid.Program()
+        self._pred_prog = pred_prog
        if pred_init_prog is None:
            pred_init_prog = fluid.Program()
+        self._pred_init_prog = pred_init_prog
        with fluid.program_guard(pred_prog, pred_init_prog):
            pred_net_inputs = reader_helper.create_net_inputs(pred_input_attrs)
            # pred_bb_output_vars = pred_backbone.build(pred_net_inputs, scope_name='__paddlepalm_')
@@ -121,8 +125,6 @@ class Trainer(object):
                self._build_head(pred_task_inputs, phase='pred', scope=scope)


-
-
    def build_forward(self, backbone, pred_backbone=None, train_prog=None, train_init_prog=None, pred_prog=None, pred_init_prog=None):

        # assert self._backbone is not None, "backbone is required for Trainer to build net forward to run with single task mode"
@@ -154,7 +156,6 @@ class Trainer(object):
            print('joint input shape and dtypes:')
            print(joint_shape_and_dtypes)

-
        input_attrs = [[i, j, k] for i, (j,k) in zip(input_names, shape_and_dtypes)]

        if train_prog is None:
@@ -172,6 +173,7 @@ class Trainer(object):
            # bb_output_vars = self._backbone.build(net_inputs, scope_name='__paddlepalm_')
            bb_output_vars = backbone.build(net_inputs)
            assert sorted(bb_output_vars.keys()) == sorted(backbone.outputs_attr.keys())
+        # self._bb_output_vars.keys
        

        # fluid.framework.switch_main_program(train_prog)
@@ -293,10 +295,14 @@ class Trainer(object):
        pass

    def train(self, iterator, save_path=None, save_steps=None, save_type='ckpt', print_steps=5):
+        """
+        Argument:
+            save_type: ckpt, predict, pretrain
+        """

        save_type = save_type.split(',')
        if 'predict' in save_type:
-            assert self._pred_head is not None, "Predict head not found! You should call set_predict_head first if you want to save predict model."
+            assert self._pred_head is not None, "Predict head not found! You should build_predict_head first if you want to save predict model."
            assert save_path is not None and save_steps is not None, 'save_path and save_steps is required to save model.'
            save_predict = True
            if not os.path.exists(save_path):
@@ -369,11 +375,11 @@ class Trainer(object):
            #     cur_task.save()

            if (save_predict or save_ckpt) and self._cur_train_step % save_steps == 0:
-                if save_predict_model:
-                    self.save(save_path, suffix='pred.step'+str(global_step))
+                if save_predict:
+                    self.save(save_path, suffix='pred.step'+str(self._cur_train_step))
                if save_ckpt:
-                    fluid.io.save_persistables(self.exe, os.path.join(save_path, 'ckpt.step'+str(global_step)), self._train_prog)
-                    print('checkpoint has been saved at '+os.path.join(save_path, 'ckpt.step'+str(global_step)))
+                    fluid.io.save_persistables(self._exe, os.path.join(save_path, 'ckpt.step'+str(self._cur_train_step)), self._train_prog)
+                    print('checkpoint has been saved at '+os.path.join(save_path, 'ckpt.step'+str(self._cur_train_step)))

        # save_path = os.path.join(main_conf['save_path'], 'ckpt',
        #                          "step_" + str(global_step))
@@ -422,7 +428,7 @@ class Trainer(object):
            dirpath = save_path
        self._pred_input_varname_list = [str(i) for i in self._pred_input_varname_list]

-        prog = fluid.default_main_program().clone()
+        prog = self._pred_prog.clone()
        fluid.io.save_inference_model(dirpath, self._pred_input_varname_list, self._pred_fetch_var_list, self._exe, prog)

        conf = {}

--- a/reader/__init__.py
+++ b/reader/__init__.py
--- a/reader/cls.py
+++ b/reader/cls.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddlepalm.interface import reader
-from paddlepalm.reader.utils.reader4ernie import ClassifyReader
-
-class Reader(reader):
-    
-    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
-        """
-        Args:
-            phase: train, eval, pred
-            """
-
-        self._is_training = phase == 'train'
-
-        reader = ClassifyReader(config['vocab_path'],
-            max_seq_len=config['max_seq_len'],
-            do_lower_case=config.get('do_lower_case', False),
-            for_cn=config.get('for_cn', False),
-            random_seed=config.get('seed', None))
-        self._reader = reader
-        self._dev_count = dev_count
-
-        self._batch_size = config['batch_size']
-        self._max_seq_len = config['max_seq_len']
-        self._num_classes = config['n_classes']
-
-        if phase == 'train':
-            self._input_file = config['train_file']
-            self._num_epochs = None # 防止iteartor终止
-            self._shuffle = config.get('shuffle', True)
-            # self._shuffle_buffer = config.get('shuffle_buffer', 5000)
-        elif phase == 'eval':
-            self._input_file = config['dev_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-        elif phase == 'pred':
-            self._input_file = config['pred_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-
-        self._phase = phase
-        # self._batch_size = 
-        self._print_first_n = config.get('print_first_n', 0)
-
-
-    @property
-    def outputs_attr(self):
-        if self._is_training:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
-                    "input_mask": [[-1, -1, 1], 'float32'],
-                    "label_ids": [[-1,1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64']
-                    }
-        else:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64'],
-                    "input_mask": [[-1, -1, 1], 'float32']
-                    }
-
-
-    def load_data(self):
-        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
-
-    def iterator(self): 
-
-        def list_to_dict(x):
-            names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
-                'label_ids', 'unique_ids']
-            outputs = {n: i for n,i in zip(names, x)}
-            del outputs['unique_ids']
-            if not self._is_training:
-                del outputs['label_ids']
-            return outputs
-
-        for batch in self._data_generator():
-            yield list_to_dict(batch)
-
-    def get_epoch_outputs(self):
-        return {'examples': self._reader.get_examples(self._phase),
-                'features': self._reader.get_features(self._phase)}
-
-    @property
-    def num_examples(self):
-        return self._reader.get_num_examples(phase=self._phase)
-
--- a/reader/match.py
+++ b/reader/match.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddlepalm.interface import reader
-from paddlepalm.reader.utils.reader4ernie import ClassifyReader
-
-def match(vocab_path, max_seq_len, do_lower_case=True, phase, dev_count=1):
-    config={
-        xxx}
-
-    return Reader(config())
-
-class Reader(reader):
-    
-    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
-        """
-        Args:
-            phase: train, eval, pred
-            """
-
-        self._is_training = phase == 'train'
-
-        reader = ClassifyReader(config['vocab_path'],
-            max_seq_len=config['max_seq_len'],
-            do_lower_case=config.get('do_lower_case', True),
-            for_cn=config.get('for_cn', False),
-            random_seed=config.get('seed', None))
-        self._reader = reader
-        self._dev_count = dev_count
-
-        self._batch_size = config['batch_size']
-        self._max_seq_len = config['max_seq_len']
-        if phase == 'train':
-            self._input_file = config['train_file']
-            self._num_epochs = None # 防止iteartor终止
-            self._shuffle = config.get('shuffle', True)
-            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
-        elif phase == 'eval':
-            self._input_file = config['dev_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-        elif phase == 'pred':
-            self._input_file = config['pred_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-
-        self._phase = phase
-        # self._batch_size = 
-        self._print_first_n = config.get('print_first_n', 1)
-
-
-    @property
-    def outputs_attr(self):
-        if self._is_training:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
-                    "input_mask": [[-1, -1, 1], 'float32'],
-                    "label_ids": [[-1,1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64']
-                    }
-        else:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64'],
-                    "input_mask": [[-1, -1, 1], 'float32']
-                    }
-
-
-    def load_data(self):
-        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
-
-    def iterator(self): 
-
-        def list_to_dict(x):
-            names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
-                'label_ids', 'unique_ids']
-            outputs = {n: i for n,i in zip(names, x)}
-            del outputs['unique_ids']
-            if not self._is_training:
-                del outputs['label_ids']
-            return outputs
-
-        for batch in self._data_generator():
-            yield list_to_dict(batch)
-
-    @property
-    def num_examples(self):
-        return self._reader.get_num_examples(phase=self._phase)
-
--- a/reader/mlm.py
+++ b/reader/mlm.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddlepalm.interface import reader
-from paddlepalm.reader.utils.reader4ernie import MaskLMReader
-import numpy as np
-
-class Reader(reader):
-    
-    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
-        """
-        Args:
-            phase: train, eval, pred
-            """
-
-        self._is_training = phase == 'train'
-
-        reader = MaskLMReader(config['vocab_path'],
-            max_seq_len=config['max_seq_len'],
-            do_lower_case=config.get('do_lower_case', False),
-            for_cn=config.get('for_cn', False),
-            random_seed=config.get('seed', None))
-        self._reader = reader
-        self._dev_count = dev_count
-
-        self._batch_size = config['batch_size']
-        self._max_seq_len = config['max_seq_len']
-        if phase == 'train':
-            self._input_file = config['train_file']
-            self._num_epochs = None # 防止iteartor终止
-            self._shuffle = config.get('shuffle', True)
-            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
-        elif phase == 'eval':
-            self._input_file = config['dev_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-        elif phase == 'pred':
-            self._input_file = config['pred_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-
-        self._phase = phase
-        # self._batch_size = 
-        self._print_first_n = config.get('print_first_n', 1)
-
-
-    @property
-    def outputs_attr(self):
-        return {"token_ids": [[-1, -1, 1], 'int64'],
-                "position_ids": [[-1, -1, 1], 'int64'],
-                "segment_ids": [[-1, -1, 1], 'int64'],
-                "input_mask": [[-1, -1, 1], 'float32'],
-                "task_ids": [[-1, -1, 1], 'int64'],
-                "mask_label": [[-1, 1], 'int64'],
-                "mask_pos": [[-1, 1], 'int64'],
-                }
-
-
-    def load_data(self):
-        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
-
-    def iterator(self): 
-
-        def list_to_dict(x):
-            names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask', 
-                'task_ids', 'mask_label', 'mask_pos']
-            outputs = {n: i for n,i in zip(names, x)}
-            # outputs['batchsize_x_seqlen'] = [self._batch_size * len(outputs['token_ids'][0]) - 1]
-            return outputs
-
-        for batch in self._data_generator():
-            # print(np.shape(list_to_dict(batch)['token_ids']))
-            # print(list_to_dict(batch)['mask_label'].tolist())
-            yield list_to_dict(batch)
-
-    def get_epoch_outputs(self):
-        return {'examples': self._reader.get_examples(self._phase),
-                'features': self._reader.get_features(self._phase)}
-
-    @property
-    def num_examples(self):
-        return self._reader.get_num_examples(phase=self._phase)
-
--- a/reader/mrc.py
+++ b/reader/mrc.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddlepalm.interface import reader
-from paddlepalm.reader.utils.reader4ernie import MRCReader
-
-class Reader(reader):
-    
-    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
-        """
-        Args:
-            phase: train, eval, pred
-            """
-
-        self._is_training = phase == 'train'
-
-        reader = MRCReader(config['vocab_path'],
-            max_seq_len=config['max_seq_len'],
-            do_lower_case=config.get('do_lower_case', False),
-            tokenizer='FullTokenizer',
-            for_cn=config.get('for_cn', False),
-            doc_stride=config['doc_stride'],
-            max_query_length=config['max_query_len'],
-            random_seed=config.get('seed', None))
-        self._reader = reader
-        self._dev_count = dev_count
-
-        self._batch_size = config['batch_size']
-        self._max_seq_len = config['max_seq_len']
-        if phase == 'train':
-            self._input_file = config['train_file']
-            # self._num_epochs = config['num_epochs']
-            self._num_epochs = None # 防止iteartor终止
-            self._shuffle = config.get('shuffle', True)
-            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
-        if phase == 'eval':
-            self._input_file = config['dev_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-        elif phase == 'pred':
-            self._input_file = config['pred_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-
-        self._phase = phase
-        # self._batch_size = 
-        self._print_first_n = config.get('print_first_n', 1)
-
-        # TODO: without slide window version
-        self._with_slide_window = config.get('with_slide_window', False)
-
-
-    @property
-    def outputs_attr(self):
-        if self._is_training:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
-                    "input_mask": [[-1, -1, 1], 'float32'],
-                    "start_positions": [[-1, 1], 'int64'],
-                    "end_positions": [[-1, 1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64']
-                    }
-        else:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64'],
-                    "input_mask": [[-1, -1, 1], 'float32'],
-                    "unique_ids": [[-1, 1], 'int64']
-                    }
-
-    @property
-    def epoch_outputs_attr(self):
-        if not self._is_training:
-            return {"examples": None,
-                    "features": None}
-
-    def load_data(self):
-        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
-
-    def iterator(self): 
-
-        def list_to_dict(x):
-            names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
-                'start_positions', 'end_positions', 'unique_ids']
-            outputs = {n: i for n,i in zip(names, x)}
-            if self._is_training:
-                del outputs['unique_ids']
-            else:
-                del outputs['start_positions']
-                del outputs['end_positions']
-            return outputs
-
-        for batch in self._data_generator():
-            yield list_to_dict(batch)
-
-    def get_epoch_outputs(self):
-        return {'examples': self._reader.get_examples(self._phase),
-                'features': self._reader.get_features(self._phase)}
-
-    @property
-    def num_examples(self):
-        return self._reader.get_num_examples(phase=self._phase)
-
--- a/reader/utils/__init__.py
+++ b/reader/utils/__init__.py
--- a/reader/utils/batching4bert.py
+++ b/reader/utils/batching4bert.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Mask, padding and batching."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import numpy as np
-
-
-def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
-    """
-    Add mask for batch_tokens, return out, mask_label, mask_pos;
-    Note: mask_pos responding the batch_tokens after padded;
-    """
-    max_len = max([len(sent) for sent in batch_tokens])
-    mask_label = []
-    mask_pos = []
-    prob_mask = np.random.rand(total_token_num)
-    # Note: the first token is [CLS], so [low=1]
-    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
-    pre_sent_len = 0
-    prob_index = 0
-    for sent_index, sent in enumerate(batch_tokens):
-        mask_flag = False
-        prob_index += pre_sent_len
-        for token_index, token in enumerate(sent):
-            prob = prob_mask[prob_index + token_index]
-            if prob > 0.15:
-                continue
-            elif 0.03 < prob <= 0.15:
-                # mask
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    sent[token_index] = MASK
-                    mask_flag = True
-                    mask_pos.append(sent_index * max_len + token_index)
-            elif 0.015 < prob <= 0.03:
-                # random replace
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    sent[token_index] = replace_ids[prob_index + token_index]
-                    mask_flag = True
-                    mask_pos.append(sent_index * max_len + token_index)
-            else:
-                # keep the original token
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    mask_pos.append(sent_index * max_len + token_index)
-        pre_sent_len = len(sent)
-        # ensure at least mask one word in a sentence
-        while not mask_flag:
-            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
-            if sent[token_index] != SEP and sent[token_index] != CLS:
-                mask_label.append(sent[token_index])
-                sent[token_index] = MASK
-                mask_flag = True
-                mask_pos.append(sent_index * max_len + token_index)
-    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
-    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
-    return batch_tokens, mask_label, mask_pos
-
-
-def prepare_batch_data(insts,
-                       total_token_num,
-                       max_len=None,
-                       voc_size=0,
-                       pad_id=None,
-                       cls_id=None,
-                       sep_id=None,
-                       mask_id=None,
-                       return_input_mask=True,
-                       return_max_len=True,
-                       return_num_token=False):
-    """
-    1. generate Tensor of data
-    2. generate Tensor of position
-    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
-    """
-    batch_src_ids = [inst[0] for inst in insts]
-    batch_sent_ids = [inst[1] for inst in insts]
-    batch_pos_ids = [inst[2] for inst in insts]
-    labels_list = []
-    # compatible with mrqa, whose example includes start/end positions, 
-    # or unique id
-    for i in range(3, len(insts[0]), 1):
-        labels = [inst[i] for inst in insts]
-        labels = np.array(labels).astype("int64").reshape([-1, 1])
-        labels_list.append(labels)
-    # First step: do mask without padding
-    if mask_id >= 0:
-        out, mask_label, mask_pos = mask(
-            batch_src_ids,
-            total_token_num,
-            vocab_size=voc_size,
-            CLS=cls_id,
-            SEP=sep_id,
-            MASK=mask_id)
-    else:
-        out = batch_src_ids
-    # Second step: padding
-    src_id, self_input_mask = pad_batch_data(
-        out, 
-        max_len=max_len,
-        pad_idx=pad_id, return_input_mask=True)
-    pos_id = pad_batch_data(
-        batch_pos_ids,
-        max_len=max_len,
-        pad_idx=pad_id,
-        return_pos=False,
-        return_input_mask=False)
-    sent_id = pad_batch_data(
-        batch_sent_ids,
-        max_len=max_len,
-        pad_idx=pad_id,
-        return_pos=False,
-        return_input_mask=False)
-    if mask_id >= 0:
-        return_list = [
-            src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
-        ] + labels_list
-    else:
-        return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-def pad_batch_data(insts,
-                   max_len=None,
-                   pad_idx=0,
-                   return_pos=False,
-                   return_input_mask=False,
-                   return_max_len=False,
-                   return_num_token=False):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and input mask.
-    """
-    return_list = []
-    if max_len is None:
-        max_len = max(len(inst) for inst in insts)
-    # Any token included in dict can be used to pad, since the paddings' loss
-    # will be masked out by weights and make no effect on parameter gradients.
-    inst_data = np.array([
-        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
-    ])
-    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
-    # position data
-    if return_pos:
-        inst_pos = np.array([
-            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
-            for inst in insts
-        ])
-        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
-    if return_input_mask:
-        # This is used to avoid attention on paddings.
-        input_mask_data = np.array([[1] * len(inst) + [0] *
-                                    (max_len - len(inst)) for inst in insts])
-        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
-        return_list += [input_mask_data.astype("float32")]
-    if return_max_len:
-        return_list += [max_len]
-    if return_num_token:
-        num_token = 0
-        for inst in insts:
-            num_token += len(inst)
-        return_list += [num_token]
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-if __name__ == "__main__":
-    pass
-
-
--- a/reader/utils/batching4ernie.py
+++ b/reader/utils/batching4ernie.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Mask, padding and batching."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from six.moves import xrange
-
-
-def mask(batch_tokens,
-         seg_labels,
-         mask_word_tags,
-         total_token_num,
-         vocab_size,
-         CLS=1,
-         SEP=2,
-         MASK=3):
-    """
-    Add mask for batch_tokens, return out, mask_label, mask_pos;
-    Note: mask_pos responding the batch_tokens after padded;
-    """
-    max_len = max([len(sent) for sent in batch_tokens])
-    mask_label = []
-    mask_pos = []
-    prob_mask = np.random.rand(total_token_num)
-    # Note: the first token is [CLS], so [low=1]
-    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
-    pre_sent_len = 0
-    prob_index = 0
-    for sent_index, sent in enumerate(batch_tokens):
-        mask_flag = False
-        mask_word = mask_word_tags[sent_index]
-        prob_index += pre_sent_len
-        if mask_word:
-            beg = 0
-            for token_index, token in enumerate(sent):
-                seg_label = seg_labels[sent_index][token_index]
-                if seg_label == 1:
-                    continue
-                if beg == 0:
-                    if seg_label != -1:
-                        beg = token_index
-                    continue
-
-                prob = prob_mask[prob_index + beg]
-                if prob > 0.15:
-                    pass
-                else:
-                    for index in xrange(beg, token_index):
-                        prob = prob_mask[prob_index + index]
-                        base_prob = 1.0
-                        if index == beg:
-                            base_prob = 0.15
-                        if base_prob * 0.2 < prob <= base_prob:
-                            mask_label.append(sent[index])
-                            sent[index] = MASK
-                            mask_flag = True
-                            mask_pos.append(sent_index * max_len + index)
-                        elif base_prob * 0.1 < prob <= base_prob * 0.2:
-                            mask_label.append(sent[index])
-                            sent[index] = replace_ids[prob_index + index]
-                            mask_flag = True
-                            mask_pos.append(sent_index * max_len + index)
-                        else:
-                            mask_label.append(sent[index])
-                            mask_pos.append(sent_index * max_len + index)
-
-                if seg_label == -1:
-                    beg = 0
-                else:
-                    beg = token_index
-        else:
-            for token_index, token in enumerate(sent):
-                prob = prob_mask[prob_index + token_index]
-                if prob > 0.15:
-                    continue
-                elif 0.03 < prob <= 0.15:
-                    # mask
-                    if token != SEP and token != CLS:
-                        mask_label.append(sent[token_index])
-                        sent[token_index] = MASK
-                        mask_flag = True
-                        mask_pos.append(sent_index * max_len + token_index)
-                elif 0.015 < prob <= 0.03:
-                    # random replace
-                    if token != SEP and token != CLS:
-                        mask_label.append(sent[token_index])
-                        sent[token_index] = replace_ids[prob_index +
-                                                        token_index]
-                        mask_flag = True
-                        mask_pos.append(sent_index * max_len + token_index)
-                else:
-                    # keep the original token
-                    if token != SEP and token != CLS:
-                        mask_label.append(sent[token_index])
-                        mask_pos.append(sent_index * max_len + token_index)
-
-        pre_sent_len = len(sent)
-
-    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
-    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
-    return batch_tokens, mask_label, mask_pos
-
-
-def pad_batch_data(insts,
-                   pad_idx=0,
-                   return_pos=False,
-                   return_input_mask=False,
-                   return_max_len=False,
-                   return_num_token=False,
-                   return_seq_lens=False):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias.
-    """
-    return_list = []
-    max_len = max(len(inst) for inst in insts)
-    # Any token included in dict can be used to pad, since the paddings' loss
-    # will be masked out by weights and make no effect on parameter gradients.
-
-    inst_data = np.array(
-        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
-    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
-
-    # position data
-    if return_pos:
-        inst_pos = np.array([
-            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
-            for inst in insts
-        ])
-
-        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
-
-    if return_input_mask:
-        # This is used to avoid attention on paddings.
-        input_mask_data = np.array([[1] * len(inst) + [0] *
-                                    (max_len - len(inst)) for inst in insts])
-        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
-        return_list += [input_mask_data.astype("float32")]
-
-    if return_max_len:
-        return_list += [max_len]
-
-    if return_num_token:
-        num_token = 0
-        for inst in insts:
-            num_token += len(inst)
-        return_list += [num_token]
-
-    if return_seq_lens:
-        seq_lens = np.array([len(inst) for inst in insts])
-        return_list += [seq_lens.astype("int64").reshape([-1, 1])]
-
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-if __name__ == "__main__":
-
-    pass
--- a/reader/utils/mlm_batching.py
+++ b/reader/utils/mlm_batching.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Mask, padding and batching."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import numpy as np
-
-
-def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
-    """
-    Add mask for batch_tokens, return out, mask_label, mask_pos;
-    Note: mask_pos responding the batch_tokens after padded;
-    """
-    max_len = max([len(sent) for sent in batch_tokens])
-    mask_label = []
-    mask_pos = []
-    prob_mask = np.random.rand(total_token_num)
-    # Note: the first token is [CLS], so [low=1]
-    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
-    pre_sent_len = 0
-    prob_index = 0
-    for sent_index, sent in enumerate(batch_tokens):
-        mask_flag = False
-        prob_index += pre_sent_len
-        for token_index, token in enumerate(sent):
-            prob = prob_mask[prob_index + token_index]
-            if prob > 0.15:
-                continue
-            elif 0.03 < prob <= 0.15:
-                # mask
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    sent[token_index] = MASK
-                    mask_flag = True
-                    mask_pos.append(sent_index * max_len + token_index)
-            elif 0.015 < prob <= 0.03:
-                # random replace
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    sent[token_index] = replace_ids[prob_index + token_index]
-                    mask_flag = True
-                    mask_pos.append(sent_index * max_len + token_index)
-            else:
-                # keep the original token
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    mask_pos.append(sent_index * max_len + token_index)
-        pre_sent_len = len(sent)
-        # ensure at least mask one word in a sentence
-        while not mask_flag:
-            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
-            if sent[token_index] != SEP and sent[token_index] != CLS:
-                mask_label.append(sent[token_index])
-                sent[token_index] = MASK
-                mask_flag = True
-                mask_pos.append(sent_index * max_len + token_index)
-    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
-    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
-    return batch_tokens, mask_label, mask_pos
-
-
-def prepare_batch_data(insts,
-                       total_token_num,
-                       max_len=None,
-                       voc_size=0,
-                       pad_id=None,
-                       cls_id=None,
-                       sep_id=None,
-                       mask_id=None,
-                       task_id=0,
-                       return_input_mask=True,
-                       return_max_len=True,
-                       return_num_token=False):
-    """
-    1. generate Tensor of data
-    2. generate Tensor of position
-    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
-    """
-    batch_src_ids = [inst[0] for inst in insts]
-    batch_sent_ids = [inst[1] for inst in insts]
-    batch_pos_ids = [inst[2] for inst in insts]
-
-    # 这里是否应该反过来？？？否则在task layer里展开后的word embedding是padding后的，这时候word的index是跟没有padding时的index对不上的？
-    # First step: do mask without padding
-    out, mask_label, mask_pos = mask(
-        batch_src_ids,
-        total_token_num,
-        vocab_size=voc_size,
-        CLS=cls_id,
-        SEP=sep_id,
-        MASK=mask_id)
-    # Second step: padding
-    src_id, self_input_mask = pad_batch_data(
-        out, 
-        max_len=max_len,
-        pad_idx=pad_id, return_input_mask=True)
-
-    pos_id = pad_batch_data(
-        batch_pos_ids,
-        max_len=max_len,
-        pad_idx=pad_id,
-        return_pos=False,
-        return_input_mask=False)
-    sent_id = pad_batch_data(
-        batch_sent_ids,
-        max_len=max_len,
-        pad_idx=pad_id,
-        return_pos=False,
-        return_input_mask=False)
-    task_ids = np.ones_like(
-        src_id, dtype="int64") * task_id
-    return_list = [
-        src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos
-    ]
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-def pad_batch_data(insts,
-                   max_len=None,
-                   pad_idx=0,
-                   return_pos=False,
-                   return_input_mask=False,
-                   return_max_len=False,
-                   return_num_token=False):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and input mask.
-    """
-    return_list = []
-    if max_len is None:
-        max_len = max(len(inst) for inst in insts)
-    # Any token included in dict can be used to pad, since the paddings' loss
-    # will be masked out by weights and make no effect on parameter gradients.
-    inst_data = np.array([
-        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
-    ])
-    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
-    # position data
-    if return_pos:
-        inst_pos = np.array([
-            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
-            for inst in insts
-        ])
-        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
-    if return_input_mask:
-        # This is used to avoid attention on paddings.
-        input_mask_data = np.array([[1] * len(inst) + [0] *
-                                    (max_len - len(inst)) for inst in insts])
-        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
-        return_list += [input_mask_data.astype("float32")]
-    if return_max_len:
-        return_list += [max_len]
-    if return_num_token:
-        num_token = 0
-        for inst in insts:
-            num_token += len(inst)
-        return_list += [num_token]
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-if __name__ == "__main__":
-    pass
-
-
--- a/reader/utils/mrqa_helper.py
+++ b/reader/utils/mrqa_helper.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-class MRQAExample(object):
-    """A single training/test example for simple sequence classification.
-
-     For examples without an answer, the start and end position are -1.
-  """
-
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=False):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = ""
-        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
-        s += ", question_text: %s" % (
-            tokenization.printable_text(self.question_text))
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-        if self.start_position:
-            s += ", start_position: %d" % (self.start_position)
-        if self.start_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.start_position:
-            s += ", is_impossible: %r" % (self.is_impossible)
-        return s
-
-
-class MRQAFeature(object):
-    """A single set of features of data."""
-
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
--- a/reader/utils/reader4ernie.py
+++ b/reader/utils/reader4ernie.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-from __future__ import absolute_import
-
-import sys
-import os
-import json
-import random
-import logging
-import numpy as np
-import six
-from io import open
-from collections import namedtuple
-
-import paddlepalm.tokenizer.ernie_tokenizer as tokenization
-from paddlepalm.reader.utils.batching4ernie import pad_batch_data
-from paddlepalm.reader.utils.mlm_batching import prepare_batch_data
-
-
-log = logging.getLogger(__name__)
-
-if six.PY3:
-    import io
-    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
-    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
-
-
-def csv_reader(fd, delimiter='\t'):
-    def gen():
-        for i in fd:
-            yield i.rstrip('\n').split(delimiter)
-    return gen()
-
-
-class BaseReader(object):
-    def __init__(self,
-                 vocab_path,
-                 label_map_config=None,
-                 max_seq_len=512,
-                 do_lower_case=True,
-                 in_tokens=False,
-                 is_inference=False,
-                 random_seed=None,
-                 tokenizer="FullTokenizer",
-                 is_classify=True,
-                 is_regression=False,
-                 for_cn=True,
-                 task_id=0):
-        self.max_seq_len = max_seq_len
-        self.tokenizer = tokenization.FullTokenizer(
-            vocab_file=vocab_path, do_lower_case=do_lower_case)
-        self.vocab = self.tokenizer.vocab
-        self.pad_id = self.vocab["[PAD]"]
-        self.cls_id = self.vocab["[CLS]"]
-        self.sep_id = self.vocab["[SEP]"]
-        self.mask_id = self.vocab["[MASK]"]
-        self.in_tokens = in_tokens
-        self.is_inference = is_inference
-        self.for_cn = for_cn
-        self.task_id = task_id
-
-        np.random.seed(random_seed)
-
-        self.is_classify = is_classify
-        self.is_regression = is_regression
-        self.current_example = 0
-        self.current_epoch = 0
-        self.num_examples = 0
-
-        self.examples = {}
-
-        if label_map_config:
-            with open(label_map_config, encoding='utf8') as f: 
-                self.label_map = json.load(f)
-        else:
-            self.label_map = None
-
-    def get_train_progress(self):
-        """Gets progress for training phase."""
-        return self.current_example, self.current_epoch
-
-    def _read_tsv(self, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, 'r', encoding='utf8') as f:
-            reader = csv_reader(f)
-            headers = next(reader)
-            Example = namedtuple('Example', headers)
-
-            examples = []
-            for line in reader:
-                example = Example(*line)
-                examples.append(example)
-            return examples
-
-    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
-        """Truncates a sequence pair in place to the maximum length."""
-
-        # This is a simple heuristic which will always truncate the longer sequence
-        # one token at a time. This makes more sense than truncating an equal percent
-        # of tokens from each, since if one sequence is very short then each token
-        # that's truncated likely contains more information than a longer sequence.
-        while True:
-            total_length = len(tokens_a) + len(tokens_b)
-            if total_length <= max_length:
-                break
-            if len(tokens_a) > len(tokens_b):
-                tokens_a.pop()
-            else:
-                tokens_b.pop()
-
-    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
-        """Converts a single `Example` into a single `Record`."""
-
-        text_a = tokenization.convert_to_unicode(example.text_a)
-        tokens_a = tokenizer.tokenize(text_a)
-        tokens_b = None
-
-        has_text_b = False
-        if isinstance(example, dict):
-            has_text_b = "text_b" in example.keys()
-        else:
-            has_text_b = "text_b" in example._fields
-
-        if has_text_b:
-            text_b = tokenization.convert_to_unicode(example.text_b)
-            tokens_b = tokenizer.tokenize(text_b)
-
-        if tokens_b:
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[0:(max_seq_length - 2)]
-
-        # The convention in BERT/ERNIE is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0     0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        text_type_ids = []
-        tokens.append("[CLS]")
-        text_type_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            text_type_ids.append(0)
-        tokens.append("[SEP]")
-        text_type_ids.append(0)
-
-        if tokens_b:
-            for token in tokens_b:
-                tokens.append(token)
-                text_type_ids.append(1)
-            tokens.append("[SEP]")
-            text_type_ids.append(1)
-
-        token_ids = tokenizer.convert_tokens_to_ids(tokens)
-        position_ids = list(range(len(token_ids)))
-
-        if self.is_inference:
-            Record = namedtuple('Record',
-                                ['token_ids', 'text_type_ids', 'position_ids'])
-            record = Record(
-                token_ids=token_ids,
-                text_type_ids=text_type_ids,
-                position_ids=position_ids)
-        else:
-            if self.label_map:
-                label_id = self.label_map[example.label]
-            else:
-                label_id = example.label
-
-            Record = namedtuple('Record', [
-                'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'
-            ])
-
-            qid = None
-            if "qid" in example._fields:
-                qid = example.qid
-
-            record = Record(
-                token_ids=token_ids,
-                text_type_ids=text_type_ids,
-                position_ids=position_ids,
-                label_id=label_id,
-                qid=qid)
-        return record
-
-    def _prepare_batch_data(self, examples, batch_size, phase=None):
-        """generate batch records"""
-        batch_records, max_len = [], 0
-        if len(examples) < batch_size:
-            raise Exception('CLS dataset contains too few samples. Expect more than '+str(batch_size))
-        for index, example in enumerate(examples):
-            if phase == "train":
-                self.current_example = index
-            record = self._convert_example_to_record(example, self.max_seq_len,
-                                                     self.tokenizer)
-            max_len = max(max_len, len(record.token_ids))
-            if self.in_tokens:
-                to_append = (len(batch_records) + 1) * max_len <= batch_size
-            else:
-                to_append = len(batch_records) < batch_size
-            if to_append:
-                batch_records.append(record)
-            else:
-                yield self._pad_batch_records(batch_records)
-                batch_records, max_len = [record], len(record.token_ids)
-
-        if phase == 'pred' and batch_records:
-            yield self._pad_batch_records(batch_records)
-
-    def get_num_examples(self, input_file=None, phase=None):
-        if self.examples is not None:
-            if phase is None:
-                phase = 'all'
-            return len(self.examples[phase])
-        else:
-            assert input_file is not None, "Argument input_file should be given or the data_generator should be created when this func is called."
-            examples = self._read_tsv(input_file)
-            return len(examples)
-
-    def data_generator(self,
-                       input_file,
-                       batch_size,
-                       epoch,
-                       dev_count=1,
-                       shuffle=True,
-                       phase=None):
-        examples = self._read_tsv(input_file)
-        if phase is None:
-            phase = 'all'
-        self.examples[phase] = examples
-
-        def wrapper():
-            all_dev_batches = []
-            if epoch is None:
-                num_epochs = 99999999
-            else:
-                num_epochs = epoch
-            for epoch_index in range(num_epochs):
-                if phase == "train":
-                    self.current_example = 0
-                    self.current_epoch = epoch_index
-                if shuffle:
-                    np.random.shuffle(examples)
-
-                for batch_data in self._prepare_batch_data(
-                        examples, batch_size, phase=phase):
-                    if len(all_dev_batches) < dev_count:
-                        all_dev_batches.append(batch_data)
-                    if len(all_dev_batches) == dev_count:
-                        for batch in all_dev_batches:
-                            yield batch
-                        all_dev_batches = []
-        def f():
-            for i in wrapper():
-                yield i
-
-        # def f():
-        #     try:
-        #         for i in wrapper():
-        #             yield i
-        #     except Exception as e:
-        #         import traceback
-        #         traceback.print_exc()
-
-        return f
-
-
-class MaskLMReader(BaseReader):
-
-    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
-        """Converts a single `Example` into a single `Record`."""
-
-        text_a = tokenization.convert_to_unicode(example.text_a)
-        tokens_a = tokenizer.tokenize(text_a)
-        tokens_b = None 
-
-        has_text_b = False
-        if isinstance(example, dict):
-            has_text_b = "text_b" in example.keys()
-        else:
-            has_text_b = "text_b" in example._fields
-
-        if has_text_b:
-            text_b = tokenization.convert_to_unicode(example.text_b)
-            tokens_b = tokenizer.tokenize(text_b)
-
-        if tokens_b:
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[0:(max_seq_length - 2)]
-
-        # The convention in BERT/ERNIE is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0     0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        text_type_ids = []
-        tokens.append("[CLS]")
-        text_type_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            text_type_ids.append(0)
-        tokens.append("[SEP]")
-        text_type_ids.append(0)
-
-        if tokens_b:
-            for token in tokens_b:
-                tokens.append(token)
-                text_type_ids.append(1)
-            tokens.append("[SEP]")
-            text_type_ids.append(1)
-
-        token_ids = tokenizer.convert_tokens_to_ids(tokens)
-        position_ids = list(range(len(token_ids)))
-
-        # Record = namedtuple('Record',
-        #                     ['token_ids', 'text_type_ids', 'position_ids'])
-        # record = Record(
-        #     token_ids=token_ids,
-        #     text_type_ids=text_type_ids,
-        #     position_ids=position_ids)
-
-        return [token_ids, text_type_ids, position_ids]
-
-    def batch_reader(self, examples, batch_size, in_tokens, phase):
-        batch = []
-        total_token_num = 0
-        if len(examples) < batch_size:
-            raise Exception('MaskLM dataset contains too few samples. Expect more than '+str(batch_size))
-        for e in examples:
-            parsed_line = self._convert_example_to_record(e, self.max_seq_len, self.tokenizer)
-            to_append = len(batch) < batch_size
-            if to_append:
-                batch.append(parsed_line)
-                total_token_num += len(parsed_line[0])
-            else:
-                yield batch, total_token_num
-                batch = [parsed_line]
-                total_token_num = len(parsed_line[0])
-
-        if len(batch) > 0 and phase == 'pred':
-            yield batch, total_token_num
-
-    def data_generator(self,
-                       input_file,
-                       batch_size,
-                       epoch,
-                       dev_count=1,
-                       shuffle=True,
-                       phase=None):
-        examples = self._read_tsv(input_file)
-        if phase is None:
-            phase = 'all'
-        self.examples[phase] = examples
-
-        def wrapper():
-            all_dev_batches = []
-            if epoch is None:
-                num_epochs = 99999999
-            else:
-                num_epochs = epoch
-            for epoch_index in range(num_epochs):
-                if phase == "train":
-                    self.current_example = 0
-                    self.current_epoch = epoch_index
-                if shuffle:
-                    np.random.shuffle(examples)
-
-                all_dev_batches = []
-                for batch_data, num_tokens in self.batch_reader(examples, 
-                                                    batch_size, self.in_tokens, phase=phase):
-                    batch_data = prepare_batch_data(
-                        batch_data,
-                        num_tokens,
-                        voc_size=len(self.vocab),
-                        pad_id=self.pad_id,
-                        cls_id=self.cls_id,
-                        sep_id=self.sep_id,
-                        mask_id=self.mask_id,
-                        # max_len=self.max_seq_len, # 注意，如果padding到最大长度，会导致mask_pos与实际位置不对应。因为mask pos是基于batch内最大长度来计算的。
-                        return_input_mask=True,
-                        return_max_len=False,
-                        return_num_token=False)
-
-                    if len(all_dev_batches) < dev_count:
-                        all_dev_batches.append(batch_data)
-                    if len(all_dev_batches) == dev_count:
-                        for batch in all_dev_batches:
-                            yield batch
-                        all_dev_batches = []
-
-        return wrapper
-
-
-class ClassifyReader(BaseReader):
-    def _read_tsv(self, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, 'r', encoding='utf8') as f:
-            reader = csv_reader(f)
-            headers = next(reader)
-            text_indices = [
-                index for index, h in enumerate(headers) if h != "label"
-            ]
-            Example = namedtuple('Example', headers)
-
-            examples = []
-            for line in reader:
-                for index, text in enumerate(line):
-                    if index in text_indices:
-                        if self.for_cn:
-                            line[index] = text.replace(' ', '')
-                        else:
-                            line[index] = text
-                example = Example(*line)
-                examples.append(example)
-            return examples
-
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-
-        if not self.is_inference:
-            batch_labels = [record.label_id for record in batch_records]
-            if self.is_classify:
-                batch_labels = np.array(batch_labels).astype("int64").reshape(
-                    [-1, 1])
-            elif self.is_regression:
-                batch_labels = np.array(batch_labels).astype("float32").reshape(
-                    [-1, 1])
-
-            if batch_records[0].qid:
-                batch_qids = [record.qid for record in batch_records]
-                batch_qids = np.array(batch_qids).astype("int64").reshape(
-                    [-1, 1])
-            else:
-                batch_qids = np.array([]).astype("int64").reshape([-1, 1])
-
-        # padding
-        padded_token_ids, input_mask = pad_batch_data(
-            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        padded_task_ids = np.ones_like(
-            padded_token_ids, dtype="int64") * self.task_id
-
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            padded_task_ids, input_mask
-        ]
-        if not self.is_inference:
-            return_list += [batch_labels, batch_qids]
-
-        return return_list
-
-
-class SequenceLabelReader(BaseReader):
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-        batch_label_ids = [record.label_ids for record in batch_records]
-
-        # padding
-        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
-            batch_token_ids,
-            pad_idx=self.pad_id,
-            return_input_mask=True,
-            return_seq_lens=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        padded_label_ids = pad_batch_data(
-            batch_label_ids, pad_idx=len(self.label_map) - 1)
-        padded_task_ids = np.ones_like(
-            padded_token_ids, dtype="int64") * self.task_id
-
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            padded_task_ids, input_mask, padded_label_ids, batch_seq_lens
-        ]
-        return return_list
-
-    def _reseg_token_label(self, tokens, labels, tokenizer):
-        assert len(tokens) == len(labels)
-        ret_tokens = []
-        ret_labels = []
-        for token, label in zip(tokens, labels):
-            sub_token = tokenizer.tokenize(token)
-            if len(sub_token) == 0:
-                continue
-            ret_tokens.extend(sub_token)
-            if len(sub_token) == 1:
-                ret_labels.append(label)
-                continue
-
-            if label == "O" or label.startswith("I-"):
-                ret_labels.extend([label] * len(sub_token))
-            elif label.startswith("B-"):
-                i_label = "I-" + label[2:]
-                ret_labels.extend([label] + [i_label] * (len(sub_token) - 1))
-            elif label.startswith("S-"):
-                b_laebl = "B-" + label[2:]
-                e_label = "E-" + label[2:]
-                i_label = "I-" + label[2:]
-                ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label])
-            elif label.startswith("E-"):
-                i_label = "I-" + label[2:]
-                ret_labels.extend([i_label] * (len(sub_token) - 1) + [label])
-
-        assert len(ret_tokens) == len(ret_labels)
-        return ret_tokens, ret_labels
-
-    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
-        tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
-        labels = tokenization.convert_to_unicode(example.label).split(u"")
-        tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
-
-        if len(tokens) > max_seq_length - 2:
-            tokens = tokens[0:(max_seq_length - 2)]
-            labels = labels[0:(max_seq_length - 2)]
-
-        tokens = ["[CLS]"] + tokens + ["[SEP]"]
-        token_ids = tokenizer.convert_tokens_to_ids(tokens)
-        position_ids = list(range(len(token_ids)))
-        text_type_ids = [0] * len(token_ids)
-        no_entity_id = len(self.label_map) - 1
-        label_ids = [no_entity_id] + [
-            self.label_map[label] for label in labels
-        ] + [no_entity_id]
-
-        Record = namedtuple(
-            'Record',
-            ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
-        record = Record(
-            token_ids=token_ids,
-            text_type_ids=text_type_ids,
-            position_ids=position_ids,
-            label_ids=label_ids)
-        return record
-
-
-class ExtractEmbeddingReader(BaseReader):
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-
-        # padding
-        padded_token_ids, input_mask, seq_lens = pad_batch_data(
-            batch_token_ids,
-            pad_idx=self.pad_id,
-            return_input_mask=True,
-            return_seq_lens=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        padded_task_ids = np.ones_like(
-            padded_token_ids, dtype="int64") * self.task_id
-
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            padded_task_ids, input_mask, seq_lens
-        ]
-
-        return return_list
-
-
-class MRCReader(BaseReader):
-    def __init__(self,
-                 vocab_path,
-                 label_map_config=None,
-                 max_seq_len=512,
-                 do_lower_case=True,
-                 in_tokens=False,
-                 random_seed=None,
-                 tokenizer="FullTokenizer",
-                 is_classify=True,
-                 is_regression=False,
-                 for_cn=True,
-                 task_id=0,
-                 doc_stride=128,
-                 max_query_length=64,
-                 remove_noanswer=True):
-        self.max_seq_len = max_seq_len
-        self.tokenizer = tokenization.FullTokenizer(
-            vocab_file=vocab_path, do_lower_case=do_lower_case)
-        self.vocab = self.tokenizer.vocab
-        self.pad_id = self.vocab["[PAD]"]
-        self.cls_id = self.vocab["[CLS]"]
-        self.sep_id = self.vocab["[SEP]"]
-        self.in_tokens = in_tokens
-        self.for_cn = for_cn
-        self.task_id = task_id
-        self.doc_stride = doc_stride
-        self.max_query_length = max_query_length
-        self.examples = {}
-        self.features = {}
-        self.remove_noanswer = remove_noanswer
-
-        if random_seed is not None:
-            np.random.seed(random_seed)
-
-        self.current_example = 0
-        self.current_epoch = 0
-        self.num_examples = 0
-
-        self.Example = namedtuple('Example',
-                ['qas_id', 'question_text', 'doc_tokens', 'orig_answer_text',
-                'start_position', 'end_position'])
-        self.Feature = namedtuple("Feature", ["unique_id", "example_index", "doc_span_index",
-                "tokens", "token_to_orig_map", "token_is_max_context",
-                "token_ids", "position_ids", "text_type_ids",
-                "start_position", "end_position"])
-        self.DocSpan = namedtuple("DocSpan", ["start", "length"])
-
-    def _read_json(self, input_file, is_training):
-        examples = []
-        with open(input_file, "r", encoding='utf8') as f:
-            input_data = json.load(f)["data"]
-            for entry in input_data:
-                for paragraph in entry["paragraphs"]:
-                    paragraph_text = paragraph["context"]
-                    for qa in paragraph["qas"]:
-                        qas_id = qa["id"]
-                        question_text = qa["question"]
-                        start_pos = None
-                        end_pos = None
-                        orig_answer_text = None
-
-                        if is_training:
-                            if len(qa["answers"]) != 1:
-                                raise ValueError(
-                                    "For training, each question should have exactly 1 answer."
-                                )
-
-                            answer = qa["answers"][0]
-                            orig_answer_text = answer["text"]
-                            answer_offset = answer["answer_start"]
-                            answer_length = len(orig_answer_text)
-                            doc_tokens = [
-                                paragraph_text[:answer_offset],
-                                paragraph_text[answer_offset:answer_offset +
-                                               answer_length],
-                                paragraph_text[answer_offset + answer_length:]
-                            ]
-
-                            start_pos = 1
-                            end_pos = 1
-
-                            actual_text = " ".join(doc_tokens[start_pos:(end_pos
-                                                                         + 1)])
-                            if actual_text.find(orig_answer_text) == -1:
-                                log.info("Could not find answer: '%s' vs. '%s'",
-                                      actual_text, orig_answer_text)
-                                continue
-                        else:
-                            doc_tokens = tokenization.tokenize_chinese_chars(
-                                paragraph_text)
-
-                        example = self.Example(
-                            qas_id=qas_id,
-                            question_text=question_text,
-                            doc_tokens=doc_tokens,
-                            orig_answer_text=orig_answer_text,
-                            start_position=start_pos,
-                            end_position=end_pos)
-                        examples.append(example)
-
-        return examples
-
-    def _improve_answer_span(self, doc_tokens, input_start, input_end,
-                             tokenizer, orig_answer_text):
-        tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-        for new_start in range(input_start, input_end + 1):
-            for new_end in range(input_end, new_start - 1, -1):
-                text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-                if text_span == tok_answer_text:
-                    return (new_start, new_end)
-
-        return (input_start, input_end)
-
-    def _check_is_max_context(self, doc_spans, cur_span_index, position):
-        best_score = None
-        best_span_index = None
-        for (span_index, doc_span) in enumerate(doc_spans):
-            end = doc_span.start + doc_span.length - 1
-            if position < doc_span.start:
-                continue
-            if position > end:
-                continue
-            num_left_context = position - doc_span.start
-            num_right_context = end - position
-            score = min(num_left_context,
-                        num_right_context) + 0.01 * doc_span.length
-            if best_score is None or score > best_score:
-                best_score = score
-                best_span_index = span_index
-
-        return cur_span_index == best_span_index
-
-    def _convert_example_to_feature(self, examples, max_seq_length, tokenizer,
-                                    is_training, remove_noanswer=True):
-        features = []
-        unique_id = 1000000000
-
-        print('converting examples to features...')
-        for (example_index, example) in enumerate(examples):
-            if example_index % 1000 == 0:
-                print('processing {}th example...'.format(example_index))
-            query_tokens = tokenizer.tokenize(example.question_text)
-            if len(query_tokens) > self.max_query_length:
-                query_tokens = query_tokens[0:self.max_query_length]
-            tok_to_orig_index = []
-            orig_to_tok_index = []
-            all_doc_tokens = []
-            for (i, token) in enumerate(example.doc_tokens):
-                orig_to_tok_index.append(len(all_doc_tokens))
-                sub_tokens = tokenizer.tokenize(token)
-                for sub_token in sub_tokens:
-                    tok_to_orig_index.append(i)
-                    all_doc_tokens.append(sub_token)
-
-            tok_start_position = None
-            tok_end_position = None
-            if is_training:
-                tok_start_position = orig_to_tok_index[example.start_position]
-                if example.end_position < len(example.doc_tokens) - 1:
-                    tok_end_position = orig_to_tok_index[example.end_position +
-                                                         1] - 1
-                else:
-                    tok_end_position = len(all_doc_tokens) - 1
-                (tok_start_position,
-                 tok_end_position) = self._improve_answer_span(
-                     all_doc_tokens, tok_start_position, tok_end_position,
-                     tokenizer, example.orig_answer_text)
-
-            max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-            doc_spans = []
-            start_offset = 0
-            while start_offset < len(all_doc_tokens):
-                length = len(all_doc_tokens) - start_offset
-                if length > max_tokens_for_doc:
-                    length = max_tokens_for_doc
-                doc_spans.append(self.DocSpan(start=start_offset, length=length))
-                if start_offset + length == len(all_doc_tokens):
-                    break
-                start_offset += min(length, self.doc_stride)
-
-            for (doc_span_index, doc_span) in enumerate(doc_spans):
-                tokens = []
-                token_to_orig_map = {}
-                token_is_max_context = {}
-                text_type_ids = []
-                tokens.append("[CLS]")
-                text_type_ids.append(0)
-                for token in query_tokens:
-                    tokens.append(token)
-                    text_type_ids.append(0)
-                tokens.append("[SEP]")
-                text_type_ids.append(0)
-
-                for i in range(doc_span.length):
-                    split_token_index = doc_span.start + i
-                    token_to_orig_map[len(tokens)] = tok_to_orig_index[
-                        split_token_index]
-
-                    is_max_context = self._check_is_max_context(
-                        doc_spans, doc_span_index, split_token_index)
-                    token_is_max_context[len(tokens)] = is_max_context
-                    tokens.append(all_doc_tokens[split_token_index])
-                    text_type_ids.append(1)
-                tokens.append("[SEP]")
-                text_type_ids.append(1)
-
-                token_ids = tokenizer.convert_tokens_to_ids(tokens)
-                position_ids = list(range(len(token_ids)))
-                start_position = None
-                end_position = None
-                if is_training:
-                    doc_start = doc_span.start
-                    doc_end = doc_span.start + doc_span.length - 1
-                    out_of_span = False
-                    if not (tok_start_position >= doc_start and
-                            tok_end_position <= doc_end):
-                        out_of_span = True
-                    if out_of_span:
-                        start_position = 0
-                        end_position = 0
-                        if remove_noanswer:
-                            continue
-                    else:
-                        doc_offset = len(query_tokens) + 2
-                        start_position = tok_start_position - doc_start + doc_offset
-                        end_position = tok_end_position - doc_start + doc_offset
-
-                feature = self.Feature(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    token_ids=token_ids,
-                    position_ids=position_ids,
-                    text_type_ids=text_type_ids,
-                    start_position=start_position,
-                    end_position=end_position)
-                features.append(feature)
-
-                unique_id += 1
-
-        return features
-
-    def _prepare_batch_data(self, records, batch_size, phase=None):
-        """generate batch records"""
-        batch_records, max_len = [], 0
-
-        if len(records) < batch_size:
-            raise Exception('mrc dataset contains too few samples. Expect more than '+str(batch_size))
-
-        for index, record in enumerate(records):
-            if phase == "train":
-                self.current_example = index
-            max_len = max(max_len, len(record.token_ids))
-            if self.in_tokens:
-                to_append = (len(batch_records) + 1) * max_len <= batch_size
-            else:
-                to_append = len(batch_records) < batch_size
-            if to_append:
-                batch_records.append(record)
-            else:
-                yield self._pad_batch_records(batch_records, phase == "train")
-                batch_records, max_len = [record], len(record.token_ids)
-
-        if phase == 'pred' and batch_records:
-            yield self._pad_batch_records(batch_records, phase == "train")
-
-    def _pad_batch_records(self, batch_records, is_training):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-        if is_training:
-            batch_start_position = [
-                record.start_position for record in batch_records
-            ]
-            batch_end_position = [
-                record.end_position for record in batch_records
-            ]
-            batch_start_position = np.array(batch_start_position).astype(
-                "int64").reshape([-1, 1])
-            batch_end_position = np.array(batch_end_position).astype(
-                "int64").reshape([-1, 1])
-
-        else:
-            batch_size = len(batch_token_ids)
-            batch_start_position = np.zeros(
-                shape=[batch_size, 1], dtype="int64")
-            batch_end_position = np.zeros(shape=[batch_size, 1], dtype="int64")
-
-        batch_unique_ids = [record.unique_id for record in batch_records]
-        batch_unique_ids = np.array(batch_unique_ids).astype("int64").reshape(
-            [-1, 1])
-
-        # padding
-        padded_token_ids, input_mask = pad_batch_data(
-            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        padded_task_ids = np.ones_like(
-            padded_token_ids, dtype="int64") * self.task_id
-
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            padded_task_ids, input_mask, batch_start_position,
-            batch_end_position, batch_unique_ids
-        ]
-
-        return return_list
-
-    def get_num_examples(self, phase):
-        return len(self.features[phase])
-
-    def get_features(self, phase):
-        return self.features[phase]
-
-    def get_examples(self, phase):
-        return self.examples[phase]
-
-    def data_generator(self,
-                       input_file,
-                       batch_size,
-                       epoch,
-                       dev_count=1,
-                       shuffle=True,
-                       phase=None):
-
-        examples = self.examples.get(phase, None)
-        features = self.features.get(phase, None)
-        if not examples:
-            examples = self._read_json(input_file, phase == "train")
-            features = self._convert_example_to_feature(
-                examples, self.max_seq_len, self.tokenizer, phase == "train", remove_noanswer=self.remove_noanswer)
-            self.examples[phase] = examples
-            self.features[phase] = features
-
-        def wrapper():
-            all_dev_batches = []
-            if epoch is None:
-                num_epochs = 99999999
-            else:
-                num_epochs = epoch
-            for epoch_index in range(num_epochs):
-                if phase == "train":
-                    self.current_example = 0
-                    self.current_epoch = epoch_index
-                if phase == "train" and shuffle:
-                    np.random.shuffle(features)
-
-                for batch_data in self._prepare_batch_data(
-                        features, batch_size, phase=phase):
-                    if len(all_dev_batches) < dev_count:
-                        all_dev_batches.append(batch_data)
-                    if len(all_dev_batches) == dev_count:
-                        for batch in all_dev_batches:
-                            yield batch
-                        all_dev_batches = []
-
-        return wrapper
-
-
-if __name__ == '__main__':
-    pass
--- a/tasktype/__init__.py
+++ b/tasktype/__init__.py
--- a/tasktype/cls.py
+++ b/tasktype/cls.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from paddle.fluid import layers
-from paddlepalm.interface import task_paradigm
-import numpy as np
-import os
-
-class TaskParadigm(task_paradigm):
-    '''
-    classification
-    '''
-    def __init__(self, config, phase, backbone_config=None):
-        self._is_training = phase == 'train'
-        self._hidden_size = backbone_config['hidden_size']
-        self.num_classes = config['n_classes']
-    
-        if 'initializer_range' in config:
-            self._param_initializer = config['initializer_range']
-        else:
-            self._param_initializer = fluid.initializer.TruncatedNormal(
-                scale=backbone_config.get('initializer_range', 0.02))
-        if 'dropout_prob' in config:
-            self._dropout_prob = config['dropout_prob']
-        else:
-            self._dropout_prob = backbone_config.get('hidden_dropout_prob', 0.0)
-        self._pred_output_path = config.get('pred_output_path', None)
-        self._preds = []
-
-    @property
-    def inputs_attrs(self):
-        if self._is_training:
-            reader = {"label_ids": [[-1, 1], 'int64']}
-        else:
-            reader = {}
-        bb = {"sentence_embedding": [[-1, self._hidden_size], 'float32']}
-        return {'reader': reader, 'backbone': bb}
-
-    @property
-    def outputs_attrs(self):
-        if self._is_training:
-            return {'loss': [[1], 'float32']}
-        else:
-            return {'logits': [[-1, self.num_classes], 'float32']}
-
-    def build(self, inputs, scope_name=''):
-        sent_emb = inputs['backbone']['sentence_embedding']
-        if self._is_training:
-            label_ids = inputs['reader']['label_ids']
-            cls_feats = fluid.layers.dropout(
-                x=sent_emb,
-                dropout_prob=self._dropout_prob,
-                dropout_implementation="upscale_in_train")
-
-        logits = fluid.layers.fc(
-            input=sent_emb,
-            size=self.num_classes,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+"cls_out_w",
-                initializer=self._param_initializer),
-            bias_attr=fluid.ParamAttr(
-                name=scope_name+"cls_out_b", initializer=fluid.initializer.Constant(0.)))
-
-        if self._is_training:
-            loss = fluid.layers.softmax_with_cross_entropy(
-                logits=logits, label=label_ids)
-            loss = layers.mean(loss)
-            return {"loss": loss}
-        else:
-            return {"logits":logits}
-
-    def postprocess(self, rt_outputs):
-        if not self._is_training:
-            logits = rt_outputs['logits']
-            preds = np.argmax(logits, -1)
-            self._preds.extend(preds.tolist())
-
-    def epoch_postprocess(self, post_inputs):
-        # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
-        if not self._is_training:
-            if self._pred_output_path is None:
-                raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
-            with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer:
-                for p in self._preds:
-                    writer.write(str(p)+'\n')
-            print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json'))
-
-                
--- a/tasktype/match.py
+++ b/tasktype/match.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from paddle.fluid import layers
-from paddlepalm.interface import task_paradigm
-import numpy as np
-import os
-
-class TaskParadigm(task_paradigm):
-    '''
-    matching
-    '''
-    def __init__(self, config, phase, backbone_config=None):
-        self._is_training = phase == 'train'
-        self._hidden_size = backbone_config['hidden_size']
-
-        if 'initializer_range' in config:
-            self._param_initializer = config['initializer_range']
-        else:
-            self._param_initializer = fluid.initializer.TruncatedNormal(
-                scale=backbone_config.get('initializer_range', 0.02))
-        if 'dropout_prob' in config:
-            self._dropout_prob = config['dropout_prob']
-        else:
-            self._dropout_prob = backbone_config.get('hidden_dropout_prob', 0.0)
-
-        self._pred_output_path = config.get('pred_output_path', None)
-        self._preds = []
-
-    
-    @property
-    def inputs_attrs(self):
-        if self._is_training:
-            reader = {"label_ids": [[-1, 1], 'int64']}
-        else:
-            reader = {}
-        bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']}
-        return {'reader': reader, 'backbone': bb}
-
-    @property
-    def outputs_attrs(self):
-        if self._is_training:
-            return {"loss": [[1], 'float32']}
-        else:
-            return {"logits": [[-1, 2], 'float32']}
-
-    def build(self, inputs, scope_name=""):
-        if self._is_training:
-            labels = inputs["reader"]["label_ids"] 
-        cls_feats = inputs["backbone"]["sentence_pair_embedding"]
-
-        if self._is_training:
-            cls_feats = fluid.layers.dropout(
-                x=cls_feats,
-                dropout_prob=self._dropout_prob,
-                dropout_implementation="upscale_in_train")
-
-        logits = fluid.layers.fc(
-            input=cls_feats,
-            size=2,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+"cls_out_w",
-                initializer=self._param_initializer),
-            bias_attr=fluid.ParamAttr(
-                name=scope_name+"cls_out_b",
-                initializer=fluid.initializer.Constant(0.)))
-
-        if self._is_training:
-            ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
-                logits=logits, label=labels, return_softmax=True)
-            loss = fluid.layers.mean(x=ce_loss)
-            return {'loss': loss}
-        else:
-            return {'logits': logits}
-
-    def postprocess(self, rt_outputs):
-        if not self._is_training:
-            logits = rt_outputs['logits']
-            preds = np.argmax(logits, -1)
-            self._preds.extend(preds.tolist())
-
-    def epoch_postprocess(self, post_inputs):
-        # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
-        if not self._is_training:
-            if self._pred_output_path is None:
-                raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
-            with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer:
-                for p in self._preds:
-                    writer.write(str(p)+'\n')
-            print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json'))
-
-                
--- a/tasktype/mlm.py
+++ b/tasktype/mlm.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from paddlepalm.interface import task_paradigm
-from paddle.fluid import layers
-from paddlepalm.backbone.utils.transformer import pre_process_layer
-
-class TaskParadigm(task_paradigm):
-    '''
-    matching
-    '''
-    def __init__(self, config, phase, backbone_config=None):
-        self._is_training = phase == 'train'
-        self._emb_size = backbone_config['hidden_size']
-        self._hidden_size = backbone_config['hidden_size']
-        self._vocab_size = backbone_config['vocab_size']
-        self._hidden_act = backbone_config['hidden_act']
-        self._initializer_range = backbone_config['initializer_range']
-    
-    @property
-    def inputs_attrs(self):
-        reader = {
-            "mask_label": [[-1, 1], 'int64'],
-            "mask_pos": [[-1, 1], 'int64']}
-        if not self._is_training:
-            del reader['mask_label']
-            del reader['batchsize_x_seqlen']
-        bb = {
-            "encoder_outputs": [[-1, -1, self._hidden_size], 'float32'],
-            "embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']}
-        return {'reader': reader, 'backbone': bb}
-
-    @property
-    def outputs_attrs(self):
-        if self._is_training:
-            return {"loss": [[1], 'float32']}
-        else:
-            return {"logits": [[-1], 'float32']}
-
-    def build(self, inputs, scope_name=""):
-        mask_pos = inputs["reader"]["mask_pos"]
-        if self._is_training:
-            mask_label = inputs["reader"]["mask_label"] 
-            max_position = inputs["reader"]["batchsize_x_seqlen"] - 1
-            mask_pos = fluid.layers.elementwise_min(mask_pos, max_position)
-            mask_pos.stop_gradient = True
-
-        word_emb = inputs["backbone"]["embedding_table"]
-        enc_out = inputs["backbone"]["encoder_outputs"]
-
-        emb_size = word_emb.shape[-1]
-
-        _param_initializer = fluid.initializer.TruncatedNormal(
-            scale=self._initializer_range)
-
-        reshaped_emb_out = fluid.layers.reshape(
-            x=enc_out, shape=[-1, emb_size])
-
-        # extract masked tokens' feature
-        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-
-        # transform: fc
-        mask_trans_feat = fluid.layers.fc(
-            input=mask_feat,
-            size=emb_size,
-            act=self._hidden_act,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+'mask_lm_trans_fc.w_0',
-                initializer=_param_initializer),
-            bias_attr=fluid.ParamAttr(name=scope_name+'mask_lm_trans_fc.b_0'))
-        # transform: layer norm
-        mask_trans_feat = pre_process_layer(
-            mask_trans_feat, 'n', name=scope_name+'mask_lm_trans')
-
-        mask_lm_out_bias_attr = fluid.ParamAttr(
-            name=scope_name+"mask_lm_out_fc.b_0",
-            initializer=fluid.initializer.Constant(value=0.0))
-
-        fc_out = fluid.layers.matmul(
-            x=mask_trans_feat,
-            y=word_emb,
-            transpose_y=True)
-        fc_out += fluid.layers.create_parameter(
-            shape=[self._vocab_size],
-            dtype='float32',
-            attr=mask_lm_out_bias_attr,
-            is_bias=True)
-
-        if self._is_training:
-            mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
-                logits=fc_out, label=mask_label)
-            loss = fluid.layers.mean(mask_lm_loss)
-            return {'loss': loss}
-        else:
-            return {'logits': fc_out}
-
-
--- a/tasktype/mrc.py
+++ b/tasktype/mrc.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from paddlepalm.interface import task_paradigm
-import collections
-import numpy as np
-import os
-import math
-import six
-import paddlepalm.tokenizer.ernie_tokenizer as tokenization
-import json
-
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-
-class TaskParadigm(task_paradigm):
-    """"""
-
-    def __init__(self, config, phase, backbone_config=None):
-        
-        self._is_training = phase == 'train'
-        self._max_sequence_length = config['max_seq_len']
-        self._hidden_size = backbone_config['hidden_size']
-        self._pred_results = []
-        
-        if phase == 'pred':
-            self._max_answer_length = config.get('max_answer_len', None)
-            self._null_score_diff_threshold = config.get('null_score_diff_threshold', 0.0)
-            self._n_best_size = config.get('n_best_size', 20)
-            self._pred_output_path = config.get('pred_output_path', None)
-            self._verbose = config.get('verbose', False)
-            self._with_negative = config.get('with_negative', False)
-            self._do_lower_case = config.get('do_lower_case', False)
-
-
-    @property
-    def inputs_attrs(self):
-        if self._is_training:
-            reader = {"start_positions": [[-1, 1], 'int64'],
-                      "end_positions": [[-1, 1], 'int64'],
-                      }
-        else:
-            reader = {'unique_ids': [[-1, 1], 'int64']}
-        bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']}
-        return {'reader': reader, 'backbone': bb}
-        
-    @property
-    def epoch_inputs_attrs(self):
-        if not self._is_training:
-            from_reader = {'examples': None, 'features': None}
-            return {'reader': from_reader}
-
-    @property
-    def outputs_attr(self):
-        if self._is_training:
-            return {'loss': [[1], 'float32']}
-        else:
-            return {'start_logits': [[-1, -1, 1], 'float32'],
-                    'end_logits': [[-1, -1, 1], 'float32'],
-                    'unique_ids': [[-1, 1], 'int64']}
-
-
-    def build(self, inputs, scope_name=""):
-        if self._is_training:
-            start_positions = inputs['reader']['start_positions']
-            end_positions = inputs['reader']['end_positions']
-            max_position = inputs["reader"]["seqlen"] - 1
-            start_positions = fluid.layers.elementwise_min(start_positions, max_position)
-            end_positions = fluid.layers.elementwise_min(end_positions, max_position)
-            start_positions.stop_gradient = True
-            end_positions.stop_gradient = True
-        else:
-            unique_id = inputs['reader']['unique_ids']
-
-        enc_out = inputs['backbone']['encoder_outputs']
-        logits = fluid.layers.fc(
-            input=enc_out,
-            size=2,
-            num_flatten_dims=2,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+"cls_squad_out_w",
-                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
-            bias_attr=fluid.ParamAttr(
-                name=scope_name+"cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
-
-        logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
-        start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
-
-        def _compute_single_loss(logits, positions):
-            """Compute start/end loss for mrc model"""
-            loss = fluid.layers.softmax_with_cross_entropy(
-                logits=logits, label=positions)
-            loss = fluid.layers.mean(x=loss)
-            return loss
-
-        if self._is_training:
-            start_loss = _compute_single_loss(start_logits, start_positions)
-            end_loss = _compute_single_loss(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2.0
-            return {'loss': total_loss}
-        else:
-            return {'start_logits': start_logits,
-                    'end_logits': end_logits,
-                    'unique_ids': unique_id}
-
-
-    def postprocess(self, rt_outputs):
-        """this func will be called after each step(batch) of training/evaluating/predicting process."""
-        if not self._is_training:
-            unique_ids = np.squeeze(rt_outputs['unique_ids'], -1)
-            start_logits = rt_outputs['start_logits']
-            end_logits = rt_outputs['end_logits']
-            for idx in range(len(unique_ids)):
-                
-                if unique_ids[idx] < 0:
-                    continue
-                if len(self._pred_results) % 1000 == 0:
-                    print("Predicting example: {}".format(len(self._pred_results)))
-                uid = int(unique_ids[idx])
-
-                s = [float(x) for x in start_logits[idx].flat]
-                e = [float(x) for x in end_logits[idx].flat]
-                self._pred_results.append(
-                    RawResult(
-                        unique_id=uid,
-                        start_logits=s,
-                        end_logits=e))
-
-    def epoch_postprocess(self, post_inputs):
-        """(optional interface) this func will be called after evaluation/predicting process and each epoch during training process."""
-
-        if not self._is_training:
-            if self._pred_output_path is None:
-                raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
-            examples = post_inputs['reader']['examples']
-            features = post_inputs['reader']['features']
-            if not os.path.exists(self._pred_output_path):
-                os.makedirs(self._pred_output_path)
-            output_prediction_file = os.path.join(self._pred_output_path, "predictions.json")
-            output_nbest_file = os.path.join(self._pred_output_path, "nbest_predictions.json")
-            output_null_log_odds_file = os.path.join(self._pred_output_path, "null_odds.json")
-            _write_predictions(examples, features, self._pred_results,
-                              self._n_best_size, self._max_answer_length,
-                              self._do_lower_case, output_prediction_file,
-                              output_nbest_file, output_null_log_odds_file,
-                              self._with_negative,
-                              self._null_score_diff_threshold, self._verbose)
-
-
-def _write_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file, output_null_log_odds_file,
-                      with_negative, null_score_diff_threshold,
-                      verbose):
-    """Write final predictions to the json file and log-odds of null if needed."""
-    print("Writing predictions to: %s" % (output_prediction_file))
-    print("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", [
-            "feature_index", "start_index", "end_index", "start_logit",
-            "end_logit"
-        ])
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-        min_null_feature_index = 0  # the paragraph slice with min mull score
-        null_start_logit = 0  # the start logit at the slice with min null score
-        null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-            # if we could have irrelevant answers, get the min score of irrelevant
-            if with_negative:
-                feature_null_score = result.start_logits[0] + result.end_logits[
-                    0]
-                if feature_null_score < score_null:
-                    score_null = feature_null_score
-                    min_null_feature_index = feature_index
-                    null_start_logit = result.start_logits[0]
-                    null_end_logit = result.end_logits[0]
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= len(feature.tokens):
-                        continue
-                    if end_index >= len(feature.tokens):
-                        continue
-                    if start_index not in feature.token_to_orig_map:
-                        continue
-                    if end_index not in feature.token_to_orig_map:
-                        continue
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
-
-        if with_negative:
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    feature_index=min_null_feature_index,
-                    start_index=0,
-                    end_index=0,
-                    start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
-
-        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-            if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
-                                                              )]
-                orig_doc_start = feature.token_to_orig_map[pred.start_index]
-                orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
-                                                                 1)]
-                tok_text = " ".join(tok_tokens)
-
-                # De-tokenize WordPieces that have been split off.
-                tok_text = tok_text.replace(" ##", "")
-                tok_text = tok_text.replace("##", "")
-
-                # Clean whitespace
-                tok_text = tok_text.strip()
-                tok_text = " ".join(tok_text.split())
-                orig_text = " ".join(orig_tokens)
-
-                final_text = _get_final_text(tok_text, orig_text, do_lower_case,
-                                            verbose)
-                if final_text in seen_predictions:
-                    continue
-
-                seen_predictions[final_text] = True
-            else:
-                final_text = ""
-                seen_predictions[final_text] = True
-
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
-
-        # if we didn't inlude the empty option in the n-best, inlcude it
-        if with_negative:
-            if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(
-                _NbestPrediction(
-                    text="empty", start_logit=0.0, end_logit=0.0))
-
-        assert len(nbest) >= 1
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry:
-                if entry.text:
-                    best_non_null_entry = entry
-        # debug
-        if best_non_null_entry is None:
-            print("Emmm..., sth wrong")
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1
-
-        if not with_negative:
-            all_predictions[example.qas_id] = nbest_json[0]["text"]
-        else:
-            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
-                best_non_null_entry.end_logit)
-            scores_diff_json[example.qas_id] = score_diff
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example.qas_id] = ""
-            else:
-                all_predictions[example.qas_id] = best_non_null_entry.text
-
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-
-def _get_final_text(pred_text, orig_text, do_lower_case, verbose):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the MRQA eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heruistic between
-    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose:
-            print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose:
-            print("Length not equal after stripping spaces: '%s' vs '%s'",
-                  orig_ns_text, tok_ns_text)
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        if verbose:
-            print("Couldn't map start position")
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        if verbose:
-            print("Couldn't map end position")
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(
-        enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-
-