Merge branch 'r0.3-api' of https://github.com/PaddlePaddle/PALM into r0.3-api

174d3fec · xixiaoyao · b42dffe7 · 3cf78f84 · 174d3fec · b42dffe7
23 changed file
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ cd PALM && python setup.py install
 **环境依赖**
- Python >= 2.7 (即将支持python3)
+- Python >= 2.7
 - cuda >= 9.0
 - cudnn >= 7.0
 - PaddlePaddle >= 1.6.1 (请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装)

--- a/demo/demo3/pretrain
+++ b/demo/demo3/pretrain
-../../pretrain/
\ No newline at end of file
--- a/download_models.py
+++ b/download_models.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddlepalm as palm
+import sys
+import argparse
+# create parser
+parser = argparse.ArgumentParser(prog='download_models.py', usage='python %(prog)s -l | -d <model_name> [-h]\n\nFor example,\n\tpython %(prog)s -d bert-en-uncased-large ',description = 'Download pretrain models for initializing params of backbones. ')
+parser1= parser.add_argument_group("required arguments")
+parser1.add_argument('-l','--list', action = 'store_true', help = 'show the list of available pretrain models', default = False)
+parser1.add_argument('-d','--download', action = 'store', help = 'download pretrain models. The available pretrain models can be listed by run "python download_models.py -l"') 
+args = parser.parse_args()
+if(args.list):
+  palm.downloader.ls('pretrain')
+elif(args.download):
+  print('download~~~')
+  print(args.download)
+  palm.downloader.download('pretrain', args.download)
+else:
+  print (parser.parse_args(['-h']))
--- a/paddlepalm/_downloader.py
+++ b/paddlepalm/_downloader.py
@@ -34,13 +34,16 @@ _items = {
    'pretrain': {'ernie-en-uncased-large': 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz',
                 'bert-en-uncased-large': 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz',
                 'bert-en-uncased-base': 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz',
+                 'ernie-ch-uncased-base':'https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz',
+                 'roberta-cn-base': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz',
+                 'roberta-cn-large': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz',
                 'utils': None},
    'reader': {'utils': None},
    'backbone': {'utils': None},
    'tasktype': {'utils': None},
 }
-def _download(item, scope, path, silent=False):
+def _download(item, scope, path, silent=False, convert=False):
    data_url = _items[item][scope]
    if data_url == None:
        return
@@ -100,6 +103,7 @@ def _download(item, scope, path, silent=False):
            os.removedirs(source_path)
        if not silent:
            print ('done!')
+        if convert:
            if not silent:
                print ('Converting params...', end=" ")
            _convert(data_dir, silent)

--- a/paddlepalm/backbone/bert.py
+++ b/paddlepalm/backbone/bert.py
@@ -29,22 +29,31 @@ from paddlepalm.backbone.base_backbone import Backbone
 class BERT(Backbone):
-    def __init__(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
+    def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
          max_position_embeddings, type_vocab_size, hidden_act, hidden_dropout_prob, \
-          attention_probs_dropout_prob, initializer_range, phase='train'):
+          attention_probs_dropout_prob, initializer_range, is_pairwise=False, phase='train'):
-        config = {}
-        config['hidden_size'] = hidden_size
+        self._emb_size = hidden_size
-        config['num_hidden_layers'] = num_hidden_layers
+        self._n_layer = num_hidden_layers
-        config['num_attention_heads'] = num_attention_heads
+        self._n_head = num_attention_heads
-        config['vocab_size'] = vocab_size
+        self._voc_size = vocab_size
-        config['max_position_embeddings'] = max_position_embeddings
+        self._max_position_seq_len = max_position_embeddings
-        config['type_vocab_size'] = sent_type_vocab_size
+        self._sent_types = type_vocab_size
-        config['hidden_act'] = hidden_act
-        config['hidden_dropout_prob'] = hidden_dropout_prob
-        config['attention_probs_dropout_prob'] = attention_probs_dropout_prob
+        self._hidden_act = hidden_act
-        config['initializer_range'] = initializer_range
+        self._prepostprocess_dropout = hidden_dropout_prob
+        self._attention_dropout = attention_probs_dropout_prob
-        self.from_config(config, phase=phase)
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._task_emb_name = "task_embedding"
+        self._emb_dtype = "float32"
+        self._phase = phase
+        self._is_pairwise = is_pairwise
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=initializer_range)
    @classmethod
    def from_config(self, config, phase='train'):
@@ -62,40 +71,57 @@ class BERT(Backbone):
            "{} is required to initialize ERNIE".format('attention_probs_dropout_prob')
        assert 'initializer_range' in config, "{} is required to initialize ERNIE".format('initializer_range')
-        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变
+        hidden_size = config['hidden_size']
-        self._emb_size = config["hidden_size"]
+        num_hidden_layers = config['num_hidden_layers']
-        self._n_layer = config["num_hidden_layers"]
+        num_attention_heads = config['num_attention_heads']
-        self._n_head = config["num_attention_heads"]
+        vocab_size = config['vocab_size']
-        self._voc_size = config["vocab_size"]
+        max_position_embeddings = config['max_position_embeddings']
-        self._max_position_seq_len = config["max_position_embeddings"]
+        if 'sent_type_vocab_size' in config:
-        self._sent_types = config["type_vocab_size"]
+            sent_type_vocab_size = config['sent_type_vocab_size']
-        self._hidden_act = config["hidden_act"]
+        else:
-        self._prepostprocess_dropout = config["hidden_dropout_prob"]
+            sent_type_vocab_size = config['type_vocab_size']
-        self._attention_dropout = config["attention_probs_dropout_prob"]
+        hidden_act = config['hidden_act']
-        self._word_emb_name = "word_embedding"
+        hidden_dropout_prob = config['hidden_dropout_prob']
-        self._pos_emb_name = "pos_embedding"
+        attention_probs_dropout_prob = config['attention_probs_dropout_prob']
-        self._sent_emb_name = "sent_embedding"
+        initializer_range = config['initializer_range']
+        if 'is_pairwise' in config:
-        # Initialize all weigths by truncated normal initializer, and all biases 
+            is_pairwise = config['is_pairwise']
-        # will be initialized by constant zero by default.
+        else:
-        self._param_initializer = fluid.initializer.TruncatedNormal(
+            is_pairwise = False
-            scale=config["initializer_range"])
+        return self(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
+          max_position_embeddings, sent_type_vocab_size, \
+          hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise, phase)
    @property
    def inputs_attr(self):
-        return {"token_ids": [[-1, -1], 'int64'],
+        ret = {"token_ids": [[-1, -1], 'int64'],
               "position_ids": [[-1, -1], 'int64'],
               "segment_ids": [[-1, -1], 'int64'],
-                "input_mask": [[-1, -1, 1], 'float32']}
+               "input_mask": [[-1, -1, 1], 'float32'],
+               }
+        if self._is_pairwise and self._phase=='train':
+            ret.update({"token_ids_neg": [[-1, -1], 'int64'],
+                        "position_ids_neg": [[-1, -1], 'int64'],
+                        "segment_ids_neg": [[-1, -1], 'int64'],
+                        "input_mask_neg": [[-1, -1, 1], 'float32'],
+                        })
+        return ret
    @property
    def outputs_attr(self):
-        return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
+        ret = {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
               "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
               "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
               "sentence_embedding": [[-1, self._emb_size], 'float32'],
               "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
+        if self._is_pairwise and self._phase == 'train':
+            ret.update({"word_embedding_neg": [[-1, -1, self._emb_size], 'float32'],
+                        "encoder_outputs_neg": [[-1, -1, self._emb_size], 'float32'],
+                        "sentence_embedding_neg": [[-1, self._emb_size], 'float32'],
+                        "sentence_pair_embedding_neg": [[-1, self._emb_size], 'float32']})
+        return ret 
    def build(self, inputs, scope_name=""):
        src_ids = inputs['token_ids']
@@ -104,6 +130,21 @@ class BERT(Backbone):
        input_mask = inputs['input_mask']
        self._emb_dtype = 'float32'
+        input_buffer = {}
+        output_buffer = {}
+        input_buffer['base'] = [src_ids, pos_ids, sent_ids, input_mask]
+        output_buffer['base'] = {}
+        if self._is_pairwise and self._phase =='train':
+            src_ids = inputs['token_ids_neg']
+            pos_ids = inputs['position_ids_neg']
+            sent_ids = inputs['segment_ids_neg']
+            input_mask = inputs['input_mask_neg']
+            input_buffer['neg'] = [src_ids, pos_ids, sent_ids, input_mask]
+            output_buffer['neg'] = {}
+        for key, (src_ids, pos_ids, sent_ids, input_mask) in input_buffer.items():
            # padding id in vocabulary must be set to 0
            emb_out = fluid.embedding(
                input=src_ids,
@@ -174,12 +215,25 @@ class BERT(Backbone):
                param_attr=fluid.ParamAttr(
                    name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
                bias_attr=scope_name+"pooled_fc.b_0")
+            output_buffer[key]['word_embedding'] = emb_out
-        return {'embedding_table': embedding_table,
+            output_buffer[key]['encoder_outputs'] = enc_out
-                'word_embedding': emb_out,
+            output_buffer[key]['sentence_embedding'] = next_sent_feat
-                'encoder_outputs': enc_out,
+            output_buffer[key]['sentence_pair_embedding'] = next_sent_feat
-                'sentence_embedding': next_sent_feat,
-                'sentence_pair_embedding': next_sent_feat}
+        ret = {}
+        ret['embedding_table'] = embedding_table
+        ret['word_embedding'] = output_buffer['base']['word_embedding']
+        ret['encoder_outputs'] = output_buffer['base']['encoder_outputs']
+        ret['sentence_embedding'] = output_buffer['base']['sentence_embedding']
+        ret['sentence_pair_embedding'] = output_buffer['base']['sentence_pair_embedding']
+        if self._is_pairwise and self._phase == 'train':
+            ret['word_embedding_neg'] = output_buffer['neg']['word_embedding']
+            ret['encoder_outputs_neg'] = output_buffer['neg']['encoder_outputs']
+            ret['sentence_embedding_neg'] = output_buffer['neg']['sentence_embedding']
+            ret['sentence_pair_embedding_neg'] = output_buffer['neg']['sentence_pair_embedding']
+        return ret
    def postprocess(self, rt_outputs):
        pass

--- a/paddlepalm/backbone/ernie.py
+++ b/paddlepalm/backbone/ernie.py
@@ -31,7 +31,7 @@ class ERNIE(Backbone):
    def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
          max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \
-          hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase='train'):
+          hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise=False, phase='train'):
        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变
@@ -53,7 +53,8 @@ class ERNIE(Backbone):
        self._sent_emb_name = "sent_embedding"
        self._task_emb_name = "task_embedding"
        self._emb_dtype = "float32"
+        self._is_pairwise = is_pairwise
+        self._phase=phase
        self._param_initializer = fluid.initializer.TruncatedNormal(
            scale=initializer_range)
@@ -65,7 +66,7 @@ class ERNIE(Backbone):
        assert 'vocab_size' in config, "{} is required to initialize ERNIE".format('vocab_size')
        assert 'max_position_embeddings' in config, "{} is required to initialize ERNIE".format('max_position_embeddings')
        assert 'sent_type_vocab_size' in config or 'type_vocab_size' in config, "{} is required to initialize ERNIE".format('sent_type_vocab_size')
-        assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size')
+        # assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size')
        assert 'hidden_act' in config, "{} is required to initialize ERNIE".format('hidden_act')
        assert 'hidden_dropout_prob' in config, "{} is required to initialize ERNIE".format('hidden_dropout_prob')
        assert 'attention_probs_dropout_prob' in config, "{} is required to initialize ERNIE".format('attention_probs_dropout_prob')
@@ -80,40 +81,76 @@ class ERNIE(Backbone):
            sent_type_vocab_size = config['sent_type_vocab_size']
        else:
            sent_type_vocab_size = config['type_vocab_size']
+        if 'task_type_vocab_size' in config:
            task_type_vocab_size = config['task_type_vocab_size']
+        else:
+            task_type_vocab_size = config['type_vocab_size']
        hidden_act = config['hidden_act']
        hidden_dropout_prob = config['hidden_dropout_prob']
        attention_probs_dropout_prob = config['attention_probs_dropout_prob']
        initializer_range = config['initializer_range']
+        if 'is_pairwise' in config:
+            is_pairwise = config['is_pairwise']
+        else:
+            is_pairwise = False
        return cls(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
          max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \
-          hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase=phase)
+          hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise, phase=phase)
    @property
    def inputs_attr(self):
-        return {"token_ids": [[-1, -1], 'int64'],
+        ret = {"token_ids": [[-1, -1], 'int64'],
               "position_ids": [[-1, -1], 'int64'],
               "segment_ids": [[-1, -1], 'int64'],
               "input_mask": [[-1, -1, 1], 'float32'],
               "task_ids": [[-1,-1], 'int64']}
+        if self._is_pairwise and self._phase=='train':
+            ret.update({"token_ids_neg": [[-1, -1], 'int64'],
+                        "position_ids_neg": [[-1, -1], 'int64'],
+                        "segment_ids_neg": [[-1, -1], 'int64'],
+                        "input_mask_neg": [[-1, -1, 1], 'float32'],
+                        "task_ids_neg": [[-1,-1], 'int64']
+                        })
+        return ret
    @property
    def outputs_attr(self):
-        return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
+        ret = {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
               "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
               "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
               "sentence_embedding": [[-1, self._emb_size], 'float32'],
               "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
+        if self._is_pairwise and self._phase == 'train':
+            ret.update({"word_embedding_neg": [[-1, -1, self._emb_size], 'float32'],
+                        "encoder_outputs_neg": [[-1, -1, self._emb_size], 'float32'],
+                        "sentence_embedding_neg": [[-1, self._emb_size], 'float32'],
+                        "sentence_pair_embedding_neg": [[-1, self._emb_size], 'float32']})
+        return ret 
    def build(self, inputs, scope_name=""):
        src_ids = inputs['token_ids']
        pos_ids = inputs['position_ids']
        sent_ids = inputs['segment_ids']
        input_mask = inputs['input_mask']
        task_ids = inputs['task_ids']
+        input_buffer = {}
+        output_buffer = {}
+        input_buffer['base'] = [src_ids, pos_ids, sent_ids, input_mask, task_ids]
+        output_buffer['base'] = {}
+        if self._is_pairwise and self._phase =='train':
+            src_ids = inputs['token_ids_neg']
+            pos_ids = inputs['position_ids_neg']
+            sent_ids = inputs['segment_ids_neg']
+            input_mask = inputs['input_mask_neg']
+            task_ids = inputs['task_ids_neg']
+            input_buffer['neg'] = [src_ids, pos_ids, sent_ids, input_mask, task_ids]
+            output_buffer['neg'] = {}
+        for key, (src_ids, pos_ids, sent_ids, input_mask, task_ids) in input_buffer.items():
            # padding id in vocabulary must be set to 0
            emb_out = fluid.embedding(
                input=src_ids,
@@ -183,7 +220,6 @@ class ERNIE(Backbone):
                param_initializer=self._param_initializer,
                name=scope_name+'encoder')
            next_sent_feat = fluid.layers.slice(
                input=enc_out, axes=[1], starts=[0], ends=[1])
            next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
@@ -195,11 +231,25 @@ class ERNIE(Backbone):
                    name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
                bias_attr=scope_name+"pooled_fc.b_0")
-        return {'embedding_table': embedding_table,
+            output_buffer[key]['word_embedding'] = emb_out
-                'word_embedding': emb_out,
+            output_buffer[key]['encoder_outputs'] = enc_out
-                'encoder_outputs': enc_out,
+            output_buffer[key]['sentence_embedding'] = next_sent_feat
-                'sentence_embedding': next_sent_feat,
+            output_buffer[key]['sentence_pair_embedding'] = next_sent_feat
-                'sentence_pair_embedding': next_sent_feat}
+        ret = {}
+        ret['embedding_table'] = embedding_table
+        ret['word_embedding'] = output_buffer['base']['word_embedding']
+        ret['encoder_outputs'] = output_buffer['base']['encoder_outputs']
+        ret['sentence_embedding'] = output_buffer['base']['sentence_embedding']
+        ret['sentence_pair_embedding'] = output_buffer['base']['sentence_pair_embedding']
+        if self._is_pairwise and self._phase == 'train':
+            ret['word_embedding_neg'] = output_buffer['neg']['word_embedding']
+            ret['encoder_outputs_neg'] = output_buffer['neg']['encoder_outputs']
+            ret['sentence_embedding_neg'] = output_buffer['neg']['sentence_embedding']
+            ret['sentence_pair_embedding_neg'] = output_buffer['neg']['sentence_pair_embedding']
+        return ret
    def postprocess(self, rt_outputs):
        pass

--- a/paddlepalm/distribute/reader.py
+++ b/paddlepalm/distribute/reader.py
@@ -24,7 +24,6 @@ def yield_pieces(data, distribute_strategy, batch_size):
        assert isinstance(data, list), "the input data must be a list or dict, and contained with multiple tensors."
        data_list = data
        ds_list = distribute_strategy
    stride = batch_size // dev_count
    p = stride
    # while p < len(data_list) + stride:
@@ -57,7 +56,8 @@ def yield_pieces(data, distribute_strategy, batch_size):
            # print(len(temp))
            yield temp
-def data_feeder(reader, postprocess_fn=None, prefetch_steps=2, phase='train'):
+def data_feeder(reader, postprocess_fn=None, prefetch_steps=2):
    if postprocess_fn is None:
        def postprocess_fn(batch):
            return batch

--- a/paddlepalm/head/__init__.py
+++ b/paddlepalm/head/__init__.py
 from cls import Classify
-# from match import Match
+from match import Match
-# from mrc import MRC
+from ner import SequenceLabel
-# from mlm import MaskLM
+from mrc import MRC
+from mlm import MaskLM
--- a/paddlepalm/head/match.py
+++ b/paddlepalm/head/match.py
@@ -13,41 +13,66 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.fluid as fluid
 from paddle.fluid import layers
-from paddlepalm.interface import task_paradigm
+from paddlepalm.head.base_head import Head
 import numpy as np
 import os
+import json
+def computeHingeLoss(pos, neg, margin):
+    loss_part1 = fluid.layers.elementwise_sub(
+        fluid.layers.fill_constant_batch_size_like(
+            input=pos, shape=[-1, 1], value=margin, dtype='float32'), pos)
+    loss_part2 = fluid.layers.elementwise_add(loss_part1, neg)
+    loss_part3 = fluid.layers.elementwise_max(
+        fluid.layers.fill_constant_batch_size_like(
+            input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), loss_part2)
+    return loss_part3
-class TaskParadigm(task_paradigm):
+class Match(Head):
    '''
    matching
    '''
-    def __init__(self, config, phase, backbone_config=None):
+    def __init__(self, num_classes, input_dim, dropout_prob=0.0, param_initializer_range=0.02, \
+        learning_strategy='pointwise', margin=0.5, phase='train'):
+        """  
+        Args:
+            phase: train, eval, pred
+            lang: en, ch, ...
+            learning_strategy: pointwise, pairwise
+        """
        self._is_training = phase == 'train'
-        self._hidden_size = backbone_config['hidden_size']
+        self._hidden_size = input_dim
-        if 'initializer_range' in config:
+        self._num_classes = num_classes
-            self._param_initializer = config['initializer_range']
-        else:
+        self._dropout_prob = dropout_prob if phase == 'train' else 0.0
        self._param_initializer = fluid.initializer.TruncatedNormal(
-                scale=backbone_config.get('initializer_range', 0.02))
+            scale=param_initializer_range)
-        if 'dropout_prob' in config:
+        self._learning_strategy = learning_strategy 
-            self._dropout_prob = config['dropout_prob']
+        self._margin = margin
-        else:
-            self._dropout_prob = backbone_config.get('hidden_dropout_prob', 0.0)
-        self._pred_output_path = config.get('pred_output_path', None)
-        self._preds = []
+        self._preds = []
+        self._preds_logits = []
    @property
    def inputs_attrs(self):
-        if self._is_training:
-            reader = {"label_ids": [[-1, 1], 'int64']}
-        else:
        reader = {}
        bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']}
+        if self._is_training:
+            if self._learning_strategy == 'pointwise':
+                reader["label_ids"] = [[-1], 'int64']
+            elif self._learning_strategy == 'pairwise':
+                bb["sentence_pair_embedding_neg"] = [[-1, self._hidden_size], 'float32']
        return {'reader': reader, 'backbone': bb}
    @property
@@ -55,51 +80,110 @@ class TaskParadigm(task_paradigm):
        if self._is_training:
            return {"loss": [[1], 'float32']}
        else:
-            return {"logits": [[-1, 2], 'float32']}
+            if self._learning_strategy=='paiwise':
+                return {"probs": [[-1, 1], 'float32']}
+            else:
+                return {"logits": [[-1, 2], 'float32'],
+                        "probs": [[-1, 2], 'float32']}
    def build(self, inputs, scope_name=""):
-        if self._is_training:
-            labels = inputs["reader"]["label_ids"] 
-        cls_feats = inputs["backbone"]["sentence_pair_embedding"]
+        # inputs          
+        cls_feats = inputs["backbone"]["sentence_pair_embedding"] 
        if self._is_training:
            cls_feats = fluid.layers.dropout(
                x=cls_feats,
                dropout_prob=self._dropout_prob,
                dropout_implementation="upscale_in_train")
+            if self._learning_strategy == 'pairwise':
+                cls_feats_neg = inputs["backbone"]["sentence_pair_embedding_neg"]
+                cls_feats_neg = fluid.layers.dropout(
+                x=cls_feats_neg,
+                dropout_prob=self._dropout_prob,
+                dropout_implementation="upscale_in_train")
+            elif self._learning_strategy == 'pointwise':
+                labels = inputs["reader"]["label_ids"] 
+        # loss
+        # for pointwise
+        if self._learning_strategy == 'pointwise':
            logits = fluid.layers.fc(
                input=cls_feats,
-            size=2,
+                size=self._num_classes,
                param_attr=fluid.ParamAttr(
                    name=scope_name+"cls_out_w",
                    initializer=self._param_initializer),
                bias_attr=fluid.ParamAttr(
                    name=scope_name+"cls_out_b",
                    initializer=fluid.initializer.Constant(0.)))
+            probs = fluid.layers.softmax(logits)
            if self._is_training:
-            ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
+                ce_loss = fluid.layers.cross_entropy(
-                logits=logits, label=labels, return_softmax=True)
+                    input=probs, label=labels)
                loss = fluid.layers.mean(x=ce_loss)
                return {'loss': loss}
+            # for pred
            else:
-            return {'logits': logits}
+                return {'logits': logits,
+                        'probs': probs}
+        # for pairwise
+        elif self._learning_strategy == 'pairwise':
+            pos_score = fluid.layers.fc(
+                input=cls_feats,
+                size=1,
+                act = "sigmoid",
+                param_attr=fluid.ParamAttr(
+                    name=scope_name+"cls_out_w_pr",
+                    initializer=self._param_initializer),
+                bias_attr=fluid.ParamAttr(
+                    name=scope_name+"cls_out_b_pr",
+                    initializer=fluid.initializer.Constant(0.)))
+            pos_score = fluid.layers.reshape(x=pos_score, shape=[-1, 1], inplace=True)
-    def postprocess(self, rt_outputs):
+            if self._is_training:
+                neg_score = fluid.layers.fc(
+                    input=cls_feats_neg,
+                    size=1,
+                    act = "sigmoid",
+                    param_attr=fluid.ParamAttr(
+                        name=scope_name+"cls_out_w_pr",
+                        initializer=self._param_initializer),
+                    bias_attr=fluid.ParamAttr(
+                        name=scope_name+"cls_out_b_pr",
+                        initializer=fluid.initializer.Constant(0.)))        
+                neg_score = fluid.layers.reshape(x=neg_score, shape=[-1, 1], inplace=True)
+                loss = fluid.layers.mean(computeHingeLoss(pos_score, neg_score, self._margin))
+                return {'loss': loss}
+            # for pred
+            else:
+                return {'probs': pos_score}
+    def batch_postprocess(self, rt_outputs):
        if not self._is_training:
+            probs = []
+            logits = []
+            probs = rt_outputs['probs']
+            self._preds.extend(probs.tolist())
+            if self._learning_strategy == 'pointwise':
                logits = rt_outputs['logits']
-            preds = np.argmax(logits, -1)
+                self._preds_logits.extend(logits.tolist())
-            self._preds.extend(preds.tolist())
-    def epoch_postprocess(self, post_inputs):
+    def epoch_postprocess(self, post_inputs, output_dir=None):
        # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
        if not self._is_training:
-            if self._pred_output_path is None:
+            if output_dir is None:
-                raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
+                raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
-            with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer:
+            with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
-                for p in self._preds:
+                for i in range(len(self._preds)):
-                    writer.write(str(p)+'\n')
+                    if self._learning_strategy == 'pointwise':
-            print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json'))
+                        label = 0 if self._preds[i][0] > self._preds[i][1] else 1
+                        result = {'index': i, 'label': label, 'logits': self._preds_logits[i], 'probs': self._preds[i]}
+                    elif self._learning_strategy == 'pairwise':
+                        label = 0 if self._preds[i][0] < 0.5 else 1
+                        result = {'index': i, 'label': label, 'probs': self._preds[i][0]}
+                    result = json.dumps(result)
+                    writer.write(result+'\n')
+            print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
\ No newline at end of file
--- a/paddlepalm/head/mlm.py
+++ b/paddlepalm/head/mlm.py
@@ -14,30 +14,39 @@
 # limitations under the License.
 import paddle.fluid as fluid
-from paddlepalm.interface import task_paradigm
+from paddlepalm.head.base_head import Head
 from paddle.fluid import layers
+import numpy as np
+import os
 from paddlepalm.backbone.utils.transformer import pre_process_layer
-class TaskParadigm(task_paradigm):
+class MaskLM(Head):
    '''
-    matching
+    mlm
    '''
-    def __init__(self, config, phase, backbone_config=None):
+    def __init__(self, input_dim, vocab_size, hidden_act, initializer_range, dropout_prob=0.0, \
+                 param_initializer_range=0.02, phase='train'):
        self._is_training = phase == 'train'
-        self._emb_size = backbone_config['hidden_size']
+        self._emb_size = input_dim
-        self._hidden_size = backbone_config['hidden_size']
+        self._hidden_size = input_dim
-        self._vocab_size = backbone_config['vocab_size']
+        self._dropout_prob = dropout_prob if phase == 'train' else 0.0
-        self._hidden_act = backbone_config['hidden_act']
+        self._param_initializer = fluid.initializer.TruncatedNormal(
-        self._initializer_range = backbone_config['initializer_range']
+            scale=param_initializer_range)
+        self._preds = []
+        self._vocab_size = vocab_size
+        self._hidden_act = hidden_act
+        self._initializer_range = initializer_range
    @property
    def inputs_attrs(self):
        reader = {
-            "mask_label": [[-1, 1], 'int64'],
+            "token_ids":[[-1, -1], 'int64'],
-            "mask_pos": [[-1, 1], 'int64']}
+            "mask_label": [[-1], 'int64'],
+            "mask_pos": [[-1], 'int64'],
+            }
        if not self._is_training:
            del reader['mask_label']
-            del reader['batchsize_x_seqlen']
        bb = {
            "encoder_outputs": [[-1, -1, self._hidden_size], 'float32'],
            "embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']}
@@ -54,7 +63,13 @@ class TaskParadigm(task_paradigm):
        mask_pos = inputs["reader"]["mask_pos"]
        if self._is_training:
            mask_label = inputs["reader"]["mask_label"] 
-            max_position = inputs["reader"]["batchsize_x_seqlen"] - 1
+            l1 = fluid.layers.shape(inputs["reader"]["token_ids"] )[0]
+            # bxs = inputs["reader"]["token_ids"].shape[2].value
+            l2 = fluid.layers.shape(inputs["reader"]["token_ids"][0])[0]
+            bxs = (l1*l2).astype(np.int64)
+            # max_position = inputs["reader"]["batchsize_x_seqlen"] - 1
+            max_position = bxs - 1
            mask_pos = fluid.layers.elementwise_min(mask_pos, max_position)
            mask_pos.stop_gradient = True
@@ -100,11 +115,31 @@ class TaskParadigm(task_paradigm):
            is_bias=True)
        if self._is_training:
-            mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+            inputs = fluid.layers.softmax(fc_out)
-                logits=fc_out, label=mask_label)
+            mask_lm_loss = fluid.layers.cross_entropy(
+                input=inputs, label=mask_label)
            loss = fluid.layers.mean(mask_lm_loss)
            return {'loss': loss}
        else:
            return {'logits': fc_out}
+    def batch_postprocess(self, rt_outputs):
+        if not self._is_training:
+            logits = rt_outputs['logits']
+            preds = np.argmax(logits, -1)
+            self._preds.extend(preds.tolist())
+            return preds
+    def epoch_postprocess(self, post_inputs, output_dir=None):
+        # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
+        if not self._is_training:
+            if output_dir is None:
+                for p in self._preds:
+                    print(p)
+            else:
+                with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
+                    for p in self._preds:
+                        writer.write(str(p)+'\n')
+                print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
--- a/paddlepalm/head/mrc.py
+++ b/paddlepalm/head/mrc.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import paddle.fluid as fluid
-from paddlepalm.interface import task_paradigm
+from paddlepalm.head.base_head import Head
 import collections
 import numpy as np
 import os
@@ -26,34 +26,37 @@ import json
 RawResult = collections.namedtuple("RawResult",
                                   ["unique_id", "start_logits", "end_logits"])
-class TaskParadigm(task_paradigm):
+class MRC(Head):
-    """"""
+    """
+    Machine Reading Comprehension
+    """
-    def __init__(self, config, phase, backbone_config=None):
+    def __init__(self, max_query_len, input_dim, pred_output_path=None, verbose=False, with_negative=False, do_lower_case=False, max_ans_len=None, null_score_diff_threshold=0.0, n_best_size=20, phase='train'):
        self._is_training = phase == 'train'
-        self._max_sequence_length = config['max_seq_len']
+        self._hidden_size = input_dim
-        self._hidden_size = backbone_config['hidden_size']
+        self._max_sequence_length = max_query_len
        self._pred_results = []
-        if phase == 'pred':
+        output_dir = pred_output_path
-            self._max_answer_length = config.get('max_answer_len', None)
+        self._max_answer_length = max_ans_len
-            self._null_score_diff_threshold = config.get('null_score_diff_threshold', 0.0)
+        self._null_score_diff_threshold = null_score_diff_threshold
-            self._n_best_size = config.get('n_best_size', 20)
+        self._n_best_size = n_best_size
-            self._pred_output_path = config.get('pred_output_path', None)
+        output_dir = pred_output_path
-            self._verbose = config.get('verbose', False)
+        self._verbose = verbose
-            self._with_negative = config.get('with_negative', False)
+        self._with_negative = with_negative
-            self._do_lower_case = config.get('do_lower_case', False)
+        self._do_lower_case = do_lower_case
    @property
    def inputs_attrs(self):
        if self._is_training:
-            reader = {"start_positions": [[-1, 1], 'int64'],
+            reader = {"start_positions": [[-1], 'int64'],
-                      "end_positions": [[-1, 1], 'int64'],
+                      "end_positions": [[-1], 'int64'],
                      }
        else:
-            reader = {'unique_ids': [[-1, 1], 'int64']}
+            reader = {'unique_ids': [[-1], 'int64']}
        bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']}
        return {'reader': reader, 'backbone': bb}
@@ -70,21 +73,26 @@ class TaskParadigm(task_paradigm):
        else:
            return {'start_logits': [[-1, -1, 1], 'float32'],
                    'end_logits': [[-1, -1, 1], 'float32'],
-                    'unique_ids': [[-1, 1], 'int64']}
+                    'unique_ids': [[-1], 'int64']}
    def build(self, inputs, scope_name=""):
        if self._is_training:
            start_positions = inputs['reader']['start_positions']
            end_positions = inputs['reader']['end_positions']
-            max_position = inputs["reader"]["seqlen"] - 1
+            # max_position = inputs["reader"]["seqlen"] - 1
-            start_positions = fluid.layers.elementwise_min(start_positions, max_position)
+            # start_positions = fluid.layers.elementwise_min(start_positions, max_position)
-            end_positions = fluid.layers.elementwise_min(end_positions, max_position)
+            # end_positions = fluid.layers.elementwise_min(end_positions, max_position)
            start_positions.stop_gradient = True
            end_positions.stop_gradient = True
        else:
            unique_id = inputs['reader']['unique_ids']
+            # It's used to help fetch variable 'unique_ids' that will be removed in the future
+            helper_constant = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64')
+            fluid.layers.elementwise_mul(unique_id, helper_constant)  
        enc_out = inputs['backbone']['encoder_outputs']
        logits = fluid.layers.fc(
            input=enc_out,
@@ -100,9 +108,11 @@ class TaskParadigm(task_paradigm):
        start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
        def _compute_single_loss(logits, positions):
-            """Compute start/end loss for mrc model"""
+            """Compute start/en
-            loss = fluid.layers.softmax_with_cross_entropy(
+            d loss for mrc model"""
-                logits=logits, label=positions)
+            inputs = fluid.layers.softmax(logits)
+            loss = fluid.layers.cross_entropy(
+                input=inputs, label=positions)
            loss = fluid.layers.mean(x=loss)
            return loss
@@ -117,10 +127,10 @@ class TaskParadigm(task_paradigm):
                    'unique_ids': unique_id}
-    def postprocess(self, rt_outputs):
+    def batch_postprocess(self, rt_outputs):
        """this func will be called after each step(batch) of training/evaluating/predicting process."""
        if not self._is_training:
-            unique_ids = np.squeeze(rt_outputs['unique_ids'], -1)
+            unique_ids = rt_outputs['unique_ids']
            start_logits = rt_outputs['start_logits']
            end_logits = rt_outputs['end_logits']
            for idx in range(len(unique_ids)):
@@ -139,19 +149,19 @@ class TaskParadigm(task_paradigm):
                        start_logits=s,
                        end_logits=e))
-    def epoch_postprocess(self, post_inputs):
+    def epoch_postprocess(self, post_inputs, output_dir=None):
        """(optional interface) this func will be called after evaluation/predicting process and each epoch during training process."""
        if not self._is_training:
-            if self._pred_output_path is None:
+            if output_dir is None:
-                raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
+                raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
            examples = post_inputs['reader']['examples']
            features = post_inputs['reader']['features']
-            if not os.path.exists(self._pred_output_path):
+            if not os.path.exists(output_dir):
-                os.makedirs(self._pred_output_path)
+                os.makedirs(output_dir)
-            output_prediction_file = os.path.join(self._pred_output_path, "predictions.json")
+            output_prediction_file = os.path.join(output_dir, "predictions.json")
-            output_nbest_file = os.path.join(self._pred_output_path, "nbest_predictions.json")
+            output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
-            output_null_log_odds_file = os.path.join(self._pred_output_path, "null_odds.json")
+            output_null_log_odds_file = os.path.join(output_dir, "null_odds.json")
            _write_predictions(examples, features, self._pred_results,
                              self._n_best_size, self._max_answer_length,
                              self._do_lower_case, output_prediction_file,
@@ -194,8 +204,9 @@ def _write_predictions(all_examples, all_features, all_results, n_best_size,
        # keep track of the minimum score of null start+end of position 0
        score_null = 1000000  # large and positive
        min_null_feature_index = 0  # the paragraph slice with min mull score
-        null_start_logit = 0  # the start logit at the slice with min null score
+        ull_start_logit = 0  # the start logit at the slice with min null score
        null_end_logit = 0  # the end logit at the slice with min null score
        for (feature_index, feature) in enumerate(features):
            result = unique_id_to_result[feature.unique_id]
            start_indexes = _get_best_indexes(result.start_logits, n_best_size)

--- a/paddlepalm/head/ner.py
+++ b/paddlepalm/head/ner.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid as fluid
+from paddle.fluid import layers
+from paddlepalm.head.base_head import Head
+import numpy as np
+import os
+import math
+class SequenceLabel(Head):
+    '''
+    Sequence label
+    '''
+    def __init__(self, num_classes, input_dim, dropout_prob=0.0, learning_rate=1e-3,  \
+                 param_initializer_range=0.02, phase='train'):
+        """  
+        Args:
+            phase: train, eval, pred
+            lang: en, ch, ...
+        """
+        self._is_training = phase == 'train'
+        self._hidden_size = input_dim
+        self.num_classes = num_classes
+        self._dropout_prob = dropout_prob if phase == 'train' else 0.0
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=param_initializer_range)
+        self.learning_rate = learning_rate
+        self._preds = []
+    @property
+    def inputs_attrs(self):
+        reader = {}
+        bb = {"encoder_outputs": [[-1, -1, -1], 'float32']}
+        if self._is_training:
+            reader["label_ids"] = [[-1, -1], 'int64']
+            reader["seq_lens"] = [[-1], 'int64']
+        return {'reader': reader, 'backbone': bb}
+    @property
+    def outputs_attrs(self):
+        if self._is_training:
+            return {'loss': [[1], 'float32']}
+        else:
+            return {'emission': [[-1, self.num_classes], 'float32']}
+    def build(self, inputs, scope_name=''):
+        token_emb = inputs['backbone']['encoder_outputs']
+        if self._is_training:
+            label_ids = inputs['reader']['label_ids']
+            seq_lens = inputs['reader']['seq_lens']
+        emission = fluid.layers.fc(
+            size=self.num_classes,
+            input=token_emb,
+            param_attr=fluid.ParamAttr(
+                initializer=self._param_initializer,
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)),
+            bias_attr=fluid.ParamAttr(
+                name=scope_name+"cls_out_b", initializer=fluid.initializer.Constant(0.)),
+            num_flatten_dims=2)
+        if self._is_training:
+            # compute loss
+            crf_cost = fluid.layers.linear_chain_crf(  
+                input=emission,
+                label=label_ids,
+                param_attr=fluid.ParamAttr(
+                    name=scope_name+'crfw', learning_rate=self.learning_rate),
+                length=seq_lens)
+            avg_cost = fluid.layers.mean(x=crf_cost)
+            crf_decode = fluid.layers.crf_decoding(
+                input=emission,
+                param_attr=fluid.ParamAttr(name=scope_name+'crfw'),
+                length=seq_lens)
+            (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
+            num_correct_chunks) = fluid.layers.chunk_eval(
+                input=crf_decode,
+                label=label_ids,
+                chunk_scheme="IOB",
+                num_chunk_types=int(math.ceil((self.num_classes - 1) / 2.0)),
+                seq_length=seq_lens)
+            chunk_evaluator = fluid.metrics.ChunkEvaluator()
+            chunk_evaluator.reset()
+            return {"loss": avg_cost}
+        else:
+            return {"emission": emission} 
+    def batch_postprocess(self, rt_outputs):
+        if not self._is_training:
+            emission = rt_outputs['emission']
+            preds = np.argmax(emission, -1)
+            self._preds.extend(preds.tolist())
+    def epoch_postprocess(self, post_inputs, output_dir=None):
+        # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
+        if not self._is_training:
+            if output_dir is None:
+                raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
+            with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
+                for p in self._preds:
+                    writer.write(str(p)+'\n')
+            print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
--- a/paddlepalm/reader/__init__.py
+++ b/paddlepalm/reader/__init__.py
 from cls import ClassifyReader
+from match import MatchReader
+from ner import SequenceLabelReader
+from mrc import MrcReader
+from mlm import MaskLMReader
--- a/paddlepalm/reader/base_reader.py
+++ b/paddlepalm/reader/base_reader.py
@@ -42,7 +42,6 @@ class Reader(object):
        self._register.add(attr_name)
    def register_with(self, backbone):
-        print(backbone)
        for attr in backbone.inputs_attr:
            self.require_attr(attr)
        self._registered_backbone = backbone

--- a/paddlepalm/reader/match.py
+++ b/paddlepalm/reader/match.py
@@ -13,87 +13,104 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddlepalm.interface import reader
+from paddlepalm.reader.base_reader import Reader
-from paddlepalm.reader.utils.reader4ernie import ClassifyReader
+from paddlepalm.reader.utils.reader4ernie import ClassifyReader as CLSReader
-class Reader(reader):
-    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+class MatchReader(Reader):
+    def __init__(self, vocab_path, max_len, tokenizer='wordpiece', lang='en', seed=None, \
+        do_lower_case=False, learning_strategy='pointwise', phase='train', dev_count=1, print_prefix=''):  # 需要什么加什么
        """  
        Args:
            phase: train, eval, pred
+            lang: en, ch, ...
+            learning_strategy: pointwise, pairwise
        """
-        self._is_training = phase == 'train'
+        Reader.__init__(self, phase)
-        reader = ClassifyReader(config['vocab_path'],
+        assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
-            max_seq_len=config['max_seq_len'],
+        assert phase in ['train', 'predict'], "supported phase: train, predict."
-            do_lower_case=config.get('do_lower_case', True),
-            for_cn=config.get('for_cn', False),
-            random_seed=config.get('seed', None))
-        self._reader = reader
-        self._dev_count = dev_count
-        self._batch_size = config['batch_size']
+        for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
-        self._max_seq_len = config['max_seq_len']
+        self._register.add('token_ids')
        if phase == 'train':
-            self._input_file = config['train_file']
+            if learning_strategy == 'pointwise':
-            self._num_epochs = None # 防止iteartor终止
+                self._register.add('label_ids')
-            self._shuffle = config.get('shuffle', True)
+            if learning_strategy == 'pairwise':
-            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+                self._register.add('token_ids_neg')
-        elif phase == 'eval':
+                self._register.add('position_ids_neg')
-            self._input_file = config['dev_file']
+                self._register.add('segment_ids_neg')
-            self._num_epochs = 1
+                self._register.add('input_mask_neg')
-            self._shuffle = False
+                self._register.add('task_ids_neg')
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-        elif phase == 'pred':
+        self._is_training = phase == 'train'
-            self._input_file = config['pred_file']
+        self._learning_strategy = learning_strategy
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        match_reader = CLSReader(vocab_path,
+                                max_seq_len=max_len,
+                                do_lower_case=do_lower_case,
+                                for_cn=for_cn,
+                                random_seed=seed,
+                                learning_strategy = learning_strategy)
+        self._reader = match_reader
+        self._dev_count = dev_count
        self._phase = phase
-        # self._batch_size = 
-        self._print_first_n = config.get('print_first_n', 1)
    @property
    def outputs_attr(self):
-        if self._is_training:
+        attrs = {"token_ids": [[-1, -1], 'int64'],
-            return {"token_ids": [[-1, -1], 'int64'],
                "position_ids": [[-1, -1], 'int64'],
                "segment_ids": [[-1, -1], 'int64'],
                "input_mask": [[-1, -1, 1], 'float32'],
-                    "label_ids": [[-1], 'int64'],
-                    "task_ids": [[-1, -1], 'int64']
-                    }
-        else:
-            return {"token_ids": [[-1, -1], 'int64'],
-                    "position_ids": [[-1, -1], 'int64'],
-                    "segment_ids": [[-1, -1], 'int64'],
                "task_ids": [[-1, -1], 'int64'],
-                    "input_mask": [[-1, -1, 1], 'float32']
+                "label_ids": [[-1], 'int64'],
+                "token_ids_neg": [[-1, -1], 'int64'],
+                "position_ids_neg": [[-1, -1], 'int64'],
+                "segment_ids_neg": [[-1, -1], 'int64'],
+                "input_mask_neg": [[-1, -1, 1], 'float32'],
+                "task_ids_neg": [[-1, -1], 'int64']
                }
+        return self._get_registed_attrs(attrs)
+    def load_data(self, input_file, batch_size, num_epochs=None, \
+                  file_format='tsv', shuffle_train=True):
+        self._batch_size = batch_size
+        self._num_epochs = num_epochs
+        self._data_generator = self._reader.data_generator( \
+            input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
+            shuffle=shuffle_train if self._phase == 'train' else False, \
+            phase=self._phase)
-    def load_data(self):
+    def _iterator(self): 
-        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
-    def iterator(self): 
-        def list_to_dict(x):
+        names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 'label_ids', \
-            names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
+            'token_ids_neg', 'segment_ids_neg', 'position_ids_neg', 'task_ids_neg', 'input_mask_neg']
-                'label_ids', 'unique_ids']
-            outputs = {n: i for n,i in zip(names, x)}
+        if self._learning_strategy == 'pairwise':
-            del outputs['unique_ids']
+            names.remove('label_ids')
-            if not self._is_training:
-                del outputs['label_ids']
-            return outputs
        for batch in self._data_generator():
-            yield list_to_dict(batch)
+            outputs = {n: i for n,i in zip(names, batch)}
+            ret = {}
+            # TODO: move runtime shape check here
+            for attr in self.outputs_attr.keys():
+                ret[attr] = outputs[attr]
+            yield ret
    @property
    def num_examples(self):
        return self._reader.get_num_examples(phase=self._phase)
+    @property
+    def num_epochs(self):
+        return self._num_epochs
--- a/paddlepalm/reader/mlm.py
+++ b/paddlepalm/reader/mlm.py
@@ -13,79 +13,79 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddlepalm.interface import reader
+from paddlepalm.reader.base_reader import Reader
-from paddlepalm.reader.utils.reader4ernie import MaskLMReader
+from paddlepalm.reader.utils.reader4ernie import MaskLMReader as MLMReader
 import numpy as np
-class Reader(reader):
+class MaskLMReader(Reader):
-    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+    def __init__(self, vocab_path, max_len, tokenizer='wordpiece', \
+             lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''):
        """
        Args:
            phase: train, eval, pred
        """
-        self._is_training = phase == 'train'
-        reader = MaskLMReader(config['vocab_path'],
+        Reader.__init__(self, phase)
-            max_seq_len=config['max_seq_len'],
-            do_lower_case=config.get('do_lower_case', False),
+        assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
-            for_cn=config.get('for_cn', False),
+        assert phase in ['train', 'predict'], "supported phase: train, predict."
-            random_seed=config.get('seed', None))
-        self._reader = reader
-        self._dev_count = dev_count
-        self._batch_size = config['batch_size']
+        for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
-        self._max_seq_len = config['max_seq_len']
+        self._register.add('token_ids')
+        self._register.add('mask_pos')
        if phase == 'train':
-            self._input_file = config['train_file']
+            self._register.add('mask_label')
-            self._num_epochs = None # 防止iteartor终止
+        self._is_training = phase == 'train'
-            self._shuffle = config.get('shuffle', True)
-            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+        mlm_reader = MLMReader(vocab_path,
-        elif phase == 'eval':
+                                max_seq_len=max_len,
-            self._input_file = config['dev_file']
+                                do_lower_case=do_lower_case,
-            self._num_epochs = 1
+                                for_cn=for_cn,
-            self._shuffle = False
+                                random_seed=seed)
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        self._reader = mlm_reader
-        elif phase == 'pred':
-            self._input_file = config['pred_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
        self._phase = phase
-        # self._batch_size = 
+        self._dev_count = dev_count
-        self._print_first_n = config.get('print_first_n', 1)
    @property
    def outputs_attr(self):
-        return {"token_ids": [[-1, -1], 'int64'],
+        attrs = {"token_ids": [[-1, -1], 'int64'],
                "position_ids": [[-1, -1], 'int64'],
                "segment_ids": [[-1, -1], 'int64'],
                "input_mask": [[-1, -1, 1], 'float32'],
                "task_ids": [[-1, -1], 'int64'],
                "mask_label": [[-1], 'int64'],
-                "mask_pos": [[-1], 'int64'],
+                "mask_pos": [[-1], 'int64']
                }
+        return self._get_registed_attrs(attrs)
-    def load_data(self):
+    def load_data(self, input_file, batch_size, num_epochs=None, \
-        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
+                  file_format='csv', shuffle_train=True):
+        self._batch_size = batch_size
+        self._num_epochs = num_epochs
+        self._data_generator = self._reader.data_generator( \
+            input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
+            shuffle=shuffle_train if self._phase == 'train' else False, \
+            phase=self._phase)
-    def iterator(self): 
+    def _iterator(self): 
-        def list_to_dict(x):
        names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask', 
            'task_ids', 'mask_label', 'mask_pos']
-            outputs = {n: i for n,i in zip(names, x)}
-            # outputs['batchsize_x_seqlen'] = [self._batch_size * len(outputs['token_ids'][0]) - 1]
-            return outputs
        for batch in self._data_generator():
-            # print(np.shape(list_to_dict(batch)['token_ids']))
+            outputs = {n: i for n,i in zip(names, batch)}
-            # print(list_to_dict(batch)['mask_label'].tolist())
+            ret = {}
-            yield list_to_dict(batch)
+            # TODO: move runtime shape check here
+            for attr in self.outputs_attr.keys():
+                ret[attr] = outputs[attr]
+            yield ret
    def get_epoch_outputs(self):
        return {'examples': self._reader.get_examples(self._phase),
@@ -95,3 +95,7 @@ class Reader(reader):
    def num_examples(self):
        return self._reader.get_num_examples(phase=self._phase)
+    @property
+    def num_epochs(self):
+        return self._num_epochs
--- a/paddlepalm/reader/mrc.py
+++ b/paddlepalm/reader/mrc.py
@@ -13,77 +13,66 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddlepalm.interface import reader
+from paddlepalm.reader.base_reader import Reader
 from paddlepalm.reader.utils.reader4ernie import MRCReader
+import numpy as np
-class Reader(reader):
+class MrcReader(Reader):
-    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+    def __init__(self, vocab_path, max_len, max_query_len, doc_stride, tokenizer='FullTokenizer', lang='en', seed=None, do_lower_case=False, \
+        remove_noanswer=True, phase='train', dev_count=1, print_prefix=''):
        """
        Args:
            phase: train, eval, pred
+            lang: en, ch, ...
        """
-        self._is_training = phase == 'train'
+        Reader.__init__(self, phase)
-        reader = MRCReader(config['vocab_path'],
-            max_seq_len=config['max_seq_len'],
-            do_lower_case=config.get('do_lower_case', False),
-            tokenizer='FullTokenizer',
-            for_cn=config.get('for_cn', False),
-            doc_stride=config['doc_stride'],
-            remove_noanswer=config.get('remove_noanswer', True),
-            max_query_length=config['max_query_len'],
-            random_seed=config.get('seed', None))
-        self._reader = reader
-        self._dev_count = dev_count
-        self._batch_size = config['batch_size']
+        assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
-        self._max_seq_len = config['max_seq_len']
+        assert phase in ['train', 'predict'], "supported phase: train, predict."
+        for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
+        self._register.add('token_ids')
        if phase == 'train':
-            self._input_file = config['train_file']
+            self._register.add("start_positions")
-            # self._num_epochs = config['num_epochs']
+            self._register.add("end_positions")
-            self._num_epochs = None # 防止iteartor终止
+        else:
-            self._shuffle = config.get('shuffle', True)
+            self._register.add("unique_ids")
-            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
-        if phase == 'eval':
-            self._input_file = config['dev_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-        elif phase == 'pred':
-            self._input_file = config['pred_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-        self._phase = phase
-        # self._batch_size = 
-        self._print_first_n = config.get('print_first_n', 1)
-        # TODO: without slide window version
+        self._is_training = phase == 'train'
-        self._with_slide_window = config.get('with_slide_window', False)
+        mrc_reader = MRCReader(vocab_path,
+                                max_seq_len=max_len,
+                                do_lower_case=do_lower_case,
+                                tokenizer=tokenizer,
+                                doc_stride=doc_stride,
+                                remove_noanswer=remove_noanswer,
+                                max_query_length=max_query_len,
+                                for_cn=for_cn,
+                                random_seed=seed)
+        self._reader = mrc_reader
+        self._phase = phase
+        self._dev_count = dev_count
    @property
    def outputs_attr(self):
-        if self._is_training:
+        attrs = {"token_ids": [[-1, -1], 'int64'],
-            return {"token_ids": [[-1, -1], 'int64'],
                "position_ids": [[-1, -1], 'int64'],
                "segment_ids": [[-1, -1], 'int64'],
                "input_mask": [[-1, -1, 1], 'float32'],
                "start_positions": [[-1], 'int64'],
                "end_positions": [[-1], 'int64'],
-                    "task_ids": [[-1, -1], 'int64']
-                    }
-        else:
-            return {"token_ids": [[-1, -1], 'int64'],
-                    "position_ids": [[-1, -1], 'int64'],
-                    "segment_ids": [[-1, -1], 'int64'],
                "task_ids": [[-1, -1], 'int64'],
-                    "input_mask": [[-1, -1, 1], 'float32'],
                "unique_ids": [[-1], 'int64']
                }
+        return self._get_registed_attrs(attrs)
    @property
    def epoch_outputs_attr(self):
@@ -91,26 +80,34 @@ class Reader(reader):
            return {"examples": None,
                    "features": None}
-    def load_data(self):
+    def load_data(self, input_file, batch_size, num_epochs=None, file_format='csv', shuffle_train=True):
-        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
+        self._batch_size = batch_size
+        self._num_epochs = num_epochs
-    def iterator(self): 
+        self._data_generator = self._reader.data_generator( \
+            input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
+            shuffle=shuffle_train if self._phase == 'train' else False, \
+            phase=self._phase)
+    def _iterator(self): 
-        def list_to_dict(x):
        names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
            'start_positions', 'end_positions', 'unique_ids']
-            outputs = {n: i for n,i in zip(names, x)}
        if self._is_training:
-                del outputs['unique_ids']
+            names.remove('unique_ids')
-            else:
-                del outputs['start_positions']
-                del outputs['end_positions']
-            return outputs
        for batch in self._data_generator():
-            yield list_to_dict(batch)
+            outputs = {n: i for n,i in zip(names, batch)}
+            ret = {}
+            # TODO: move runtime shape check here
+            for attr in self.outputs_attr.keys():
+                ret[attr] = outputs[attr]
+            if not self._is_training:
+                assert 'unique_ids' in ret, ret
+            yield ret
    def get_epoch_outputs(self):
        return {'examples': self._reader.get_examples(self._phase),
                'features': self._reader.get_features(self._phase)}
@@ -118,3 +115,7 @@ class Reader(reader):
    def num_examples(self):
        return self._reader.get_num_examples(phase=self._phase)
+    @property
+    def num_epochs(self):
+        return self._num_epochs
--- a/paddlepalm/reader/ner.py
+++ b/paddlepalm/reader/ner.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlepalm.reader.base_reader import Reader
+from paddlepalm.reader.utils.reader4ernie import SequenceLabelReader as SLReader
+class SequenceLabelReader(Reader):
+    def __init__(self, vocab_path, max_len, label_map_config, tokenizer='wordpiece', \
+             lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''):
+        """  
+        Args:
+            phase: train, eval, pred
+            lang: en, ch, ...
+        """
+        Reader.__init__(self, phase)
+        assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
+        assert phase in ['train', 'predict'], "supported phase: train, predict."
+        for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
+        self._register.add('token_ids')
+        self._register.add('seq_lens')
+        if phase == 'train':
+            self._register.add('label_ids')
+        self._is_training = phase == 'train'
+        ner_reader = SLReader(vocab_path,
+                                max_seq_len=max_len,
+                                do_lower_case=do_lower_case,
+                                for_cn=for_cn,
+                                random_seed=seed,
+                                label_map_config=label_map_config)
+        self._reader = ner_reader
+        self._phase = phase
+        self._dev_count = dev_count
+    @property
+    def outputs_attr(self):
+        attrs = {"token_ids": [[-1, -1], 'int64'],
+                "position_ids": [[-1, -1], 'int64'],
+                "segment_ids": [[-1, -1], 'int64'],
+                "task_ids": [[-1, -1], 'int64'],
+                "input_mask": [[-1, -1, 1], 'float32'],
+                "seq_lens": [[-1], 'int64'],
+                "label_ids": [[-1, -1], 'int64']}
+        return self._get_registed_attrs(attrs)
+    def load_data(self, input_file, batch_size, num_epochs=None, \
+                  file_format='tsv', shuffle_train=True):
+        self._batch_size = batch_size
+        self._num_epochs = num_epochs
+        self._data_generator = self._reader.data_generator( \
+            input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
+            shuffle=shuffle_train if self._phase == 'train' else False, \
+            phase=self._phase)
+    def _iterator(self): 
+        names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
+            'label_ids', 'seq_lens', 'label_ids']
+        for batch in self._data_generator():
+            outputs = {n: i for n,i in zip(names, batch)}
+            ret = {}
+            # TODO: move runtime shape check here
+            for attr in self.outputs_attr.keys():
+                ret[attr] = outputs[attr]
+            yield ret
+    def get_epoch_outputs(self):
+        return {'examples': self._reader.get_examples(self._phase),
+                'features': self._reader.get_features(self._phase)}
+    @property
+    def num_examples(self):
+        return self._reader.get_num_examples(phase=self._phase)
+    @property
+    def num_epochs(self):
+        return self._num_epochs
--- a/paddlepalm/reader/utils/mlm_batching.py
+++ b/paddlepalm/reader/utils/mlm_batching.py
@@ -19,12 +19,26 @@ from __future__ import print_function
 import numpy as np
-def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3, dev_count=1):
    """
    Add mask for batch_tokens, return out, mask_label, mask_pos;
    Note: mask_pos responding the batch_tokens after padded;
    """
    max_len = max([len(sent) for sent in batch_tokens])
+    multidev_batch_tokens = []
+    multidev_mask_label = []
+    multidev_mask_pos = []
+    big_batch_tokens = batch_tokens
+    stride = len(batch_tokens) // dev_count
+    if stride == 0:
+        return None, None, None
+    p = stride
+    for i in range(dev_count):
+        batch_tokens = big_batch_tokens[p-stride:p]
+        p += stride
        mask_label = []
        mask_pos = []
        prob_mask = np.random.rand(total_token_num)
@@ -69,7 +83,12 @@ def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
                    mask_pos.append(sent_index * max_len + token_index)
        mask_label = np.array(mask_label).astype("int64").reshape([-1])
        mask_pos = np.array(mask_pos).astype("int64").reshape([-1])
-    return batch_tokens, mask_label, mask_pos
+        multidev_batch_tokens.extend(batch_tokens)
+        multidev_mask_label.append(mask_label)
+        multidev_mask_pos.append(mask_pos)
+    return multidev_batch_tokens, multidev_mask_label, multidev_mask_pos
 def prepare_batch_data(insts,
@@ -83,7 +102,8 @@ def prepare_batch_data(insts,
                       task_id=0,
                       return_input_mask=True,
                       return_max_len=True,
-                       return_num_token=False):
+                       return_num_token=False, 
+                       dev_count=1):
    """
    1. generate Tensor of data
    2. generate Tensor of position
@@ -101,7 +121,8 @@ def prepare_batch_data(insts,
        vocab_size=voc_size,
        CLS=cls_id,
        SEP=sep_id,
-        MASK=mask_id)
+        MASK=mask_id,
+        dev_count=dev_count)
    # Second step: padding
    src_id, self_input_mask = pad_batch_data(
        out, 
@@ -125,7 +146,7 @@ def prepare_batch_data(insts,
    return_list = [
        src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos
    ]
-    return return_list if len(return_list) > 1 else return_list[0]
+    return return_list
 def pad_batch_data(insts,

--- a/paddlepalm/reader/utils/reader4ernie.py
+++ b/paddlepalm/reader/utils/reader4ernie.py
@@ -29,11 +29,15 @@ import six
 from io import open
 from collections import namedtuple
+# from . import gpu_dev_count
+gpu_dev_count=1
+import paddlepalm as palm
 import paddlepalm.tokenizer.ernie_tokenizer as tokenization
 from paddlepalm.reader.utils.batching4ernie import pad_batch_data
 from paddlepalm.reader.utils.mlm_batching import prepare_batch_data
 log = logging.getLogger(__name__)
 if six.PY3:
@@ -49,20 +53,23 @@ def csv_reader(fd, delimiter='\t'):
    return gen()
-class BaseReader(object):
+class Reader(object):
    def __init__(self,
                 vocab_path,
                 label_map_config=None,
                 max_seq_len=512,
-                 do_lower_case=False,
+                 do_lower_case=True,
                 in_tokens=False,
                 is_inference=False,
+                 learning_strategy='pointwise',
                 random_seed=None,
                 tokenizer="FullTokenizer",
+                 phase='train',
                 is_classify=True,
                 is_regression=False,
-                 for_cn=False,
+                 for_cn=True,
                 task_id=0):
+        assert phase in ['train', 'predict'], "supported phase: train, predict."
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=do_lower_case)
@@ -72,7 +79,9 @@ class BaseReader(object):
        self.sep_id = self.vocab["[SEP]"]
        self.mask_id = self.vocab["[MASK]"]
        self.in_tokens = in_tokens
+        self.phase = phase
        self.is_inference = is_inference
+        self.learning_strategy = learning_strategy
        self.for_cn = for_cn
        self.task_id = task_id
@@ -83,7 +92,6 @@ class BaseReader(object):
        self.current_example = 0
        self.current_epoch = 0
        self.num_examples = 0
        self.examples = {}
        if label_map_config:
@@ -125,33 +133,41 @@ class BaseReader(object):
            else:
                tokens_b.pop()
    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
        """Converts a single `Example` into a single `Record`."""
        text_a = tokenization.convert_to_unicode(example.text_a)
        tokens_a = tokenizer.tokenize(text_a)
        tokens_b = None
        has_text_b = False
+        has_text_b_neg = False
        if isinstance(example, dict):
            has_text_b = "text_b" in example.keys()
+            has_text_b_neg = "text_b_neg" in example.keys()
        else:
            has_text_b = "text_b" in example._fields
+            has_text_b_neg = "text_b_neg" in example._fields
        if has_text_b:
            text_b = tokenization.convert_to_unicode(example.text_b)
            tokens_b = tokenizer.tokenize(text_b)
-        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+            if has_text_b_neg and self.phase == 'train':
+                tokens_a_neg = tokenizer.tokenize(text_a)
+                text_b_neg = tokenization.convert_to_unicode(example.text_b_neg)
+                tokens_b_neg = tokenizer.tokenize(text_b_neg)
+                self._truncate_seq_pair(tokens_a_neg, tokens_b_neg, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]
        # The convention in BERT/ERNIE is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
@@ -173,6 +189,7 @@ class BaseReader(object):
        tokens = []
        text_type_ids = []
        tokens.append("[CLS]")
        text_type_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
@@ -190,6 +207,29 @@ class BaseReader(object):
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        position_ids = list(range(len(token_ids)))
+        if has_text_b_neg and self.phase == 'train':
+            tokens_neg = []
+            text_type_ids_neg = []
+            tokens_neg.append("[CLS]")
+            text_type_ids_neg.append(0)
+            for token in tokens_a_neg:
+                tokens_neg.append(token)
+                text_type_ids_neg.append(0)
+            tokens_neg.append("[SEP]")
+            text_type_ids_neg.append(0)
+            if tokens_b_neg:
+                for token in tokens_b_neg:
+                    tokens_neg.append(token)
+                    text_type_ids_neg.append(1)
+                tokens_neg.append("[SEP]")
+                text_type_ids_neg.append(1)
+            token_ids_neg = tokenizer.convert_tokens_to_ids(tokens_neg)
+            position_ids_neg = list(range(len(token_ids_neg)))
        if self.is_inference:
            Record = namedtuple('Record',
                                ['token_ids', 'text_type_ids', 'position_ids'])
@@ -197,6 +237,23 @@ class BaseReader(object):
                token_ids=token_ids,
                text_type_ids=text_type_ids,
                position_ids=position_ids)
+        else:
+            qid = None
+            if "qid" in example._fields:
+                qid = example.qid
+            if self.learning_strategy == 'pairwise' and self.phase == 'train':
+                Record = namedtuple('Record',
+                                    ['token_ids', 'text_type_ids', 'position_ids', 'token_ids_neg', 'text_type_ids_neg', 'position_ids_neg', 'qid'])
+                record = Record(
+                    token_ids=token_ids,
+                    text_type_ids=text_type_ids,
+                    position_ids=position_ids,
+                    token_ids_neg=token_ids_neg,
+                    text_type_ids_neg=text_type_ids_neg,
+                    position_ids_neg=position_ids_neg,
+                    qid=qid)
            else:
                if self.label_map:
                    label_id = self.label_map[example.label]
@@ -207,10 +264,6 @@ class BaseReader(object):
                    'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'
                ])
-            qid = None
-            if "qid" in example._fields:
-                qid = example.qid
                record = Record(
                    token_ids=token_ids,
                    text_type_ids=text_type_ids,
@@ -240,7 +293,7 @@ class BaseReader(object):
                yield self._pad_batch_records(batch_records)
                batch_records, max_len = [record], len(record.token_ids)
-        if phase == 'pred' and batch_records:
+        if phase == 'predict' and batch_records:
            yield self._pad_batch_records(batch_records)
    def get_num_examples(self, input_file=None, phase='train'):
@@ -283,6 +336,7 @@ class BaseReader(object):
                    if len(all_dev_batches) == dev_count:
                        for batch in all_dev_batches:
                            yield batch
                        all_dev_batches = []
        def f():
            for i in wrapper():
@@ -299,71 +353,7 @@ class BaseReader(object):
        return f
-class ClassifyReader(BaseReader):
+class MaskLMReader(Reader):
-    def _read_tsv(self, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, 'r', encoding='utf8') as f:
-            reader = csv_reader(f)
-            headers = next(reader)
-            text_indices = [
-                index for index, h in enumerate(headers) if h != "label"
-            ]
-            Example = namedtuple('Example', headers)
-            examples = []
-            for line in reader:
-                for index, text in enumerate(line):
-                    if index in text_indices:
-                        if self.for_cn:
-                            line[index] = text.replace(' ', '')
-                        else:
-                            line[index] = text
-                example = Example(*line)
-                examples.append(example)
-            return examples
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-        if not self.is_inference:
-            batch_labels = [record.label_id for record in batch_records]
-            if self.is_classify:
-                batch_labels = np.array(batch_labels).astype("int64").reshape(
-                    [-1])
-            elif self.is_regression:
-                batch_labels = np.array(batch_labels).astype("float32").reshape(
-                    [-1])
-            if batch_records[0].qid:
-                batch_qids = [record.qid for record in batch_records]
-                batch_qids = np.array(batch_qids).astype("int64").reshape(
-                    [-1])
-            else:
-                batch_qids = np.array([]).astype("int64").reshape([-1])
-        # padding
-        padded_token_ids, input_mask = pad_batch_data(
-            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        padded_task_ids = np.ones_like(
-            padded_token_ids, dtype="int64") * self.task_id
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            padded_task_ids, input_mask
-        ]
-        if not self.is_inference:
-            return_list += [batch_labels, batch_qids]
-        return return_list
-class MaskLMReader(BaseReader):
    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
        """Converts a single `Example` into a single `Record`."""
@@ -430,13 +420,6 @@ class MaskLMReader(BaseReader):
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        position_ids = list(range(len(token_ids)))
-        # Record = namedtuple('Record',
-        #                     ['token_ids', 'text_type_ids', 'position_ids'])
-        # record = Record(
-        #     token_ids=token_ids,
-        #     text_type_ids=text_type_ids,
-        #     position_ids=position_ids)
        return [token_ids, text_type_ids, position_ids]
    def batch_reader(self, examples, batch_size, in_tokens, phase):
@@ -455,7 +438,7 @@ class MaskLMReader(BaseReader):
                batch = [parsed_line]
                total_token_num = len(parsed_line[0])
-        if len(batch) > 0 and phase == 'pred':
+        if len(batch) > 0 and phase == 'predict':
            yield batch, total_token_num
    def data_generator(self,
@@ -497,19 +480,102 @@ class MaskLMReader(BaseReader):
                        # max_len=self.max_seq_len, # 注意，如果padding到最大长度，会导致mask_pos与实际位置不对应。因为mask pos是基于batch内最大长度来计算的。
                        return_input_mask=True,
                        return_max_len=False,
-                        return_num_token=False)
+                        return_num_token=False,
+                        # dev_count=gpu_dev_count)
+                        dev_count=1)
-                    if len(all_dev_batches) < dev_count:
-                        all_dev_batches.append(batch_data)
+                    # yield batch
-                    if len(all_dev_batches) == dev_count:
+                    for piece in palm.distribute.yield_pieces(batch_data, ['s', 's', 's', 's', 's', 'u', 'u'], batch_size):
-                        for batch in all_dev_batches:
+                        yield piece
-                            yield batch
-                        all_dev_batches = []
        return wrapper
-class SequenceLabelReader(BaseReader):
+class ClassifyReader(Reader):
+    def _read_tsv(self, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, 'r', encoding='utf8') as f:
+            reader = csv_reader(f)
+            headers = next(reader)
+            text_indices = [
+                index for index, h in enumerate(headers) if h != "label"
+            ]
+            Example = namedtuple('Example', headers)
+            examples = []
+            for line in reader:
+                for index, text in enumerate(line):
+                    if index in text_indices:
+                        if self.for_cn:
+                            line[index] = text.replace(' ', '')
+                        else:
+                            line[index] = text
+                example = Example(*line)
+                examples.append(example)
+            return examples
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        if self.phase=='train' and self.learning_strategy == 'pairwise':
+            batch_token_ids_neg = [record.token_ids_neg for record in batch_records]
+            batch_text_type_ids_neg = [record.text_type_ids_neg for record in batch_records]
+            batch_position_ids_neg = [record.position_ids_neg for record in batch_records]
+        if not self.is_inference:
+            if not self.learning_strategy == 'pairwise':
+                batch_labels = [record.label_id for record in batch_records]
+                if self.is_classify:
+                    batch_labels = np.array(batch_labels).astype("int64").reshape(
+                        [-1])
+                elif self.is_regression:
+                    batch_labels = np.array(batch_labels).astype("float32").reshape(
+                        [-1])
+            if batch_records[0].qid:
+                batch_qids = [record.qid for record in batch_records]
+                batch_qids = np.array(batch_qids).astype("int64").reshape(
+                    [-1])
+            else:
+                batch_qids = np.array([]).astype("int64").reshape([-1])
+        # padding
+        padded_token_ids, input_mask = pad_batch_data(
+            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_task_ids = np.ones_like(
+            padded_token_ids, dtype="int64") * self.task_id
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask
+        ]
+        if self.phase=='train':
+            if self.learning_strategy == 'pairwise':
+                padded_token_ids_neg, input_mask_neg = pad_batch_data(
+                    batch_token_ids_neg, pad_idx=self.pad_id, return_input_mask=True)
+                padded_text_type_ids_neg = pad_batch_data(
+                    batch_text_type_ids_neg, pad_idx=self.pad_id)
+                padded_position_ids_neg = pad_batch_data(
+                    batch_position_ids_neg, pad_idx=self.pad_id)
+                padded_task_ids_neg = np.ones_like(
+                    padded_token_ids_neg, dtype="int64") * self.task_id
+                return_list += [padded_token_ids_neg, padded_text_type_ids_neg, \
+                                padded_position_ids_neg, padded_task_ids_neg, input_mask_neg]
+            elif self.learning_strategy == 'pointwise':
+                return_list += [batch_labels]
+        return return_list
+class SequenceLabelReader(Reader):
    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [record.text_type_ids for record in batch_records]
@@ -550,19 +616,7 @@ class SequenceLabelReader(BaseReader):
                ret_labels.append(label)
                continue
-            if label == "O" or label.startswith("I-"):
            ret_labels.extend([label] * len(sub_token))
-            elif label.startswith("B-"):
-                i_label = "I-" + label[2:]
-                ret_labels.extend([label] + [i_label] * (len(sub_token) - 1))
-            elif label.startswith("S-"):
-                b_laebl = "B-" + label[2:]
-                e_label = "E-" + label[2:]
-                i_label = "I-" + label[2:]
-                ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label])
-            elif label.startswith("E-"):
-                i_label = "I-" + label[2:]
-                ret_labels.extend([i_label] * (len(sub_token) - 1) + [label])
        assert len(ret_tokens) == len(ret_labels)
        return ret_tokens, ret_labels
@@ -581,6 +635,9 @@ class SequenceLabelReader(BaseReader):
        position_ids = list(range(len(token_ids)))
        text_type_ids = [0] * len(token_ids)
        no_entity_id = len(self.label_map) - 1
+        labels = [
+            label if label in self.label_map else u"O" for label in labels
+        ]
        label_ids = [no_entity_id] + [
            self.label_map[label] for label in labels
        ] + [no_entity_id]
@@ -596,7 +653,7 @@ class SequenceLabelReader(BaseReader):
        return record
-class ExtractEmbeddingReader(BaseReader):
+class ExtractEmbeddingReader(Reader):
    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
        batch_text_type_ids = [record.text_type_ids for record in batch_records]
@@ -623,7 +680,7 @@ class ExtractEmbeddingReader(BaseReader):
        return return_list
-class MRCReader(BaseReader):
+class MRCReader(Reader):
    def __init__(self,
                 vocab_path,
                 label_map_config=None,
@@ -888,11 +945,20 @@ class MRCReader(BaseReader):
            if to_append:
                batch_records.append(record)
            else:
-                yield self._pad_batch_records(batch_records, phase == "train")
+                # yield self._pad_batch_records(batch_records, phase == "train")
+                ds = ['s'] * 8
+                for piece in palm.distribute.yield_pieces(\
+                        self._pad_batch_records(batch_records, phase == 'train'),
+                        ds, batch_size):
+                    yield piece
                batch_records, max_len = [record], len(record.token_ids)
-        if phase == 'pred' and batch_records:
+        if phase == 'predict' and batch_records:
-            yield self._pad_batch_records(batch_records, phase == "train")
+            for piece in palm.distribute.yield_pieces(\
+                        self._pad_batch_records(batch_records, phase == 'train'),
+                        ds, batch_size):
+                yield piece
    def _pad_batch_records(self, batch_records, is_training):
        batch_token_ids = [record.token_ids for record in batch_records]
@@ -979,12 +1045,8 @@ class MRCReader(BaseReader):
                for batch_data in self._prepare_batch_data(
                        features, batch_size, phase=phase):
-                    if len(all_dev_batches) < dev_count:
-                        all_dev_batches.append(batch_data)
+                    yield batch_data
-                    if len(all_dev_batches) == dev_count:
-                        for batch in all_dev_batches:
-                            yield batch
-                        all_dev_batches = []
        return wrapper

--- a/paddlepalm/task_instance.py
+++ b/paddlepalm/task_instance.py
@@ -71,10 +71,10 @@ class TaskInstance(object):
        self._train_finish = False
        # 存放不同运行阶段（train，eval，pred）的数据集reader，key为phase，value为Reader实例
-        self._reader = {'train': None, 'eval': None, 'pred': None}
+        self._reader = {'train': None, 'eval': None, 'predict': None}
        self._input_layer = None
        self._inputname_to_varname = {}
-        self._task_layer = {'train': None, 'eval': None, 'pred': None}
+        self._task_layer = {'train': None, 'eval': None, 'predict': None}
        self._pred_input_name_list = []
        self._pred_input_varname_list = []
        self._pred_fetch_name_list = []
@@ -90,7 +90,7 @@ class TaskInstance(object):
    def build_task_layer(self, net_inputs, phase, scope=""):
        output_vars = self._task_layer[phase].build(net_inputs, scope_name=scope)
-        if phase == 'pred':
+        if phase == 'predict':
            if output_vars is not None:
                self._pred_fetch_name_list, self._pred_fetch_var_list = zip(*output_vars.items())
            else:

--- a/paddlepalm/utils/basic_helper.py
+++ b/paddlepalm/utils/basic_helper.py
@@ -3,6 +3,7 @@ import os
 import json
 import yaml
 from config_helper import PDConfig
+import logging
 from paddle import fluid
 def get_basename(f):

--- a/paddlepalm/utils/reader_helper.py
+++ b/paddlepalm/utils/reader_helper.py
@@ -16,6 +16,7 @@
 import os
 import sys
 import random
+import logging
 import numpy as np
 import paddle
 from paddle import fluid