add examples

6e5a2af4 · wangxiao1021 · fac8802f · 6e5a2af4 · 6e5a2af4 · 6e5a2af4
129 changed file
--- a/.gitignore
+++ b/.gitignore
 *.pyc
 __pycache__
 pretrain_model
+pretrain
+output*
 output_model
 build
 dist

--- a/README.md
+++ b/README.md
--- a/README.md.old
+++ b/README.md.old
--- a/backbone/README.md
+++ b/backbone/README.md
--- a/backbone/__init__.py
+++ b/backbone/__init__.py
--- a/backbone/bert.py
+++ b/backbone/bert.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""v1.1 
-BERT model."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from paddle import fluid
-from paddle.fluid import layers
-from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
-from paddlepalm.interface import backbone
-class Model(backbone):
-    def __init__(self, config, phase):
-        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变
-        self._emb_size = config["hidden_size"]
-        self._n_layer = config["num_hidden_layers"]
-        self._n_head = config["num_attention_heads"]
-        self._voc_size = config["vocab_size"]
-        self._max_position_seq_len = config["max_position_embeddings"]
-        self._sent_types = config["type_vocab_size"]
-        self._hidden_act = config["hidden_act"]
-        self._prepostprocess_dropout = config["hidden_dropout_prob"]
-        self._attention_dropout = config["attention_probs_dropout_prob"]
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        # Initialize all weigths by truncated normal initializer, and all biases 
-        # will be initialized by constant zero by default.
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config["initializer_range"])
-    @property
-    def inputs_attr(self):
-        return {"token_ids": [[-1, -1, 1], 'int64'],
-                "position_ids": [[-1, -1, 1], 'int64'],
-                "segment_ids": [[-1, -1, 1], 'int64'],
-                "input_mask": [[-1, -1, 1], 'float32']}
-    @property
-    def outputs_attr(self):
-        return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
-                "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
-                "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
-                "sentence_embedding": [[-1, self._emb_size], 'float32'],
-                "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
-    def build(self, inputs, scope_name=""):
-        src_ids = inputs['token_ids']
-        pos_ids = inputs['position_ids']
-        sent_ids = inputs['segment_ids']
-        input_mask = inputs['input_mask']
-        self._emb_dtype = 'float32'
-        # padding id in vocabulary must be set to 0
-        emb_out = fluid.layers.embedding(
-            input=src_ids,
-            size=[self._voc_size, self._emb_size],
-            dtype=self._emb_dtype,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+self._word_emb_name, initializer=self._param_initializer),
-            is_sparse=False)
-        # fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
-        embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
-        position_emb_out = fluid.layers.embedding(
-            input=pos_ids,
-            size=[self._max_position_seq_len, self._emb_size],
-            dtype=self._emb_dtype,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
-        sent_emb_out = fluid.layers.embedding(
-            sent_ids,
-            size=[self._sent_types, self._emb_size],
-            dtype=self._emb_dtype,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
-        emb_out = emb_out + position_emb_out
-        emb_out = emb_out + sent_emb_out
-        emb_out = pre_process_layer(
-            emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
-        self_attn_mask = fluid.layers.matmul(
-            x=input_mask, y=input_mask, transpose_y=True)
-        self_attn_mask = fluid.layers.scale(
-            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
-        n_head_self_attn_mask = fluid.layers.stack(
-            x=[self_attn_mask] * self._n_head, axis=1)
-        n_head_self_attn_mask.stop_gradient = True
-        enc_out = encoder(
-            enc_input=emb_out,
-            attn_bias=n_head_self_attn_mask,
-            n_layer=self._n_layer,
-            n_head=self._n_head,
-            d_key=self._emb_size // self._n_head,
-            d_value=self._emb_size // self._n_head,
-            d_model=self._emb_size,
-            d_inner_hid=self._emb_size * 4,
-            prepostprocess_dropout=self._prepostprocess_dropout,
-            attention_dropout=self._attention_dropout,
-            relu_dropout=0,
-            hidden_act=self._hidden_act,
-            preprocess_cmd="",
-            postprocess_cmd="dan",
-            param_initializer=self._param_initializer,
-            name=scope_name+'encoder')
-        next_sent_feat = fluid.layers.slice(
-            input=enc_out, axes=[1], starts=[0], ends=[1])
-        next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
-        next_sent_feat = fluid.layers.fc(
-            input=next_sent_feat,
-            size=self._emb_size,
-            act="tanh",
-            param_attr=fluid.ParamAttr(
-                name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
-            bias_attr=scope_name+"pooled_fc.b_0")
-        return {'embedding_table': embedding_table,
-                'word_embedding': emb_out,
-                'encoder_outputs': enc_out,
-                'sentence_embedding': next_sent_feat,
-                'sentence_pair_embedding': next_sent_feat}
-    def postprocess(self, rt_outputs):
-        pass
--- a/backbone/ernie.py
+++ b/backbone/ernie.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Ernie model."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from paddle import fluid
-from paddle.fluid import layers
-from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
-from paddlepalm.interface import backbone
-class Model(backbone):
-    def __init__(self,
-                 config,
-                 phase):
-        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变
-        self._emb_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        if config['sent_type_vocab_size']:
-            self._sent_types = config['sent_type_vocab_size']
-        else:
-            self._sent_types = config['type_vocab_size']
-        self._task_types = config['task_type_vocab_size']
-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        self._task_emb_name = "task_embedding"
-        self._emb_dtype = "float32"
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range'])
-    @property
-    def inputs_attr(self):
-        return {"token_ids": [[-1, -1, 1], 'int64'],
-                "position_ids": [[-1, -1, 1], 'int64'],
-                "segment_ids": [[-1, -1, 1], 'int64'],
-                "input_mask": [[-1, -1, 1], 'float32'],
-                "task_ids": [[-1,-1, 1], 'int64']}
-    @property
-    def outputs_attr(self):
-        return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
-                "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
-                "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
-                "sentence_embedding": [[-1, self._emb_size], 'float32'],
-                "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
-    def build(self, inputs, scope_name=""):
-        src_ids = inputs['token_ids']
-        pos_ids = inputs['position_ids']
-        sent_ids = inputs['segment_ids']
-        input_mask = inputs['input_mask']
-        task_ids = inputs['task_ids']
-        # padding id in vocabulary must be set to 0
-        emb_out = fluid.layers.embedding(
-            input=src_ids,
-            size=[self._voc_size, self._emb_size],
-            dtype=self._emb_dtype,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+self._word_emb_name, initializer=self._param_initializer),
-            is_sparse=False)
-        # fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
-        embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
-        position_emb_out = fluid.layers.embedding(
-            input=pos_ids,
-            size=[self._max_position_seq_len, self._emb_size],
-            dtype=self._emb_dtype,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
-        sent_emb_out = fluid.layers.embedding(
-            sent_ids,
-            size=[self._sent_types, self._emb_size],
-            dtype=self._emb_dtype,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
-        emb_out = emb_out + position_emb_out
-        emb_out = emb_out + sent_emb_out
-        task_emb_out = fluid.layers.embedding(
-            task_ids,
-            size=[self._task_types, self._emb_size],
-            dtype=self._emb_dtype,
-            param_attr=fluid.ParamAttr(
-                name=scope_name+self._task_emb_name,
-                initializer=self._param_initializer))
-        emb_out = emb_out + task_emb_out
-        emb_out = pre_process_layer(
-            emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
-        self_attn_mask = fluid.layers.matmul(
-            x=input_mask, y=input_mask, transpose_y=True)
-        self_attn_mask = fluid.layers.scale(
-            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
-        n_head_self_attn_mask = fluid.layers.stack(
-            x=[self_attn_mask] * self._n_head, axis=1)
-        n_head_self_attn_mask.stop_gradient = True
-        enc_out = encoder(
-            enc_input=emb_out,
-            attn_bias=n_head_self_attn_mask,
-            n_layer=self._n_layer,
-            n_head=self._n_head,
-            d_key=self._emb_size // self._n_head,
-            d_value=self._emb_size // self._n_head,
-            d_model=self._emb_size,
-            d_inner_hid=self._emb_size * 4,
-            prepostprocess_dropout=self._prepostprocess_dropout,
-            attention_dropout=self._attention_dropout,
-            relu_dropout=0,
-            hidden_act=self._hidden_act,
-            preprocess_cmd="",
-            postprocess_cmd="dan",
-            param_initializer=self._param_initializer,
-            name=scope_name+'encoder')
-        next_sent_feat = fluid.layers.slice(
-            input=enc_out, axes=[1], starts=[0], ends=[1])
-        next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
-        next_sent_feat = fluid.layers.fc(
-            input=next_sent_feat,
-            size=self._emb_size,
-            act="tanh",
-            param_attr=fluid.ParamAttr(
-                name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
-            bias_attr=scope_name+"pooled_fc.b_0")
-        return {'embedding_table': embedding_table,
-                'word_embedding': emb_out,
-                'encoder_outputs': enc_out,
-                'sentence_embedding': next_sent_feat,
-                'sentence_pair_embedding': next_sent_feat}
-    def postprocess(self, rt_outputs):
-        pass
--- a/backbone/utils/__init__.py
+++ b/backbone/utils/__init__.py
--- a/backbone/utils/transformer.py
+++ b/backbone/utils/transformer.py
-# -*- coding: UTF-8 -*-
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from functools import partial
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.layer_helper import LayerHelper as LayerHelper
-from functools import reduce # py3
-def layer_norm(x, begin_norm_axis=1, epsilon=1e-6, param_attr=None, bias_attr=None):
-    helper = LayerHelper('layer_norm', **locals())
-    mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
-    shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
-    variance = layers.reduce_mean(layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
-    r_stdev = layers.rsqrt(variance + epsilon)
-    norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
-    param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
-    param_dtype = norm_x.dtype
-    scale = helper.create_parameter(
-        attr=param_attr,
-        shape=param_shape,
-        dtype=param_dtype,
-        default_initializer=fluid.initializer.Constant(1.))
-    bias = helper.create_parameter(
-        attr=bias_attr,
-        shape=param_shape,
-        dtype=param_dtype,
-        is_bias=True,
-        default_initializer=fluid.initializer.Constant(0.))
-    out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
-    out = layers.elementwise_add(x=out, y=bias, axis=-1)
-    return out
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_query_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_query_fc.b_0')
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_key_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_key_fc.b_0')
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_value_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_value_fc.b_0')
-        return q, k, v
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(
-            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(
-            x=trans_x,
-            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
-            inplace=True)
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(
-                weights,
-                dropout_prob=dropout_rate,
-                dropout_implementation="upscale_in_train",
-                is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat(
-            [layers.reshape(
-                cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat(
-            [layers.reshape(
-                cache["v"], shape=[0, 0, d_model]), v], axis=1)
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
-                                                  dropout_rate)
-    out = __combine_heads(ctx_multiheads)
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=fluid.ParamAttr(
-                             name=name + '_output_fc.w_0',
-                             initializer=param_initializer),
-                         bias_attr=name + '_output_fc.b_0')
-    return proj_out
-def positionwise_feed_forward(x,
-                              d_inner_hid,
-                              d_hid,
-                              dropout_rate,
-                              hidden_act,
-                              param_initializer=None,
-                              name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=hidden_act,
-                       param_attr=fluid.ParamAttr(
-                           name=name + '_fc_0.w_0',
-                           initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    if dropout_rate:
-        hidden = layers.dropout(
-            hidden,
-            dropout_prob=dropout_rate,
-            dropout_implementation="upscale_in_train",
-            is_test=False)
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.ParamAttr(
-                        name=name + '_fc_1.w_0', initializer=param_initializer),
-                    bias_attr=name + '_fc_1.b_0')
-    return out
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
-                           name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out_dtype = out.dtype
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float32")
-            out = layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_scale',
-                    initializer=fluid.initializer.Constant(1.)),
-                bias_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_bias',
-                    initializer=fluid.initializer.Constant(0.)))
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float16")
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(
-                    out,
-                    dropout_prob=dropout_rate,
-                    dropout_implementation="upscale_in_train",
-                    is_test=False)
-    return out
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name=''):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(
-        pre_process_layer(
-            enc_input,
-            preprocess_cmd,
-            prepostprocess_dropout,
-            name=name + '_pre_att'),
-        None,
-        None,
-        attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        param_initializer=param_initializer,
-        name=name + '_multi_head_att')
-    attn_output = post_process_layer(
-        enc_input,
-        attn_output,
-        postprocess_cmd,
-        prepostprocess_dropout,
-        name=name + '_post_att')
-    ffd_output = positionwise_feed_forward(
-        pre_process_layer(
-            attn_output,
-            preprocess_cmd,
-            prepostprocess_dropout,
-            name=name + '_pre_ffn'),
-        d_inner_hid,
-        d_model,
-        relu_dropout,
-        hidden_act,
-        param_initializer=param_initializer,
-        name=name + '_ffn')
-    return post_process_layer(
-        attn_output,
-        ffd_output,
-        postprocess_cmd,
-        prepostprocess_dropout,
-        name=name + '_post_ffn')
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            param_initializer=None,
-            name=''):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            param_initializer=param_initializer,
-            name=name + '_layer_' + str(i))
-        enc_input = enc_output
-    enc_output = pre_process_layer(
-        enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-    return enc_output
--- a/demo/demo1/config.yaml
+++ b/demo/demo1/config.yaml
-task_instance: "mrqa"
-save_path: "output_model/firstrun"
-backbone: "bert"
-backbone_config_path: "../../pretrain_model/bert/bert_config.json"
-batch_size: 4
-num_epochs: 2
-optimizer: "adam"
-learning_rate: 3e-5
-warmup_proportion: 0.1
-weight_decay: 0.1
-print_every_n_steps: 10
--- a/demo/demo1/mrqa.yaml
+++ b/demo/demo1/mrqa.yaml
-train_file: data/mrqa/train.json
-reader: mrc
-paradigm: mrc
-vocab_path: "../../pretrain_model/bert/vocab.txt"
-do_lower_case: True
-max_seq_len: 512
-doc_stride: 128
-max_query_len: 64
--- a/demo/demo1/run.py
+++ b/demo/demo1/run.py
-import paddlepalm as palm
-if __name__ == '__main__':
-    controller = palm.Controller('config.yaml')
-    controller.load_pretrain('../../pretrain_model/bert/params')
-    controller.train()
--- a/demo/demo1/run.sh
+++ b/demo/demo1/run.sh
-export CUDA_VISIBLE_DEVICES=0
-python run.py
--- a/demo/demo2/data/mrqa/dev.json
+++ b/demo/demo2/data/mrqa/dev.json
--- a/demo/demo2/data/mrqa/train.json
+++ b/demo/demo2/data/mrqa/train.json
--- a/demo/demo2/run.py
+++ b/demo/demo2/run.py
-import paddlepalm as palm
-if __name__ == '__main__':
-    max_seqlen = 512
-    batch_size = 32
-    match_reader = palm.reader.match(train_file, vocab, \
-        max_seqlen, file_format='csv', tokenizer='wordpiece', \
-        lang='en', shuffle_train=True)
-    mrc_reader = palm.reader.mrc(train_file, phase='train')
-    mlm_reader = palm.reader.mlm(train_file, phase='train')
-    palm.reader.
-    match = palm.tasktype.cls(num_classes=4)
-    mrc = palm.tasktype.match(learning_strategy='pairwise')
-    mlm = palm.tasktype.mlm()
-    mlm.print()
-    bb_flags = palm.load_json('./pretrain/ernie/ernie_config.json')
-    bb = palm.backbone.ernie(bb_flags['xx'], xxx)
-    bb.print()
-    match4mrqa = palm.Task('match4mrqa', match_reader, match_tt)
-    mrc4mrqa = palm.Task('match4mrqa', match_reader, match_tt)
-    # match4mrqa.reuse_with(mrc4mrqa)
-    controller = palm.Controller([mrqa, match4mrqa, mlm4mrqa])
-    loss = controller.build_forward(bb, mask_task=[])
-    n_steps = controller.estimate_train_steps(basetask=mrqa, num_epochs=2, batch_size=8, dev_count=4)
-    adam = palm.optimizer.Adam(loss)
-    sched = palm.schedualer.LinearWarmup(learning_rate, max_train_steps=n_steps, warmup_steps=0.1*n_steps)
-    controller.build_backward(optimizer=adam, schedualer=sched, weight_decay=0.001, use_ema=True, ema_decay=0.999)
-    controller.random_init_params()
-    controller.load_pretrain('../../pretrain_model/ernie/params')
-    controller.train()
-    # controller = palm.Controller(config='config.yaml', task_dir='tasks', for_train=False)
-    # controller.pred('mrqa', inference_model_dir='output_model/secondrun/mrqa/infer_model')
--- a/demo/demo2/tasks/match4mrqa.yaml
+++ b/demo/demo2/tasks/match4mrqa.yaml
-train_file: "data/match4mrqa/train.tsv"
-reader: match
-paradigm: match
--- a/demo/demo2/tasks/mlm4mrqa.yaml
+++ b/demo/demo2/tasks/mlm4mrqa.yaml
-train_file: "data/mlm4mrqa/train.tsv"
-reader: mlm
-paradigm: mlm
--- a/demo/demo2/tasks/mrqa.yaml
+++ b/demo/demo2/tasks/mrqa.yaml
-train_file: data/mrqa/train.json
-pred_file: data/mrqa/dev.json
-pred_output_path: 'mrqa_output'
-reader: mrc
-paradigm: mrc
-doc_stride: 128
-max_query_len: 64
-max_answer_len: 30
-n_best_size: 20
--- a/demo/demo3/pretrain
+++ b/demo/demo3/pretrain
-../../pretrain/
\ No newline at end of file
--- a/demo/demo3/tasks/cls1.yaml
+++ b/demo/demo3/tasks/cls1.yaml
-train_file: "data/cls4mrqa/train.tsv"
-reader: cls
-paradigm: cls
-n_classes: 4
--- a/demo/demo3/tasks/cls2.yaml
+++ b/demo/demo3/tasks/cls2.yaml
-train_file: "data/cls4mrqa/train.tsv"
-reader: cls
-paradigm: cls
-n_classes: 4
--- a/demo/demo3/tasks/cls3.yaml
+++ b/demo/demo3/tasks/cls3.yaml
-train_file: "data/cls4mrqa/train.tsv"
-reader: cls
-paradigm: cls
-n_classes: 4
--- a/demo/demo3/tasks/cls4.yaml
+++ b/demo/demo3/tasks/cls4.yaml
-train_file: "data/cls4mrqa/train.tsv"
-reader: cls
-paradigm: cls
-n_classes: 4
--- a/demo/demo3/tasks/cls5.yaml
+++ b/demo/demo3/tasks/cls5.yaml
-train_file: "data/cls4mrqa/train.tsv"
-reader: cls
-paradigm: cls
-n_classes: 4
--- a/demo/demo3/tasks/cls6.yaml
+++ b/demo/demo3/tasks/cls6.yaml
-train_file: "data/cls4mrqa/train.tsv"
-reader: cls
-paradigm: cls
-n_classes: 4
--- a/examples/classification/README.md
+++ b/examples/classification/README.md
+## Examples 1: Classification
+This task is a sentiment analysis task. The following sections detail model preparation, dataset preparation, and how to run the task.
+### Step 1: Prepare Pre-trained Models & Datasets
+#### Pre-trianed Model
+The pre-training model of this mission is: [ernie-zh-base](https://github.com/PaddlePaddle/PALM/tree/r0.3-api).
+Make sure you have downloaded the required pre-training model in the current folder.
+#### Dataset
+This task uses the `chnsenticorp` dataset. 
+Download dataset:
+```shell
+python download.py
+```
+If everything goes well, there will be a folder named `data/`  created with all the datas in it.
+The data should have 2 fields,  `label  text_a`, with tsv format. Here is some example datas:
+```
+label  text_a
+0   当当网名不符实，订货多日不见送货，询问客服只会推托，只会要求用户再下订单。如此服务留不住顾客的。去别的网站买书服务更好。
+0   XP的驱动不好找！我的17号提的货，现在就降价了100元，而且还送杀毒软件！
+1   <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道当年我听说这本书的时候花很长时间去图书馆找和借都没能如愿,所以这次一看到当当有,马上买了,红迷们也要记得备货哦!
+```
+### Step 2: Train & Predict
+The code used to perform classification task is in `run.py`. If you have prepared the pre-training model and the data set required for the task, run:
+```shell
+python run.py
+```
+If you want to specify a specific gpu or use multiple gpus for training, please use **`CUDA_VISIBLE_DEVICES`**, for example:
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2 python run.py
+```
+Some logs will be shown below:
+```
+step 1/154 (epoch 0), loss: 5.512, speed: 0.51 steps/s
+step 2/154 (epoch 0), loss: 2.595, speed: 3.36 steps/s
+step 3/154 (epoch 0), loss: 1.798, speed: 3.48 steps/s
+```
+After the run, you can view the saved models in the `outputs/` folder and the predictions in the `outputs/predict` folder. Here are some examples of predictions:
+```
+{"index": 0, "logits": [-0.2014336884021759, 0.6799028515815735], "probs": [0.29290086030960083, 0.7070990800857544], "label": 1}
+{"index": 1, "logits": [0.8593899011611938, -0.29743513464927673], "probs": [0.7607553601264954, 0.23924466967582703], "label": 0}
+{"index": 2, "logits": [0.7462944388389587, -0.7083730101585388], "probs": [0.8107157349586487, 0.18928426504135132], "label": 0}
+```
+### Step 3: Evaluate
+Once you have the prediction, you can run the evaluation script to evaluate the model:
+```shell
+python evaluate.py
+```
+The evaluation results are as follows:
+```
+precision: 0.956666666667, recall: 0.949013157895, f1: 0.95688225039
+```
--- a/examples/classification/download.py
+++ b/examples/classification/download.py
+#  -*- coding: utf-8 -*-
+import os
+import requests
+import tarfile
+import shutil
+from tqdm import tqdm
+def download(src, url):
+    file_size = int(requests.head(url).headers['Content-Length'])
+    header = {
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
+        '70.0.3538.67 Safari/537.36'
+    }
+    pbar = tqdm(total=file_size)
+    resp = requests.get(url, headers=header, stream=True)
+    with open(src, 'ab') as f:
+        for chunk in resp.iter_content(chunk_size=1024):
+            if chunk:
+                f.write(chunk)
+                pbar.update(1024)
+    pbar.close()
+    return file_size
+abs_path = os.path.abspath(__file__)
+download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
+downlaod_path = os.path.join(os.path.dirname(abs_path), "task_data_zh.tgz")
+target_dir = os.path.dirname(abs_path)
+download(downlaod_path, download_url)
+tar = tarfile.open(downlaod_path)
+tar.extractall(target_dir)
+os.remove(downlaod_path)
+abs_path = os.path.abspath(__file__)
+dst_dir = os.path.join(os.path.dirname(abs_path), "data")
+if not os.path.exists(dst_dir) or not os.path.isdir(dst_dir):
+    os.makedirs(dst_dir)
+for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')):
+    shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir)
+shutil.rmtree(os.path.join(target_dir, 'task_data'))
--- a/examples/classification/evaluate.py
+++ b/examples/classification/evaluate.py
+#  -*- coding: utf-8 -*-
+import json
+import numpy as np
+def accuracy(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels) 
+    return (preds == labels).mean()
+def f1(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    tp = np.sum((labels == '1') & (preds == '1'))
+    tn = np.sum((labels == '0') & (preds == '0'))
+    fp = np.sum((labels == '0') & (preds == '1'))
+    fn = np.sum((labels == '1') & (preds == '0'))
+    p = tp * 1.0 / (tp + fp) 
+    r = tp * 1.0 / (tp + fn) * 1.0
+    f1 = (2 * p * r) / (p + r + 1e-8)
+    return f1
+def recall(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    # recall=TP/(TP+FN)
+    tp = np.sum((labels == '1') & (preds == '1'))
+    fn = np.sum((labels == '1') & (preds == '0'))
+    re = tp * 1.0 / (tp + fn)
+    return re
+def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
+    if eval_phase == 'test':
+        data_dir="./data/test.tsv"
+    elif eval_phase == 'dev':
+        data_dir="./data/dev.tsv"
+    else:
+        assert eval_phase in ['dev', 'test'], 'eval_phase should be dev or test'
+    labels = []
+    with open(data_dir, "r") as file:
+        first_flag = True
+        for line in file:
+            line = line.split("\t")
+            label = line[0]
+            if label=='label':
+                continue
+            labels.append(str(label))
+    file.close()
+    preds = []
+    with open(res_dir, "r") as file:
+        for line in file.readlines():
+            line = json.loads(line)
+            pred = line['label']
+            preds.append(str(pred))
+    file.close()
+    assert len(labels) == len(preds), "prediction result doesn't match to labels"
+    print('data num: {}'.format(len(labels)))
+    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+res_evaluate()
--- a/examples/classification/run.py
+++ b/examples/classification/run.py
+# coding=utf-8
+import paddlepalm as palm
+import json
+from paddlepalm.distribute import gpu_dev_count
+if __name__ == '__main__':
+    # configs
+    max_seqlen = 256
+    batch_size = 8
+    num_epochs = 10
+    lr = 5e-5
+    weight_decay = 0.01
+    vocab_path = './pretrain/ernie-zh-base/vocab.txt'
+    train_file = './data/train.tsv'
+    predict_file = './data/test.tsv'
+    config = json.load(open('./pretrain/ernie-zh-base/ernie_config.json'))
+    input_dim = config['hidden_size']
+    num_classes = 2
+    dropout_prob = 0.1
+    random_seed = 1
+    task_name = 'chnsenticorp'
+    save_path = './outputs/'
+    pred_output = './outputs/predict/'
+    save_type = 'ckpt'
+    print_steps = 20
+    pre_params = './pretrain/ernie-zh-base/params'
+    # -----------------------  for training ----------------------- 
+    # step 1-1: create readers for training
+    cls_reader = palm.reader.ClassifyReader(vocab_path, max_seqlen, seed=random_seed)
+    # step 1-2: load the training data
+    cls_reader.load_data(train_file, batch_size, num_epochs=num_epochs)
+    # step 2: create a backbone of the model to extract text features
+    ernie = palm.backbone.ERNIE.from_config(config)
+    # step 3: register the backbone in reader
+    cls_reader.register_with(ernie)
+    # step 4: create the task output head
+    cls_head = palm.head.Classify(num_classes, input_dim, dropout_prob)
+    # step 5-1: create a task trainer
+    trainer = palm.Trainer(task_name)
+    # step 5-2: build forward graph with backbone and task head
+    loss_var = trainer.build_forward(ernie, cls_head)
+    # step 6-1*: use warmup
+    n_steps = cls_reader.num_examples * num_epochs // batch_size
+    warmup_steps = int(0.1 * n_steps)
+    sched = palm.lr_sched.TriangularSchedualer(warmup_steps, n_steps)
+    # step 6-2: create a optimizer
+    adam = palm.optimizer.Adam(loss_var, lr, sched)
+    # step 6-3: build backward
+    trainer.build_backward(optimizer=adam, weight_decay=weight_decay)
+    # step 7: fit prepared reader and data
+    trainer.fit_reader(cls_reader)
+    # step 8-1*: load pretrained parameters
+    trainer.load_pretrain(pre_params)
+    # step 8-2*: set saver to save model
+    # save_steps = n_steps // gpu_dev_count - batch_size
+    save_steps = 2396
+    trainer.set_saver(save_steps=save_steps, save_path=save_path, save_type=save_type)
+    # step 8-3: start training
+    trainer.train(print_steps=print_steps)
+    # -----------------------  for prediction ----------------------- 
+    # step 1-1: create readers for prediction
+    print('prepare to predict...')
+    predict_cls_reader = palm.reader.ClassifyReader(vocab_path, max_seqlen, seed=random_seed, phase='predict')
+    # step 1-2: load the training data
+    predict_cls_reader.load_data(predict_file, batch_size)
+    # step 2: create a backbone of the model to extract text features
+    pred_ernie = palm.backbone.ERNIE.from_config(config, phase='predict')
+    # step 3: register the backbone in reader
+    predict_cls_reader.register_with(pred_ernie)
+    # step 4: create the task output head
+    cls_pred_head = palm.head.Classify(num_classes, input_dim, phase='predict')
+    # step 5: build forward graph with backbone and task head
+    trainer.build_predict_forward(pred_ernie, cls_pred_head)
+    # step 6: load pretrained model
+    # model_path = './outputs/ckpt.step'+str(save_steps)
+    model_path = './outputs/ckpt.step'+str(11980)
+    pred_ckpt = trainer.load_ckpt(model_path)
+    # step 7: fit prepared reader and data
+    trainer.fit_reader(predict_cls_reader, phase='predict')
+    # step 8: predict
+    print('predicting..')
+    trainer.predict(print_steps=print_steps, output_dir=pred_output)
--- a/examples/matching/README.md
+++ b/examples/matching/README.md
+## Examples 2: Mathing
+This task is a sentence pair matching task. The following sections detail model preparation, dataset preparation, and how to run the task.
+### Step 1: Prepare Pre-trained Models & Datasets
+#### Pre-trianed Model
+The pre-training model of this mission is: [ernie-en-base](https://github.com/PaddlePaddle/PALM/tree/r0.3-api).
+Make sure you have downloaded the required pre-training model in the current folder.
+#### Dataset
+This task uses the `Quora Question Pairs matching` dataset. 
+Download dataset:
+```shell
+python download.py
+```
+After the dataset is downloaded, you should convert the data format for training:
+```shell
+python process.py quora_duplicate_questions.tsv train.tsv test.tsv
+```
+If everything goes well, there will be a folder named `data/`  created with all the converted datas in it.
+The data should have 3 fields,  `text_a  text_b  label`, with tsv format. Here is some example datas:
+```
+text_a  text_b  label
+How can the arrangement of corynebacterium xerosis be described?  How would you describe waves? 0
+How do you fix a Google Play Store account that isn't working?  What can cause the Google Play store to not open? How are such probelms fixed?  1
+Which is the best earphone under 1000?  What are the best earphones under 1k? 1
+What are the differences between the Dell Inspiron 3000, 5000, and 7000 series laptops? "Should I buy an Apple MacBook Pro 15"" or a Dell Inspiron 17 5000 series?" 0
+```
+### Step 2: Train & Predict
+The code used to perform classification task is in `run.py`. If you have prepared the pre-training model and the data set required for the task, run:
+```shell
+python run.py
+```
+If you want to specify a specific gpu or use multiple gpus for training, please use **`CUDA_VISIBLE_DEVICES`**, for example:
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2 python run.py
+```
+Some logs will be shown below:
+```
+step 20/49087 (epoch 0), loss: 1.079, speed: 3.48 steps/s
+step 40/49087 (epoch 0), loss: 1.251, speed: 5.18 steps/s
+step 60/49087 (epoch 0), loss: 1.193, speed: 5.04 steps/s
+```
+After the run, you can view the saved models in the `outputs/` folder and the predictions in the `outputs/predict` folder. Here are some examples of predictions:
+```
+{"index": 0, "logits": [-0.32688724994659424, -0.8568955063819885], "probs": [0.629485011100769, 0.3705149292945862], "label": 0}
+{"index": 1, "logits": [-0.2735646963119507, -0.7983021140098572], "probs": [0.6282548904418945, 0.37174513936042786], "label": 0}
+{"index": 2, "logits": [-0.3381381630897522, -0.8614270091056824], "probs": [0.6279165148735046, 0.37208351492881775], "label": 0}
+```
+### Step 3: Evaluate
+Once you have the prediction, you can run the evaluation script to evaluate the model:
+```shell
+python evaluate.py
+```
+The evaluation results are as follows:
+```
+precision: 0.857906976744, recall: 0.824249846908, f1: 0.81501664653
+```
--- a/examples/matching/download.py
+++ b/examples/matching/download.py
+#  -*- coding: utf-8 -*-
+import os
+import requests
+from tqdm import tqdm
+def download(src, url):
+    file_size = int(requests.head(url).headers['Content-Length'])
+    header = {
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
+        '70.0.3538.67 Safari/537.36'
+    }
+    pbar = tqdm(total=file_size)
+    resp = requests.get(url, headers=header, stream=True)
+    with open(src, 'ab') as f:
+        for chunk in resp.iter_content(chunk_size=1024):
+            if chunk:
+                f.write(chunk)
+                pbar.update(1024)
+    pbar.close()
+    return file_size
+abs_path = os.path.abspath(__file__)
+data_dir = os.path.join(os.path.dirname(abs_path), "data")
+if not os.path.exists(data_dir) or not os.path.isdir(data_dir):
+    os.makedirs(data_dir)
+download_url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
+downlaod_path = os.path.join(data_dir, "quora_duplicate_questions.tsv")
+download(downlaod_path, download_url)
--- a/examples/matching/evaluate.py
+++ b/examples/matching/evaluate.py
+#  -*- coding: utf-8 -*-
+import json
+import numpy as np
+def accuracy(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels) 
+    return (preds == labels).mean()
+def f1(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    tp = np.sum((labels == '1') & (preds == '1'))
+    tn = np.sum((labels == '0') & (preds == '0'))
+    fp = np.sum((labels == '0') & (preds == '1'))
+    fn = np.sum((labels == '1') & (preds == '0'))
+    p = tp * 1.0 / (tp + fp) 
+    r = tp * 1.0 / (tp + fn) * 1.0
+    f1 = (2 * p * r) / (p + r + 1e-8)
+    return f1
+def recall(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    # recall=TP/(TP+FN)
+    tp = np.sum((labels == '1') & (preds == '1'))
+    fn = np.sum((labels == '1') & (preds == '0'))
+    re = tp * 1.0 / (tp + fn)
+    return re
+def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
+    if eval_phase == 'test':
+        data_dir="./data/test.tsv"
+    elif eval_phase == 'dev':
+        data_dir="./data/dev.tsv"
+    else:
+        assert eval_phase in ['dev', 'test'], 'eval_phase should be dev or test'
+    labels = []
+    with open(data_dir, "r") as file:
+        first_flag = True
+        for line in file:
+            line = line.split("\t")
+            label = line[2][:-1]
+            if label=='label':
+                continue
+            labels.append(str(label))
+    file.close()
+    preds = []
+    with open(res_dir, "r") as file:
+        for line in file.readlines():
+            line = json.loads(line)
+            pred = line['label']
+            preds.append(str(pred))
+    file.close()
+    assert len(labels) == len(preds), "prediction result({}) doesn't match to labels({})".format(len(preds),len(labels))
+    print('data num: {}'.format(len(labels)))
+    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+res_evaluate()
--- a/examples/matching/process.py
+++ b/examples/matching/process.py
+#  -*- coding: utf-8 -*-
+import sys
+import os
+if len(sys.argv) != 4:
+    exit(0)
+data_dir = sys.argv[1]
+if not os.path.exists(data_dir):
+    print("%s not exists" % data_dir)
+    exit(0)
+train_dir = sys.argv[2]
+train_file = open(train_dir, "w")
+train_file.write("text_a\ttext_b\tlabel\n")
+test_dir = sys.argv[3]
+test_file = open(test_dir, "w")
+test_file.write("text_a\ttext_b\tlabel\n")
+with open(data_dir, "r") as file:
+    before = ""
+    cnt = 0
+    for line in file:
+        line = line.strip("\n")
+        line_t = line.split("\t")
+        flag = 0
+        if len(line_t) < 6:
+            if flag: 
+                flag = 0
+                out_line = "{}{}\n".format(out_line, line)
+            else:
+                flag = 1
+                outline = "{}".format(line)
+            continue
+        else:
+            out_line = "{}\t{}\t{}\n".format(line_t[3], line_t[4], line_t[5])
+        cnt += 1
+        if 2 <= cnt <= 4301:
+            test_file.write(out_line)
+        if 4301 <= cnt <= 104301:
+            train_file.write(out_line)
+train_file.close()
+test_file.close()
--- a/examples/matching/run.py
+++ b/examples/matching/run.py
+# coding=utf-8
+import paddlepalm as palm
+import json
+from paddlepalm.distribute import gpu_dev_count
+if __name__ == '__main__':
+    # configs 
+    max_seqlen = 128
+    batch_size = 16 
+    num_epochs = 3
+    lr = 3e-5
+    weight_decay = 0.0
+    num_classes = 2
+    random_seed = 1
+    dropout_prob = 0.1
+    save_path = './outputs/'
+    save_type = 'ckpt'
+    pred_model_path = './outputs/ckpt.step'+str(18732)
+    print_steps = 50
+    pred_output = './outputs/predict/'
+    pre_params = './pretrain/ernie-en-base/params'
+    task_name = 'Quora Question Pairs matching'
+    vocab_path = './pretrain/ernie-en-base/vocab.txt'
+    train_file = './data/train.tsv'
+    predict_file = './data/test.tsv'
+    config = json.load(open('./pretrain/ernie-en-base/ernie_config.json'))
+    input_dim = config['hidden_size']
+    # -----------------------  for training ----------------------- 
+    # step 1-1: create readers for training
+    match_reader = palm.reader.MatchReader(vocab_path, max_seqlen, seed=random_seed)
+    # step 1-2: load the training data
+    match_reader.load_data(train_file, file_format='tsv', num_epochs=num_epochs, batch_size=batch_size)
+    # step 2: create a backbone of the model to extract text features
+    ernie = palm.backbone.ERNIE.from_config(config)
+    # step 3: register the backbone in reader
+    match_reader.register_with(ernie)
+    # step 4: create the task output head
+    match_head = palm.head.Match(num_classes, input_dim, dropout_prob)
+    # step 5-1: create a task trainer
+    trainer = palm.Trainer(task_name)
+    # step 5-2: build forward graph with backbone and task head
+    loss_var = trainer.build_forward(ernie, match_head)
+    # step 6-1*: use warmup
+    n_steps = match_reader.num_examples * num_epochs // batch_size
+    warmup_steps = int(0.1 * n_steps)
+    print('total_steps: {}'.format(n_steps))
+    print('warmup_steps: {}'.format(warmup_steps))
+    sched = palm.lr_sched.TriangularSchedualer(warmup_steps, n_steps)
+    # step 6-2: create a optimizer
+    adam = palm.optimizer.Adam(loss_var, lr, sched)
+    # step 6-3: build backward
+    trainer.build_backward(optimizer=adam, weight_decay=weight_decay)
+    # step 7: fit prepared reader and data
+    trainer.fit_reader(match_reader)
+    # step 8-1*: load pretrained parameters
+    trainer.load_pretrain(pre_params, False)
+    # step 8-2*: set saver to save model
+    # save_steps = (n_steps-16) // gpu_dev_count
+    save_steps = 6244
+    trainer.set_saver(save_path=save_path, save_steps=save_steps, save_type=save_type)
+    # step 8-3: start training
+    trainer.train(print_steps=print_steps)
+    # -----------------------  for prediction ----------------------- 
+    # step 1-1: create readers for prediction
+    print('prepare to predict...')
+    predict_match_reader = palm.reader.MatchReader(vocab_path, max_seqlen, seed=random_seed, phase='predict')
+    # step 1-2: load the training data
+    predict_match_reader.load_data(predict_file, batch_size)
+    # step 2: create a backbone of the model to extract text features
+    pred_ernie = palm.backbone.ERNIE.from_config(config, phase='predict')
+    # step 3: register the backbone in reader
+    predict_match_reader.register_with(pred_ernie)
+    # step 4: create the task output head
+    match_pred_head = palm.head.Match(num_classes, input_dim, phase='predict')
+    # step 5: build forward graph with backbone and task head
+    trainer.build_predict_forward(pred_ernie, match_pred_head)
+    # step 6: load pretrained model
+    pred_ckpt = trainer.load_ckpt(pred_model_path)
+    # step 7: fit prepared reader and data
+    trainer.fit_reader(predict_match_reader, phase='predict')
+    # step 8: predict
+    print('predicting..')
+    trainer.predict(print_steps=print_steps, output_dir=pred_output)
--- a/examples/mrc/README.md
+++ b/examples/mrc/README.md
+## Examples 4: Machine Reading Comprehension
+This task is a machine reading comprehension task. The following sections detail model preparation, dataset preparation, and how to run the task.
+### Step 1: Prepare Pre-trained Models & Datasets
+#### Pre-trianed Model
+The pre-training model of this mission is: [ernie-zh-base](https://github.com/PaddlePaddle/PALM/tree/r0.3-api).
+Make sure you have downloaded the required pre-training model in the current folder.
+#### Dataset
+This task uses the `CMRC2018` dataset. `CMRC2018` is an evaluation conducted by Chinese information society. The task of evaluation is to extract reading comprehension.
+Download dataset:
+```shell
+python download.py
+```
+If everything goes well, there will be a folder named `data/`  created with all the datas in it.
+Here is some example datas:
+ ```json
+"paragraphs": [
+         {
+           "id": "TRAIN_36",
+           "context": "NGC 6231是一个位于天蝎座的疏散星团，天球座标为赤经16时54分，赤纬-41度48分，视觉观测大小约45角分，亮度约2.6视星等，距地球5900光年。NGC 6231年龄约为三百二十万年，是一个非常年轻的星团，星团内的最亮星是5等的天蝎座 ζ1星。用双筒望远镜或小型望远镜就能看到个别的行星。NGC 6231在1654年被意大利天文学家乔瓦尼·巴蒂斯特·霍迪尔纳（Giovanni Battista Hodierna）以Luminosae的名字首次纪录在星表中，但是未见记载于夏尔·梅西耶的天体列表和威廉·赫歇尔的深空天体目录。这个天体在1678年被爱德蒙·哈雷（I.7）、1745年被夏西亚科斯（Jean-Phillippe Loys de Cheseaux）（9）、1751年被尼可拉·路易·拉卡伊（II.13）分别再次独立发现。",
+           "qas": [
+             {
+               "question": "NGC 6231的经纬度是多少？",
+               "id": "TRAIN_36_QUERY_0",
+               "answers": [
+                 {
+                   "text": "赤经16时54分，赤纬-41度48分",
+                   "answer_start": 27
+                 }
+               ]
+             }
+ ```
+### Step 2: Train & Predict
+The code used to perform classification task is in `run.py`. If you have prepared the pre-training model and the data set required for the task, run:
+```shell
+python run.py
+```
+If you want to specify a specific gpu or use multiple gpus for training, please use **`CUDA_VISIBLE_DEVICES`**, for example:
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2 python run.py
+```
+Some logs will be shown below:
+```
+step 1/1515 (epoch 0), loss: 6.251, speed: 0.31 steps/s
+step 2/1515 (epoch 0), loss: 6.206, speed: 0.80 steps/s
+step 3/1515 (epoch 0), loss: 6.172, speed: 0.86 steps/s
+```
+After the run, you can view the saved models in the `outputs/` folder and the predictions in the `outputs/predict` folder. Here are some examples of predictions:
+```json
+{
+    "DEV_0_QUERY_0": "光 荣 和 ω-force 开 发", 
+    "DEV_0_QUERY_1": "任 天 堂 游 戏 谜 之 村 雨 城", 
+    "DEV_0_QUERY_2": "战 史 演 武 」&「 争 霸 演 武 」。", 
+    "DEV_1_QUERY_0": "大 陆 传 统 器 乐 及 戏 曲 里 面 常 用 的 打 击 乐 记 谱 方 法 ， 以 中 文 字 的 声 音 模 拟 敲 击 乐 的 声 音 ， 纪 录 打 击 乐 的 各 种 不 同 的 演 奏 方 法 。", 
+    "DEV_1_QUERY_1": "「 锣 鼓 点", 
+    "DEV_1_QUERY_2": "锣 鼓 的 运 用 有 约 定 俗 成 的 程 式 ， 依 照 角 色 行 当 的 身 份 、 性 格 、 情 绪 以 及 环 境 ， 配 合 相 应 的 锣 鼓 点", 
+    "DEV_1_QUERY_3": "鼓 、 锣 、 钹 和 板 四 类 型", 
+    "DEV_2_QUERY_0": "364.6 公 里", 
+}
+```
+### Step 3: Evaluate
+Once you have the prediction, you can run the evaluation script to evaluate the model:
+```shell
+python evaluate.py
+```
+The evaluation results are as follows:
+```
+data_num: 3219
+em_sroce: 0.963031997515, f1: 83.9865402973
+```
--- a/examples/mrc/download.py
+++ b/examples/mrc/download.py
+#  -*- coding: utf-8 -*-
+import os
+import requests
+import tarfile
+import shutil
+from tqdm import tqdm
+def download(src, url):
+    file_size = int(requests.head(url).headers['Content-Length'])
+    header = {
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
+        '70.0.3538.67 Safari/537.36'
+    }
+    pbar = tqdm(total=file_size)
+    resp = requests.get(url, headers=header, stream=True)
+    with open(src, 'ab') as f:
+        for chunk in resp.iter_content(chunk_size=1024):
+            if chunk:
+                f.write(chunk)
+                pbar.update(1024)
+    pbar.close()
+    return file_size
+abs_path = os.path.abspath(__file__)
+download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
+downlaod_path = os.path.join(os.path.dirname(abs_path), "task_data_zh.tgz")
+target_dir = os.path.dirname(abs_path)
+download(downlaod_path, download_url)
+tar = tarfile.open(downlaod_path)
+tar.extractall(target_dir)
+os.remove(downlaod_path)
+abs_path = os.path.abspath(__file__)
+dst_dir = os.path.join(os.path.dirname(abs_path), "data")
+if not os.path.exists(dst_dir) or not os.path.isdir(dst_dir):
+    os.makedirs(dst_dir)
+for file in os.listdir(os.path.join(target_dir, 'task_data', 'cmrc2018')):
+    shutil.move(os.path.join(target_dir, 'task_data', 'cmrc2018', file), dst_dir)
+shutil.rmtree(os.path.join(target_dir, 'task_data'))
--- a/examples/mrc/evaluate.py
+++ b/examples/mrc/evaluate.py
+# -*- coding: utf-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Evaluation script for CMRC 2018
+version: v5
+Note:
+v5 formatted output, add usage description
+v4 fixed segmentation issues
+'''
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from collections import Counter, OrderedDict
+import string
+import re
+import argparse
+import json
+import sys
+import nltk
+import pdb
+# split Chinese with English
+def mixed_segmentation(in_str, rm_punc=False):
+    in_str = in_str.lower().strip()
+    segs_out = []
+    temp_str = ""
+    sp_char = [
+        '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', '，', '。', '：',
+        '？', '！', '“', '”', '；', '’', '《', '》', '……', '·', '、', '「', '」', '（',
+        '）', '－', '～', '『', '』'
+    ]
+    for char in in_str:
+        if rm_punc and char in sp_char:
+            continue
+        if re.search(r'[\u4e00-\u9fa5]', char) or char in sp_char:
+            if temp_str != "":
+                ss = nltk.word_tokenize(temp_str)
+                segs_out.extend(ss)
+                temp_str = ""
+            segs_out.append(char)
+        else:
+            temp_str += char
+    #handling last part
+    if temp_str != "":
+        ss = nltk.word_tokenize(temp_str)
+        segs_out.extend(ss)
+    return segs_out
+# remove punctuation
+def remove_punctuation(in_str):
+    in_str = in_str.lower().strip()
+    sp_char = [
+        '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', '，', '。', '：',
+        '？', '！', '“', '”', '；', '’', '《', '》', '……', '·', '、', '「', '」', '（',
+        '）', '－', '～', '『', '』'
+    ]
+    out_segs = []
+    for char in in_str:
+        if char in sp_char:
+            continue
+        else:
+            out_segs.append(char)
+    return ''.join(out_segs)
+# find longest common string
+def find_lcs(s1, s2):
+    m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)]
+    mmax = 0
+    p = 0
+    for i in range(len(s1)):
+        for j in range(len(s2)):
+            if s1[i] == s2[j]:
+                m[i + 1][j + 1] = m[i][j] + 1
+                if m[i + 1][j + 1] > mmax:
+                    mmax = m[i + 1][j + 1]
+                    p = i + 1
+    return s1[p - mmax:p], mmax
+#
+def evaluate(ground_truth_file, prediction_file):
+    f1 = 0
+    em = 0
+    total_count = 0
+    skip_count = 0
+    for instances in ground_truth_file["data"]:
+        for instance in instances["paragraphs"]:
+            context_text = instance['context'].strip()
+            for qas in instance['qas']:
+                total_count += 1
+                query_id = qas['id'].strip()
+                query_text = qas['question'].strip()
+                answers = [ans["text"] for ans in qas["answers"]]
+                if query_id not in prediction_file:
+                    print('Unanswered question: {}\n'.format(
+                        query_id))
+                    skip_count += 1
+                    continue
+                prediction = prediction_file[query_id]
+                f1 += calc_f1_score(answers, prediction)
+                em += calc_em_score(answers, prediction)
+    f1_score = 100.0 * f1 / total_count
+    em_score = 100.0 * em / total_count
+    return f1_score, em_score, total_count, skip_count
+def calc_f1_score(answers, prediction):
+    f1_scores = []
+    for ans in answers:
+        ans_segs = mixed_segmentation(ans, rm_punc=True)
+        prediction_segs = mixed_segmentation(prediction, rm_punc=True)
+        lcs, lcs_len = find_lcs(ans_segs, prediction_segs)
+        if lcs_len == 0:
+            f1_scores.append(0)
+            continue
+        precision = 1.0 * lcs_len / len(prediction_segs)
+        recall = 1.0 * lcs_len / len(ans_segs)
+        f1 = (2 * precision * recall) / (precision + recall)
+        f1_scores.append(f1)
+    return max(f1_scores)
+def calc_em_score(answers, prediction):
+    em = 0
+    for ans in answers:
+        ans_ = remove_punctuation(ans)
+        prediction_ = remove_punctuation(prediction)
+        if ans_ == prediction_:
+            em = 1
+            break
+    return em
+def eval_file(dataset_file, prediction_file):
+    ground_truth_file = json.load(open(dataset_file, 'r'))
+    prediction_file = json.load(open(prediction_file, 'r'))
+    F1, EM, TOTAL, SKIP = evaluate(ground_truth_file, prediction_file)
+    AVG = (EM + F1) * 0.5
+    return EM, F1, AVG, TOTAL
+if __name__ == '__main__':
+    EM, F1, AVG, TOTAL = eval_file("task_data/cmrc2018/dev.json", "predictions.json")
+    print(EM)
+    print(F1)
+    print(TOTAL)
\ No newline at end of file
--- a/examples/mrc/run.py
+++ b/examples/mrc/run.py
+# coding=utf-8
+import paddlepalm as palm
+import json
+from paddlepalm.distribute import gpu_dev_count
+if __name__ == '__main__':
+    # configs
+    max_seqlen = 512
+    batch_size = 8   
+    num_epochs = 8
+    lr = 3e-5
+    doc_stride = 128
+    max_query_len = 64
+    max_ans_len = 128
+    weight_decay = 0.01
+    print_steps = 20
+    vocab_path = './pretrain/ernie-zh-base/vocab.txt'
+    do_lower_case = True
+    train_file = './data/train.json'
+    predict_file = './data/dev.json'
+    save_path = './outputs/'
+    pred_output = './outputs/predict/'
+    save_type = 'ckpt'
+    task_name = 'cmrc2018'
+    pre_params = './pretrain/ernie-zh-base/params'
+    config = json.load(open('./pretrain/ernie-zh-base/ernie_config.json'))
+    # -----------------------  for training ----------------------- 
+    # step 1-1: create readers for training
+    mrc_reader = palm.reader.MRCReader(vocab_path, max_seqlen, max_query_len, doc_stride, do_lower_case=do_lower_case)
+    # step 1-2: load the training data
+    mrc_reader.load_data(train_file, file_format='json', num_epochs=num_epochs, batch_size=batch_size)
+    # step 2: create a backbone of the model to extract text features
+    ernie = palm.backbone.ERNIE.from_config(config)
+    # step 3: register the backbone in reader
+    mrc_reader.register_with(ernie)
+    # step 4: create the task output head
+    mrc_head = palm.head.MRC(max_query_len, config['hidden_size'], do_lower_case=do_lower_case, max_ans_len=max_ans_len)
+    # step 5-1: create a task trainer
+    trainer = palm.Trainer(task_name)
+    # step 5-2: build forward graph with backbone and task head
+    loss_var = trainer.build_forward(ernie, mrc_head)
+    # step 6-1*: use warmup
+    n_steps = mrc_reader.num_examples * num_epochs // batch_size
+    warmup_steps = int(0.1 * n_steps)
+    sched = palm.lr_sched.TriangularSchedualer(warmup_steps, n_steps)
+    # step 6-2: create a optimizer
+    adam = palm.optimizer.Adam(loss_var, lr, sched)
+    # step 6-3: build backward
+    trainer.build_backward(optimizer=adam, weight_decay=weight_decay)
+    # step 7: fit prepared reader and data
+    trainer.fit_reader(mrc_reader)
+    # step 8-1*: load pretrained parameters
+    trainer.load_pretrain(pre_params)
+    # step 8-2*: set saver to save model
+    # save_steps = (n_steps-8) // gpu_dev_count // 4
+    save_steps = 1520
+    trainer.set_saver(save_path=save_path, save_steps=save_steps, save_type=save_type)
+    # step 8-3: start training
+    trainer.train(print_steps=print_steps)
+    # -----------------------  for prediction ----------------------- 
+    # step 1-1: create readers for prediction
+    predict_mrc_reader = palm.reader.MRCReader(vocab_path, max_seqlen, max_query_len, doc_stride, do_lower_case=do_lower_case, phase='predict')
+    # step 1-2: load the training data
+    predict_mrc_reader.load_data(predict_file, batch_size)
+    # step 2: create a backbone of the model to extract text features
+    pred_ernie = palm.backbone.ERNIE.from_config(config, phase='predict')
+    # step 3: register the backbone in reader
+    predict_mrc_reader.register_with(pred_ernie)
+    # step 4: create the task output head
+    mrc_pred_head = palm.head.MRC(max_query_len, config['hidden_size'], do_lower_case=do_lower_case, max_ans_len=max_ans_len, phase='predict')
+    # step 5: build forward graph with backbone and task head
+    trainer.build_predict_forward(pred_ernie, mrc_pred_head)
+    # step 6: load pretrained model
+    pred_model_path =  './outputs/ckpt.step'+str(12160)
+    pred_ckpt = trainer.load_ckpt(pred_model_path)
+    # step 7: fit prepared reader and data
+    trainer.fit_reader(predict_mrc_reader, phase='predict')
+    # step 8: predict
+    print('predicting..')
+    trainer.predict(print_steps=print_steps, output_dir="outputs/")
--- a/examples/predict/README.md
+++ b/examples/predict/README.md
+## Examples 5: Predict(Classification)
+This task is a sentiment analysis task. The following sections detail model preparation, dataset preparation, and how to run the task.
+### Step 1: Prepare Pre-trained Models & Datasets
+#### Pre-trianed Model
+The pre-training model of this mission is: [ernie-zh-base](https://github.com/PaddlePaddle/PALM/tree/r0.3-api).
+Make sure you have downloaded the required pre-training model in the current folder.
+#### Dataset
+This task uses the `chnsenticorp` dataset. 
+Download dataset:
+```shell
+python download.py
+```
+If everything goes well, there will be a folder named `data/`  created with all the datas in it.
+The data should have 2 fields,  `label  text_a`, with tsv format. Here is some example datas:
+```
+label  text_a
+0   当当网名不符实，订货多日不见送货，询问客服只会推托，只会要求用户再下订单。如此服务留不住顾客的。去别的网站买书服务更好。
+0   XP的驱动不好找！我的17号提的货，现在就降价了100元，而且还送杀毒软件！
+1   <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道当年我听说这本书的时候花很长时间去图书馆找和借都没能如愿,所以这次一看到当当有,马上买了,红迷们也要记得备货哦!
+```
+### Step 2: Predict
+The code used to perform classification task is in `run.py`. If you have prepared the pre-training model and the data set required for the task, run:
+```shell
+python run.py
+```
+If you want to specify a specific gpu or use multiple gpus for predict, please use **`CUDA_VISIBLE_DEVICES`**, for example:
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2 python run.py
+```
+Some logs will be shown below:
+```
+step 1/154, speed: 0.51 steps/s
+step 2/154, speed: 3.36 steps/s
+step 3/154, speed: 3.48 steps/s
+```
+After the run, you can view the predictions in the `outputs/predict` folder. Here are some examples of predictions:
+```
+{"index": 0, "logits": [-0.2014336884021759, 0.6799028515815735], "probs": [0.29290086030960083, 0.7070990800857544], "label": 1}
+{"index": 1, "logits": [0.8593899011611938, -0.29743513464927673], "probs": [0.7607553601264954, 0.23924466967582703], "label": 0}
+{"index": 2, "logits": [0.7462944388389587, -0.7083730101585388], "probs": [0.8107157349586487, 0.18928426504135132], "label": 0}
+```
+### Step 3: Evaluate
+Once you have the prediction, you can run the evaluation script to evaluate the model:
+```shell
+python evaluate.py
+```
+The evaluation results are as follows:  (need to update)
+```
+precision: 0.956666666667, recall: 0.949013157895, f1: 0.95688225039
+```
--- a/examples/predict/download.py
+++ b/examples/predict/download.py
+#  -*- coding: utf-8 -*-
+import os
+import requests
+import tarfile
+import shutil
+from tqdm import tqdm
+def download(src, url):
+    file_size = int(requests.head(url).headers['Content-Length'])
+    header = {
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
+        '70.0.3538.67 Safari/537.36'
+    }
+    pbar = tqdm(total=file_size)
+    resp = requests.get(url, headers=header, stream=True)
+    with open(src, 'ab') as f:
+        for chunk in resp.iter_content(chunk_size=1024):
+            if chunk:
+                f.write(chunk)
+                pbar.update(1024)
+    pbar.close()
+    return file_size
+abs_path = os.path.abspath(__file__)
+download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
+downlaod_path = os.path.join(os.path.dirname(abs_path), "task_data_zh.tgz")
+target_dir = os.path.dirname(abs_path)
+download(downlaod_path, download_url)
+tar = tarfile.open(downlaod_path)
+tar.extractall(target_dir)
+os.remove(downlaod_path)
+abs_path = os.path.abspath(__file__)
+dst_dir = os.path.join(os.path.dirname(abs_path), "data")
+if not os.path.exists(dst_dir) or not os.path.isdir(dst_dir):
+    os.makedirs(dst_dir)
+for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')):
+    shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir)
+shutil.rmtree(os.path.join(target_dir, 'task_data'))
--- a/examples/predict/evaluate.py
+++ b/examples/predict/evaluate.py
+#  -*- coding: utf-8 -*-
+import json
+import numpy as np
+def accuracy(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels) 
+    return (preds == labels).mean()
+def f1(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    tp = np.sum((labels == '1') & (preds == '1'))
+    tn = np.sum((labels == '0') & (preds == '0'))
+    fp = np.sum((labels == '0') & (preds == '1'))
+    fn = np.sum((labels == '1') & (preds == '0'))
+    p = tp * 1.0 / (tp + fp) 
+    r = tp * 1.0 / (tp + fn) * 1.0
+    f1 = (2 * p * r) / (p + r + 1e-8)
+    return f1
+def recall(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    # recall=TP/(TP+FN)
+    tp = np.sum((labels == '1') & (preds == '1'))
+    fn = np.sum((labels == '1') & (preds == '0'))
+    re = tp * 1.0 / (tp + fn)
+    return re
+def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
+    if eval_phase == 'test':
+        data_dir="./data/test.tsv"
+    elif eval_phase == 'dev':
+        data_dir="./data/dev.tsv"
+    else:
+        assert eval_phase in ['dev', 'test'], 'eval_phase should be dev or test'
+    labels = []
+    with open(data_dir, "r") as file:
+        first_flag = True
+        for line in file:
+            line = line.split("\t")
+            label = line[0]
+            if label=='label':
+                continue
+            labels.append(str(label))
+    file.close()
+    preds = []
+    with open(res_dir, "r") as file:
+        for line in file.readlines():
+            line = json.loads(line)
+            pred = line['label']
+            preds.append(str(pred))
+    file.close()
+    assert len(labels) == len(preds), "prediction result doesn't match to labels"
+    print('data num: {}'.format(len(labels)))
+    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+res_evaluate()
--- a/examples/predict/run.py
+++ b/examples/predict/run.py
+# coding=utf-8
+import paddlepalm as palm
+import json
+from paddlepalm.distribute import gpu_dev_count
+if __name__ == '__main__':
+    # configs
+    max_seqlen = 256
+    batch_size = 8
+    vocab_path = './pretrain/ernie-zh-base/vocab.txt'
+    predict_file = './data/test.tsv'
+    random_seed = 1
+    config = json.load(open('./pretrain/ernie-zh-base/ernie_config.json'))
+    input_dim = config['hidden_size']
+    num_classes = 2
+    task_name = 'chnsenticorp'
+    pred_output = './outputs/predict/'
+    print_steps = 20
+    pre_params = './pretrain/ernie-zh-base/params'
+    # -----------------------  for prediction ----------------------- 
+    # step 1-1: create readers for prediction
+    print('prepare to predict...')
+    predict_cls_reader = palm.reader.ClassifyReader(vocab_path, max_seqlen, seed=random_seed, phase='predict')
+    # step 1-2: load the training data
+    predict_cls_reader.load_data(predict_file, batch_size)
+    # step 2: create a backbone of the model to extract text features
+    pred_ernie = palm.backbone.ERNIE.from_config(config, phase='predict')
+    # step 3: register the backbone in reader
+    predict_cls_reader.register_with(pred_ernie)
+    # step 4: create the task output head
+    cls_pred_head = palm.head.Classify(num_classes, input_dim, phase='predict')
+    # step 5-1: create a task trainer
+    trainer = palm.Trainer(task_name)
+    # step 5-2: build forward graph with backbone and task head
+    trainer.build_predict_forward(pred_ernie, cls_pred_head)
+    # step 6: load pretrained model
+    pred_model = trainer.load_ckpt(pre_params)
+    # step 7: fit prepared reader and data
+    trainer.fit_reader(predict_cls_reader, phase='predict')
+    # step 8: predict
+    print('predicting..')
+    trainer.predict(print_steps=print_steps, output_dir=pred_output)
--- a/examples/tagging/README.md
+++ b/examples/tagging/README.md
+## Examples 3: Tagging
+This task is a named entity recognition task. The following sections detail model preparation, dataset preparation, and how to run the task.
+### Step 1: Prepare Pre-trained Models & Datasets
+#### Pre-trianed Model
+The pre-training model of this mission is: [ernie-zh-base](https://github.com/PaddlePaddle/PALM/tree/r0.3-api).
+Make sure you have downloaded the required pre-training model in the current folder.
+#### Dataset
+This task uses the `MSRA-NER(SIGHAN2006)` dataset. 
+Download dataset:
+```shell
+python download.py
+```
+If everything goes well, there will be a folder named `data/`  created with all the datas in it.
+The data should have 2 fields,  `text_a  label`, with tsv format. Here is some example datas:
+ ```
+text_a  label
+在 这 里 恕 弟 不 恭 之 罪 ， 敢 在 尊 前 一 诤 ： 前 人 论 书 ， 每 曰 “ 字 字 有 来 历 ， 笔 笔 有 出 处 ” ， 细 读 公 字 ， 何 尝 跳 出 前 人 藩 篱 ， 自 隶 变 而 后 ， 直 至 明 季 ， 兄 有 何 新 出 ？    O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
+相 比 之 下 ， 青 岛 海 牛 队 和 广 州 松 日 队 的 雨 中 之 战 虽 然 也 是 0 ∶ 0 ， 但 乏 善 可 陈 。   O O O O O B-ORG I-ORG I-ORG I-ORG I-ORG O B-ORG I-ORG I-ORG I-ORG I-ORG O O O O O O O O O O O O O O O O O O O
+理 由 多 多 ， 最 无 奈 的 却 是 ： 5 月 恰 逢 双 重 考 试 ， 她 攻 读 的 博 士 学 位 论 文 要 通 考 ； 她 任 教 的 两 所 学 校 ， 也 要 在 这 段 时 日 大 考 。    O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
+ ```
+### Step 2: Train & Predict
+The code used to perform classification task is in `run.py`. If you have prepared the pre-training model and the data set required for the task, run:
+```shell
+python run.py
+```
+If you want to specify a specific gpu or use multiple gpus for training, please use **`CUDA_VISIBLE_DEVICES`**, for example:
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2 python run.py
+```
+Some logs will be shown below:
+```
+step 1/652 (epoch 0), loss: 216.002, speed: 0.32 steps/s
+step 2/652 (epoch 0), loss: 202.567, speed: 1.28 steps/s
+step 3/652 (epoch 0), loss: 170.677, speed: 1.05 steps/s
+```
+After the run, you can view the saved models in the `outputs/` folder and the predictions in the `outputs/predict` folder. Here are some examples of predictions:
+```
+[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 6, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
+```
+### Step 3: Evaluate
+Once you have the prediction, you can run the evaluation script to evaluate the model:
+```python
+python evaluate.py
+```
+The evaluation results are as follows:
+```
+precision: 0.948718989809, recall: 0.944806113784, f1: 0.946758508914
+```
--- a/examples/tagging/download.py
+++ b/examples/tagging/download.py
+#  -*- coding: utf-8 -*-
+import os
+import requests
+import tarfile
+import shutil
+from tqdm import tqdm
+def download(src, url):
+    file_size = int(requests.head(url).headers['Content-Length'])
+    header = {
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
+        '70.0.3538.67 Safari/537.36'
+    }
+    pbar = tqdm(total=file_size)
+    resp = requests.get(url, headers=header, stream=True)
+    with open(src, 'ab') as f:
+        for chunk in resp.iter_content(chunk_size=1024):
+            if chunk:
+                f.write(chunk)
+                pbar.update(1024)
+    pbar.close()
+    return file_size
+abs_path = os.path.abspath(__file__)
+download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
+downlaod_path = os.path.join(os.path.dirname(abs_path), "task_data_zh.tgz")
+target_dir = os.path.dirname(abs_path)
+download(downlaod_path, download_url)
+tar = tarfile.open(downlaod_path)
+tar.extractall(target_dir)
+os.remove(downlaod_path)
+abs_path = os.path.abspath(__file__)
+dst_dir = os.path.join(os.path.dirname(abs_path), "data")
+if not os.path.exists(dst_dir) or not os.path.isdir(dst_dir):
+    os.makedirs(dst_dir)
+for file in os.listdir(os.path.join(target_dir, 'task_data', 'msra_ner')):
+    shutil.move(os.path.join(target_dir, 'task_data', 'msra_ner', file), dst_dir)
+shutil.rmtree(os.path.join(target_dir, 'task_data'))
--- a/examples/tagging/evaluate.py
+++ b/examples/tagging/evaluate.py
+#  -*- coding: utf-8 -*-
+import json
+def load_label_map(map_dir="./data/label_map.json"):
+    """
+    :param map_dir: dict indictuing chunk type
+    :return:
+    """
+    return json.load(open(map_dir, "r"))
+def cal_chunk(total_res, total_label):
+    assert len(total_label) == len(total_res), 'prediction result doesn\'t match to labels'
+    num_labels = 0
+    num_corr = 0
+    num_infers = 0
+    for res, label in zip(total_res, total_label):
+        assert len(res) == len(label), "prediction result doesn\'t match to labels"
+        num_labels += sum([0 if i == 6 else 1 for i in label])
+        num_corr += sum([1 if label[i] == res[i] and label[i] != 6 else 0 for i in range(len(label))])
+        num_infers += sum([0 if i == 6 else 1 for i in res])
+    precision = num_corr * 1.0 / num_infers if num_infers > 0 else 0.0
+    recall = num_corr * 1.0 / num_labels if num_labels > 0 else 0.0
+    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
+    return precision, recall, f1
+def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/test.tsv"):
+    label_map = load_label_map()
+    total_label = []
+    with open(data_dir, "r") as file:
+        first_flag = True
+        for line in file:
+            if first_flag:
+                first_flag = False
+                continue
+            line = line.strip("\n")
+            if len(line) == 0:
+                continue
+            line = line.split("\t")
+            if len(line) < 2:
+                continue
+            labels = line[1].split("\x02")
+            total_label.append(labels)
+    total_label = [[label_map[j] for j in i] for i in total_label]
+    total_res = []
+    with open(res_dir, "r") as file:
+        cnt = 0
+        for line in file:
+            line = line.strip("\n")
+            if len(line) == 0:
+                continue
+            try:
+                res_arr = json.loads(line)
+                if len(total_label[cnt]) < len(res_arr):
+                    total_res.append(res_arr[1: 1 + len(total_label[cnt])])
+                elif len(total_label[cnt]) == len(res_arr):
+                    total_res.append(res_arr)
+                else:
+                    total_res.append(res_arr)
+                    total_label[cnt] = total_label[cnt][: len(res_arr)]
+            except:
+                print("json format error: {}".format(cnt))
+                print(line)
+            cnt += 1
+    precision, recall, f1 = cal_chunk(total_res, total_label)
+    print("precision: {}, recall: {}, f1: {}".format(precision, recall, f1))
+res_evaluate()
--- a/examples/tagging/run.py
+++ b/examples/tagging/run.py
+# coding=utf-8
+import paddlepalm as palm
+import json
+from paddlepalm.distribute import gpu_dev_count
+if __name__ == '__main__':
+    # configs
+    max_seqlen = 256
+    batch_size = 16
+    num_epochs = 6
+    lr = 5e-5
+    num_classes = 7
+    weight_decay = 0.01
+    dropout_prob = 0.1
+    vocab_path = './pretrain/ernie-zh-base/vocab.txt'
+    label_map = './data/label_map.json'
+    random_seed = 1
+    train_file = './data/train.tsv'
+    predict_file = './data/test.tsv'
+    save_path='./outputs/'
+    save_type='ckpt' 
+    pre_params = './pretrain/ernie-zh-base/params'
+    config = json.load(open('./pretrain/ernie-zh-base/ernie_config.json'))
+    input_dim = config['hidden_size']  
+    task_name = 'msra_ner'
+    pred_output = './outputs/predict/'
+    train_print_steps = 10
+    pred_print_steps = 20
+    # -----------------------  for training ----------------------- 
+    # step 1-1: create readers for training
+    ner_reader = palm.reader.SequenceLabelReader(vocab_path, max_seqlen, label_map, seed=random_seed)
+    # step 1-2: load the training data
+    ner_reader.load_data(train_file, file_format='tsv', num_epochs=num_epochs, batch_size=batch_size)
+    # step 2: create a backbone of the model to extract text features
+    ernie = palm.backbone.ERNIE.from_config(config)
+    # step 3: register the backbone in reader
+    ner_reader.register_with(ernie)
+    # step 4: create the task output head
+    ner_head = palm.head.SequenceLabel(num_classes, input_dim, dropout_prob)
+    # step 5-1: create a task trainer
+    trainer = palm.Trainer(task_name)
+    # step 5-2: build forward graph with backbone and task head
+    loss_var = trainer.build_forward(ernie, ner_head)
+    # step 6-1*: use warmup
+    n_steps = ner_reader.num_examples * num_epochs // batch_size
+    warmup_steps = int(0.1 * n_steps)
+    print('total_steps: {}'.format(n_steps))
+    print('warmup_steps: {}'.format(warmup_steps))
+    sched = palm.lr_sched.TriangularSchedualer(warmup_steps, n_steps)
+    # step 6-2: create a optimizer
+    adam = palm.optimizer.Adam(loss_var, lr, sched)
+    # step 6-3: build backward
+    trainer.build_backward(optimizer=adam, weight_decay=weight_decay)
+    # step 7: fit prepared reader and data
+    trainer.fit_reader(ner_reader)
+    # step 8-1*: load pretrained parameters
+    trainer.load_pretrain(pre_params)
+    # step 8-2*: set saver to save model
+    save_steps = (n_steps-20)// gpu_dev_count
+    print('save_steps: {}'.format(save_steps))
+    trainer.set_saver(save_path=save_path, save_steps=save_steps, save_type=save_type)
+    # step 8-3: start training
+    trainer.train(print_steps=train_print_steps)
+    # -----------------------  for prediction ----------------------- 
+    # step 1-1: create readers for prediction
+    print('prepare to predict...')
+    predict_ner_reader = palm.reader.SequenceLabelReader(vocab_path, max_seqlen, label_map, phase='predict')
+    # step 1-2: load the training data
+    predict_ner_reader.load_data(predict_file, batch_size)
+    # step 2: create a backbone of the model to extract text features
+    pred_ernie = palm.backbone.ERNIE.from_config(config, phase='predict')
+    # step 3: register the backbone in reader
+    predict_ner_reader.register_with(pred_ernie)
+    # step 4: create the task output head
+    ner_pred_head = palm.head.SequenceLabel(num_classes, input_dim, phase='predict')
+    # step 5: build forward graph with backbone and task head
+    trainer.build_predict_forward(pred_ernie, ner_pred_head)
+    # step 6: load pretrained model
+    pred_model_path = './outputs/ckpt.step' + str(save_steps)
+    pred_ckpt = trainer.load_ckpt(pred_model_path)
+    # step 7: fit prepared reader and data
+    trainer.fit_reader(predict_ner_reader, phase='predict')
+    # step 8: predict
+    print('predicting..')
+    trainer.predict(print_steps=pred_print_steps, output_dir=pred_output)
--- a/interface.py
+++ b/interface.py
--- a/paddlepalm/README.md
+++ b/paddlepalm/README.md
--- a/paddlepalm/__init__.py
+++ b/paddlepalm/__init__.py
@@ -9,6 +9,7 @@ import head
 from trainer import Trainer
+from multihead_trainer import MultiHeadTrainer
 del interface
 del task_instance

--- a/paddlepalm/_downloader.py
+++ b/paddlepalm/_downloader.py
--- a/paddlepalm/backbone/base_backbone.py
+++ b/paddlepalm/backbone/base_backbone.py
--- a/paddlepalm/backbone/bert.py
+++ b/paddlepalm/backbone/bert.py
--- a/paddlepalm/backbone/ernie.py
+++ b/paddlepalm/backbone/ernie.py
--- a/paddlepalm/controller/__init__.py
+++ b/paddlepalm/controller/__init__.py
-from conf_controller import ConfigController
-from controller import Controller
--- a/paddlepalm/controller/conf_controller.py
+++ b/paddlepalm/controller/conf_controller.py
--- a/paddlepalm/controller/controller.py
+++ b/paddlepalm/controller/controller.py
--- a/paddlepalm/default_settings.py
+++ b/paddlepalm/default_settings.py
--- a/paddlepalm/distribute/__init__.py
+++ b/paddlepalm/distribute/__init__.py
@@ -5,5 +5,5 @@ import multiprocessing
 gpu_dev_count = int(fluid.core.get_cuda_device_count())
 cpu_dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-from reader import yield_pieces, data_feeder 
+from reader import yield_pieces, data_feeder, decode_fake
--- a/paddlepalm/distribute/reader.py
+++ b/paddlepalm/distribute/reader.py
--- a/paddlepalm/downloader.py
+++ b/paddlepalm/downloader.py
 from _downloader import *
--- a/paddlepalm/head/__init__.py
+++ b/paddlepalm/head/__init__.py
--- a/paddlepalm/head/base_head.py
+++ b/paddlepalm/head/base_head.py
--- a/paddlepalm/head/cls.py
+++ b/paddlepalm/head/cls.py
--- a/paddlepalm/head/match.py
+++ b/paddlepalm/head/match.py
--- a/paddlepalm/head/mlm.py
+++ b/paddlepalm/head/mlm.py
--- a/paddlepalm/head/mrc.py
+++ b/paddlepalm/head/mrc.py
--- a/tasktype/cls.py
+++ b/tasktype/cls.py
--- a/paddlepalm/interface.py
+++ b/paddlepalm/interface.py
--- a/paddlepalm/lr_sched/__init__.py
+++ b/paddlepalm/lr_sched/__init__.py
 from slanted_triangular_schedualer import TriangularSchedualer
 from warmup_schedualer import WarmupSchedualer
--- a/paddlepalm/lr_sched/schedualer.py
+++ b/paddlepalm/lr_sched/schedualer.py
--- a/paddlepalm/lr_sched/noam_decay_schedualer.py
+++ b/paddlepalm/lr_sched/noam_decay_schedualer.py
-# scheduled_lr = fluid.layers.learning_rate_scheduler\
-#  .noam_decay(1/(warmup_steps *(config['learning_rate'] ** 2)),
-#              warmup_steps)
--- a/paddlepalm/lr_sched/slanted_triangular_schedualer.py
+++ b/paddlepalm/lr_sched/slanted_triangular_schedualer.py
--- a/paddlepalm/lr_sched/warmup_schedualer.py
+++ b/paddlepalm/lr_sched/warmup_schedualer.py
--- a/paddlepalm/mtl_controller.py
+++ b/paddlepalm/mtl_controller.py
--- a/paddlepalm/multihead_trainer.py
+++ b/paddlepalm/multihead_trainer.py
--- a/paddlepalm/optimizer/adam.py
+++ b/paddlepalm/optimizer/adam.py
--- a/paddlepalm/optimizer/base_optimizer.py
+++ b/paddlepalm/optimizer/base_optimizer.py
--- a/paddlepalm/reader/__init__.py
+++ b/paddlepalm/reader/__init__.py
--- a/paddlepalm/reader/base_reader.py
+++ b/paddlepalm/reader/base_reader.py
--- a/paddlepalm/reader/cls.py
+++ b/paddlepalm/reader/cls.py
--- a/paddlepalm/reader/match.py
+++ b/paddlepalm/reader/match.py
--- a/paddlepalm/reader/mlm.py
+++ b/paddlepalm/reader/mlm.py
--- a/paddlepalm/reader/mrc.py
+++ b/paddlepalm/reader/mrc.py
--- a/paddlepalm/reader/seq_label.py
+++ b/paddlepalm/reader/seq_label.py
--- a/paddlepalm/reader/utils/mlm_batching.py
+++ b/paddlepalm/reader/utils/mlm_batching.py
--- a/paddlepalm/reader/utils/reader4ernie.py
+++ b/paddlepalm/reader/utils/reader4ernie.py
--- a/paddlepalm/task_instance.py
+++ b/paddlepalm/task_instance.py
--- a/paddlepalm/trainer.py
+++ b/paddlepalm/trainer.py
--- a/paddlepalm/utils/.saver.py.swp
+++ b/paddlepalm/utils/.saver.py.swp
--- a/paddlepalm/utils/basic_helper.py
+++ b/paddlepalm/utils/basic_helper.py
--- a/paddlepalm/utils/reader_helper.py
+++ b/paddlepalm/utils/reader_helper.py
--- a/paddlepalm/utils/saver.py
+++ b/paddlepalm/utils/saver.py
--- a/reader/__init__.py
+++ b/reader/__init__.py
--- a/reader/cls.py
+++ b/reader/cls.py
--- a/reader/match.py
+++ b/reader/match.py
--- a/reader/mlm.py
+++ b/reader/mlm.py
--- a/reader/mrc.py
+++ b/reader/mrc.py
--- a/reader/utils/__init__.py
+++ b/reader/utils/__init__.py
--- a/reader/utils/batching4bert.py
+++ b/reader/utils/batching4bert.py
--- a/reader/utils/batching4ernie.py
+++ b/reader/utils/batching4ernie.py
--- a/reader/utils/mlm_batching.py
+++ b/reader/utils/mlm_batching.py
--- a/reader/utils/mrqa_helper.py
+++ b/reader/utils/mrqa_helper.py
--- a/reader/utils/reader4ernie.py
+++ b/reader/utils/reader4ernie.py
--- a/script/convert_params.sh
+++ b/script/convert_params.sh
--- a/script/download_pretrain_backbone.sh
+++ b/script/download_pretrain_backbone.sh
--- a/script/recover_params.sh
+++ b/script/recover_params.sh
--- a/setup.cfg
+++ b/setup.cfg
--- a/setup.py
+++ b/setup.py
--- a/tasktype/__init__.py
+++ b/tasktype/__init__.py
--- a/tasktype/match.py
+++ b/tasktype/match.py
--- a/tasktype/mlm.py
+++ b/tasktype/mlm.py
--- a/tasktype/mrc.py
+++ b/tasktype/mrc.py
--- a/demo/demo2/config.yaml
+++ b/demo/demo2/config.yaml
--- a/demo/demo3/data/cls4mrqa/train.tsv
+++ b/demo/demo3/data/cls4mrqa/train.tsv
--- a/test/test2/data/cls4mrqa/train.tsv
+++ b/test/test2/data/cls4mrqa/train.tsv
--- a/demo/demo2/data/match4mrqa/train.tsv
+++ b/demo/demo2/data/match4mrqa/train.tsv
--- a/demo/demo2/data/mlm4mrqa/train.tsv
+++ b/demo/demo2/data/mlm4mrqa/train.tsv
--- a/demo/demo1/data/mrqa/dev.json
+++ b/demo/demo1/data/mrqa/dev.json
--- a/demo/demo1/data/mrqa/train.json
+++ b/demo/demo1/data/mrqa/train.json
--- a/demo/demo2/paddlepalm
+++ b/demo/demo2/paddlepalm
--- a/demo/demo3/run.py
+++ b/demo/demo3/run.py
--- a/demo/demo2/run.sh
+++ b/demo/demo2/run.sh
--- a/demo/demo3/config.yaml
+++ b/demo/demo3/config.yaml
--- a/test/test3/data/cls4mrqa/dev.tsv
+++ b/test/test3/data/cls4mrqa/dev.tsv
--- a/test/test3/data/cls4mrqa/train.tsv
+++ b/test/test3/data/cls4mrqa/train.tsv
--- a/demo/demo3/paddlepalm
+++ b/demo/demo3/paddlepalm
--- a/test/test3/run.py
+++ b/test/test3/run.py
--- a/demo/demo3/run.sh
+++ b/demo/demo3/run.sh