Merge branch 'r0.3-api' of https://github.com/PaddlePaddle/PALM into r0.3-api

fac8802f · wangxiao1021 · bc504176 · d44b6381 · fac8802f · fac8802f
73 changed file
--- a/paddlepalm/task_paradigm/__init__.py
+++ b/paddlepalm/task_paradigm/__init__.py
--- a/backbone/__init__.py
+++ b/backbone/__init__.py
--- a/backbone/bert.py
+++ b/backbone/bert.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""v1.1 
+BERT model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddle import fluid
+from paddle.fluid import layers
+
+from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
+from paddlepalm.interface import backbone
+
+    
+class Model(backbone):
+    
+    def __init__(self, config, phase):
+
+        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变
+        self._emb_size = config["hidden_size"]
+        self._n_layer = config["num_hidden_layers"]
+        self._n_head = config["num_attention_heads"]
+        self._voc_size = config["vocab_size"]
+        self._max_position_seq_len = config["max_position_embeddings"]
+        self._sent_types = config["type_vocab_size"]
+        self._hidden_act = config["hidden_act"]
+        self._prepostprocess_dropout = config["hidden_dropout_prob"]
+        self._attention_dropout = config["attention_probs_dropout_prob"]
+
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+
+        # Initialize all weigths by truncated normal initializer, and all biases 
+        # will be initialized by constant zero by default.
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config["initializer_range"])
+
+    @property
+    def inputs_attr(self):
+        return {"token_ids": [[-1, -1, 1], 'int64'],
+                "position_ids": [[-1, -1, 1], 'int64'],
+                "segment_ids": [[-1, -1, 1], 'int64'],
+                "input_mask": [[-1, -1, 1], 'float32']}
+
+    @property
+    def outputs_attr(self):
+        return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
+                "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
+                "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
+                "sentence_embedding": [[-1, self._emb_size], 'float32'],
+                "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
+
+    def build(self, inputs, scope_name=""):
+        src_ids = inputs['token_ids']
+        pos_ids = inputs['position_ids']
+        sent_ids = inputs['segment_ids']
+        input_mask = inputs['input_mask']
+
+        self._emb_dtype = 'float32'
+        # padding id in vocabulary must be set to 0
+        emb_out = fluid.layers.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False)
+
+        # fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
+        embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
+        
+        position_emb_out = fluid.layers.embedding(
+            input=pos_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
+
+        sent_emb_out = fluid.layers.embedding(
+            sent_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
+
+        emb_out = emb_out + position_emb_out
+        emb_out = emb_out + sent_emb_out
+
+        emb_out = pre_process_layer(
+            emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
+
+        self_attn_mask = fluid.layers.matmul(
+            x=input_mask, y=input_mask, transpose_y=True)
+
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+
+        enc_out = encoder(
+            enc_input=emb_out,
+            attn_bias=n_head_self_attn_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            param_initializer=self._param_initializer,
+            name=scope_name+'encoder')
+
+        
+        next_sent_feat = fluid.layers.slice(
+            input=enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
+        next_sent_feat = fluid.layers.fc(
+            input=next_sent_feat,
+            size=self._emb_size,
+            act="tanh",
+            param_attr=fluid.ParamAttr(
+                name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr=scope_name+"pooled_fc.b_0")
+
+        return {'embedding_table': embedding_table,
+                'word_embedding': emb_out,
+                'encoder_outputs': enc_out,
+                'sentence_embedding': next_sent_feat,
+                'sentence_pair_embedding': next_sent_feat}
+
+    def postprocess(self, rt_outputs):
+        pass
+
+
--- a/backbone/ernie.py
+++ b/backbone/ernie.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ernie model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+from paddle import fluid
+from paddle.fluid import layers
+
+from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
+from paddlepalm.interface import backbone
+
+
+class Model(backbone):
+
+    def __init__(self,
+                 config,
+                 phase):
+
+        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变
+
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        if config['sent_type_vocab_size']:
+            self._sent_types = config['sent_type_vocab_size']
+        else:
+            self._sent_types = config['type_vocab_size']
+
+        self._task_types = config['task_type_vocab_size']
+
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._task_emb_name = "task_embedding"
+        self._emb_dtype = "float32"
+
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+
+    @property
+    def inputs_attr(self):
+        return {"token_ids": [[-1, -1, 1], 'int64'],
+                "position_ids": [[-1, -1, 1], 'int64'],
+                "segment_ids": [[-1, -1, 1], 'int64'],
+                "input_mask": [[-1, -1, 1], 'float32'],
+                "task_ids": [[-1,-1, 1], 'int64']}
+
+    @property
+    def outputs_attr(self):
+        return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
+                "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
+                "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
+                "sentence_embedding": [[-1, self._emb_size], 'float32'],
+                "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
+
+    def build(self, inputs, scope_name=""):
+
+        src_ids = inputs['token_ids']
+        pos_ids = inputs['position_ids']
+        sent_ids = inputs['segment_ids']
+        input_mask = inputs['input_mask']
+        task_ids = inputs['task_ids']
+
+        # padding id in vocabulary must be set to 0
+        emb_out = fluid.layers.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False)
+
+        # fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
+        embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
+        
+        position_emb_out = fluid.layers.embedding(
+            input=pos_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
+
+        sent_emb_out = fluid.layers.embedding(
+            sent_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
+
+        emb_out = emb_out + position_emb_out
+        emb_out = emb_out + sent_emb_out
+
+        task_emb_out = fluid.layers.embedding(
+            task_ids,
+            size=[self._task_types, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+self._task_emb_name,
+                initializer=self._param_initializer))
+
+        emb_out = emb_out + task_emb_out
+
+        emb_out = pre_process_layer(
+            emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
+
+        self_attn_mask = fluid.layers.matmul(
+            x=input_mask, y=input_mask, transpose_y=True)
+
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+
+        enc_out = encoder(
+            enc_input=emb_out,
+            attn_bias=n_head_self_attn_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            param_initializer=self._param_initializer,
+            name=scope_name+'encoder')
+
+        
+        next_sent_feat = fluid.layers.slice(
+            input=enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
+        next_sent_feat = fluid.layers.fc(
+            input=next_sent_feat,
+            size=self._emb_size,
+            act="tanh",
+            param_attr=fluid.ParamAttr(
+                name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr=scope_name+"pooled_fc.b_0")
+
+        return {'embedding_table': embedding_table,
+                'word_embedding': emb_out,
+                'encoder_outputs': enc_out,
+                'sentence_embedding': next_sent_feat,
+                'sentence_pair_embedding': next_sent_feat}
+
+    def postprocess(self, rt_outputs):
+        pass
--- a/backbone/utils/__init__.py
+++ b/backbone/utils/__init__.py
--- a/backbone/utils/transformer.py
+++ b/backbone/utils/transformer.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer encoder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import partial
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+from paddle.fluid.layer_helper import LayerHelper as LayerHelper
+from functools import reduce # py3
+def layer_norm(x, begin_norm_axis=1, epsilon=1e-6, param_attr=None, bias_attr=None):
+    helper = LayerHelper('layer_norm', **locals())
+    mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
+    shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
+    variance = layers.reduce_mean(layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
+    r_stdev = layers.rsqrt(variance + epsilon)
+    norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
+
+    param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
+    param_dtype = norm_x.dtype
+    scale = helper.create_parameter(
+        attr=param_attr,
+        shape=param_shape,
+        dtype=param_dtype,
+        default_initializer=fluid.initializer.Constant(1.))
+    bias = helper.create_parameter(
+        attr=bias_attr,
+        shape=param_shape,
+        dtype=param_dtype,
+        is_bias=True,
+        default_initializer=fluid.initializer.Constant(0.))
+
+    out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
+    out = layers.elementwise_add(x=out, y=bias, axis=-1)
+
+    return out
+
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None,
+                         param_initializer=None,
+                         name='multi_head_att'):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    keys = queries if keys is None else keys
+    values = keys if values is None else values
+
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_query_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_query_fc.b_0')
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_key_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_key_fc.b_0')
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_value_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_value_fc.b_0')
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(
+            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        return layers.reshape(
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+            inplace=True)
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=dropout_rate,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+    if cache is not None:  # use cache and concat time steps
+        # Since the inplace reshape in __split_heads changes the shape of k and
+        # v, which is the cache input for next time step, reshape the cache
+        # input from the previous time step first.
+        k = cache["k"] = layers.concat(
+            [layers.reshape(
+                cache["k"], shape=[0, 0, d_model]), k], axis=1)
+        v = cache["v"] = layers.concat(
+            [layers.reshape(
+                cache["v"], shape=[0, 0, d_model]), v], axis=1)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
+                                                  dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         num_flatten_dims=2,
+                         param_attr=fluid.ParamAttr(
+                             name=name + '_output_fc.w_0',
+                             initializer=param_initializer),
+                         bias_attr=name + '_output_fc.b_0')
+    return proj_out
+
+
+def positionwise_feed_forward(x,
+                              d_inner_hid,
+                              d_hid,
+                              dropout_rate,
+                              hidden_act,
+                              param_initializer=None,
+                              name='ffn'):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       act=hidden_act,
+                       param_attr=fluid.ParamAttr(
+                           name=name + '_fc_0.w_0',
+                           initializer=param_initializer),
+                       bias_attr=name + '_fc_0.b_0')
+    if dropout_rate:
+        hidden = layers.dropout(
+            hidden,
+            dropout_prob=dropout_rate,
+            dropout_implementation="upscale_in_train",
+            is_test=False)
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    param_attr=fluid.ParamAttr(
+                        name=name + '_fc_1.w_0', initializer=param_initializer),
+                    bias_attr=name + '_fc_1.b_0')
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
+                           name=''):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out_dtype = out.dtype
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float32")
+            out = layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_scale',
+                    initializer=fluid.initializer.Constant(1.)),
+                bias_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_bias',
+                    initializer=fluid.initializer.Constant(0.)))
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float16")
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out,
+                    dropout_prob=dropout_rate,
+                    dropout_implementation="upscale_in_train",
+                    is_test=False)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  prepostprocess_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  hidden_act,
+                  preprocess_cmd="n",
+                  postprocess_cmd="da",
+                  param_initializer=None,
+                  name=''):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(
+        pre_process_layer(
+            enc_input,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_att'),
+        None,
+        None,
+        attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        param_initializer=param_initializer,
+        name=name + '_multi_head_att')
+    attn_output = post_process_layer(
+        enc_input,
+        attn_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_att')
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(
+            attn_output,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_ffn'),
+        d_inner_hid,
+        d_model,
+        relu_dropout,
+        hidden_act,
+        param_initializer=param_initializer,
+        name=name + '_ffn')
+    return post_process_layer(
+        attn_output,
+        ffd_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_ffn')
+
+
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
+
+    return enc_output
--- a/demo/demo2/config.yaml
+++ b/demo/demo2/config.yaml
-task_instance: "mrqa, mlm4mrqa, match4mrqa"
+ask_instance: "mrqa, mlm4mrqa, match4mrqa"
 target_tag: 1, 0, 0
 mix_ratio: 1.0, 0.5, 0.5


--- a/demo/demo2/paddlepalm
+++ b/demo/demo2/paddlepalm
+../../paddlepalm/
\ No newline at end of file
--- a/demo/demo2/run.py
+++ b/demo/demo2/run.py
 import paddlepalm as palm

 if __name__ == '__main__':
-    controller = palm.Controller('config.yaml', task_dir='tasks')
+
+    max_seqlen = 512
+    batch_size = 32
+
+    match_reader = palm.reader.match(train_file, vocab, \
+        max_seqlen, file_format='csv', tokenizer='wordpiece', \
+        lang='en', shuffle_train=True)
+    mrc_reader = palm.reader.mrc(train_file, phase='train')
+    mlm_reader = palm.reader.mlm(train_file, phase='train')
+    palm.reader.
+
+    match = palm.tasktype.cls(num_classes=4)
+    mrc = palm.tasktype.match(learning_strategy='pairwise')
+    mlm = palm.tasktype.mlm()
+    mlm.print()
+
+    
+    bb_flags = palm.load_json('./pretrain/ernie/ernie_config.json')
+    bb = palm.backbone.ernie(bb_flags['xx'], xxx)
+    bb.print()
+
+    match4mrqa = palm.Task('match4mrqa', match_reader, match_tt)
+    mrc4mrqa = palm.Task('match4mrqa', match_reader, match_tt)
+
+    # match4mrqa.reuse_with(mrc4mrqa)
+
+
+    controller = palm.Controller([mrqa, match4mrqa, mlm4mrqa])
+
+    loss = controller.build_forward(bb, mask_task=[])
+
+    n_steps = controller.estimate_train_steps(basetask=mrqa, num_epochs=2, batch_size=8, dev_count=4)
+    adam = palm.optimizer.Adam(loss)
+    sched = palm.schedualer.LinearWarmup(learning_rate, max_train_steps=n_steps, warmup_steps=0.1*n_steps)
+    
+    controller.build_backward(optimizer=adam, schedualer=sched, weight_decay=0.001, use_ema=True, ema_decay=0.999)
+
+    controller.random_init_params()
    controller.load_pretrain('../../pretrain_model/ernie/params')
    controller.train()

-    controller = palm.Controller(config='config.yaml', task_dir='tasks', for_train=False)
-    controller.pred('mrqa', inference_model_dir='output_model/secondrun/mrqa/infer_model')
+
+
+
+
+    # controller = palm.Controller(config='config.yaml', task_dir='tasks', for_train=False)
+    # controller.pred('mrqa', inference_model_dir='output_model/secondrun/mrqa/infer_model')


--- a/demo/demo3/paddlepalm
+++ b/demo/demo3/paddlepalm
+../../paddlepalm/
\ No newline at end of file
--- a/demo/demo3/pretrain
+++ b/demo/demo3/pretrain
+../../pretrain/
\ No newline at end of file
--- a/demo/demo3/run.py
+++ b/demo/demo3/run.py
+# coding=utf-8
 import paddlepalm as palm
+import json

 if __name__ == '__main__':
-    controller = palm.Controller('config.yaml', task_dir='tasks')
-    controller.load_pretrain('../../pretrain_model/ernie/params')
-    controller.train()
+
+    max_seqlen = 512
+    batch_size = 4
+    num_epochs = 2
+    lr = 1e-3
+    vocab_path = './pretrain/ernie/vocab.txt'
+
+    train_file = './data/cls4mrqa/train.tsv'
+
+    config = json.load(open('./pretrain/ernie/ernie_config.json'))
+    # ernie = palm.backbone.ERNIE(...)
+    ernie = palm.backbone.ERNIE.from_config(config)
+    # pred_ernie = palm.backbone.ERNIE.from_config(config, phase='pred')
+
+    # cls_reader2 = palm.reader.cls(train_file_topic, vocab_path, batch_size, max_seqlen)
+    # cls_reader3 = palm.reader.cls(train_file_subj, vocab_path, batch_size, max_seqlen)
+    # topic_trainer = palm.Trainer('topic_cls', cls_reader2, cls)
+    # subj_trainer = palm.Trainer('subj_cls', cls_reader3, cls)
+
+    # 创建该分类任务的reader，由诸多参数控制数据集读入格式、文件数量、预处理规则等
+    cls_reader = palm.reader.ClassifyReader(vocab_path, max_seqlen)
+    print(cls_reader.outputs_attr)
+    # 不同的backbone会对任务reader有不同的特征要求，例如对于分类任务，基本的输入feature为token_ids和label_ids，但是对于BERT，还要求从输入中额外提取position、segment、input_mask等特征，因此经过register后，reader会自动补充backbone所要求的字段
+    cls_reader.register_with(ernie)
+    print(cls_reader.outputs_attr)
+    # 创建任务头（task head），如分类、匹配、机器阅读理解等。每个任务头有跟该任务相关的必选/可选参数。注意，任务头与reader是解耦合的，只要任务头依赖的数据集侧的字段能被reader提供，那么就是合法的
+    cls_head = palm.head.Classify(4, 1024, 0.1)
+    # cls_pred_head = palm.head.Classify(4, 1024, 0.1, phase='pred')
+
+    # 根据reader和任务头来创建一个训练器trainer，trainer代表了一个训练任务，内部维护着训练进程、和任务的关键信息，并完成合法性校验，该任务的模型保存、载入等相关规则控制
+    trainer = palm.Trainer('senti_cls', cls_reader, cls_head)
+
+    # match4mrqa.reuse_head_with(mrc4mrqa)
+
+    # data_vars = cls_reader.build()
+    # output_vars = ernie.build(data_vars)
+    # cls_head.build({'backbone': output_vars, 'reader': data_vars})
+
+    loss_var = trainer.build_forward(ernie)
+
+    # controller.build_forward()
+    # Error! a head/backbone can be only build once! Try NOT to call build_forward method for any Trainer!
+
+    print(trainer.num_examples)
+    iterator_fn = trainer.load_data(train_file, 'csv', num_epochs=num_epochs, batch_size=batch_size)
+    print(trainer.num_examples)
+
+    n_steps = trainer.num_examples * num_epochs // batch_size
+    warmup_steps = int(0.1 * n_steps)
+    print(warmup_steps)
+    sched = palm.lr_sched.TriangularSchedualer(warmup_steps, n_steps)
+
+    adam = palm.optimizer.Adam(loss_var, lr, sched)
+
+    trainer.build_backward(optimizer=adam, weight_decay=0.001)
+
+    trainer.random_init_params()
+    trainer.load_pretrain('pretrain/ernie/params')
+
+    # print(trainer.train_one_step(next(iterator_fn())))
+    # trainer.train_one_epoch()
+    trainer.train(iterator_fn, print_steps=1, save_steps=5, save_path='outputs/ckpt')
+    # trainer.save()
+
+
+
+
+
+
+
+
+
+    # controller = palm.Controller([mrqa, match4mrqa, mlm4mrqa])
+
+    # loss = controller.build_forward(bb, mask_task=[])
+
+    # n_steps = controller.estimate_train_steps(basetask=mrqa, num_epochs=2, batch_size=8, dev_count=4)
+    # adam = palm.optimizer.Adam(loss)
+    # sched = palm.schedualer.LinearWarmup(learning_rate, max_train_steps=n_steps, warmup_steps=0.1*n_steps)
+    # 
+    # controller.build_backward(optimizer=adam, schedualer=sched, weight_decay=0.001, use_ema=True, ema_decay=0.999)
+
+    # controller.random_init_params()
+    # controller.load_pretrain('../../pretrain_model/ernie/params')
+    # controller.train()
+
+
+
+
+
+    # controller = palm.Controller(config='config.yaml', task_dir='tasks', for_train=False)
+    # controller.pred('mrqa', inference_model_dir='output_model/secondrun/mrqa/infer_model')
+

--- a/demo/demo3/run.sh
+++ b/demo/demo3/run.sh
-export CUDA_VISIBLE_DEVICES=0
+export CUDA_VISIBLE_DEVICES=3

 python run.py

--- a/interface.py
+++ b/interface.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""v1.1"""
+
+class reader(object):
+    """interface of data manager."""
+
+    def __init__(self, config):
+        assert isinstance(config, dict)
+
+    # @property
+    # def inputs_attr(self):
+    #     """描述reader输入对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1.
+    #     Return:
+    #         dict类型。对各个输入对象的属性描述。例如，
+    #         对于文本分类任务，可能需要包含输入文本和所属标签的id
+    #             {"text": ([], 'str'),
+    #              "label": ([], 'int')}
+    #         对于标注任务，可能需要输入词序列和对应的标签
+    #             {"tokens", ([-1], 'str'),
+    #              "tags", ([-1], 'str')}
+    #         对于机器阅读理解任务，可能需要包含上下文、问题、回答、答案区域的起止位置等
+    #             {"paragraph", ([], 'str'),
+    #              "question", ([], 'str'),
+    #              "start_position", ([], 'int')
+    #         """
+    #     raise NotImplementedError()
+
+    @property
+    def outputs_attr(self):
+        """描述reader输出对象（被yield出的对象）的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        注意：当使用mini-batch梯度下降学习策略时，，应为常规的输入对象设置batch_size维度（一般为-1）
+        Return:
+            dict类型。对各个输入对象的属性描述。例如，
+            对于文本分类和匹配任务，yield的输出内容可能包含如下的对象（下游backbone和task可按需访问其中的对象）
+                {"token_ids": ([-1, max_len], 'int64'),
+                 "input_ids": ([-1, max_len], 'int64'),
+                 "segment_ids": ([-1, max_len], 'int64'),
+                 "input_mask": ([-1, max_len], 'float32'),
+                 "label": ([-1], 'int')}
+        """
+        raise NotImplementedError()
+
+    # def parse_line(self):
+    #     """框架内部使用字典描述每个样本，字典的key为inputs_attr，value为每个input对应的符合attr描述的值。
+    #         该函数负责将文本行解析成符合inputs_attr描述的字典类型的样本。默认的parse_line方法会读取json格式的数据集文件，数据集的每一行为json格式描述的样本。
+    #         用户可通过对该方法的继承改写来适配不同格式的数据集，例如csv格式甚至tfrecord文件。
+    #         """
+    #     raise NotImplementedError()
+    # 
+    # def tokenize(self, line):
+    #     """框架中内置了word piece tokenizer等分词器，用户可通过修改tokenizer超参数来制定使用的分词器，若内置的分词器均无法满足需求，用户可通过对该方法的继承改写来自定义分词器。
+    #         Args:
+    #             - line: a unicode string. 
+    #         Return:
+    #             a list of tokens
+    #         """
+    #     raise NotImplementedError()
+    
+    def iterator(self):
+        """数据集遍历接口，注意，当数据集遍历到尾部时该接口应自动完成指针重置，即重新从数据集头部开始新的遍历。
+        Yield:
+            (dict) elements that meet the requirements in output_templete
+        """
+        raise NotImplementedError()
+
+    @property
+    def num_examples(self):
+        """数据集中的样本数量，即每个epoch中iterator所生成的样本数。注意，使用滑动窗口等可能导致数据集样本数发生变化的策略时，该接口应返回runtime阶段的实际样本数。"""
+        raise NotImplementedError()
+
+
+
+class backbone(object):
+    """interface of backbone model."""
+
+    def __init__(self, config, phase):
+        """
+        Args:
+            config: dict类型。描述了 多任务配置文件+预训练模型配置文件 中定义超参数
+            phase: str类型。运行阶段，目前支持train和predict
+            """
+        assert isinstance(config, dict)
+
+    @property
+    def inputs_attr(self):
+        """描述backbone从reader处需要得到的输入对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个输入对象的属性描述。例如，
+            对于文本分类和匹配任务，bert backbone依赖的reader对象主要包含如下的对象
+                {"token_ids": ([-1, max_len], 'int64'),
+                 "input_ids": ([-1, max_len], 'int64'),
+                 "segment_ids": ([-1, max_len], 'int64'),
+                 "input_mask": ([-1, max_len], 'float32')}"""
+        raise NotImplementedError()
+
+    @property
+    def outputs_attr(self):
+        """描述backbone输出对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个输出对象的属性描述。例如，
+            对于文本分类和匹配任务，bert backbone的输出内容可能包含如下的对象
+                {"word_emb": ([-1, max_seqlen, word_emb_size], 'float32'),
+                 "sentence_emb": ([-1, hidden_size], 'float32'),
+                 "sim_vec": ([-1, hidden_size], 'float32')}""" 
+        raise NotImplementedError()
+
+    def build(self, inputs):
+        """建立backbone的计算图。将符合inputs_attr描述的静态图Variable输入映射成符合outputs_attr描述的静态图Variable输出。
+        Args:
+            inputs: dict类型。字典中包含inputs_attr中的对象名到计算图Variable的映射，inputs中至少会包含inputs_attr中定义的对象
+        Return:
+           需要输出的计算图变量，输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+            """
+        raise NotImplementedError()
+
+
+
+
+class task_paradigm(object):
+
+    def __init__(self, config, phase, backbone_config):
+        """
+            config: dict类型。描述了 任务实例(task instance)+多任务配置文件 中定义超参数
+            phase: str类型。运行阶段，目前支持train和predict
+            """
+
+    @property
+    def inputs_attrs(self):
+        """描述task_layer需要从reader, backbone等输入对象集合所读取到的输入对象的属性，第一级key为对象集和的名字，如backbone，reader等（后续会支持更灵活的输入），第二级key为对象集和中各对象的属性，包括对象的名字，shape和dtype。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个对象集及其输入对象的属性描述。"""
+        raise NotImplementedError()
+
+    @property
+    def outputs_attr(self):
+        """描述task输出对象的属性，包括对象的名字，shape和dtype。输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+        当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个输入对象的属性描述。注意，训练阶段必须包含名为loss的输出对象。
+            """
+
+        raise NotImplementedError()
+
+    @property
+    def epoch_inputs_attrs(self):
+        return {}
+
+    def build(self, inputs, scope_name=""):
+        """建立task_layer的计算图。将符合inputs_attrs描述的来自各个对象集的静态图Variables映射成符合outputs_attr描述的静态图Variable输出。
+        Args:
+            inputs: dict类型。字典中包含inputs_attrs中的对象名到计算图Variable的映射，inputs中至少会包含inputs_attr中定义的对象
+        Return:
+           需要输出的计算图变量，输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+
+        """
+        raise NotImplementedError()
+
+    def postprocess(self, rt_outputs):
+        """每个训练或推理step后针对当前batch的task_layer的runtime计算结果进行相关后处理。注意，rt_outputs除了包含build方法，还自动包含了loss的计算结果。"""
+        pass
+        
+    def epoch_postprocess(self, post_inputs):
+        pass
+
--- a/paddlepalm/__init__.py
+++ b/paddlepalm/__init__.py
+import downloader
+# from mtl_controller import Controller 
+import controller
+import optimizer
+import lr_sched
+import backbone
+import reader
+import head

-import sys
-from paddlepalm.mtl_controller import Controller
-sys.path.append('paddlepalm')

+from trainer import Trainer
+
+del interface
+del task_instance
+del default_settings
+del utils
--- a/paddlepalm/_downloader.py
+++ b/paddlepalm/_downloader.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import tarfile
+import shutil
+try:
+    from urllib.request import urlopen # Python 3
+except ImportError:
+    from urllib2 import urlopen # Python 2
+
+
+import ssl
+
+__all__ = ["download", "ls"]
+
+# for https
+ssl._create_default_https_context = ssl._create_unverified_context
+
+_items = {
+    'pretrain': {'ernie-en-uncased-large': 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz',
+                 'bert-en-uncased-large': 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz',
+                 'bert-en-uncased-base': 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz',
+                 'utils': None},
+    'reader': {'utils': None},
+    'backbone': {'utils': None},
+    'tasktype': {'utils': None},
+}
+
+def _download(item, scope, path, silent=False):
+    data_url = _items[item][scope]
+    if data_url == None:
+        return
+    if not silent:
+        print('Downloading {}: {} from {}...'.format(item, scope, data_url))
+    data_dir = path + '/' + item + '/' + scope
+    if not os.path.exists(data_dir):
+        os.makedirs(os.path.join(data_dir))
+    data_name = data_url.split('/')[-1]
+    filename = data_dir + '/' + data_name
+
+    # print process
+    def _chunk_report(bytes_so_far, total_size):
+        percent = float(bytes_so_far) / float(total_size)
+        if percent > 1:
+            percent = 1
+        if not silent:
+            print('\r>> Downloading... {:.1%}'.format(percent), end = "")
+    
+    # copy to local
+    def _chunk_read(response, url, chunk_size = 16 * 1024, report_hook = None):
+        total_size = response.info().getheader('Content-Length').strip()
+        total_size = int(total_size)
+        bytes_so_far = 0
+        with open("%s" % filename, "wb") as f:
+            while 1:
+                chunk = response.read(chunk_size)
+                f.write(chunk)
+                f.flush() 
+                bytes_so_far += len(chunk)
+                if not chunk:
+                    break
+                if report_hook:
+                    report_hook(bytes_so_far, total_size)
+        return bytes_so_far
+
+    response = urlopen(data_url)
+    _chunk_read(response, data_url, report_hook=_chunk_report)
+    
+    if not silent:
+        print(' done!')
+    
+    if item == 'pretrain':
+        if not silent:
+            print ('Extracting {}...'.format(data_name), end=" ")
+        if os.path.exists(filename):
+            tar = tarfile.open(filename, 'r')
+            tar.extractall(path = data_dir)
+            tar.close()
+            os.remove(filename)
+        if scope.startswith('bert'):
+            source_path = data_dir + '/' + data_name.split('.')[0]
+            fileList = os.listdir(source_path)
+            for file in fileList:
+                filePath = os.path.join(source_path, file)
+                shutil.move(filePath, data_dir)
+            os.removedirs(source_path)
+        if not silent:
+            print ('done!')
+        if not silent:
+            print ('Converting params...', end=" ")
+        _convert(data_dir, silent)
+        if not silent:
+            print ('done!')
+
+
+def _convert(path, silent=False):
+    if os.path.isfile(path + '/params/__palminfo__'):
+        if not silent:
+            print ('already converted.')
+    else:
+        if os.path.exists(path + '/params/'):
+            os.rename(path + '/params/', path + '/params1/')
+            os.mkdir(path + '/params/')
+            tar_model = tarfile.open(path + '/params/' + '__palmmodel__', 'w')
+            tar_info = open(path + '/params/'+ '__palminfo__', 'w')
+            for root, dirs, files in os.walk(path + '/params1/'):
+                for file in files:
+                    src_file = os.path.join(root, file)
+                    tar_model.add(src_file, '__paddlepalm_' + file)
+                    tar_info.write('__paddlepalm_' + file)
+                    os.remove(src_file)
+            tar_model.close()
+            tar_info.close()
+            os.removedirs(path + '/params1/') 
+
+def download(item, scope='all', path='.'):
+    item = item.lower()
+    scope = scope.lower()
+    assert item in _items, '{} is not found. Support list: {}'.format(item, list(_items.keys()))
+   
+    if _items[item]['utils'] is not None:
+        _download(item, 'utils', path, silent=True)
+
+    if scope != 'all':
+        assert scope in _items[item], '{} is not found. Support scopes: {}'.format(scope, list(_items[item].keys()))
+        _download(item, scope, path)
+    else:
+        for s in _items[item].keys():
+            _download(item, s, path)
+
+
+def _ls(item, scope, l = 10):
+    if scope != 'all':
+        assert scope in _items[item], '{} is not found. Support scopes: {}'.format(scope, list(_items[item].keys()))
+        print ('{}'.format(scope))
+    else:
+        for s in _items[item].keys():
+            if s == 'utils':
+                continue
+            print ('  => '+s)
+
+def ls(item='all', scope='all'):
+    
+    if scope == 'utils':
+        return
+    if item != 'all':
+        assert item in _items, '{} is not found. Support scopes: {}'.format(item, list(_items.keys()))
+        print ('Available {} items:'.format(item))
+        _ls(item, scope)
+    else:
+        l = max(map(len, _items.keys()))
+        for i in _items.keys():
+            print ('Available {} items: '.format(i))
+            _ls(i, scope, l)
+
+
+    
--- a/paddlepalm/backbone/__init__.py
+++ b/paddlepalm/backbone/__init__.py
+
+from ernie import ERNIE
+from bert import BERT
+
--- a/paddlepalm/backbone/base_backbone.py
+++ b/paddlepalm/backbone/base_backbone.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""v1.1"""
+
+
+class BaseBackbone(object):
+    """interface of backbone model."""
+
+    def __init__(self, config, phase):
+        """
+        Args:
+            config: dict类型。描述了 多任务配置文件+预训练模型配置文件 中定义超参数
+            phase: str类型。运行阶段，目前支持train和predict
+            """
+        assert isinstance(config, dict)
+
+    @property
+    def inputs_attr(self):
+        """描述backbone从reader处需要得到的输入对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个输入对象的属性描述。例如，
+            对于文本分类和匹配任务，bert backbone依赖的reader对象主要包含如下的对象
+                {"token_ids": ([-1, max_len], 'int64'),
+                 "input_ids": ([-1, max_len], 'int64'),
+                 "segment_ids": ([-1, max_len], 'int64'),
+                 "input_mask": ([-1, max_len], 'float32')}"""
+        raise NotImplementedError()
+
+    @property
+    def outputs_attr(self):
+        """描述backbone输出对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个输出对象的属性描述。例如，
+            对于文本分类和匹配任务，bert backbone的输出内容可能包含如下的对象
+                {"word_emb": ([-1, max_seqlen, word_emb_size], 'float32'),
+                 "sentence_emb": ([-1, hidden_size], 'float32'),
+                 "sim_vec": ([-1, hidden_size], 'float32')}""" 
+        raise NotImplementedError()
+
+    def build(self, inputs):
+        """建立backbone的计算图。将符合inputs_attr描述的静态图Variable输入映射成符合outputs_attr描述的静态图Variable输出。
+        Args:
+            inputs: dict类型。字典中包含inputs_attr中的对象名到计算图Variable的映射，inputs中至少会包含inputs_attr中定义的对象
+        Return:
+           需要输出的计算图变量，输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+            """
+        raise NotImplementedError()
+
+
+
+
+class task_paradigm(object):
+
+    def __init__(self, config, phase, backbone_config):
+        """
+            config: dict类型。描述了 任务实例(task instance)+多任务配置文件 中定义超参数
+            phase: str类型。运行阶段，目前支持train和predict
+            """
+
+    @property
+    def inputs_attrs(self):
+        """描述task_layer需要从reader, backbone等输入对象集合所读取到的输入对象的属性，第一级key为对象集和的名字，如backbone，reader等（后续会支持更灵活的输入），第二级key为对象集和中各对象的属性，包括对象的名字，shape和dtype。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个对象集及其输入对象的属性描述。"""
+        raise NotImplementedError()
+
+    @property
+    def outputs_attr(self):
+        """描述task输出对象的属性，包括对象的名字，shape和dtype。输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+        当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个输入对象的属性描述。注意，训练阶段必须包含名为loss的输出对象。
+            """
+
+        raise NotImplementedError()
+
+    @property
+    def epoch_inputs_attrs(self):
+        return {}
+
+    def build(self, inputs, scope_name=""):
+        """建立task_layer的计算图。将符合inputs_attrs描述的来自各个对象集的静态图Variables映射成符合outputs_attr描述的静态图Variable输出。
+        Args:
+            inputs: dict类型。字典中包含inputs_attrs中的对象名到计算图Variable的映射，inputs中至少会包含inputs_attr中定义的对象
+        Return:
+           需要输出的计算图变量，输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+
+        """
+        raise NotImplementedError()
+
+    def postprocess(self, rt_outputs):
+        """每个训练或推理step后针对当前batch的task_layer的runtime计算结果进行相关后处理。注意，rt_outputs除了包含build方法，还自动包含了loss的计算结果。"""
+        pass
+        
+    def epoch_postprocess(self, post_inputs):
+        pass
+
--- a/paddlepalm/backbone/bert.py
+++ b/paddlepalm/backbone/bert.py
@@ -23,12 +23,44 @@ from paddle import fluid
 from paddle.fluid import layers

 from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
-from paddlepalm.interface import backbone
+from paddlepalm.backbone.base_backbone import BaseBackbone

-    
-class Model(backbone):
-    
-    def __init__(self, config, phase):
+
+class BERT(BaseBackbone):
+
+
+    def __init__(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
+          max_position_embeddings, type_vocab_size, hidden_act, hidden_dropout_prob, \
+          attention_probs_dropout_prob, initializer_range, phase='train'):
+        config = {}
+        config['hidden_size'] = hidden_size
+        config['num_hidden_layers'] = num_hidden_layers
+        config['num_attention_heads'] = num_attention_heads
+        config['vocab_size'] = vocab_size
+        config['max_position_embeddings'] = max_position_embeddings
+        config['type_vocab_size'] = sent_type_vocab_size
+        config['hidden_act'] = hidden_act
+        config['hidden_dropout_prob'] = hidden_dropout_prob
+        config['attention_probs_dropout_prob'] = attention_probs_dropout_prob
+        config['initializer_range'] = initializer_range
+
+        self.from_config(config, phase=phase)
+
+    @classmethod
+    def from_config(self, config, phase='train'):
+        
+        assert 'hidden_size' in config, "{} is required to initialize ERNIE".format('')
+        assert 'num_hidden_layers' in config, "{} is required to initialize ERNIE".format('num_hidden_layers')
+        assert 'num_attention_heads' in config, "{} is required to initialize ERNIE".format('num_attention_heads')
+        assert 'vocab_size' in config, "{} is required to initialize ERNIE".format('vocab_size')
+        assert 'max_position_embeddings' in config, "{} is required to initialize ERNIE".format('max_position_embeddings')
+        assert 'sent_type_vocab_size' in config or 'type_vocab_size' in config, \
+            "{} is required to initialize ERNIE".format('type_vocab_size')
+        assert 'hidden_act' in config, "{} is required to initialize ERNIE".format('hidden_act')
+        assert 'hidden_dropout_prob' in config, "{} is required to initialize ERNIE".format('hidden_dropout_prob')
+        assert 'attention_probs_dropout_prob' in config, \
+            "{} is required to initialize ERNIE".format('attention_probs_dropout_prob')
+        assert 'initializer_range' in config, "{} is required to initialize ERNIE".format('initializer_range')

        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变
        self._emb_size = config["hidden_size"]
@@ -52,9 +84,9 @@ class Model(backbone):

    @property
    def inputs_attr(self):
-        return {"token_ids": [[-1, -1, 1], 'int64'],
-                "position_ids": [[-1, -1, 1], 'int64'],
-                "segment_ids": [[-1, -1, 1], 'int64'],
+        return {"token_ids": [[-1, -1], 'int64'],
+                "position_ids": [[-1, -1], 'int64'],
+                "segment_ids": [[-1, -1], 'int64'],
                "input_mask": [[-1, -1, 1], 'float32']}

    @property
@@ -73,7 +105,7 @@ class Model(backbone):

        self._emb_dtype = 'float32'
        # padding id in vocabulary must be set to 0
-        emb_out = fluid.layers.embedding(
+        emb_out = fluid.embedding(
            input=src_ids,
            size=[self._voc_size, self._emb_size],
            dtype=self._emb_dtype,
@@ -84,14 +116,14 @@ class Model(backbone):
        # fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
        embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
        
-        position_emb_out = fluid.layers.embedding(
+        position_emb_out = fluid.embedding(
            input=pos_ids,
            size=[self._max_position_seq_len, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(
                name=scope_name+self._pos_emb_name, initializer=self._param_initializer))

-        sent_emb_out = fluid.layers.embedding(
+        sent_emb_out = fluid.embedding(
            sent_ids,
            size=[self._sent_types, self._emb_size],
            dtype=self._emb_dtype,
@@ -153,3 +185,9 @@ class Model(backbone):
        pass


+class Model(BERT):
+    """BERT wrapper for ConfigController"""
+    def __init__(self, config, phase):
+        BERT.from_config(config, phase=phase)
+
+
--- a/paddlepalm/backbone/ernie.py
+++ b/paddlepalm/backbone/ernie.py
@@ -24,32 +24,29 @@ from paddle import fluid
 from paddle.fluid import layers

 from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
-from paddlepalm.interface import backbone
+from paddlepalm.backbone.base_backbone import BaseBackbone


-class Model(backbone):
-
-    def __init__(self,
-                 config,
-                 phase):
+class ERNIE(BaseBackbone):
+    
+    def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
+          max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \
+          hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase='train'):

        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变

-        self._emb_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        if config['sent_type_vocab_size']:
-            self._sent_types = config['sent_type_vocab_size']
-        else:
-            self._sent_types = config['type_vocab_size']
+        self._emb_size = hidden_size
+        self._n_layer = num_hidden_layers
+        self._n_head = num_attention_heads
+        self._voc_size = vocab_size
+        self._max_position_seq_len = max_position_embeddings
+        self._sent_types = sent_type_vocab_size

-        self._task_types = config['task_type_vocab_size']
+        self._task_types = task_type_vocab_size

-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
+        self._hidden_act = hidden_act
+        self._prepostprocess_dropout = hidden_dropout_prob
+        self._attention_dropout = attention_probs_dropout_prob

        self._word_emb_name = "word_embedding"
        self._pos_emb_name = "pos_embedding"
@@ -58,15 +55,48 @@ class Model(backbone):
        self._emb_dtype = "float32"

        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range'])
+            scale=initializer_range)
+
+    @classmethod
+    def from_config(cls, config, phase='train'):
+        assert 'hidden_size' in config, "{} is required to initialize ERNIE".format('hidden_size')
+        assert 'num_hidden_layers' in config, "{} is required to initialize ERNIE".format('num_hidden_layers')
+        assert 'num_attention_heads' in config, "{} is required to initialize ERNIE".format('num_attention_heads')
+        assert 'vocab_size' in config, "{} is required to initialize ERNIE".format('vocab_size')
+        assert 'max_position_embeddings' in config, "{} is required to initialize ERNIE".format('max_position_embeddings')
+        assert 'sent_type_vocab_size' in config or 'type_vocab_size' in config, "{} is required to initialize ERNIE".format('sent_type_vocab_size')
+        assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size')
+        assert 'hidden_act' in config, "{} is required to initialize ERNIE".format('hidden_act')
+        assert 'hidden_dropout_prob' in config, "{} is required to initialize ERNIE".format('hidden_dropout_prob')
+        assert 'attention_probs_dropout_prob' in config, "{} is required to initialize ERNIE".format('attention_probs_dropout_prob')
+        assert 'initializer_range' in config, "{} is required to initialize ERNIE".format('initializer_range')
+
+        hidden_size = config['hidden_size']
+        num_hidden_layers = config['num_hidden_layers']
+        num_attention_heads = config['num_attention_heads']
+        vocab_size = config['vocab_size']
+        max_position_embeddings = config['max_position_embeddings']
+        if 'sent_type_vocab_size' in config:
+            sent_type_vocab_size = config['sent_type_vocab_size']
+        else:
+            sent_type_vocab_size = config['type_vocab_size']
+        task_type_vocab_size = config['task_type_vocab_size']
+        hidden_act = config['hidden_act']
+        hidden_dropout_prob = config['hidden_dropout_prob']
+        attention_probs_dropout_prob = config['attention_probs_dropout_prob']
+        initializer_range = config['initializer_range']
+        
+        return cls(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
+          max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \
+          hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase=phase)

    @property
    def inputs_attr(self):
-        return {"token_ids": [[-1, -1, 1], 'int64'],
-                "position_ids": [[-1, -1, 1], 'int64'],
-                "segment_ids": [[-1, -1, 1], 'int64'],
+        return {"token_ids": [[-1, -1], 'int64'],
+                "position_ids": [[-1, -1], 'int64'],
+                "segment_ids": [[-1, -1], 'int64'],
                "input_mask": [[-1, -1, 1], 'float32'],
-                "task_ids": [[-1,-1, 1], 'int64']}
+                "task_ids": [[-1,-1], 'int64']}

    @property
    def outputs_attr(self):
@@ -85,7 +115,7 @@ class Model(backbone):
        task_ids = inputs['task_ids']

        # padding id in vocabulary must be set to 0
-        emb_out = fluid.layers.embedding(
+        emb_out = fluid.embedding(
            input=src_ids,
            size=[self._voc_size, self._emb_size],
            dtype=self._emb_dtype,
@@ -96,14 +126,14 @@ class Model(backbone):
        # fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
        embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
        
-        position_emb_out = fluid.layers.embedding(
+        position_emb_out = fluid.embedding(
            input=pos_ids,
            size=[self._max_position_seq_len, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(
                name=scope_name+self._pos_emb_name, initializer=self._param_initializer))

-        sent_emb_out = fluid.layers.embedding(
+        sent_emb_out = fluid.embedding(
            sent_ids,
            size=[self._sent_types, self._emb_size],
            dtype=self._emb_dtype,
@@ -113,7 +143,7 @@ class Model(backbone):
        emb_out = emb_out + position_emb_out
        emb_out = emb_out + sent_emb_out

-        task_emb_out = fluid.layers.embedding(
+        task_emb_out = fluid.embedding(
            task_ids,
            size=[self._task_types, self._emb_size],
            dtype=self._emb_dtype,
@@ -173,3 +203,12 @@ class Model(backbone):

    def postprocess(self, rt_outputs):
        pass
+
+
+
+class Model(ERNIE):
+
+    def __init__(self, config, phase):
+        ERNIE.from_config(config, phase=phase)
+
+
--- a/paddlepalm/controller/__init__.py
+++ b/paddlepalm/controller/__init__.py
+
+from conf_controller import ConfigController
+from controller import Controller
--- a/paddlepalm/controller/conf_controller.py
+++ b/paddlepalm/controller/conf_controller.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import importlib
+import multiprocessing
+from paddle import fluid
+from paddle.fluid import layers
+import yaml
+import json
+import logging
+import time
+import numpy as np
+
+from paddlepalm.utils.saver import init_pretraining_params, init_checkpoint
+from paddlepalm.utils.config_helper import PDConfig
+from paddlepalm.utils.print_helper import print_dict
+from paddlepalm.utils.reader_helper import create_net_inputs, create_iterator_fn, create_joint_iterator_fn, merge_input_attrs 
+
+from paddlepalm.default_settings import *
+from paddlepalm.task_instance import TaskInstance, check_instances
+
+import Queue
+from threading import Thread
+
+DEBUG=False
+VERBOSE=0
+
+def _get_basename(f):
+    return os.path.splitext(f)[0]
+
+
+def _get_suffix(f):
+    return os.path.splitext(f)[-1]
+
+
+def _parse_yaml(f, asdict=True, support_cmd_line=False):
+    assert os.path.exists(f), "file {} not found.".format(f)
+    if support_cmd_line:
+        args = PDConfig(yaml_file=f, fuse_args=True)
+        args.build()
+        return args.asdict() if asdict else args
+    else:
+        if asdict:
+            with open(f, "r") as fin: 
+                yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
+            return yaml_config
+        else:
+            raise NotImplementedError()
+
+
+def _parse_json(f, asdict=True, support_cmd_line=False):
+    assert os.path.exists(f), "file {} not found.".format(f)
+    if support_cmd_line:
+        args = PDConfig(json_file=f, fuse_args=support_cmd_line)
+        args.build()
+        return args.asdict() if asdict else args
+    else:
+        if asdict:
+            with open(f, "r") as fin: 
+                config = json.load(fin)
+            return config
+        else:
+            raise NotImplementedError()
+            
+
+def _parse_list(string, astype=str):
+    assert isinstance(string, str), "{} is not a string.".format(string)
+    if ',' not in string:
+        return [astype(string)]
+    string = string.replace(',', ' ')
+    return [astype(i) for i in string.split()]
+
+
+def _try_float(s):
+    try:
+        float(s)
+        return(float(s))
+    except:
+        return s
+
+
+def _check_conf(conf, checklist=None):
+    assert isinstance(conf, dict), "{} is not a dict.".format(conf)
+    ret = {}
+    for k,v in conf.items():
+        if isinstance(v, str):
+            v = _try_float(v)
+        ret[k] = v
+    if checklist is not None:
+        for k, t in checklist:
+            assert k in ret, "required argument {} is NOT exist in config file.".format(k)
+            assert isintance(ret[k], t), "value type of argument {} should be {}".format(k, t)
+    return ret
+
+
+# TODO: 增加None机制，允许hidden size、batch size和seqlen设置为None
+def _check_io(in_attr, out_attr, strict=False, in_name="left", out_name="right"):
+    for name, attr in in_attr.items():
+        assert name in out_attr, in_name+': '+name+' not found in '+out_name
+        if attr != out_attr[name]:
+            if strict:
+                raise ValueError(name+': shape or dtype not consistent!')
+            else:
+                logging.warning('{}: shape or dtype not consistent!\n{}:\n{}\n{}:\n{}'.format(name, in_name, attr, out_name, out_attr[name]))
+
+
+def _merge_conf(conf1, conf2, conf1_first=True, strict=False):
+    assert isinstance(conf1, dict), "{} is not a dict.".format(conf1)
+    assert isinstance(conf2, dict), "{} is not a dict.".format(conf2)
+    base_conf = conf2 if conf1_first else conf1
+    base_conf = base_conf.copy()
+    new_conf = conf1 if conf1_first else conf2
+
+    for k, v in new_conf.items():
+        if k in base_conf:
+            if base_conf[k] != v:
+                raise Warning("value of argument {} has been updated to {}.".format(k, v))
+        else:
+            if strict:
+                continue
+            
+        base_conf[k] = v
+    return base_conf
+
+
+def _encode_inputs(inputs, scope_name, sep='/', cand_set=None):
+    outputs = {}
+    for k, v in inputs.items():
+        if cand_set is not None:
+            if k in cand_set:
+                outputs[k] = v
+            if scope_name+sep+k in cand_set:
+                outputs[scope_name+sep+k] = v
+        else:
+            outputs[scope_name+sep+k] = v
+    return outputs
+
+
+def _decode_inputs(inputs, scope_name, sep='/', keep_unk_keys=True):
+    outputs = {}
+    for name, value in inputs.items():
+        # var for backbone are also available to tasks
+        if keep_unk_keys and sep not in name:
+            outputs[name] = value
+        # var for this inst
+        if name.startswith(scope_name+'/'):
+            outputs[name[len(scope_name+'/'):]] = value
+    return outputs
+
+
+def _init_env(use_gpu):
+    if use_gpu:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    return fluid.Executor(place), dev_count
+
+
+def _fit_attr(conf, fit_attr, strict=False):
+    for i, attr in fit_attr.items():
+        if i not in conf:
+            if strict:
+                raise Exception('Argument {} is required to create a controller.'.format(i))
+            else:
+                continue
+        conf[i] = attr(conf[i])
+    return conf
+
+
+class ConfigController(object):
+
+    def __init__(self, config, task_dir='.', for_train=True):
+        """
+        Args:
+            config: (str|dict) 字符串类型时，给出yaml格式的config配置文件路径；
+        """
+
+        self._for_train = for_train
+        assert isinstance(config, str) or isinstance(config, dict), "a config dict or config file path is required to create a Controller."
+
+        if isinstance(config, str):
+            mtl_conf = _parse_yaml(config, support_cmd_line=True)
+        else:
+            mtl_conf = config
+                
+        mtl_conf = _check_conf(mtl_conf)
+        mtl_conf = _fit_attr(mtl_conf, REQUIRED_ARGS, strict=True)
+        mtl_conf = _fit_attr(mtl_conf, OPTIONAL_ARGS, strict=False)
+
+        exe, dev_count = _init_env(use_gpu=mtl_conf.get('use_gpu', True))
+        self.exe = exe
+        self.dev_count = dev_count
+
+        print_dict(mtl_conf, title='global configuration')
+
+        # parse task instances and target tags
+        instnames = _parse_list(mtl_conf['task_instance'])
+        assert len(instnames) == len(set(instnames)), "repeated task_instance is NOT supported."
+        num_instances = len(instnames)
+        self.num_instances = num_instances
+
+        instname_to_conf = {}
+        instname_to_id = {}
+        for id, instname in enumerate(instnames):
+            instpath = os.path.join(task_dir, instname+'.yaml')
+            conf = _parse_yaml(instpath, support_cmd_line=False)
+            # conf = _check_conf(conf, TASK_INSTANCE_REQUIRED_ARGS)
+            conf = _check_conf(conf)
+            temp_conf = _merge_conf(mtl_conf, conf, strict=True)
+            print_dict(temp_conf, title='{} configuration'.format(instname))
+            conf = _merge_conf(mtl_conf, conf)
+            
+            instname_to_conf[instname] = conf
+            instname_to_id[instname] = id
+
+        # prepare backbone
+        if 'backbone_config_path' in mtl_conf:
+            bb_conf = _parse_json(mtl_conf['backbone_config_path'])
+            bb_conf = _merge_conf(mtl_conf, bb_conf)
+        else:
+            bb_conf = mtl_conf
+        print_dict(bb_conf, title = 'backbone configuration'.format(instname))
+
+        bb_name = mtl_conf['backbone']
+        bb_mod = importlib.import_module(BACKBONE_DIR + '.' + bb_name)
+        Backbone = getattr(bb_mod, 'Model')
+
+        # create task instances
+        instances = []
+        for name in instnames:
+            instances.append(TaskInstance(name, instname_to_id[name], instname_to_conf[name]))
+
+        check_instances(instances)
+
+        # parse target_tag
+        if 'target_tag' in mtl_conf:
+            target_tag = str(mtl_conf['target_tag'])
+            tags = _parse_list(target_tag, astype=int)
+            assert len(tags) == len(instnames), "number of target_tag is NOT consistent with that in task_instance."
+            for tag, inst in zip(tags, instances):
+                inst.is_target = tag
+        else:
+            tags = [i.is_target for i in instances]
+        num_targets = sum(tags)
+        num_auxes = num_instances - num_targets
+
+        # parse mix ratios
+        if 'mix_ratio' in mtl_conf:
+            mix_ratio = str(mtl_conf['mix_ratio'])
+            mrs = _parse_list(mix_ratio, astype=float)
+            assert len(mrs) == num_instances, "number of mix_ratios is NOT consistent with num_instances."
+        else:
+            mrs = [1.0] * num_instances
+
+        for mr, inst in zip(mrs, instances):
+            inst.mix_ratio = mr
+
+        # parse task layer reuse tags
+        instname_to_reusehost = {i:i for i in instnames}
+        if 'task_reuse_tag' in mtl_conf:
+            tags = _parse_list(mtl_conf['task_reuse_tag'], astype=int)
+            assert len(tags) == num_targets, 'number of reuse_tags is NOT consistent with number of instances.'
+        else:
+            tags = []
+            mapper = {}
+            for inst in instances:
+                history = set()
+                history.add(inst.name)
+                cur_inst = inst
+                while True:
+                    if cur_inst.task_reuse_scope in history:
+                        mapper[inst.name] = len(tags)
+                        break
+                    elif cur_inst.task_reuse_scope in mapper:
+                        mapper[inst.name] = mapper[cur_inst.task_reuse_scope]
+                        break
+                    else:
+                        cur_inst = name_to_instance[cur_inst.task_reuse_scope]
+                        history.add(cur_inst.name)
+
+                tags.append(mapper[inst.name])
+
+        for i in range(1, num_instances):
+            for j in range(i):
+                if tags[i] == tags[j]:
+                    assert instances[i].Paradigm == \
+                            instances[j].Paradigm, \
+                            "paradigm of reuse tasks should be consistent"
+                    instances[i].task_reuse_scope = instances[j].name
+                    break
+
+        self.instances = instances
+        self.mrs = mrs
+        self.Backbone = Backbone
+        self.bb_conf = bb_conf
+        self.bb_name = bb_name
+
+        self.has_init_train = False
+        self.has_init_pred = False
+
+        if self._for_train:
+            print("initialing for training...")
+            self._init_train()
+            self.has_init_train = True
+            
+    def _init_train(self):
+        
+        instances = self.instances
+        Backbone = self.Backbone
+        bb_conf = self.bb_conf
+        bb_name = self.bb_name
+        dev_count = self.dev_count
+        num_instances = len(instances)
+        mrs = self.mrs
+
+        # set first_target/main task instance
+        main_inst = None
+        for inst in instances:
+            if inst.is_target:
+                main_inst = inst
+                inst.is_first_target = True
+                break
+        main_conf = main_inst.config
+        if not os.path.exists(main_conf['save_path']):
+            os.makedirs(main_conf['save_path'])
+            os.makedirs(os.path.join(main_conf['save_path'], 'ckpt'))
+        
+        # prepare backbone
+        train_backbone = Backbone(bb_conf, phase='train')
+        pred_backbone = Backbone(bb_conf, phase='pred')
+
+        # create reader, task
+        # then check i/o across reader, backbone and task_layer
+        task_attrs = []
+        pred_task_attrs = []
+        for inst in instances:
+            train_reader = inst.Reader(inst.config, phase='train')
+            inst.reader['train'] = train_reader
+            train_parad = inst.Paradigm(inst.config, phase='train', backbone_config=bb_conf)
+            inst.task_layer['train'] = train_parad
+            task_attr_from_reader = _encode_inputs(train_parad.inputs_attrs['reader'], inst.name)
+            task_attrs.append(task_attr_from_reader)
+
+            _check_io(train_backbone.inputs_attr, train_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.train')
+            _check_io(train_parad.inputs_attrs['reader'], train_reader.outputs_attr, in_name='task_paradigm.train.reader', out_name='reader.train')
+            _check_io(train_parad.inputs_attrs['backbone'], train_backbone.outputs_attr, in_name='task_paradigm.train.backbone', out_name=bb_name+'_backbone')
+
+            if inst.is_target:
+                if 'pred_file' not in inst.config:
+                    inst.config['pred_file'] = ''
+                pred_reader = inst.Reader(inst.config, phase='pred')
+                pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=bb_conf)
+                inst.task_layer['pred'] = pred_parad
+                task_attr_from_reader = _encode_inputs(pred_parad.inputs_attrs['reader'], inst.name)
+                pred_task_attrs.append(task_attr_from_reader)
+                _check_io(pred_backbone.inputs_attr, pred_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.pred')
+                _check_io(pred_parad.inputs_attrs['reader'], pred_reader.outputs_attr, in_name='task_paradigm.pred.reader', out_name='reader.pred')
+                _check_io(pred_parad.inputs_attrs['backbone'], pred_backbone.outputs_attr, in_name='task_paradigm.pred.backbone', out_name=bb_name+'_backbone')
+
+        # merge reader input attrs from backbone and task_instances
+        joint_input_names, joint_shape_and_dtypes, name_to_position = merge_input_attrs(train_backbone.inputs_attr, task_attrs)
+        pred_joint_input_names, pred_joint_shape_and_dtypes, _ = merge_input_attrs(pred_backbone.inputs_attr, pred_task_attrs, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
+        # shapes: [task_id, shapes_of_backbone, shapes_of_inst1, ..., shapes_of_instN]
+
+        if DEBUG:
+            print('----- for debug -----')
+            print('joint input names:')
+            print(joint_input_names)
+            print('joint input shape and dtypes:')
+            print(joint_shape_and_dtypes)
+
+        # load data
+        for inst in instances:
+            print(inst.name+": preparing data...", end='')
+            inst.reader['train'].load_data()
+            print('ok!')
+
+        # merge dataset iterators and create net input vars
+        iterators = []
+        prefixes = []
+        mrs = []
+        for inst in instances:
+            iterators.append(inst.reader['train'].iterator())
+            prefixes.append(inst.name)
+            mrs.append(inst.mix_ratio)
+
+        joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE, return_type='dict')
+        self._joint_iterator_fn = joint_iterator_fn
+
+        input_attrs = [[i, j, k] for i, (j,k) in zip(joint_input_names, joint_shape_and_dtypes)]
+        pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_joint_input_names, pred_joint_shape_and_dtypes)]
+        # net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3)
+        net_inputs = create_net_inputs(input_attrs, async=False)
+        self._net_inputs = net_inputs
+
+        # build backbone and task layers
+        train_prog = fluid.default_main_program()
+        train_init_prog = fluid.default_startup_program()
+        bb_output_vars = train_backbone.build(net_inputs, scope_name='__paddlepalm_')
+        assert sorted(bb_output_vars.keys()) == sorted(train_backbone.outputs_attr.keys())
+        
+        pred_prog = fluid.Program()
+        pred_init_prog = fluid.Program()
+
+        with fluid.program_guard(main_program = pred_prog, startup_program = pred_init_prog):
+            pred_net_inputs = create_net_inputs(pred_input_attrs)
+            pred_bb_output_vars = pred_backbone.build(pred_net_inputs, scope_name='__paddlepalm_')
+
+        fluid.framework.switch_main_program(train_prog)
+        fluid.framework.switch_startup_program(train_init_prog)
+
+        task_output_vars = {}
+        for inst in instances:
+            task_inputs = {'backbone': bb_output_vars}
+            task_inputs_from_reader = _decode_inputs(net_inputs, inst.name)
+            task_inputs['reader'] = task_inputs_from_reader
+
+            scope = inst.task_reuse_scope + '/'
+            with fluid.unique_name.guard(scope):
+                output_vars = inst.build_task_layer(task_inputs, phase='train', scope=scope)
+                output_vars = {inst.name+'/'+key: val for key, val in output_vars.items()}
+                old = len(task_output_vars) # for debug
+                task_output_vars.update(output_vars)
+                assert len(task_output_vars) - old == len(output_vars) # for debug
+
+            # prepare predict vars for saving inference model
+            if inst.is_target:
+
+                with fluid.program_guard(pred_prog, pred_init_prog):
+                    cur_inputs = _decode_inputs(pred_net_inputs, inst.name)
+                    inst.pred_input = cur_inputs
+                    pred_task_inputs = {'backbone': pred_bb_output_vars, 'reader': cur_inputs}
+                    scope = inst.task_reuse_scope + '/'
+                    with fluid.unique_name.guard(scope):
+                        inst.build_task_layer(pred_task_inputs, phase='pred', scope=scope)
+
+
+        bb_fetches = {k: v.name for k,v in bb_output_vars.items()}
+        task_fetches = {k: v.name for k,v in task_output_vars.items()}
+        fetches = task_fetches
+        fetches['__task_id'] = net_inputs['__task_id'].name
+
+        # compute loss
+        task_id_var = net_inputs['__task_id']
+        task_id_vec = fluid.one_hot(task_id_var, num_instances)
+        losses = fluid.layers.concat([task_output_vars[inst.name+'/loss'] for inst in instances], axis=0)
+        loss = layers.reduce_sum(task_id_vec * losses)
+
+        main_reader = main_inst.reader['train']
+
+        num_examples = main_reader.num_examples
+        for inst in instances:
+            max_train_steps = int(main_conf['num_epochs']* inst.mix_ratio * (num_examples // main_conf['batch_size']  // dev_count))
+            if inst.is_target:
+                print('{}: expected train steps {}.'.format(inst.name, max_train_steps))
+            inst.steps_pur_epoch = inst.reader['train'].num_examples // main_conf['batch_size']  // dev_count
+            inst.expected_train_steps = max_train_steps
+
+        global_max_train_steps = int(main_conf['num_epochs'] * sum(mrs) * (num_examples // main_conf['batch_size']  // dev_count))
+        print('Estimated overall train steps {}.'.format(global_max_train_steps))
+
+        if 'warmup_proportion' in main_conf and main_conf['warmup_proportion'] > 0:
+            warmup_steps = int(global_max_train_steps * main_conf['warmup_proportion'])
+            print('Warmup steps: '+str(warmup_steps))
+        else:
+            warmup_steps = 0
+
+        # build optimizer
+        if 'optimizer' in main_conf:
+            optim_mod = importlib.import_module(OPTIMIZER_DIR + '.' + main_conf['optimizer'])
+            optimize = getattr(optim_mod, OPTIMIZE_METHOD)
+            optimize(loss, main_conf, max_train_steps, warmup_steps, fluid.default_main_program())
+
+            loss.persistable = True
+            if main_conf.get('use_ema', False):
+                assert 'ema_decay' in main_conf, "ema_decay should be set when use_ema is enabled."
+                ema = fluid.optimizer.ExponentialMovingAverage(main_conf['ema_decay'])
+                ema.update()
+
+        # prepare for train
+        self.train_backbone = train_backbone
+        self.train_program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
+        self.saver_program = fluid.default_main_program()
+
+        self.main_inst = main_inst
+        self.fetches = fetches
+        self.has_init_train = True
+        self.has_init_pred = True
+
+        self.exe.run(fluid.default_startup_program())
+        print("\nRandomly initialize parameters...\n")
+
+    def _init_pred(self, instance, infer_model_path):
+        inst = instance
+        if 'pred_output_path' not in inst.config:
+            inst.config['pred_output_path'] = os.path.join(inst.config.get('save_path', '.'), inst.name)
+
+        if not os.path.exists(inst.config['pred_output_path']):
+            os.makedirs(inst.config['pred_output_path'])
+
+        pred_backbone = self.Backbone(self.bb_conf, phase='pred')
+        pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=self.bb_conf)
+        inst.task_layer['pred'] = pred_parad
+        pred_joint_input_names, pred_joint_shape_and_dtypes, name_to_position = merge_input_attrs(
+            pred_backbone.inputs_attr, inst.task_layer['pred'].inputs_attrs['reader'], 
+            insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
+
+        pred_prog = inst.load(infer_model_path)
+        if inst.reader['pred'] is None:
+            pred_reader = inst.Reader(inst.config, phase='pred')
+            inst.reader['pred'] = pred_reader
+        return pred_prog
+
+    def load_pretrain(self, pretrain_path=None):
+        # load pretrain model (or ckpt)
+        if pretrain_path is None:
+            assert 'pretrain_path' in self.main_conf, "pretrain_path NOT set."
+            pretrain_path = self.main_conf['pretrain_path']
+
+        init_pretraining_params(
+            self.exe,
+            pretrain_path,
+            main_program=fluid.default_startup_program())
+
+
+    def train(self):
+
+        if not self.has_init_train:
+            self._init_train()
+            self.has_init_train = True
+
+        instances = self.instances
+        num_instances = self.num_instances
+        main_inst = self.main_inst
+        main_conf = main_inst.config
+
+        backbone = self.train_backbone
+        train_program = self.train_program
+        saver_program = self.saver_program
+        fetches = self.fetches
+
+        finish = []
+        for inst in instances:
+            if inst.is_target:
+                if inst.expected_train_steps > 0:
+                    finish.append(False)
+                else:
+                    finish.append(True)
+                    print(inst.name+': train finished!')
+                    inst.save()
+        
+        def train_finish():
+            for inst in instances:
+                if inst.is_target:
+                    if not inst.train_finish:
+                        return False
+            return True
+
+        def pack_multicard_feed(iterator, net_inputs, dev_count):
+            ret = []
+            mask = []
+            for i in range(dev_count):
+                temp = {}
+                content, flag = next(iterator)
+                for q, var in net_inputs.items():
+                    temp[var.name] = content[q]
+                ret.append(temp)
+                mask.append(1 if flag else 0)
+            return ret, mask
+
+        # do training
+        fetch_names, fetch_list = zip(*fetches.items())
+
+        main_step = 0 # only count for main task
+        global_step = 0 # count for all tasks
+        epoch = 0
+        time_begin = time.time()
+        backbone_buffer = []
+
+        def multi_dev_reader(reader, dev_count):
+            def worker(reader, dev_count, queue):
+                dev_batches = []
+                for index, data in enumerate(reader()):
+                    if len(dev_batches) < dev_count:
+                        dev_batches.append(data)
+                    if len(dev_batches) == dev_count:
+                        queue.put((dev_batches, 0))
+                        dev_batches = []
+                # For the prediction of the remained batches, pad more batches to 
+                # the number of devices and the padded samples would be removed in
+                # prediction outputs. 
+                if len(dev_batches) > 0:
+                    num_pad = dev_count - len(dev_batches)
+                    for i in range(len(dev_batches), dev_count):
+                        dev_batches.append(dev_batches[-1])
+                    queue.put((dev_batches, num_pad))
+                queue.put(None)
+
+            queue = Queue.Queue(dev_count*2)
+            p = Thread(
+                target=worker, args=(reader, dev_count, queue))
+            p.daemon = True
+            p.start()
+            while True:
+                ret = queue.get()
+                if ret is not None:
+                    batches, num_pad = ret
+                    queue.task_done()
+                    for batch in batches:
+                        flag = num_pad == 0
+                        if num_pad > 0:
+                            num_pad -= 1
+                        yield batch, flag
+                else:
+                    break
+            queue.join()
+        
+        joint_iterator = multi_dev_reader(self._joint_iterator_fn, self.dev_count)
+        
+        while not train_finish():
+            feed, mask = pack_multicard_feed(joint_iterator, self._net_inputs, self.dev_count)
+            rt_outputs = self.exe.run(train_program, feed=feed, fetch_list=fetch_list)
+            rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)}
+            rt_task_id = np.squeeze(rt_outputs['__task_id']).tolist()
+            rt_task_id = rt_task_id[0] if isinstance(rt_task_id, list) else rt_task_id
+            cur_task = instances[rt_task_id]
+
+            backbone_rt_outputs = {k:v for k,v in rt_outputs.items() if '/' not in k}
+            backbone_buffer.append(backbone.postprocess(backbone_rt_outputs))
+            
+            task_rt_outputs = {k[len(cur_task.name+'/'):]: v for k,v in rt_outputs.items() if k.startswith(cur_task.name+'/')}
+            instances[rt_task_id].task_layer['train'].postprocess(task_rt_outputs)
+
+            global_step += 1
+            cur_task.cur_train_step += 1
+
+            cur_task_global_step = cur_task.cur_train_step + cur_task.cur_train_epoch * cur_task.steps_pur_epoch
+            if cur_task.is_target and cur_task.save_infermodel_every_n_steps > 0 and cur_task_global_step % cur_task.save_infermodel_every_n_steps == 0:
+                cur_task.save(suffix='.step'+str(cur_task_global_step))
+
+            if global_step % main_conf.get('print_every_n_steps', 5) == 0:
+                loss = rt_outputs[cur_task.name+'/loss']
+                loss = np.mean(np.squeeze(loss)).tolist()
+
+                time_end = time.time()
+                time_cost = time_end - time_begin
+
+                print("Global step: {}. Task: {}, step {}/{} (epoch {}), loss: {:.3f}, speed: {:.2f} steps/s".format(
+                       global_step, cur_task.name, cur_task.cur_train_step, cur_task.steps_pur_epoch, cur_task.cur_train_epoch,
+                       loss, main_conf.get('print_every_n_steps', 5) / time_cost))
+                time_begin = time.time()
+
+            if cur_task.train_finish and cur_task.cur_train_step + cur_task.cur_train_epoch * cur_task.steps_pur_epoch == cur_task.expected_train_steps:
+                print(cur_task.name+': train finished!')
+                cur_task.save()
+
+            if 'save_ckpt_every_n_steps' in main_conf and global_step % main_conf['save_ckpt_every_n_steps'] == 0:
+                save_path = os.path.join(main_conf['save_path'], 'ckpt', 
+                                         "step_" + str(global_step))
+                fluid.io.save_persistables(self.exe, save_path, saver_program)
+                print('checkpoint has been saved at '+save_path)
+
+        save_path = os.path.join(main_conf['save_path'], 'ckpt',
+                                 "step_" + str(global_step))
+        fluid.io.save_persistables(self.exe, save_path, saver_program)
+        print('checkpoint has been saved at '+save_path)
+
+        print("ALL tasks train finished, exiting...")
+            
+    def pred(self, task_instance, inference_model_dir=None):
+        if self._for_train:
+            raise Exception('This controller is a trainer. Please build a new controller with for_train=False for predicting.')
+
+        assert isinstance(task_instance, str)
+        if isinstance(inference_model_dir, str):
+            assert os.path.exists(inference_model_dir), inference_model_dir+" not found."
+        # if not self.has_init_pred and inference_model_dir is None:
+        #     raise ValueError('infer_model_path is required for prediction.')
+        if inference_model_dir is None:
+            assert 'save_path' in self.mtl_conf, "one of the `inference_model_dir` and 'save_path' should be set to load inference model."
+            inference_model_dir = os.path.join(self.mtl_conf['save_path'], task_instance, 'infer_model')
+
+        instance = None
+        for inst in self.instances:
+            if inst.name == task_instance:
+                instance = inst
+                break
+
+        if instance is None:
+            raise ValueError(task_instance + ' is not a valid task_instance.')
+
+        pred_prog = self._init_pred(instance, inference_model_dir)
+                
+        inst = instance
+        print(inst.name+": loading data...")
+        inst.reader['pred'].load_data()
+        fetch_names, fetch_vars = inst.pred_fetch_list
+
+        print('predicting...')
+        mapper = {k:v for k,v in inst.pred_input}
+        buf = []
+        for feed in inst.reader['pred'].iterator():
+            feed = _encode_inputs(feed, inst.name, cand_set=mapper)
+            feed = {mapper[k]: v for k,v in feed.items()}
+
+            rt_outputs = self.exe.run(pred_prog, feed, fetch_vars)
+            rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)}
+            inst.postprocess(rt_outputs, phase='pred')
+        if inst.task_layer['pred'].epoch_inputs_attrs:
+            reader_outputs = inst.reader['pred'].get_epoch_outputs()
+        else:
+            reader_outputs = None
+        inst.epoch_postprocess({'reader':reader_outputs}, phase='pred')
+
+
+if __name__ == '__main__':
+    assert len(sys.argv) == 2, "Usage: python mtl_controller.py <mtl_conf_path>"
+    conf_path = sys.argv[1]
+    del sys.argv[1]
+    controller = Controller(conf_path)
+    if controller.main_conf['do_train']:
+        controller.train()
+
+
+
+__all__ = ["Controller"]
+
+            
+
--- a/paddlepalm/controller/controller.py
+++ b/paddlepalm/controller/controller.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import importlib
+import multiprocessing
+from paddle import fluid
+from paddle.fluid import layers
+import yaml
+import json
+import logging
+import time
+import numpy as np
+
+from paddlepalm.utils.saver import init_pretraining_params, init_checkpoint
+from paddlepalm.utils.config_helper import PDConfig
+from paddlepalm.utils.print_helper import print_dict
+from paddlepalm.utils.reader_helper import create_net_inputs, create_iterator_fn, create_joint_iterator_fn, merge_input_attrs 
+
+from paddlepalm.default_settings import *
+from paddlepalm.task_instance import TaskInstance, check_instances
+
+DEBUG=False
+VERBOSE=0
+
+def _get_basename(f):
+    return os.path.splitext(f)[0]
+
+
+def _get_suffix(f):
+    return os.path.splitext(f)[-1]
+
+
+def _parse_yaml(f, asdict=True, support_cmd_line=False):
+    assert os.path.exists(f), "file {} not found.".format(f)
+    if support_cmd_line:
+        args = PDConfig(yaml_file=f, fuse_args=True)
+        args.build()
+        return args.asdict() if asdict else args
+    else:
+        if asdict:
+            with open(f, "r") as fin: 
+                yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
+            return yaml_config
+        else:
+            raise NotImplementedError()
+
+
+def _parse_json(f, asdict=True, support_cmd_line=False):
+    assert os.path.exists(f), "file {} not found.".format(f)
+    if support_cmd_line:
+        args = PDConfig(json_file=f, fuse_args=support_cmd_line)
+        args.build()
+        return args.asdict() if asdict else args
+    else:
+        if asdict:
+            with open(f, "r") as fin: 
+                config = json.load(fin)
+            return config
+        else:
+            raise NotImplementedError()
+            
+
+def _parse_list(string, astype=str):
+    assert isinstance(string, str), "{} is not a string.".format(string)
+    if ',' not in string:
+        return [astype(string)]
+    string = string.replace(',', ' ')
+    return [astype(i) for i in string.split()]
+
+
+def _try_float(s):
+    try:
+        float(s)
+        return(float(s))
+    except:
+        return s
+
+
+def _check_conf(conf, checklist=None):
+    assert isinstance(conf, dict), "{} is not a dict.".format(conf)
+    ret = {}
+    for k,v in conf.items():
+        if isinstance(v, str):
+            v = _try_float(v)
+        ret[k] = v
+    if checklist is not None:
+        for k, t in checklist:
+            assert k in ret, "required argument {} is NOT exist in config file.".format(k)
+            assert isintance(ret[k], t), "value type of argument {} should be {}".format(k, t)
+    return ret
+
+
+# TODO: 增加None机制，允许hidden size、batch size和seqlen设置为None
+def _check_io(in_attr, out_attr, strict=False, in_name="left", out_name="right"):
+    for name, attr in in_attr.items():
+        assert name in out_attr, in_name+': '+name+' not found in '+out_name
+        if attr != out_attr[name]:
+            if strict:
+                raise ValueError(name+': shape or dtype not consistent!')
+            else:
+                logging.warning('{}: shape or dtype not consistent!\n{}:\n{}\n{}:\n{}'.format(name, in_name, attr, out_name, out_attr[name]))
+
+
+def _merge_conf(conf1, conf2, conf1_first=True, strict=False):
+    assert isinstance(conf1, dict), "{} is not a dict.".format(conf1)
+    assert isinstance(conf2, dict), "{} is not a dict.".format(conf2)
+    base_conf = conf2 if conf1_first else conf1
+    base_conf = base_conf.copy()
+    new_conf = conf1 if conf1_first else conf2
+
+    for k, v in new_conf.items():
+        if k in base_conf:
+            if base_conf[k] != v:
+                raise Warning("value of argument {} has been updated to {}.".format(k, v))
+        else:
+            if strict:
+                continue
+            
+        base_conf[k] = v
+    return base_conf
+
+
+def _encode_inputs(inputs, scope_name, sep='/', cand_set=None):
+    outputs = {}
+    for k, v in inputs.items():
+        if cand_set is not None:
+            if k in cand_set:
+                outputs[k] = v
+            if scope_name+sep+k in cand_set:
+                outputs[scope_name+sep+k] = v
+        else:
+            outputs[scope_name+sep+k] = v
+    return outputs
+
+
+def _decode_inputs(inputs, scope_name, sep='/', keep_unk_keys=True):
+    outputs = {}
+    for name, value in inputs.items():
+        # var for backbone are also available to tasks
+        if keep_unk_keys and sep not in name:
+            outputs[name] = value
+        # var for this inst
+        if name.startswith(scope_name+'/'):
+            outputs[name[len(scope_name+'/'):]] = value
+    return outputs
+
+
+def _init_env(use_gpu):
+    if use_gpu:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    return fluid.Executor(place), dev_count
+
+
+def _fit_attr(conf, fit_attr, strict=False):
+    for i, attr in fit_attr.items():
+        if i not in conf:
+            if strict:
+                raise Exception('Argument {} is required to create a controller.'.format(i))
+            else:
+                continue
+        conf[i] = attr(conf[i])
+    return conf
+
+
+class Controller(object):
+
+    def __init__(self, tasks, mix_ratios=None, task_reuse_tag=None, use_gpu=True):
+        """
+        Args:
+        """
+
+        exe, dev_count = _init_env(use_gpu=use_gpu)
+        self.exe = exe
+        self.dev_count = dev_count
+
+        # parse task instances and target tags
+        for id in len(tasks):
+            tasks[id]._set_id(id)
+
+        # parse mix ratios
+        if mix_ratios is not None:
+            if isinstance(mix_ratios, str):
+                mix_ratios = _parse_list(mix_ratios, astype=float)
+            else:
+                assert isinstance(mix_ratios, list)
+                assert len(mix_ratios) == len(tasks), "number of mix_ratios is NOT consistent with num_instances."
+
+            for mr, t in zip(mix_ratios, tasks):
+                t.mix_ratio = mr
+
+        # parse task layer reuse tags
+        instname_to_reusehost = {i:i for i in instnames}
+        if task_reuse_tag is not None:
+            if isinstance(task_reuse_tag, str):
+                tags = _parse_list(task_reuse_tag, astype=int)
+            else:
+                assert isinstance(task_reuse_tag, list)
+                assert len(task_reuse_tag) == len(tasks), "number of task_reuse_tag is NOT consistent with num_tasks."
+                tags = task_reuse_tag
+
+        else:
+            tags = []
+            mapper = {}
+            for inst in tasks:
+                history = set()
+                history.add(inst.name)
+                cur_inst = inst
+                while True:
+                    if cur_inst.task_reuse_scope in history:
+                        mapper[inst.name] = len(tags)
+                        break
+                    elif cur_inst.task_reuse_scope in mapper:
+                        mapper[inst.name] = mapper[cur_inst.task_reuse_scope]
+                        break
+                    else:
+                        cur_inst = name_to_instance[cur_inst.task_reuse_scope]
+                        history.add(cur_inst.name)
+
+                tags.append(mapper[inst.name])
+
+        for i in range(1, len(tasks)):
+            for j in range(i):
+                if tags[i] == tags[j]:
+                    # assert tasks[i].tasktype == \
+                    #         instances[j].tasktype, \
+                    #         "paradigm of reuse tasks should be consistent"
+                    tasks[i]._task_reuse_scope = task[j].name
+                    break
+
+        # self.instances = instances
+        # self.mrs = mrs
+        # self.Backbone = Backbone
+        # self.bb_conf = bb_conf
+        # self.bb_name = bb_name
+
+        # self.has_init_train = False
+        # self.has_init_pred = False
+
+        # if self._for_train:
+        #     print("initialing for training...")
+        #     self._init_train()
+        #     self.has_init_train = True
+        #     
+    def build_forward(self, backbone, mask_task=[]):
+        
+        task_instances = self._tasks
+        Backbone = self.Backbone
+        bb_conf = self.bb_conf
+        bb_name = self.bb_name
+        dev_count = self.dev_count
+        num_instances = len(instances)
+        mrs = self.mrs
+
+        # set first_target/main task instance
+        main_inst = None
+        for inst in task_instances:
+            if inst.is_target:
+                main_inst = inst
+                inst._as_main = True
+                break
+        
+        if save_path is not None and not os.path.exists(save_path):
+            os.makedirs(save_path)
+        
+        # create reader, task
+        # then check i/o across reader, backbone and task_layer
+        task_attrs = []
+        pred_task_attrs = []
+        for inst in task_instances:
+            task_attr_from_reader = _encode_inputs(inst._taskblock['train'].inputs_attrs['reader'], inst.name)
+            task_attrs.append(task_attr_from_reader)
+
+            _check_io(backbone.inputs_attr, inst._reader['train'].outputs_attr, in_name=bb_name+'_backbone', out_name='reader.train')
+            _check_io(inst.taskblock['train'].inputs_attrs['reader'], inst._reader['train'].outputs_attr, in_name='task_paradigm.train.reader', out_name='reader.train')
+            _check_io(inst._taskblock['train'].inputs_attrs['backbone'], train_backbone.outputs_attr, in_name='task_paradigm.train.backbone', out_name=bb_name+'_backbone')
+
+            if inst.is_target:
+                if 'pred_file' not in inst.config:
+                    inst.config['pred_file'] = ''
+                pred_reader = inst.Reader(inst.config, phase='pred')
+                pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=bb_conf)
+                inst.task_layer['pred'] = pred_parad
+                task_attr_from_reader = _encode_inputs(pred_parad.inputs_attrs['reader'], inst.name)
+                pred_task_attrs.append(task_attr_from_reader)
+                _check_io(pred_backbone.inputs_attr, pred_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.pred')
+                _check_io(pred_parad.inputs_attrs['reader'], pred_reader.outputs_attr, in_name='task_paradigm.pred.reader', out_name='reader.pred')
+                _check_io(pred_parad.inputs_attrs['backbone'], pred_backbone.outputs_attr, in_name='task_paradigm.pred.backbone', out_name=bb_name+'_backbone')
+
+        # merge reader input attrs from backbone and task_instances
+        joint_input_names, joint_shape_and_dtypes, name_to_position = merge_input_attrs(train_backbone.inputs_attr, task_attrs)
+        pred_joint_input_names, pred_joint_shape_and_dtypes, _ = merge_input_attrs(pred_backbone.inputs_attr, pred_task_attrs, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
+        # shapes: [task_id, shapes_of_backbone, shapes_of_inst1, ..., shapes_of_instN]
+
+        if DEBUG:
+            print('----- for debug -----')
+            print('joint input names:')
+            print(joint_input_names)
+            print('joint input shape and dtypes:')
+            print(joint_shape_and_dtypes)
+
+        # load data
+        for inst in instances:
+            print(inst.name+": preparing data...", end='')
+            inst.reader['train'].load_data()
+            print('ok!')
+
+        # merge dataset iterators and create net input vars
+        iterators = []
+        prefixes = []
+        mrs = []
+        for inst in instances:
+            iterators.append(inst.reader['train'].iterator())
+            prefixes.append(inst.name)
+            mrs.append(inst.mix_ratio)
+
+        joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE)
+
+        input_attrs = [[i, j, k] for i, (j,k) in zip(joint_input_names, joint_shape_and_dtypes)]
+        pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_joint_input_names, pred_joint_shape_and_dtypes)]
+        net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3)
+
+        # build backbone and task layers
+        train_prog = fluid.default_main_program()
+        train_init_prog = fluid.default_startup_program()
+        bb_output_vars = train_backbone.build(net_inputs, scope_name='__paddlepalm_')
+        assert sorted(bb_output_vars.keys()) == sorted(train_backbone.outputs_attr.keys())
+        
+        pred_prog = fluid.Program()
+        pred_init_prog = fluid.Program()
+
+        with fluid.program_guard(main_program = pred_prog, startup_program = pred_init_prog):
+            pred_net_inputs = create_net_inputs(pred_input_attrs)
+            pred_bb_output_vars = pred_backbone.build(pred_net_inputs, scope_name='__paddlepalm_')
+
+        fluid.framework.switch_main_program(train_prog)
+        fluid.framework.switch_startup_program(train_init_prog)
+
+        task_output_vars = {}
+        for inst in instances:
+            task_inputs = {'backbone': bb_output_vars}
+            task_inputs_from_reader = _decode_inputs(net_inputs, inst.name)
+            task_inputs['reader'] = task_inputs_from_reader
+
+            scope = inst.task_reuse_scope + '/'
+            with fluid.unique_name.guard(scope):
+                output_vars = inst.build_task_layer(task_inputs, phase='train', scope=scope)
+                output_vars = {inst.name+'/'+key: val for key, val in output_vars.items()}
+                old = len(task_output_vars) # for debug
+                task_output_vars.update(output_vars)
+                assert len(task_output_vars) - old == len(output_vars) # for debug
+
+            # prepare predict vars for saving inference model
+            if inst.is_target:
+
+                with fluid.program_guard(pred_prog, pred_init_prog):
+                    cur_inputs = _decode_inputs(pred_net_inputs, inst.name)
+                    inst.pred_input = cur_inputs
+                    pred_task_inputs = {'backbone': pred_bb_output_vars, 'reader': cur_inputs}
+                    scope = inst.task_reuse_scope + '/'
+                    with fluid.unique_name.guard(scope):
+                        inst.build_task_layer(pred_task_inputs, phase='pred', scope=scope)
+
+
+        bb_fetches = {k: v.name for k,v in bb_output_vars.items()}
+        task_fetches = {k: v.name for k,v in task_output_vars.items()}
+        fetches = task_fetches
+        fetches['__task_id'] = net_inputs['__task_id'].name
+
+        # compute loss
+        task_id_var = net_inputs['__task_id']
+        task_id_vec = layers.one_hot(task_id_var, num_instances)
+        losses = fluid.layers.concat([task_output_vars[inst.name+'/loss'] for inst in instances], axis=0)
+        loss = layers.reduce_sum(task_id_vec * losses)
+
+    def init_train(self, basetask, num_epochs, ):
+        main_reader = main_inst.reader['train']
+
+        num_examples = main_reader.num_examples
+        for inst in instances:
+            max_train_steps = int(main_conf['num_epochs']* inst.mix_ratio * (num_examples // main_conf['batch_size']  // dev_count))
+            if inst.is_target:
+                print('{}: expected train steps {}.'.format(inst.name, max_train_steps))
+            inst.steps_pur_epoch = inst.reader['train'].num_examples // main_conf['batch_size']  // dev_count
+            inst.expected_train_steps = max_train_steps
+
+        global_max_train_steps = int(main_conf['num_epochs'] * sum(mrs) * (num_examples // main_conf['batch_size']  // dev_count))
+        print('Estimated overall train steps {}.'.format(global_max_train_steps))
+
+        # if 'warmup_proportion' in main_conf and main_conf['warmup_proportion'] > 0:
+        #     warmup_steps = int(global_max_train_steps * main_conf['warmup_proportion'])
+        #     print('Warmup steps: '+str(warmup_steps))
+        # else:
+        #     warmup_steps = 0
+
+        return loss, max_train_steps
+
+
+    def build_backward(self, optimizer, use_ema=False, ema_decay=0.9999):
+        # build optimizer
+        optimizer.optimize(fluid.default_main_program())
+
+        # loss.persistable = True
+        if use_ema:
+            ema = fluid.optimizer.ExponentialMovingAverage(ema_decay)
+            ema.update()
+
+    def random_init_params(self):
+        if not self._init_finish:
+            # prepare for train
+            self.train_program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
+            self.saver_program = fluid.default_main_program()
+            self._init_finish = True
+
+        print("\nRandomly initialize parameters...\n")
+        self.exe.run(fluid.default_startup_program())
+
+    def load_pretrain_params(self, pretrain_model_path=None):
+        # load pretrain model (or ckpt)
+        if pretrain_model_path is None:
+            assert 'pretrain_model_path' in self.main_conf, "pretrain_model_path NOT set."
+            pretrain_model_path = self.main_conf['pretrain_model_path']
+
+        init_pretraining_params(
+            self.exe,
+            pretrain_model_path,
+            main_program=fluid.default_startup_program())
+
+        if not self._init_finish:
+            self.train_program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
+            self.saver_program = fluid.default_main_program()
+            self._init_finish = True
+
+    def load_infermodel(self, instance, infer_model_path):
+        inst = instance
+        if 'pred_output_path' not in inst.config:
+            inst.config['pred_output_path'] = os.path.join(inst.config.get('save_path', '.'), inst.name)
+
+        if not os.path.exists(inst.config['pred_output_path']):
+            os.makedirs(inst.config['pred_output_path'])
+
+        pred_backbone = self.Backbone(self.bb_conf, phase='pred')
+        pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=self.bb_conf)
+        inst.task_layer['pred'] = pred_parad
+        pred_joint_input_names, pred_joint_shape_and_dtypes, name_to_position = merge_input_attrs(
+            pred_backbone.inputs_attr, inst.task_layer['pred'].inputs_attrs['reader'], 
+            insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
+
+        pred_prog = inst.load(infer_model_path)
+        if inst.reader['pred'] is None:
+            pred_reader = inst.Reader(inst.config, phase='pred')
+            inst.reader['pred'] = pred_reader
+        return pred_prog
+
+    def train(self, num_epochs):
+
+        if not self._init_finish:
+            raise Exception('params has not been initialized! Please init params with random_init_params or load_pretrain_params.')
+
+        instances = self.instances
+        num_instances = self.num_instances
+        main_inst = self.main_inst
+        main_conf = main_inst.config
+
+        backbone = self.train_backbone
+        train_program = self.train_program
+        saver_program = self.saver_program
+        fetches = self.fetches
+
+        finish = []
+        for inst in instances:
+            if inst.is_target:
+                if inst.expected_train_steps > 0:
+                    finish.append(False)
+                else:
+                    finish.append(True)
+                    print(inst.name+': train finished!')
+                    inst.save()
+        
+        def train_finish():
+            for inst in instances:
+                if inst.is_target:
+                    if not inst.train_finish:
+                        return False
+            return True
+
+        # do training
+        fetch_names, fetch_list = zip(*fetches.items())
+
+        main_step = 0 # only count for main task
+        global_step = 0 # count for all tasks
+        epoch = 0
+        time_begin = time.time()
+        backbone_buffer = []
+        while not train_finish():
+            rt_outputs = self.exe.run(train_program, fetch_list=fetch_list)
+            rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)}
+            rt_task_id = np.squeeze(rt_outputs['__task_id']).tolist()
+            rt_task_id = rt_task_id[0] if isinstance(rt_task_id, list) else rt_task_id
+            cur_task = instances[rt_task_id]
+
+            backbone_rt_outputs = {k:v for k,v in rt_outputs.items() if '/' not in k}
+            backbone_buffer.append(backbone.postprocess(backbone_rt_outputs))
+            
+            task_rt_outputs = {k[len(cur_task.name+'/'):]: v for k,v in rt_outputs.items() if k.startswith(cur_task.name+'/')}
+            instances[rt_task_id].task_layer['train'].postprocess(task_rt_outputs)
+
+            global_step += 1
+            cur_task.cur_train_step += 1
+
+            if cur_task.save_infermodel_every_n_steps > 0 and cur_task.cur_train_step % cur_task.save_infermodel_every_n_steps == 0:
+                cur_task.save(suffix='.step'+str(cur_task.cur_train_step))
+
+            if global_step % main_conf.get('print_every_n_steps', 5) == 0:
+                loss = rt_outputs[cur_task.name+'/loss']
+                loss = np.mean(np.squeeze(loss)).tolist()
+
+                time_end = time.time()
+                time_cost = time_end - time_begin
+
+                print("Global step: {}. Task: {}, step {}/{} (epoch {}), loss: {:.3f}, speed: {:.2f} steps/s".format(
+                       global_step, cur_task.name, cur_task.cur_train_step, cur_task.steps_pur_epoch, cur_task.cur_train_epoch,
+                       loss, main_conf.get('print_every_n_steps', 5) / time_cost))
+                time_begin = time.time()
+
+            if cur_task.train_finish and cur_task.cur_train_step + cur_task.cur_train_epoch * cur_task.steps_pur_epoch == cur_task.expected_train_steps:
+                print(cur_task.name+': train finished!')
+                cur_task.save()
+
+            if 'save_every_n_steps' in main_conf and global_step % main_conf['save_every_n_steps'] == 0:
+                save_path = os.path.join(main_conf['save_path'],
+                                         "step_" + str(global_step))
+                fluid.io.save_persistables(self.exe, save_path, saver_program)
+
+        print("ALL tasks train finished, exiting...")
+            
+    def pred(self, task_instance, inference_model_dir=None):
+        if self._for_train:
+            raise Exception('This controller is a trainer. Please build a new controller with for_train=False for predicting.')
+
+        assert isinstance(task_instance, str)
+        if isinstance(inference_model_dir, str):
+            assert os.path.exists(inference_model_dir), inference_model_dir+" not found."
+        # if not self.has_init_pred and inference_model_dir is None:
+        #     raise ValueError('infer_model_path is required for prediction.')
+        if inference_model_dir is None:
+            assert 'save_path' in self.mtl_conf, "one of the `inference_model_dir` and 'save_path' should be set to load inference model."
+            inference_model_dir = os.path.join(self.mtl_conf['save_path'], task_instance, 'infer_model')
+
+        instance = None
+        for inst in self.instances:
+            if inst.name == task_instance:
+                instance = inst
+                break
+
+        if instance is None:
+            raise ValueError(task_instance + ' is not a valid task_instance.')
+
+        pred_prog = self._init_pred(instance, inference_model_dir)
+                
+        inst = instance
+        print(inst.name+": loading data...")
+        inst.reader['pred'].load_data()
+        fetch_names, fetch_vars = inst.pred_fetch_list
+
+        print('predicting...')
+        mapper = {k:v for k,v in inst.pred_input}
+        buf = []
+        for feed in inst.reader['pred'].iterator():
+            feed = _encode_inputs(feed, inst.name, cand_set=mapper)
+            feed = {mapper[k]: v for k,v in feed.items()}
+
+            rt_outputs = self.exe.run(pred_prog, feed, fetch_vars)
+            rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)}
+            inst.postprocess(rt_outputs, phase='pred')
+        if inst.task_layer['pred'].epoch_inputs_attrs:
+            reader_outputs = inst.reader['pred'].get_epoch_outputs()
+        else:
+            reader_outputs = None
+        inst.epoch_postprocess({'reader':reader_outputs}, phase='pred')
+
+
+if __name__ == '__main__':
+    assert len(sys.argv) == 2, "Usage: python mtl_controller.py <mtl_conf_path>"
+    conf_path = sys.argv[1]
+    del sys.argv[1]
+    controller = Controller(conf_path)
+    if controller.main_conf['do_train']:
+        controller.train()
+
+
+
+
+            
+
--- a/paddlepalm/distribute/__init__.py
+++ b/paddlepalm/distribute/__init__.py
+from paddle import fluid
+import os
+import multiprocessing
+
+gpu_dev_count = int(fluid.core.get_cuda_device_count())
+cpu_dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+
+from reader import yield_pieces, data_feeder 
+
--- a/paddlepalm/distribute/reader.py
+++ b/paddlepalm/distribute/reader.py
+
+from . import gpu_dev_count, cpu_dev_count
+import Queue
+from threading import Thread
+
+dev_count = gpu_dev_count if gpu_dev_count > 0 else cpu_dev_count
+
+def yield_pieces(data, distribute_strategy, batch_size):
+    """
+    Args:
+        distribute_strategy: support s=split, c=copy, u=unstack,
+        """
+    assert batch_size % dev_count == 0, "batch_size need to be integer times larger than dev_count."
+    print('data in yield pieces')
+    print(len(data))
+
+    assert type(data) == type(distribute_strategy), [type(data), type(distribute_strategy)]
+    assert len(data) == len(distribute_strategy), [len(data), len(distribute_strategy)]
+    if isinstance(data, dict):
+        keys = list(data.keys())
+        data_list = [data[i] for i in keys]
+        ds_list = [distribute_strategy[i] for i in keys]
+    else:
+        assert isinstance(data, list), "the input data must be a list or dict, and contained with multiple tensors."
+        data_list = data
+        ds_list = distribute_strategy
+
+    stride = batch_size // dev_count
+    p = stride
+    # while p < len(data_list) + stride:
+    while p <= batch_size:
+        temp = []
+        for d, s in zip(data_list, ds_list):
+            s = s.strip().lower()
+            if s == 's' or s == 'split':
+                if p - stride >= len(d):
+                    print('WARNING: no more examples to feed empty devices')
+                    temp = []
+                    return
+                temp.append(d[p-stride:p])
+            elif s == 'u' or s == 'unstack':
+                assert len(d) <= dev_count, 'Tensor size on dim 0 must be less equal to dev_count when unstack is applied.'
+                if p//stride > len(d):
+                    print('WARNING: no more examples to feed empty devices')
+                    return
+                temp.append(d[p//stride-1])
+            elif s == 'c' or s == 'copy':
+                temp.append(d)
+            else:
+                raise NotImplementedError()
+            
+        p += stride
+        if type(data) == dict:
+            yield dict(zip(*[keys, temp]))
+        else:
+            print('yielded pieces')
+            print(len(temp))
+            yield temp
+
+def data_feeder(reader, postprocess_fn=None, prefetch_steps=2):
+
+    if postprocess_fn is None:
+        def postprocess_fn(batch):
+            return batch
+
+    def worker(reader, dev_count, queue):
+        dev_batches = []
+        for index, data in enumerate(reader()):
+            if len(dev_batches) < dev_count:
+                dev_batches.append(data)
+            if len(dev_batches) == dev_count:
+                queue.put((dev_batches, 0))
+                dev_batches = []
+        # For the prediction of the remained batches, pad more batches to 
+        # the number of devices and the padded samples would be removed in
+        # prediction outputs. 
+        if len(dev_batches) > 0:
+            num_pad = dev_count - len(dev_batches)
+            for i in range(len(dev_batches), dev_count):
+                dev_batches.append(dev_batches[-1])
+            queue.put((dev_batches, num_pad))
+        queue.put(None)
+
+    queue = Queue.Queue(dev_count*prefetch_steps)
+    p = Thread(
+        target=worker, args=(reader, dev_count, queue))
+    p.daemon = True
+    p.start()
+    while True:
+        ret = queue.get()
+        queue.task_done()
+        if ret is not None:
+            batches, num_pad = ret
+            batch_buf = []
+            flag_buf = []
+            for idx, batch in enumerate(batches):
+                # flag = num_pad == 0
+                flag = idx-len(batches) < -num_pad
+                # if num_pad > 0:
+                #     num_pad -= 1
+                batch = postprocess_fn(batch)
+                batch_buf.append(batch)
+                flag_buf.append(flag)
+            yield batch_buf, flag_buf
+        else:
+            break
+    queue.join()
+
+
--- a/paddlepalm/downloader.py
+++ b/paddlepalm/downloader.py
+from _downloader import *
+
--- a/paddlepalm/head/__init__.py
+++ b/paddlepalm/head/__init__.py
+
+from cls import Classify
+# from match import Match
+# from mrc import MRC
+# from mlm import MaskLM
--- a/paddlepalm/head/base_head.py
+++ b/paddlepalm/head/base_head.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class BaseHead(object):
+
+    def __init__(self, config, phase, backbone_config):
+        """
+            config: dict类型。描述了 任务实例(task instance)+多任务配置文件 中定义超参数
+            phase: str类型。运行阶段，目前支持train和predict
+            """
+        self._stop_gradient = {}
+        self._prog = None
+
+    @property
+    def inputs_attrs(self):
+        """描述task_layer需要从reader, backbone等输入对象集合所读取到的输入对象的属性，第一级key为对象集和的名字，如backbone，reader等（后续会支持更灵活的输入），第二级key为对象集和中各对象的属性，包括对象的名字，shape和dtype。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个对象集及其输入对象的属性描述。"""
+        raise NotImplementedError()
+
+    @property
+    def outputs_attr(self):
+        """描述task输出对象的属性，包括对象的名字，shape和dtype。输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+        当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个输入对象的属性描述。注意，训练阶段必须包含名为loss的输出对象。
+            """
+
+        raise NotImplementedError()
+
+    @property
+    def epoch_inputs_attrs(self):
+        return {}
+
+    # def stop_gradient(source, inputs):
+    #     # if self._inputs is None:
+    #     #     raise Exception('You need to build this head first before stop gradient.')
+    #     self._inputs = inputs
+    #     for name, var in self._inputs[source].items():
+    #         # cur_block = self._prog.current_block()
+    #         var = fluid.layers.assign(var)
+    #         var.stop_gradient = True
+    #         self._inputs[name] = var
+    #     return self._inputs
+
+    def build(self, inputs, scope_name=""):
+        """建立task_layer的计算图。将符合inputs_attrs描述的来自各个对象集的静态图Variables映射成符合outputs_attr描述的静态图Variable输出。
+        Args:
+            inputs: dict类型。字典中包含inputs_attrs中的对象名到计算图Variable的映射，inputs中至少会包含inputs_attr中定义的对象
+        Return:
+           需要输出的计算图变量，输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+
+        """
+        raise NotImplementedError()
+        
+
+    def postprocess(self, rt_outputs):
+        """每个训练或推理step后针对当前batch的task_layer的runtime计算结果进行相关后处理。注意，rt_outputs除了包含build方法，还自动包含了loss的计算结果。"""
+        pass
+        
+    def epoch_postprocess(self, post_inputs):
+        pass
+
--- a/paddlepalm/head/cls.py
+++ b/paddlepalm/head/cls.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from paddle.fluid import layers
+from paddlepalm.head.base_head import BaseHead
+import numpy as np
+import os
+
+
+class Classify(BaseHead):
+    """
+    classification
+    """
+    # def __init__(self, config, phase, backbone_config=None):
+    def __init__(self, num_classes, input_dim, dropout_prob=0.0, \
+                 param_initializer_range=0.02, phase='train'):
+
+        self._is_training = phase == 'train'
+        self._hidden_size = input_dim
+
+        self.num_classes = num_classes
+    
+        self._dropout_prob = dropout_prob if phase == 'train' else 0.0
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=param_initializer_range)
+        self._preds = []
+
+    @property
+    def inputs_attrs(self):
+        reader = {}
+        bb = {"sentence_embedding": [[-1, self._hidden_size], 'float32']}
+        if self._is_training:
+            reader["label_ids"] = [[-1], 'int64']
+        return {'reader': reader, 'backbone': bb}
+
+    @property
+    def outputs_attrs(self):
+        if self._is_training:
+            return {'loss': [[1], 'float32']}
+        else:
+            return {'logits': [[-1, self.num_classes], 'float32']}
+
+    def build(self, inputs, scope_name=''):
+        sent_emb = inputs['backbone']['sentence_embedding']
+        if self._is_training:
+            label_ids = inputs['reader']['label_ids']
+            cls_feats = fluid.layers.dropout(
+                x=sent_emb,
+                dropout_prob=self._dropout_prob,
+                dropout_implementation="upscale_in_train")
+
+        logits = fluid.layers.fc(
+            input=sent_emb,
+            size=self.num_classes,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+"cls_out_w",
+                initializer=self._param_initializer),
+            bias_attr=fluid.ParamAttr(
+                name=scope_name+"cls_out_b", initializer=fluid.initializer.Constant(0.)))
+
+        if self._is_training:
+            inputs = fluid.layers.softmax(logits)
+            loss = fluid.layers.cross_entropy(
+                input=inputs, label=label_ids)
+            loss = layers.mean(loss)
+            return {"loss": loss}
+        else:
+            return {"logits":logits}
+
+    def batch_postprocess(self, rt_outputs):
+        if not self._is_training:
+            logits = rt_outputs['logits']
+            preds = np.argmax(logits, -1)
+            self._preds.extend(preds.tolist())
+            return preds
+
+    def epoch_postprocess(self, post_inputs):
+        # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
+        if not self._is_training:
+            if self._pred_output_path is None:
+                raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
+            with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer:
+                for p in self._preds:
+                    writer.write(str(p)+'\n')
+            print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json'))
+
+                
--- a/paddlepalm/task_paradigm/match.py
+++ b/paddlepalm/task_paradigm/match.py
--- a/paddlepalm/task_paradigm/mlm.py
+++ b/paddlepalm/task_paradigm/mlm.py
--- a/paddlepalm/task_paradigm/mrc.py
+++ b/paddlepalm/task_paradigm/mrc.py
--- a/paddlepalm/lr_sched/__init__.py
+++ b/paddlepalm/lr_sched/__init__.py
+
+from slanted_triangular_schedualer import TriangularSchedualer
+from warmup_schedualer import WarmupSchedualer
--- a/paddlepalm/lr_sched/noam_decay_schedualer.py
+++ b/paddlepalm/lr_sched/noam_decay_schedualer.py
+
+# scheduled_lr = fluid.layers.learning_rate_scheduler\
+#  .noam_decay(1/(warmup_steps *(config['learning_rate'] ** 2)),
+#              warmup_steps)
--- a/paddlepalm/lr_sched/schedualer.py
+++ b/paddlepalm/lr_sched/schedualer.py
+
+class BaseSchedualer():
+
+    def __init__(self):
+        self._prog = None
+    
+    def _set_prog(self, prog):
+        self._prog = prog
+
+    def build(self, learning_rate):
+        raise NotImplementedError()
+
--- a/paddlepalm/lr_sched/slanted_triangular_schedualer.py
+++ b/paddlepalm/lr_sched/slanted_triangular_schedualer.py
+
+from paddlepalm.lr_sched.schedualer import BaseSchedualer
+from paddle import fluid
+
+class TriangularSchedualer(BaseSchedualer):
+
+    """ Applies linear warmup of learning rate from 0 to learning_rate until warmup_steps, and then decay to 0 linearly until num_train_steps."""
+
+    def __init__(self, warmup_steps, num_train_steps):
+        BaseSchedualer.__init__(self)
+        assert num_train_steps > warmup_steps > 0
+        self.warmup_steps = warmup_steps
+        self.num_train_steps = num_train_steps
+        
+
+    def build(self, learning_rate):
+        with self._prog._lr_schedule_guard():
+            lr = fluid.layers.tensor.create_global_var(
+                shape=[1],
+                value=0.0,
+                dtype='float32',
+                persistable=True,
+                name="scheduled_learning_rate")
+
+            global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
+
+            with fluid.layers.control_flow.Switch() as switch:
+                with switch.case(global_step < self.warmup_steps):
+                    warmup_lr = learning_rate * (global_step / self.warmup_steps)
+                    fluid.layers.tensor.assign(warmup_lr, lr)
+                with switch.default():
+                    decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
+                        learning_rate=learning_rate,
+                        decay_steps=self.num_train_steps,
+                        end_learning_rate=0.0,
+                        power=1.0,
+                        cycle=False)
+                    fluid.layers.tensor.assign(decayed_lr, lr)
+
+            return lr
+
+
--- a/paddlepalm/lr_sched/warmup_schedualer.py
+++ b/paddlepalm/lr_sched/warmup_schedualer.py
+
+from paddlepalm.lr_sched.schedualer import BaseSchedualer
+
+def WarmupSchedualer(BaseSchedualer):
+    """ Applies linear warmup of learning rate from 0 to learning_rate until warmup_steps, and then decay to 0 linearly until num_train_steps."""
+
+    def __init__(self, warmup_steps):
+        schedualer.__init__(self)
+        self.warmup_steps = warmup_steps
+
+    def build(self, learning_rate):
+
+        with self._prog._lr_schedule_guard():
+            lr = fluid.layers.tensor.create_global_var(
+                shape=[1],
+                value=0.0,
+                dtype='float32',
+                persistable=True,
+                name="scheduled_learning_rate")
+
+            global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
+
+            with fluid.layers.control_flow.Switch() as switch:
+                with switch.case(global_step < self.warmup_steps):
+                    warmup_lr = learning_rate * (global_step / self.warmup_steps)
+                    fluid.layers.tensor.assign(warmup_lr, lr)
+                with switch.default():
+                    fluid.layers.tensor.assign(learning_rate, lr)
+
+            return lr
+
--- a/paddlepalm/mtl_controller.py
+++ b/paddlepalm/mtl_controller.py
@@ -31,9 +31,11 @@ from paddlepalm.utils.saver import init_pretraining_params, init_checkpoint
 from paddlepalm.utils.config_helper import PDConfig
 from paddlepalm.utils.print_helper import print_dict
 from paddlepalm.utils.reader_helper import create_net_inputs, create_iterator_fn, create_joint_iterator_fn, merge_input_attrs 
+from paddlepalm.distribute import data_feeder
+
+from default_settings import *
+from task_instance import TaskInstance, check_instances

-from paddlepalm.default_settings import *
-from paddlepalm.task_instance import TaskInstance, check_instances

 DEBUG=False
 VERBOSE=0
@@ -182,6 +184,20 @@ def _fit_attr(conf, fit_attr, strict=False):
    return conf


+def create_feed_batch_process_fn(net_inputs):
+
+    def feed_batch_process_fn(data):
+        temp = {}
+        for q, var in net_inputs.items():
+            if isinstance(var, str) or isinstance(var, unicode):
+                temp[var] = data[q]
+            else:
+                temp[var.name] = data[q]
+        return temp
+
+    return feed_batch_process_fn
+
+
 class Controller(object):

    def __init__(self, config, task_dir='.', for_train=True):
@@ -234,7 +250,7 @@ class Controller(object):
            bb_conf = _merge_conf(mtl_conf, bb_conf)
        else:
            bb_conf = mtl_conf
-        print_dict(bb_conf, title='backbone configuration'.format(instname))
+        print_dict(bb_conf, title = 'backbone configuration'.format(instname))

        bb_name = mtl_conf['backbone']
        bb_mod = importlib.import_module(BACKBONE_DIR + '.' + bb_name)
@@ -338,6 +354,7 @@ class Controller(object):
        main_conf = main_inst.config
        if not os.path.exists(main_conf['save_path']):
            os.makedirs(main_conf['save_path'])
+            os.makedirs(os.path.join(main_conf['save_path'], 'ckpt'))
        
        # prepare backbone
        train_backbone = Backbone(bb_conf, phase='train')
@@ -398,11 +415,14 @@ class Controller(object):
            prefixes.append(inst.name)
            mrs.append(inst.mix_ratio)

-        joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE)
+        joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE, return_type='dict')
+        self._joint_iterator_fn = joint_iterator_fn

        input_attrs = [[i, j, k] for i, (j,k) in zip(joint_input_names, joint_shape_and_dtypes)]
        pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_joint_input_names, pred_joint_shape_and_dtypes)]
-        net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3)
+        # net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3)
+        net_inputs = create_net_inputs(input_attrs, async=False)
+        self._net_inputs = net_inputs

        # build backbone and task layers
        train_prog = fluid.default_main_program()
@@ -453,7 +473,7 @@ class Controller(object):

        # compute loss
        task_id_var = net_inputs['__task_id']
-        task_id_vec = layers.one_hot(task_id_var, num_instances)
+        task_id_vec = fluid.one_hot(task_id_var, num_instances)
        losses = fluid.layers.concat([task_output_vars[inst.name+'/loss'] for inst in instances], axis=0)
        loss = layers.reduce_sum(task_id_vec * losses)

@@ -517,20 +537,21 @@ class Controller(object):
            insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)

        pred_prog = inst.load(infer_model_path)
+        pred_prog = fluid.CompiledProgram(pred_prog).with_data_parallel()
        if inst.reader['pred'] is None:
            pred_reader = inst.Reader(inst.config, phase='pred')
            inst.reader['pred'] = pred_reader
        return pred_prog

-    def load_pretrain(self, pretrain_model_path=None):
+    def load_pretrain(self, pretrain_path=None):
        # load pretrain model (or ckpt)
-        if pretrain_model_path is None:
-            assert 'pretrain_model_path' in self.main_conf, "pretrain_model_path NOT set."
-            pretrain_model_path = self.main_conf['pretrain_model_path']
+        if pretrain_path is None:
+            assert 'pretrain_path' in self.main_conf, "pretrain_path NOT set."
+            pretrain_path = self.main_conf['pretrain_path']

        init_pretraining_params(
            self.exe,
-            pretrain_model_path,
+            pretrain_path,
            main_program=fluid.default_startup_program())


@@ -575,8 +596,18 @@ class Controller(object):
        epoch = 0
        time_begin = time.time()
        backbone_buffer = []
+        
+        feed_batch_process_fn = create_feed_batch_process_fn(self._net_inputs)
+        distribute_feeder = data_feeder(self._joint_iterator_fn, feed_batch_process_fn)
+
+        # palm.distribute.reader(self._joint_iterator_fn, self._net_inputs, prefetch_steps=2)
+        
        while not train_finish():
-            rt_outputs = self.exe.run(train_program, fetch_list=fetch_list)
+            feed, mask = next(distribute_feeder)
+            rt_outputs = self.exe.run(train_program, feed=feed, fetch_list=fetch_list)
+            while mask.pop() == False:
+                rt_outputs.pop()
+
            rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)}
            rt_task_id = np.squeeze(rt_outputs['__task_id']).tolist()
            rt_task_id = rt_task_id[0] if isinstance(rt_task_id, list) else rt_task_id
@@ -591,8 +622,9 @@ class Controller(object):
            global_step += 1
            cur_task.cur_train_step += 1

-            if cur_task.save_infermodel_every_n_steps > 0 and cur_task.cur_train_step % cur_task.save_infermodel_every_n_steps == 0:
-                cur_task.save(suffix='.step'+str(cur_task.cur_train_step))
+            cur_task_global_step = cur_task.cur_train_step + cur_task.cur_train_epoch * cur_task.steps_pur_epoch
+            if cur_task.is_target and cur_task.save_infermodel_every_n_steps > 0 and cur_task_global_step % cur_task.save_infermodel_every_n_steps == 0:
+                cur_task.save(suffix='.step'+str(cur_task_global_step))

            if global_step % main_conf.get('print_every_n_steps', 5) == 0:
                loss = rt_outputs[cur_task.name+'/loss']
@@ -610,10 +642,16 @@ class Controller(object):
                print(cur_task.name+': train finished!')
                cur_task.save()

-            if 'save_every_n_steps' in main_conf and global_step % main_conf['save_every_n_steps'] == 0:
-                save_path = os.path.join(main_conf['save_path'],
+            if 'save_ckpt_every_n_steps' in main_conf and global_step % main_conf['save_ckpt_every_n_steps'] == 0:
+                save_path = os.path.join(main_conf['save_path'], 'ckpt', 
                                         "step_" + str(global_step))
                fluid.io.save_persistables(self.exe, save_path, saver_program)
+                print('checkpoint has been saved at '+save_path)
+
+        save_path = os.path.join(main_conf['save_path'], 'ckpt',
+                                 "step_" + str(global_step))
+        fluid.io.save_persistables(self.exe, save_path, saver_program)
+        print('checkpoint has been saved at '+save_path)

        print("ALL tasks train finished, exiting...")
            
@@ -647,19 +685,38 @@ class Controller(object):
        fetch_names, fetch_vars = inst.pred_fetch_list

        print('predicting...')
-        mapper = {k:v for k,v in inst.pred_input}
-        buf = []
-        for feed in inst.reader['pred'].iterator():
-            feed = _encode_inputs(feed, inst.name, cand_set=mapper)
-            feed = {mapper[k]: v for k,v in feed.items()}
+        feed_batch_process_fn = create_feed_batch_process_fn(inst.pred_input)
+        distribute_feeder = data_feeder(inst.reader['pred'].iterator, feed_batch_process_fn, prefetch_steps=1)

+        buf = []
+        for feed, mask in distribute_feeder:
+            print('before run')
            rt_outputs = self.exe.run(pred_prog, feed, fetch_vars)
+            print('after run')
+            splited_rt_outputs = []
+            for item in rt_outputs:
+                splited_rt_outputs.append(np.split(item, len(mask)))
+
+            # assert len(rt_outputs) == len(mask), [len(rt_outputs), len(mask)]
+            print(mask)
+            
+            while mask.pop() == False:
+                print(mask)
+                for item in splited_rt_outputs:
+                    item.pop()
+            rt_outputs = []
+            print('cancat')
+            for item in splited_rt_outputs:
+                rt_outputs.append(np.concatenate(item))
+                
            rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)}
            inst.postprocess(rt_outputs, phase='pred')
+        print('leave feeder')
        if inst.task_layer['pred'].epoch_inputs_attrs:
            reader_outputs = inst.reader['pred'].get_epoch_outputs()
        else:
            reader_outputs = None
+        print('epoch postprocess')
        inst.epoch_postprocess({'reader':reader_outputs}, phase='pred')


@@ -673,6 +730,7 @@ if __name__ == '__main__':



+__all__ = ["Controller"]

            

--- a/paddlepalm/optimizer/__init__.py
+++ b/paddlepalm/optimizer/__init__.py
+
+from adam import Adam
--- a/paddlepalm/optimizer/adam.py
+++ b/paddlepalm/optimizer/adam.py
@@ -20,84 +20,36 @@ from __future__ import print_function

 import numpy as np
 import paddle.fluid as fluid
+from paddlepalm.optimizer.base_optimizer import BaseOptimizer

-def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
-    """ Applies linear warmup of learning rate from 0 and decay to 0."""
-    with fluid.default_main_program()._lr_schedule_guard():
-        lr = fluid.layers.tensor.create_global_var(
-            shape=[1],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            name="scheduled_learning_rate")
+class Adam(BaseOptimizer):

-        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
+    def __init__(self, loss_var, lr, lr_schedualer=None):

-        with fluid.layers.control_flow.Switch() as switch:
-            with switch.case(global_step < warmup_steps):
-                warmup_lr = learning_rate * (global_step / warmup_steps)
-                fluid.layers.tensor.assign(warmup_lr, lr)
-            with switch.default():
-                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
-                    learning_rate=learning_rate,
-                    decay_steps=num_train_steps,
-                    end_learning_rate=0.0,
-                    power=1.0,
-                    cycle=False)
-                fluid.layers.tensor.assign(decayed_lr, lr)
+        BaseOptimizer.__init__(self, loss_var, lr, lr_schedualer=None)

-        return lr
+        self._loss = loss_var
+        self._lr = lr
+        self._lr_schedualer = lr_schedualer
+    
+    def build(self, grad_clip=None):

+        if self._lr_schedualer is not None:
+            self._lr = self._lr_schedualer.build(self._lr)

-def optimize(loss, config, max_train_steps=None, warmup_steps=0, train_program=None):
-    if warmup_steps > 0:
-        decay_strategy = config.get('lr_scheduler', 'linear_warmup_decay')
-        if decay_strategy == 'noam_decay':
-            scheduled_lr = fluid.layers.learning_rate_scheduler\
-             .noam_decay(1/(warmup_steps *(config['learning_rate'] ** 2)),
-                         warmup_steps)
-        elif decay_strategy == 'linear_warmup_decay':
-            scheduled_lr = linear_warmup_decay(config['learning_rate'], warmup_steps,
-                                               max_train_steps)
-        else:
-            raise ValueError("Unkown lr_scheduler, should be "
-                             "'noam_decay' or 'linear_warmup_decay'")
-        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
-    else:
-        optimizer = fluid.optimizer.Adam(learning_rate=config['learning_rate'])
-        scheduled_lr = config['learning_rate']
+        optimizer = fluid.optimizer.Adam(learning_rate=self._lr)

-    clip_norm_thres = 1.0
-    # When using mixed precision training, scale the gradient clip threshold
-    # by loss_scaling
-    fluid.clip.set_gradient_clip(
-        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
+        if grad_clip is not None:
+            clip_norm_thres = grad_clip
+            # When using mixed precision training, scale the gradient clip threshold
+            # by loss_scaling
+            fluid.clip.set_gradient_clip(
+                clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))

-    def exclude_from_weight_decay(name):
-        if name.find("layer_norm") > -1:
-            return True
-        bias_suffix = ["_bias", "_b", ".b_0"]
-        for suffix in bias_suffix:
-            if name.endswith(suffix):
-                return True
-        return False
+        _, param_grads = optimizer.minimize(self._loss)
+        return param_grads

-    param_list = dict()
+    def get_cur_learning_rate(self):
+        return self._lr

-    for param in train_program.global_block().all_parameters():
-        param_list[param.name] = param * 1.0
-        param_list[param.name].stop_gradient = True
-
-    _, param_grads = optimizer.minimize(loss)
-
-
-    if config.get('weight_decay', 0) > 0:
-        for param, grad in param_grads:
-            if exclude_from_weight_decay(param.name):
-                continue
-            with param.block.program._optimized_guard(
-                [param, grad]), fluid.framework.name_scope("weight_decay"):
-                updated_param = param - param_list[
-                    param.name] * config['weight_decay'] * scheduled_lr
-                fluid.layers.assign(output=param, input=updated_param)

--- a/paddlepalm/optimizer/base_optimizer.py
+++ b/paddlepalm/optimizer/base_optimizer.py
+
+class BaseOptimizer():
+
+    def __init__(self, loss_var, lr, lr_schedualer=None):
+        self._prog = None
+        self._lr_schedualer = lr_schedualer
+
+    def build(self, grad_clip=None):
+        pass
+
+    def _set_prog(self, prog):
+        self._prog = prog
+        if self._lr_schedualer is not None:
+            self._lr_schedualer._set_prog(prog)
+
+    def get_cur_learning_rate(self):
+        pass
+
+
--- a/paddlepalm/reader/__init__.py
+++ b/paddlepalm/reader/__init__.py
+
+from cls import ClassifyReader
+
--- a/paddlepalm/reader/base_reader.py
+++ b/paddlepalm/reader/base_reader.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""v1.1"""
+from copy import copy
+class BaseReader(object):
+    """interface of data manager."""
+
+    def __init__(self, phase='train'):
+        # assert isinstance(config, dict)
+        # self._config = config
+        self._phase = phase
+        self._register = set()
+        self._registered_backbone = None
+
+    @classmethod
+    def create_register(self):
+        return set()
+        
+    def clone(self, phase='train'):
+        if phase == self._phase:
+            return copy(self)
+        else:
+            ret = copy(self)
+            ret._phase = phase
+            return ret
+
+    def require_attr(self, attr_name):
+        self._register.add(attr_name)
+            
+    def register_with(self, backbone):
+        print(backbone)
+        for attr in backbone.inputs_attr:
+            self.require_attr(attr)
+        self._registered_backbone = backbone
+
+    def get_registered_backbone(self):
+        return self._registered_backbone
+
+    def _get_registed_attrs(self, attrs):
+        ret = {}
+        for i in self._register:
+            if i not in attrs:
+                raise NotImplementedError('output attr {} is not found in this reader.'.format(i))
+            ret[i] = attrs[i]
+        return ret
+
+    # @property
+    # def inputs_attr(self):
+    #     """描述reader输入对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1.
+    #     Return:
+    #         dict类型。对各个输入对象的属性描述。例如，
+    #         对于文本分类任务，可能需要包含输入文本和所属标签的id
+    #             {"text": ([], 'str'),
+    #              "label": ([], 'int')}
+    #         对于标注任务，可能需要输入词序列和对应的标签
+    #             {"tokens", ([-1], 'str'),
+    #              "tags", ([-1], 'str')}
+    #         对于机器阅读理解任务，可能需要包含上下文、问题、回答、答案区域的起止位置等
+    #             {"paragraph", ([], 'str'),
+    #              "question", ([], 'str'),
+    #              "start_position", ([], 'int')
+    #         """
+    #     raise NotImplementedError()
+
+    @property
+    def outputs_attr(self):
+        """描述reader输出对象（被yield出的对象）的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        注意：当使用mini-batch梯度下降学习策略时，，应为常规的输入对象设置batch_size维度（一般为-1）
+        Return:
+            dict类型。对各个输入对象的属性描述。例如，
+            对于文本分类和匹配任务，yield的输出内容可能包含如下的对象（下游backbone和task可按需访问其中的对象）
+                {"token_ids": ([-1, max_len], 'int64'),
+                 "input_ids": ([-1, max_len], 'int64'),
+                 "segment_ids": ([-1, max_len], 'int64'),
+                 "input_mask": ([-1, max_len], 'float32'),
+                 "label": ([-1], 'int')}
+        """
+        raise NotImplementedError()
+
+    # def parse_line(self):
+    #     """框架内部使用字典描述每个样本，字典的key为inputs_attr，value为每个input对应的符合attr描述的值。
+    #         该函数负责将文本行解析成符合inputs_attr描述的字典类型的样本。默认的parse_line方法会读取json格式的数据集文件，数据集的每一行为json格式描述的样本。
+    #         用户可通过对该方法的继承改写来适配不同格式的数据集，例如csv格式甚至tfrecord文件。
+    #         """
+    #     raise NotImplementedError()
+    # 
+    # def tokenize(self, line):
+    #     """框架中内置了word piece tokenizer等分词器，用户可通过修改tokenizer超参数来制定使用的分词器，若内置的分词器均无法满足需求，用户可通过对该方法的继承改写来自定义分词器。
+    #         Args:
+    #             - line: a unicode string. 
+    #         Return:
+    #             a list of tokens
+    #         """
+    #     raise NotImplementedError()
+    
+    def iterator(self):
+        """数据集遍历接口，注意，当数据集遍历到尾部时该接口应自动完成指针重置，即重新从数据集头部开始新的遍历。
+        Yield:
+            (dict) elements that meet the requirements in output_templete
+        """
+        raise NotImplementedError()
+
+    @property
+    def num_examples(self):
+        """数据集中的样本数量，即每个epoch中iterator所生成的样本数。注意，使用滑动窗口等可能导致数据集样本数发生变化的策略时，该接口应返回runtime阶段的实际样本数。"""
+        raise NotImplementedError()
+
+
--- a/paddlepalm/reader/cls.py
+++ b/paddlepalm/reader/cls.py
@@ -13,87 +13,76 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from paddlepalm.interface import reader
-from paddlepalm.reader.utils.reader4ernie import ClassifyReader
+from paddlepalm.reader.base_reader import BaseReader
+from paddlepalm.reader.utils.reader4ernie import ClassifyReader as CLSReader

-class Reader(reader):
+
+class ClassifyReader(BaseReader):
    
-    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+    def __init__(self, vocab_path, max_len, tokenizer='wordpiece', \
+             lang='en', seed=None, do_lower_case=False, phase='train'):
+        """xxxxxx.
+
+        Argument:
+          - vocab_path: xxxx
+          -
+
        """
-        Args:
-            phase: train, eval, pred
-            """

-        self._is_training = phase == 'train'
+        BaseReader.__init__(self, phase)

-        reader = ClassifyReader(config['vocab_path'],
-            max_seq_len=config['max_seq_len'],
-            do_lower_case=config.get('do_lower_case', False),
-            for_cn=config.get('for_cn', False),
-            random_seed=config.get('seed', None))
-        self._reader = reader
-        self._dev_count = dev_count
+        assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
+        assert phase in ['train', 'pred'], "supported phase: train, pred."

-        self._batch_size = config['batch_size']
-        self._max_seq_len = config['max_seq_len']
-        self._num_classes = config['n_classes']
+        for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'

+        self._register.add('token_ids')
        if phase == 'train':
-            self._input_file = config['train_file']
-            self._num_epochs = None # 防止iteartor终止
-            self._shuffle = config.get('shuffle', True)
-            # self._shuffle_buffer = config.get('shuffle_buffer', 5000)
-        elif phase == 'eval':
-            self._input_file = config['dev_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
-        elif phase == 'pred':
-            self._input_file = config['pred_file']
-            self._num_epochs = 1
-            self._shuffle = False
-            self._batch_size = config.get('pred_batch_size', self._batch_size)
+            self._register.add('label_ids')
+
+        self._is_training = phase == 'train'
+
+        cls_reader = CLSReader(vocab_path,
+                                max_seq_len=max_len,
+                                do_lower_case=do_lower_case,
+                                for_cn=for_cn,
+                                random_seed=seed)
+        self._reader = cls_reader

        self._phase = phase
        # self._batch_size = 
-        self._print_first_n = config.get('print_first_n', 0)
+        # self._print_first_n = config.get('print_first_n', 0)


    @property
    def outputs_attr(self):
-        if self._is_training:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
-                    "input_mask": [[-1, -1, 1], 'float32'],
-                    "label_ids": [[-1,1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64']
-                    }
-        else:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64'],
-                    "input_mask": [[-1, -1, 1], 'float32']
-                    }
-
-
-    def load_data(self):
-        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
-
-    def iterator(self): 
-
-        def list_to_dict(x):
-            names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
-                'label_ids', 'unique_ids']
-            outputs = {n: i for n,i in zip(names, x)}
-            del outputs['unique_ids']
-            if not self._is_training:
-                del outputs['label_ids']
-            return outputs
-
+        attrs = {"token_ids": [[-1, -1], 'int64'],
+                "position_ids": [[-1, -1], 'int64'],
+                "segment_ids": [[-1, -1], 'int64'],
+                "input_mask": [[-1, -1, 1], 'float32'],
+                "label_ids": [[-1], 'int64'],
+                "task_ids": [[-1, -1], 'int64']
+                }
+        return self._get_registed_attrs(attrs)
+
+
+    def _load_data(self, input_file, batch_size, num_epochs=None, \
+                  file_format='csv', shuffle_train=True):
+        self._data_generator = self._reader.data_generator(input_file, batch_size, \
+            num_epochs, shuffle=shuffle_train if self._phase == 'train' else False, \
+            phase=self._phase)
+
+    def _iterator(self): 
+
+        names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
+            'label_ids', 'unique_ids']
        for batch in self._data_generator():
-            yield list_to_dict(batch)
+            outputs = {n: i for n,i in zip(names, batch)}
+            ret = {}
+            # TODO: move runtime shape check here
+            for attr in self.outputs_attr.keys():
+                ret[attr] = outputs[attr]
+            yield ret

    def get_epoch_outputs(self):
        return {'examples': self._reader.get_examples(self._phase),
@@ -103,3 +92,4 @@ class Reader(reader):
    def num_examples(self):
        return self._reader.get_num_examples(phase=self._phase)

+
--- a/paddlepalm/reader/match.py
+++ b/paddlepalm/reader/match.py
@@ -60,18 +60,18 @@ class Reader(reader):
    @property
    def outputs_attr(self):
        if self._is_training:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
+            return {"token_ids": [[-1, -1], 'int64'],
+                    "position_ids": [[-1, -1], 'int64'],
+                    "segment_ids": [[-1, -1], 'int64'],
                    "input_mask": [[-1, -1, 1], 'float32'],
-                    "label_ids": [[-1,1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64']
+                    "label_ids": [[-1], 'int64'],
+                    "task_ids": [[-1, -1], 'int64']
                    }
        else:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64'],
+            return {"token_ids": [[-1, -1], 'int64'],
+                    "position_ids": [[-1, -1], 'int64'],
+                    "segment_ids": [[-1, -1], 'int64'],
+                    "task_ids": [[-1, -1], 'int64'],
                    "input_mask": [[-1, -1, 1], 'float32']
                    }


--- a/paddlepalm/reader/mlm.py
+++ b/paddlepalm/reader/mlm.py
@@ -60,13 +60,13 @@ class Reader(reader):

    @property
    def outputs_attr(self):
-        return {"token_ids": [[-1, -1, 1], 'int64'],
-                "position_ids": [[-1, -1, 1], 'int64'],
-                "segment_ids": [[-1, -1, 1], 'int64'],
+        return {"token_ids": [[-1, -1], 'int64'],
+                "position_ids": [[-1, -1], 'int64'],
+                "segment_ids": [[-1, -1], 'int64'],
                "input_mask": [[-1, -1, 1], 'float32'],
-                "task_ids": [[-1, -1, 1], 'int64'],
-                "mask_label": [[-1, 1], 'int64'],
-                "mask_pos": [[-1, 1], 'int64'],
+                "task_ids": [[-1, -1], 'int64'],
+                "mask_label": [[-1], 'int64'],
+                "mask_pos": [[-1], 'int64'],
                }



--- a/paddlepalm/reader/mrc.py
+++ b/paddlepalm/reader/mrc.py
@@ -32,6 +32,7 @@ class Reader(reader):
            tokenizer='FullTokenizer',
            for_cn=config.get('for_cn', False),
            doc_stride=config['doc_stride'],
+            remove_noanswer=config.get('remove_noanswer', True),
            max_query_length=config['max_query_len'],
            random_seed=config.get('seed', None))
        self._reader = reader
@@ -67,21 +68,21 @@ class Reader(reader):
    @property
    def outputs_attr(self):
        if self._is_training:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
+            return {"token_ids": [[-1, -1], 'int64'],
+                    "position_ids": [[-1, -1], 'int64'],
+                    "segment_ids": [[-1, -1], 'int64'],
                    "input_mask": [[-1, -1, 1], 'float32'],
-                    "start_positions": [[-1, 1], 'int64'],
-                    "end_positions": [[-1, 1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64']
+                    "start_positions": [[-1], 'int64'],
+                    "end_positions": [[-1], 'int64'],
+                    "task_ids": [[-1, -1], 'int64']
                    }
        else:
-            return {"token_ids": [[-1, -1, 1], 'int64'],
-                    "position_ids": [[-1, -1, 1], 'int64'],
-                    "segment_ids": [[-1, -1, 1], 'int64'],
-                    "task_ids": [[-1, -1, 1], 'int64'],
+            return {"token_ids": [[-1, -1], 'int64'],
+                    "position_ids": [[-1, -1], 'int64'],
+                    "segment_ids": [[-1, -1], 'int64'],
+                    "task_ids": [[-1, -1], 'int64'],
                    "input_mask": [[-1, -1, 1], 'float32'],
-                    "unique_ids": [[-1, 1], 'int64']
+                    "unique_ids": [[-1], 'int64']
                    }

    @property

--- a/paddlepalm/reader/utils/batching4bert.py
+++ b/paddlepalm/reader/utils/batching4bert.py
@@ -67,8 +67,8 @@ def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
                sent[token_index] = MASK
                mask_flag = True
                mask_pos.append(sent_index * max_len + token_index)
-    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
-    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    mask_label = np.array(mask_label).astype("int64").reshape([-1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1])
    return batch_tokens, mask_label, mask_pos


@@ -96,7 +96,7 @@ def prepare_batch_data(insts,
    # or unique id
    for i in range(3, len(insts[0]), 1):
        labels = [inst[i] for inst in insts]
-        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels = np.array(labels).astype("int64").reshape([-1])
        labels_list.append(labels)
    # First step: do mask without padding
    if mask_id >= 0:
@@ -154,14 +154,14 @@ def pad_batch_data(insts,
    inst_data = np.array([
        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
    ])
-    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+    return_list += [inst_data.astype("int64").reshape([-1, max_len])]
    # position data
    if return_pos:
        inst_pos = np.array([
            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
            for inst in insts
        ])
-        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
    if return_input_mask:
        # This is used to avoid attention on paddings.
        input_mask_data = np.array([[1] * len(inst) + [0] *

--- a/paddlepalm/reader/utils/batching4ernie.py
+++ b/paddlepalm/reader/utils/batching4ernie.py
@@ -113,8 +113,8 @@ def mask(batch_tokens,

        pre_sent_len = len(sent)

-    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
-    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    mask_label = np.array(mask_label).astype("int64").reshape([-1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1])
    return batch_tokens, mask_label, mask_pos


@@ -136,7 +136,7 @@ def pad_batch_data(insts,

    inst_data = np.array(
        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
-    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+    return_list += [inst_data.astype("int64").reshape([-1, max_len])]

    # position data
    if return_pos:
@@ -145,7 +145,7 @@ def pad_batch_data(insts,
            for inst in insts
        ])

-        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len])]

    if return_input_mask:
        # This is used to avoid attention on paddings.
@@ -165,7 +165,7 @@ def pad_batch_data(insts,

    if return_seq_lens:
        seq_lens = np.array([len(inst) for inst in insts])
-        return_list += [seq_lens.astype("int64").reshape([-1, 1])]
+        return_list += [seq_lens.astype("int64").reshape([-1])]

    return return_list if len(return_list) > 1 else return_list[0]


--- a/paddlepalm/reader/utils/mlm_batching.py
+++ b/paddlepalm/reader/utils/mlm_batching.py
@@ -67,8 +67,8 @@ def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
                sent[token_index] = MASK
                mask_flag = True
                mask_pos.append(sent_index * max_len + token_index)
-    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
-    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    mask_label = np.array(mask_label).astype("int64").reshape([-1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1])
    return batch_tokens, mask_label, mask_pos


@@ -147,14 +147,14 @@ def pad_batch_data(insts,
    inst_data = np.array([
        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
    ])
-    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+    return_list += [inst_data.astype("int64").reshape([-1, max_len])]
    # position data
    if return_pos:
        inst_pos = np.array([
            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
            for inst in insts
        ])
-        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
    if return_input_mask:
        # This is used to avoid attention on paddings.
        input_mask_data = np.array([[1] * len(inst) + [0] *

--- a/paddlepalm/reader/utils/reader4ernie.py
+++ b/paddlepalm/reader/utils/reader4ernie.py
@@ -54,14 +54,14 @@ class BaseReader(object):
                 vocab_path,
                 label_map_config=None,
                 max_seq_len=512,
-                 do_lower_case=True,
+                 do_lower_case=False,
                 in_tokens=False,
                 is_inference=False,
                 random_seed=None,
                 tokenizer="FullTokenizer",
                 is_classify=True,
                 is_regression=False,
-                 for_cn=True,
+                 for_cn=False,
                 task_id=0):
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenization.FullTokenizer(
@@ -301,6 +301,70 @@ class BaseReader(object):
        return f


+class ClassifyReader(BaseReader):
+    def _read_tsv(self, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, 'r', encoding='utf8') as f:
+            reader = csv_reader(f)
+            headers = next(reader)
+            text_indices = [
+                index for index, h in enumerate(headers) if h != "label"
+            ]
+            Example = namedtuple('Example', headers)
+
+            examples = []
+            for line in reader:
+                for index, text in enumerate(line):
+                    if index in text_indices:
+                        if self.for_cn:
+                            line[index] = text.replace(' ', '')
+                        else:
+                            line[index] = text
+                example = Example(*line)
+                examples.append(example)
+            return examples
+
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+
+        if not self.is_inference:
+            batch_labels = [record.label_id for record in batch_records]
+            if self.is_classify:
+                batch_labels = np.array(batch_labels).astype("int64").reshape(
+                    [-1])
+            elif self.is_regression:
+                batch_labels = np.array(batch_labels).astype("float32").reshape(
+                    [-1])
+
+            if batch_records[0].qid:
+                batch_qids = [record.qid for record in batch_records]
+                batch_qids = np.array(batch_qids).astype("int64").reshape(
+                    [-1])
+            else:
+                batch_qids = np.array([]).astype("int64").reshape([-1])
+
+        # padding
+        padded_token_ids, input_mask = pad_batch_data(
+            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_task_ids = np.ones_like(
+            padded_token_ids, dtype="int64") * self.task_id
+
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask
+        ]
+        if not self.is_inference:
+            return_list += [batch_labels, batch_qids]
+
+        return return_list
+
+
 class MaskLMReader(BaseReader):

    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
@@ -447,70 +511,6 @@ class MaskLMReader(BaseReader):
        return wrapper


-class ClassifyReader(BaseReader):
-    def _read_tsv(self, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, 'r', encoding='utf8') as f:
-            reader = csv_reader(f)
-            headers = next(reader)
-            text_indices = [
-                index for index, h in enumerate(headers) if h != "label"
-            ]
-            Example = namedtuple('Example', headers)
-
-            examples = []
-            for line in reader:
-                for index, text in enumerate(line):
-                    if index in text_indices:
-                        if self.for_cn:
-                            line[index] = text.replace(' ', '')
-                        else:
-                            line[index] = text
-                example = Example(*line)
-                examples.append(example)
-            return examples
-
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-
-        if not self.is_inference:
-            batch_labels = [record.label_id for record in batch_records]
-            if self.is_classify:
-                batch_labels = np.array(batch_labels).astype("int64").reshape(
-                    [-1, 1])
-            elif self.is_regression:
-                batch_labels = np.array(batch_labels).astype("float32").reshape(
-                    [-1, 1])
-
-            if batch_records[0].qid:
-                batch_qids = [record.qid for record in batch_records]
-                batch_qids = np.array(batch_qids).astype("int64").reshape(
-                    [-1, 1])
-            else:
-                batch_qids = np.array([]).astype("int64").reshape([-1, 1])
-
-        # padding
-        padded_token_ids, input_mask = pad_batch_data(
-            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        padded_task_ids = np.ones_like(
-            padded_token_ids, dtype="int64") * self.task_id
-
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            padded_task_ids, input_mask
-        ]
-        if not self.is_inference:
-            return_list += [batch_labels, batch_qids]
-
-        return return_list
-
-
 class SequenceLabelReader(BaseReader):
    def _pad_batch_records(self, batch_records):
        batch_token_ids = [record.token_ids for record in batch_records]
@@ -908,19 +908,19 @@ class MRCReader(BaseReader):
                record.end_position for record in batch_records
            ]
            batch_start_position = np.array(batch_start_position).astype(
-                "int64").reshape([-1, 1])
+                "int64").reshape([-1])
            batch_end_position = np.array(batch_end_position).astype(
-                "int64").reshape([-1, 1])
+                "int64").reshape([-1])

        else:
            batch_size = len(batch_token_ids)
            batch_start_position = np.zeros(
-                shape=[batch_size, 1], dtype="int64")
-            batch_end_position = np.zeros(shape=[batch_size, 1], dtype="int64")
+                shape=[batch_size], dtype="int64")
+            batch_end_position = np.zeros(shape=[batch_size], dtype="int64")

        batch_unique_ids = [record.unique_id for record in batch_records]
        batch_unique_ids = np.array(batch_unique_ids).astype("int64").reshape(
-            [-1, 1])
+            [-1])

        # padding
        padded_token_ids, input_mask = pad_batch_data(

--- a/paddlepalm/trainer.py
+++ b/paddlepalm/trainer.py
+# -*- coding: utf-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import json
+from paddle import fluid
+import time
+import numpy as np
+import paddlepalm.utils.basic_helper as helper
+from paddlepalm.utils import reader_helper, saver
+from paddlepalm.distribute import gpu_dev_count, data_feeder
+# from paddlepalm.default_settings import *
+
+DEBUG=False
+
+
+class Trainer(object):
+
+    def __init__(self, name, reader, task_head, \
+                 mix_ratio=1.0, reuse_head_with=None, \
+                 silent=False):
+
+        self._name = name
+        self._verbose = not silent
+        self._reader = reader
+        self._pred_reader = None
+        self._task_head = task_head
+        self._pred_head = pred_head
+
+        # if save_predict_model:
+        #     self._save_predict_model = True
+        #     assert pred_head is not None, "pred_head is required to save predict model."
+        #     self._pred_reader = reader.clone(phase='pred')
+        # else:
+        #     assert pred_head is None, "You should set save_predict_model as True, or the pred_head is invalid." 
+        #     self._save_predict_model = False
+        #     self._pred_reader = None
+
+        # self._save_steps = save_steps
+
+        self._task_reuse_scope = name if reuse_head_with is None else reuse_head_with
+
+        self._feeded_var_names = None
+        self._target_vars = None
+
+        self._num_examples = 0
+
+        # training process management
+        self._mix_ratio = mix_ratio
+        self._expected_train_steps = None
+        self._expected_train_epochs = None
+        self._steps_pur_epoch = None
+        self._cur_train_epoch = 0
+        self._cur_train_step = 0
+        self._train_finish = False
+
+        # 存放不同运行阶段（train，eval，pred）的数据集reader，key为phase，value为Reader实例
+        # self._reader = {'train': reader, 'eval': None, 'pred': self._pred_reader}
+        # self._input_layer = None
+        self._inputname_to_varname = {}
+        # self._task_layer = {'train': task_head, 'eval': None, 'pred': pred_head}
+        self._pred_input_name_list = []
+        self._pred_input_varname_list = []
+        self._pred_fetch_name_list = []
+        self._pred_fetch_var_list = []
+
+        # exe is built when random_init_params is called.
+        # self._exe = helper.build_executor(gpu_dev_count>0)
+        self._exe = None
+
+        self._save_protocol = {
+            'input_names': 'self._pred_input_name_list',
+            'input_varnames': 'self._pred_input_varname_list',
+            'fetch_list': 'self._pred_fetch_name_list'}
+
+        self._lock = False
+        self._build_forward = False
+
+    def build_predict_head(self, pred_backbone, pred_prog=None, pred_init_prog=None):
+        pred_task_attr_from_reader = helper.encode_inputs(self._pred_head.inputs_attrs['reader'], self.name)
+        # pred_task_attr_from_reader = self._pred_head.inputs_attrs['reader']
+
+        # _check_io(pred_backbone.inputs_attr, pred_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.pred')
+        # _check_io(pred_parad.inputs_attrs['reader'], pred_reader.outputs_attr, in_name='task_paradigm.pred.reader', out_name='reader.pred')
+        # _check_io(pred_parad.inputs_attrs['backbone'], pred_backbone.outputs_attr, in_name='task_paradigm.pred.backbone', out_name=bb_name+'_backbone')
+        pred_input_names, pred_shape_and_dtypes, _ = reader_helper.merge_input_attrs(backbone.inputs_attr, pred_task_attr_from_reader, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
+        pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_input_names, pred_shape_and_dtypes)]
+        
+        if pred_prog is None:
+            pred_prog = fluid.Program()
+        if pred_init_prog is None:
+            pred_init_prog = fluid.Program()
+        with fluid.program_guard(pred_prog, pred_init_prog):
+            pred_net_inputs = reader_helper.create_net_inputs(pred_input_attrs)
+            # pred_bb_output_vars = pred_backbone.build(pred_net_inputs, scope_name='__paddlepalm_')
+            pred_bb_output_vars = pred_backbone.build(pred_net_inputs)
+
+        # prepare predict vars for saving inference model
+        with fluid.program_guard(pred_prog, pred_init_prog):
+            cur_inputs = helper.decode_inputs(pred_net_inputs, self.name)
+            # self.pred_input = cur_inputs
+            self._pred_input_name_list, self._pred_input_varname_list = \
+                zip(*[[k, v.name] for k,v in cur_inputs.items()])
+
+            pred_task_inputs = {'backbone': pred_bb_output_vars, 'reader': cur_inputs}
+            scope = self.name + '.'
+            with fluid.unique_name.guard(scope):
+                self._build_head(pred_task_inputs, phase='pred', scope=scope)
+
+
+
+
+    def build_forward(self, backbone, pred_backbone=None, train_prog=None, train_init_prog=None, pred_prog=None, pred_init_prog=None):
+
+        # assert self._backbone is not None, "backbone is required for Trainer to build net forward to run with single task mode"
+        self._build_forward = True
+        
+        # create reader, task
+        # then check i/o across reader, backbone and task_layer
+        task_attrs = []
+        pred_task_attrs = []
+
+        task_attr_from_reader = helper.encode_inputs(self._task_head.inputs_attrs['reader'], self.name)
+        # task_attr_from_reader = self._task_head.inputs_attrs['reader']
+
+        # _check_io(backbone.inputs_attr, inst._reader['train'].outputs_attr, in_name=bb_name+'_backbone', out_name='reader.train')
+        # _check_io(inst.taskblock['train'].inputs_attrs['reader'], inst._reader['train'].outputs_attr, in_name='task_paradigm.train.reader', out_name='reader.train')
+        # _check_io(inst._taskblock['train'].inputs_attrs['backbone'], train_backbone.outputs_attr, in_name='task_paradigm.train.backbone', out_name=bb_name+'_backbone')
+
+
+        # merge reader input attrs from backbone and task_instances
+        input_names, shape_and_dtypes, name_to_position = reader_helper.merge_input_attrs(backbone.inputs_attr, task_attr_from_reader, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
+        # shapes: [task_id, shapes_of_backbone, shapes_of_inst1, ..., shapes_of_instN]
+        self._shape_and_dtypes = shape_and_dtypes
+        self._name_to_position = name_to_position
+
+        if DEBUG:
+            print('----- for debug -----')
+            print('joint input names:')
+            print(joint_input_names)
+            print('joint input shape and dtypes:')
+            print(joint_shape_and_dtypes)
+
+
+        input_attrs = [[i, j, k] for i, (j,k) in zip(input_names, shape_and_dtypes)]
+
+        if train_prog is None:
+            train_prog = fluid.Program()
+        if train_init_prog is None:
+            train_init_prog = fluid.Program()
+        self._prog = train_prog
+        self._train_prog = train_prog
+        self._train_init_prog = train_init_prog
+        with fluid.program_guard(train_prog, train_init_prog):
+            net_inputs = reader_helper.create_net_inputs(input_attrs, async=False)
+            self._net_inputs = net_inputs
+
+            # build backbone and task layers
+            # bb_output_vars = self._backbone.build(net_inputs, scope_name='__paddlepalm_')
+            bb_output_vars = backbone.build(net_inputs)
+            assert sorted(bb_output_vars.keys()) == sorted(backbone.outputs_attr.keys())
+        
+
+        # fluid.framework.switch_main_program(train_prog)
+        # fluid.framework.switch_startup_program(train_init_prog)
+
+        task_output_vars = {}
+        task_inputs = {'backbone': bb_output_vars}
+        task_inputs_from_reader = helper.decode_inputs(net_inputs, self.name)
+        task_inputs['reader'] = task_inputs_from_reader
+
+        scope = self.name+'.'
+        with fluid.program_guard(train_prog, train_init_prog):
+            with fluid.unique_name.guard(scope):
+                output_vars = self._build_head(task_inputs, phase='train', scope=scope)
+        output_vars = {self.name+'.'+key: val for key, val in output_vars.items()}
+        old = len(task_output_vars) # for debug
+        task_output_vars.update(output_vars)
+        assert len(task_output_vars) - old == len(output_vars) # for debug
+
+        bb_fetches = {k: v.name for k,v in bb_output_vars.items()}
+        task_fetches = {k: v.name for k,v in task_output_vars.items()}
+        self._fetches = task_fetches
+        self._fetch_names, self._fetch_list = zip(*self._fetches.items())
+        # fetches = task_fetches
+        # fetches['__task_id'] = net_inputs['__task_id'].name
+
+        # compute loss
+        # task_id_var = net_inputs['__task_id']
+        # task_id_vec = layers.one_hot(task_id_var, num_instances)
+        # losses = fluid.layers.concat([task_output_vars[inst.name+'/loss'] for inst in instances], axis=0)
+        # loss = layers.reduce_sum(task_id_vec * losses)
+        with fluid.program_guard(train_prog, train_init_prog):
+            loss_var = fluid.layers.reduce_sum(task_output_vars[self.name+'.loss'])
+
+        self._distribute_train_prog = fluid.CompiledProgram(self._train_prog).with_data_parallel(loss_name=loss_var.name)
+        return loss_var
+
+    def build_backward(self, optimizer, weight_decay=None, use_ema=False, ema_decay=0.9999):
+        # build optimizer
+        optimizer._set_prog(self._train_prog)
+        with fluid.program_guard(self._train_prog, self._train_init_prog):
+            param_grads = optimizer.build()
+
+            if weight_decay is not None:
+
+                param_list = dict()
+
+                for param in self._prog.global_block().all_parameters():
+                    param_list[param.name] = param * 1.0
+                    param_list[param.name].stop_gradient = True
+
+                def exclude_from_weight_decay(name):
+                    if name.find("layer_norm") > -1:
+                        return True
+                    bias_suffix = ["_bias", "_b", ".b_0"]
+                    for suffix in bias_suffix:
+                        if name.endswith(suffix):
+                            return True
+                    return False
+
+                for param, grad in param_grads:
+                    if exclude_from_weight_decay(param.name):
+                        continue
+                    with param.block.program._optimized_guard(
+                        [param, grad]), fluid.framework.name_scope("weight_decay"):
+                        updated_param = param - param_list[
+                            param.name] * weight_decay * optimizer.get_cur_learning_rate()
+                        fluid.layers.assign(output=param, input=updated_param)
+
+
+            # loss.persistable = True
+            if use_ema:
+                ema = fluid.optimizer.ExponentialMovingAverage(ema_decay)
+                ema.update()
+
+    def load_data(self, input_file, file_format, batch_size, num_epochs=None, shuffle_train=True):
+        # load data
+        print("preparing data...", end='')
+        self._reader._load_data(input_file=input_file, batch_size=batch_size, \
+                                num_epochs=num_epochs, file_format=file_format, \
+                                shuffle_train=shuffle_train)
+        self._num_examples = self._reader.num_examples
+        # 这里不确定是否要向上取整，需确认
+        # tail = self._num_examples % batch_size > 0
+        # self._steps_pur_epoch = self._num_examples // batch_size + 1 if tail else 0
+        self._steps_pur_epoch = self._num_examples // batch_size
+        print('ok!')
+
+        # merge dataset iterators and create net input vars
+        iterator = self._reader._iterator()
+        prefix = self.name
+
+        # 对yield出的数据进行runtime检查和适配
+        iterator_fn = reader_helper.create_iterator_fn(iterator, prefix, self._shape_and_dtypes, self._name_to_position, return_type='dict')
+        feed_batch_process_fn = reader_helper.create_feed_batch_process_fn(self._net_inputs)
+        self._feed_batch_process_fn = feed_batch_process_fn
+        if gpu_dev_count > 1:
+            distribute_feeder_fn = data_feeder(iterator_fn, feed_batch_process_fn)
+        else:
+            distribute_feeder_fn = iterator_fn
+        return distribute_feeder_fn()
+
+    def random_init_params(self):
+        on_gpu = gpu_dev_count > 0
+        self._exe = helper.build_executor(on_gpu)
+        print('random init params...')
+        self._exe.run(self._train_init_prog)
+
+    def load_pretrain(self, model_path):
+        # load pretrain model (or ckpt)
+        assert self._exe is not None, "You need to random_init_params before load pretrain models."
+
+        saver.init_pretraining_params(
+            self._exe,
+            model_path,
+            main_program=self._train_init_prog)
+
+    def set_predict_head(self):
+        pass
+
+    def train(self, iterator, save_path=None, save_steps=None, save_type='ckpt', print_steps=5):
+
+        save_type = save_type.split(',')
+        if 'predict' in save_type:
+            assert self._pred_head is not None, "Predict head not found! You should call set_predict_head first if you want to save predict model."
+            assert save_path is not None and save_steps is not None, 'save_path and save_steps is required to save model.'
+            save_predict = True
+            if not os.path.exists(save_path):
+                os.makedirs(save_path)
+        else:
+            save_predict = False
+
+        if 'ckpt' in save_type:
+            if save_path is not None and save_steps is not None:
+                save_ckpt = True
+                if not os.path.exists(save_path):
+                    os.makedirs(save_path)
+            else:
+                "WARNING: save_path or save_steps is not set, model will not be saved during training."
+                save_ckpt = False
+        else:
+            save_ckpt = False
+
+        # if save_path is not None or save_steps is not None:
+        #     assert self._save_predict_model, "If you want to save model, you need set save_predict_model=True when this trainer is built."
+        # if self._save_predict_model:
+        #     if save_path is None and save_steps is None:
+        #         print('Warning: model will not be saved for this run. If you want to save model, set save_path and save_steps.')
+        #     else:
+        #         assert save_path is not None, "argument save_path is required to save models."
+        #         assert save_steps == -1 or save_steps > 0, "argument save_steps should be -1 (only save the last step of this task) or larger than 0"
+        #         if save_path is not None and not os.path.exists(save_path):
+        #             os.makedirs(save_path)
+        # else:
+        #     assert save_path is None, "You should set save_predict_model as True, or the argument save_path is invalid."
+        #     assert save_steps is None, "You should set save_predict_model as True, or the argument save_steps is invalid."
+
+        time_begin = time.time()
+        for feed in iterator:
+            rt_outputs = self.train_one_step(feed)
+            # if gpu_dev_count > 1:
+            #     feed, mask = feed
+            # rt_outputs = self.exe.run(self._train_prog, feed=feed, fetch_list=self._fetch_list)
+            # print(rt_outputs)
+            # print(len(rt_outputs))
+            # if gpu_dev_count > 1:
+            #     while mask.pop() == False:
+            #         rt_outputs.pop()
+
+            # rt_outputs = {k:v for k,v in zip(self._fetch_names, rt_outputs)}
+
+            task_rt_outputs = {k[len(self.name+'.'):]: v for k,v in rt_outputs.items() if k.startswith(self.name+'.')}
+            self._task_head.postprocess(task_rt_outputs)
+
+            self._cur_train_step += 1
+            self._cur_train_epoch = (self._cur_train_step-1) // self._steps_pur_epoch
+
+            # if self._save_predict_model and self._cur_train_step % save_steps == 0:
+            #     self.save(save_path, suffix='.step'+str(self._cur_train_steps))
+
+            if print_steps > 0 and self._cur_train_step % print_steps == 0:
+                loss = rt_outputs[self.name+'.loss']
+                loss = np.mean(np.squeeze(loss)).tolist()
+
+                time_end = time.time()
+                time_cost = time_end - time_begin
+
+                print("step {}/{} (epoch {}), loss: {:.3f}, speed: {:.2f} steps/s".format(
+                       (self._cur_train_step-1) % self._steps_pur_epoch + 1, self._steps_pur_epoch, self._cur_train_epoch,
+                       loss, print_steps / time_cost))
+                time_begin = time.time()
+
+            # if cur_task.train_finish and cur_task.cur_train_step + cur_task.cur_train_epoch * cur_task.steps_pur_epoch == cur_task.expected_train_steps:
+            #     print(cur_task.name+': train finished!')
+            #     cur_task.save()
+
+            if (save_predict or save_ckpt) and self._cur_train_step % save_steps == 0:
+                if save_predict_model:
+                    self.save(save_path, suffix='pred.step'+str(global_step))
+                if save_ckpt:
+                    fluid.io.save_persistables(self.exe, os.path.join(save_path, 'ckpt.step'+str(global_step)), self._train_prog)
+                    print('checkpoint has been saved at '+os.path.join(save_path, 'ckpt.step'+str(global_step)))
+
+        # save_path = os.path.join(main_conf['save_path'], 'ckpt',
+        #                          "step_" + str(global_step))
+        # fluid.io.save_persistables(self.exe, save_path, saver_program)
+        # print('checkpoint has been saved at '+save_path)
+
+        # print("ALL tasks train finished, exiting...")
+
+    def train_one_step(self, batch):
+        if gpu_dev_count > 1:
+            feed, mask = batch
+            rt_outputs = self.exe.run(self._distribute_train_prog, feed=feed, fetch_list=self._fetch_list)
+            while mask.pop() == False:
+                rt_outputs.pop()
+        else:
+            feed = self._feed_batch_process_fn(batch)
+            rt_outputs = self._exe.run(self._distribute_train_prog, feed=feed, fetch_list=self._fetch_list)
+
+        rt_outputs = {k:v for k,v in zip(self._fetch_names, rt_outputs)}
+        return rt_outputs
+        
+
+    def _build_head(self, net_inputs, phase, scope=""):
+        if phase == 'train':
+            output_vars = self._task_head.build(net_inputs, scope_name=scope)
+        if phase == 'pred':
+            output_vars = self._pred_head.build(net_inputs, scope_name=scope)
+            if output_vars is not None:
+                self._pred_fetch_name_list, self._pred_fetch_var_list = zip(*output_vars.items())
+            else:
+                self._pred_fetch_name_list = []
+                self._pred_fetch_var_list = []
+        return output_vars
+
+    def _postprocess(self, rt_outputs, phase):
+        return self._task_layer[phase].postprocess(rt_outputs)
+
+    def _epoch_postprocess(self, epoch_inputs, phase):
+        return self._task_layer[phase].epoch_postprocess(epoch_inputs)
+    
+    def save(self, save_path, suffix=None):
+        # dirpath = save_path.rstrip('/').rstrip('\\') + suffix
+        if suffix is not None:
+            dirpath = os.path.join(save_path, suffix)
+        else:
+            dirpath = save_path
+        self._pred_input_varname_list = [str(i) for i in self._pred_input_varname_list]
+
+        prog = fluid.default_main_program().clone()
+        fluid.io.save_inference_model(dirpath, self._pred_input_varname_list, self._pred_fetch_var_list, self._exe, prog)
+
+        conf = {}
+        for k, strv in self._save_protocol.items(): 
+            d = None
+            v = locals()
+            exec('d={}'.format(strv), globals(), v)
+            conf[k] = v['d']
+        with open(os.path.join(dirpath, '__conf__'), 'w') as writer:
+            writer.write(json.dumps(conf, indent=1))
+        print(self._name + ': predict model saved at ' + dirpath)
+
+    def _load(self, infer_model_path=None):
+        if infer_model_path is None:
+            infer_model_path = self._save_infermodel_path
+        for k,v in json.load(open(os.path.join(infer_model_path, '__conf__'))).items(): 
+            strv = self._save_protocol[k]
+            exec('{}=v'.format(strv))
+        pred_prog, self._pred_input_varname_list, self._pred_fetch_var_list = \
+            fluid.io.load_inference_model(infer_model_path, self._exe)
+        print(self._name+': inference model loaded from ' + infer_model_path)
+        return pred_prog
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def num_examples(self):
+        return self._num_examples
+
+    # @property
+    # def _pred_input(self):
+    #     return zip(*[self._pred_input_name_list, self._pred_input_varname_list])
+
+    # @_pred_input.setter
+    # def _pred_input(self, val):
+    #     assert isinstance(val, dict)
+    #     self._pred_input_name_list, self._pred_input_varname_list = \
+    #         zip(*[[k, v.name] for k,v in val.items()])
+
+    # @property
+    # def _pred_fetch_list(self):
+    #     return [self._pred_fetch_name_list, self._pred_fetch_var_list]
+
+    @property
+    def mix_ratio(self):
+        if self._mix_ratio is not None:
+            return self._mix_ratio
+        else:
+            raise ValueError("{}: mix_ratio is None".format(self._name))
+
+    @mix_ratio.setter
+    def mix_ratio(self, value):
+        self._mix_ratio = float(value)
+        if self._verbose:
+            print('{}: mix_ratio is set to {}'.format(self._name, self._mix_ratio))
+
+    @property
+    def save_infermodel_every_n_steps(self):
+        return self._save_infermodel_every_n_steps
+
+    @save_infermodel_every_n_steps.setter
+    def save_infermodel_every_n_steps(self, val):
+        self._save_infermodel_every_n_steps = val
+
+    @property
+    def expected_train_steps(self):
+        return self._expected_train_steps
+
+    @expected_train_steps.setter
+    def expected_train_steps(self, value):
+        self._expected_train_steps = value
+        self._expected_train_epochs = value / float(self._steps_pur_epoch)
+
+    @property
+    def expected_train_epochs(self):
+        return self._expected_train_epochs
+
+    @property
+    def cur_train_epoch(self):
+        return self._cur_train_epoch
+
+    @property
+    def cur_train_step(self):
+        return self._cur_train_step
+
+    # @cur_train_step.setter
+    # def _cur_train_step(self, value):
+    #     self._cur_train_step = value
+    #     if self._cur_train_step > self._steps_pur_epoch:
+    #         self._cur_train_epoch += 1
+    #         self._cur_train_step = 1
+    #     if self._is_target and self._cur_train_step + self._cur_train_epoch * self._steps_pur_epoch >= self._expected_train_steps:
+    #         self._train_finish = True
+
+    @property
+    def steps_pur_epoch(self):
+        return self._steps_pur_epoch
+
+    @steps_pur_epoch.setter
+    def steps_pur_epoch(self, value):
+        self._steps_pur_epoch = value
+
+    @property
+    def train_finish(self):
+        return self._train_finish
+
+    def tasklayer_reuse_with(self, task):
+        assert isinstance(task, Task)
+        if self._lock:
+            raise Exception('you can only set tasklayer reuses BEFORE Controller created.')
+        self._task_reuse_scope = task.name
+    
+    def _set_lock(self):
+        self._lock = True
+
--- a/paddlepalm/utils/.saver.py.swp
+++ b/paddlepalm/utils/.saver.py.swp
--- a/paddlepalm/utils/__init__.py
+++ b/paddlepalm/utils/__init__.py
+
+import basic_helper
+import config_helper
+
--- a/paddlepalm/utils/basic_helper.py
+++ b/paddlepalm/utils/basic_helper.py
+# coding=utf-8
+import os
+import json
+import yaml
+from config_helper import PDConfig
+from paddle import fluid
+
+def get_basename(f):
+    return os.path.splitext(f)[0]
+
+
+def get_suffix(f):
+    return os.path.splitext(f)[-1]
+
+
+def parse_yaml(f, asdict=True, support_cmd_line=False):
+    assert os.path.exists(f), "file {} not found.".format(f)
+    if support_cmd_line:
+        args = PDConfig(yaml_file=f, fuse_args=True)
+        args.build()
+        return args.asdict() if asdict else args
+    else:
+        if asdict:
+            with open(f, "r") as fin: 
+                yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
+            return yaml_config
+        else:
+            raise NotImplementedError()
+
+
+def parse_json(f, asdict=True, support_cmd_line=False):
+    assert os.path.exists(f), "file {} not found.".format(f)
+    if support_cmd_line:
+        args = PDConfig(json_file=f, fuse_args=support_cmd_line)
+        args.build()
+        return args.asdict() if asdict else args
+    else:
+        if asdict:
+            with open(f, "r") as fin: 
+                config = json.load(fin)
+            return config
+        else:
+            raise NotImplementedError()
+            
+
+def parse_list(string, astype=str):
+    assert isinstance(string, str), "{} is not a string.".format(string)
+    if ',' not in string:
+        return [astype(string)]
+    string = string.replace(',', ' ')
+    return [astype(i) for i in string.split()]
+
+
+def try_float(s):
+    try:
+        float(s)
+        return(float(s))
+    except:
+        return s
+
+
+# TODO: 增加None机制，允许hidden size、batch size和seqlen设置为None
+def check_io(in_attr, out_attr, strict=False, in_name="left", out_name="right"):
+    for name, attr in in_attr.items():
+        assert name in out_attr, in_name+': '+name+' not found in '+out_name
+        if attr != out_attr[name]:
+            if strict:
+                raise ValueError(name+': shape or dtype not consistent!')
+            else:
+                logging.warning('{}: shape or dtype not consistent!\n{}:\n{}\n{}:\n{}'.format(name, in_name, attr, out_name, out_attr[name]))
+
+
+def encode_inputs(inputs, scope_name, sep='.', cand_set=None):
+    outputs = {}
+    for k, v in inputs.items():
+        if cand_set is not None:
+            if k in cand_set:
+                outputs[k] = v
+            if scope_name+sep+k in cand_set:
+                outputs[scope_name+sep+k] = v
+        else:
+            outputs[scope_name+sep+k] = v
+    return outputs
+
+
+def decode_inputs(inputs, scope_name, sep='.', keep_unk_keys=True):
+    outputs = {}
+    for name, value in inputs.items():
+        # var for backbone are also available to tasks
+        if keep_unk_keys and sep not in name:
+            outputs[name] = value
+        # var for this inst
+        if name.startswith(scope_name+'.'):
+            outputs[name[len(scope_name+'.'):]] = value
+    return outputs
+
+
+def build_executor(on_gpu):
+    if on_gpu:
+        place = fluid.CUDAPlace(0)
+        # dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        # dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    # return fluid.Executor(place), dev_count
+    return fluid.Executor(place)
+
+
+def fit_attr(conf, fit_attr, strict=False):
+    for i, attr in fit_attr.items():
+        if i not in conf:
+            if strict:
+                raise Exception('Argument {} is required to create a controller.'.format(i))
+            else:
+                continue
+        conf[i] = attr(conf[i])
+    return conf
--- a/paddlepalm/utils/reader_helper.py
+++ b/paddlepalm/utils/reader_helper.py
@@ -22,6 +22,19 @@ from paddle import fluid
 from paddle.fluid import layers


+def create_feed_batch_process_fn(net_inputs):
+
+    def feed_batch_process_fn(data):
+        temp = {}
+        for q, var in net_inputs.items():
+            if isinstance(var, str) or isinstance(var, unicode):
+                temp[var] = data[q]
+            else:
+                temp[var.name] = data[q]
+        return temp
+
+    return feed_batch_process_fn
+
 def _check_and_adapt_shape_dtype(rt_val, attr, message=""):
    if not isinstance(rt_val, np.ndarray):
        rt_val = np.array(rt_val)
@@ -78,31 +91,40 @@ def create_net_inputs(input_attrs, async=False, iterator_fn=None, dev_count=1, n
    return ret


-def create_iterator_fn(iterator, iterator_prefix, shape_and_dtypes, outname_to_pos, verbose=0):
+def create_iterator_fn(iterator, iterator_prefix, shape_and_dtypes, outname_to_pos, verbose=0, return_type='list'):

-    def iterator():
+    pos_to_outname = {j:i for i,j in outname_to_pos.items()}
+    
+    def iterator_fn():
        v = verbose
        while True:
-            results = _zero_batch(shape_and_dtypes)
+            # results = _zero_batch(shape_and_dtypes)
+            results = [None] * len(outname_to_pos)

            outputs = next(iterator) # dict type
-            prefix = iterator_prefixe
+            prefix = iterator_prefix
            for outname, val in outputs.items():
-                task_outname = prefix + '/' + outname
+                task_outname = prefix + '.' + outname

                if outname in outname_to_pos:
                    idx = outname_to_pos[outname]
-                    val = _check_and_adapt_shape_dtype(val, joint_shape_and_dtypes[idx])
+                    val = _check_and_adapt_shape_dtype(val, shape_and_dtypes[idx])
                    results[idx] = val

                if task_outname in outname_to_pos:
                    idx = outname_to_pos[task_outname]
-                    val = _check_and_adapt_shape_dtype(val, joint_shape_and_dtypes[idx])
+                    val = _check_and_adapt_shape_dtype(val, shape_and_dtypes[idx])
                    results[idx] = val
+            if return_type == 'list':
+                yield results
+            elif return_type == 'dict':
+                temp = {}
+                for pos, i in enumerate(results):
+                    temp[pos_to_outname[pos]] = i

-            yield results
+                yield temp

-    return iterator
+    return iterator_fn


 def create_joint_iterator_fn(iterators, iterator_prefixes, joint_shape_and_dtypes, mrs, outname_to_pos, dev_count=1, keep_one_task=True, verbose=0):
@@ -122,7 +144,7 @@ def create_joint_iterator_fn(iterators, iterator_prefixes, joint_shape_and_dtype
        outbuf[id] = outputs
        prefix = iterator_prefixes[id]
        for outname, val in outputs.items():
-            task_outname = prefix + '/' + outname
+            task_outname = prefix + '.' + outname

            if outname in outname_to_pos:
                idx = outname_to_pos[outname]
@@ -179,7 +201,7 @@ def create_joint_iterator_fn(iterators, iterator_prefixes, joint_shape_and_dtype
                for outname, val in outputs.items():
                    if v > 0:
                        print('reader generate: '+outname)
-                    task_outname = prefix + '/' + outname
+                    task_outname = prefix + '.' + outname

                    if outname in outname_to_pos:
                        idx = outname_to_pos[outname]

--- a/paddlepalm/utils/saver.py
+++ b/paddlepalm/utils/saver.py
@@ -55,7 +55,7 @@ def init_pretraining_params(exe,
    print("Loading pretraining parameters from {}...".format(
        pretraining_params_path))

-    with tarfile.open(os.path.join(pretraining_params_path, '__palmmodel__'), 'r:') as f:
+    with tarfile.open(os.path.join(pretraining_params_path, '__palmmodel__'), 'r') as f:
        f.extractall(os.path.join(pretraining_params_path, '.temp'))
    
    log_path = os.path.join(pretraining_params_path, '__palmmodel__')

--- a/reader/__init__.py
+++ b/reader/__init__.py
--- a/reader/cls.py
+++ b/reader/cls.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlepalm.interface import reader
+from paddlepalm.reader.utils.reader4ernie import ClassifyReader
+
+class Reader(reader):
+    
+    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+        """
+        Args:
+            phase: train, eval, pred
+            """
+
+        self._is_training = phase == 'train'
+
+        reader = ClassifyReader(config['vocab_path'],
+            max_seq_len=config['max_seq_len'],
+            do_lower_case=config.get('do_lower_case', False),
+            for_cn=config.get('for_cn', False),
+            random_seed=config.get('seed', None))
+        self._reader = reader
+        self._dev_count = dev_count
+
+        self._batch_size = config['batch_size']
+        self._max_seq_len = config['max_seq_len']
+        self._num_classes = config['n_classes']
+
+        if phase == 'train':
+            self._input_file = config['train_file']
+            self._num_epochs = None # 防止iteartor终止
+            self._shuffle = config.get('shuffle', True)
+            # self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+        elif phase == 'eval':
+            self._input_file = config['dev_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        elif phase == 'pred':
+            self._input_file = config['pred_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+
+        self._phase = phase
+        # self._batch_size = 
+        self._print_first_n = config.get('print_first_n', 0)
+
+
+    @property
+    def outputs_attr(self):
+        if self._is_training:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32'],
+                    "label_ids": [[-1,1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64']
+                    }
+        else:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32']
+                    }
+
+
+    def load_data(self):
+        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
+
+    def iterator(self): 
+
+        def list_to_dict(x):
+            names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
+                'label_ids', 'unique_ids']
+            outputs = {n: i for n,i in zip(names, x)}
+            del outputs['unique_ids']
+            if not self._is_training:
+                del outputs['label_ids']
+            return outputs
+
+        for batch in self._data_generator():
+            yield list_to_dict(batch)
+
+    def get_epoch_outputs(self):
+        return {'examples': self._reader.get_examples(self._phase),
+                'features': self._reader.get_features(self._phase)}
+
+    @property
+    def num_examples(self):
+        return self._reader.get_num_examples(phase=self._phase)
+
--- a/reader/match.py
+++ b/reader/match.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlepalm.interface import reader
+from paddlepalm.reader.utils.reader4ernie import ClassifyReader
+
+def match(vocab_path, max_seq_len, do_lower_case=True, phase, dev_count=1):
+    config={
+        xxx}
+
+    return Reader(config())
+
+class Reader(reader):
+    
+    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+        """
+        Args:
+            phase: train, eval, pred
+            """
+
+        self._is_training = phase == 'train'
+
+        reader = ClassifyReader(config['vocab_path'],
+            max_seq_len=config['max_seq_len'],
+            do_lower_case=config.get('do_lower_case', True),
+            for_cn=config.get('for_cn', False),
+            random_seed=config.get('seed', None))
+        self._reader = reader
+        self._dev_count = dev_count
+
+        self._batch_size = config['batch_size']
+        self._max_seq_len = config['max_seq_len']
+        if phase == 'train':
+            self._input_file = config['train_file']
+            self._num_epochs = None # 防止iteartor终止
+            self._shuffle = config.get('shuffle', True)
+            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+        elif phase == 'eval':
+            self._input_file = config['dev_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        elif phase == 'pred':
+            self._input_file = config['pred_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+
+        self._phase = phase
+        # self._batch_size = 
+        self._print_first_n = config.get('print_first_n', 1)
+
+
+    @property
+    def outputs_attr(self):
+        if self._is_training:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32'],
+                    "label_ids": [[-1,1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64']
+                    }
+        else:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32']
+                    }
+
+
+    def load_data(self):
+        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
+
+    def iterator(self): 
+
+        def list_to_dict(x):
+            names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
+                'label_ids', 'unique_ids']
+            outputs = {n: i for n,i in zip(names, x)}
+            del outputs['unique_ids']
+            if not self._is_training:
+                del outputs['label_ids']
+            return outputs
+
+        for batch in self._data_generator():
+            yield list_to_dict(batch)
+
+    @property
+    def num_examples(self):
+        return self._reader.get_num_examples(phase=self._phase)
+
--- a/reader/mlm.py
+++ b/reader/mlm.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlepalm.interface import reader
+from paddlepalm.reader.utils.reader4ernie import MaskLMReader
+import numpy as np
+
+class Reader(reader):
+    
+    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+        """
+        Args:
+            phase: train, eval, pred
+            """
+
+        self._is_training = phase == 'train'
+
+        reader = MaskLMReader(config['vocab_path'],
+            max_seq_len=config['max_seq_len'],
+            do_lower_case=config.get('do_lower_case', False),
+            for_cn=config.get('for_cn', False),
+            random_seed=config.get('seed', None))
+        self._reader = reader
+        self._dev_count = dev_count
+
+        self._batch_size = config['batch_size']
+        self._max_seq_len = config['max_seq_len']
+        if phase == 'train':
+            self._input_file = config['train_file']
+            self._num_epochs = None # 防止iteartor终止
+            self._shuffle = config.get('shuffle', True)
+            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+        elif phase == 'eval':
+            self._input_file = config['dev_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        elif phase == 'pred':
+            self._input_file = config['pred_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+
+        self._phase = phase
+        # self._batch_size = 
+        self._print_first_n = config.get('print_first_n', 1)
+
+
+    @property
+    def outputs_attr(self):
+        return {"token_ids": [[-1, -1, 1], 'int64'],
+                "position_ids": [[-1, -1, 1], 'int64'],
+                "segment_ids": [[-1, -1, 1], 'int64'],
+                "input_mask": [[-1, -1, 1], 'float32'],
+                "task_ids": [[-1, -1, 1], 'int64'],
+                "mask_label": [[-1, 1], 'int64'],
+                "mask_pos": [[-1, 1], 'int64'],
+                }
+
+
+    def load_data(self):
+        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
+
+    def iterator(self): 
+
+        def list_to_dict(x):
+            names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask', 
+                'task_ids', 'mask_label', 'mask_pos']
+            outputs = {n: i for n,i in zip(names, x)}
+            # outputs['batchsize_x_seqlen'] = [self._batch_size * len(outputs['token_ids'][0]) - 1]
+            return outputs
+
+        for batch in self._data_generator():
+            # print(np.shape(list_to_dict(batch)['token_ids']))
+            # print(list_to_dict(batch)['mask_label'].tolist())
+            yield list_to_dict(batch)
+
+    def get_epoch_outputs(self):
+        return {'examples': self._reader.get_examples(self._phase),
+                'features': self._reader.get_features(self._phase)}
+
+    @property
+    def num_examples(self):
+        return self._reader.get_num_examples(phase=self._phase)
+
--- a/reader/mrc.py
+++ b/reader/mrc.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlepalm.interface import reader
+from paddlepalm.reader.utils.reader4ernie import MRCReader
+
+class Reader(reader):
+    
+    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+        """
+        Args:
+            phase: train, eval, pred
+            """
+
+        self._is_training = phase == 'train'
+
+        reader = MRCReader(config['vocab_path'],
+            max_seq_len=config['max_seq_len'],
+            do_lower_case=config.get('do_lower_case', False),
+            tokenizer='FullTokenizer',
+            for_cn=config.get('for_cn', False),
+            doc_stride=config['doc_stride'],
+            max_query_length=config['max_query_len'],
+            random_seed=config.get('seed', None))
+        self._reader = reader
+        self._dev_count = dev_count
+
+        self._batch_size = config['batch_size']
+        self._max_seq_len = config['max_seq_len']
+        if phase == 'train':
+            self._input_file = config['train_file']
+            # self._num_epochs = config['num_epochs']
+            self._num_epochs = None # 防止iteartor终止
+            self._shuffle = config.get('shuffle', True)
+            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+        if phase == 'eval':
+            self._input_file = config['dev_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        elif phase == 'pred':
+            self._input_file = config['pred_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+
+        self._phase = phase
+        # self._batch_size = 
+        self._print_first_n = config.get('print_first_n', 1)
+
+        # TODO: without slide window version
+        self._with_slide_window = config.get('with_slide_window', False)
+
+
+    @property
+    def outputs_attr(self):
+        if self._is_training:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32'],
+                    "start_positions": [[-1, 1], 'int64'],
+                    "end_positions": [[-1, 1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64']
+                    }
+        else:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32'],
+                    "unique_ids": [[-1, 1], 'int64']
+                    }
+
+    @property
+    def epoch_outputs_attr(self):
+        if not self._is_training:
+            return {"examples": None,
+                    "features": None}
+
+    def load_data(self):
+        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
+
+    def iterator(self): 
+
+        def list_to_dict(x):
+            names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
+                'start_positions', 'end_positions', 'unique_ids']
+            outputs = {n: i for n,i in zip(names, x)}
+            if self._is_training:
+                del outputs['unique_ids']
+            else:
+                del outputs['start_positions']
+                del outputs['end_positions']
+            return outputs
+
+        for batch in self._data_generator():
+            yield list_to_dict(batch)
+
+    def get_epoch_outputs(self):
+        return {'examples': self._reader.get_examples(self._phase),
+                'features': self._reader.get_features(self._phase)}
+
+    @property
+    def num_examples(self):
+        return self._reader.get_num_examples(phase=self._phase)
+
--- a/reader/utils/__init__.py
+++ b/reader/utils/__init__.py
--- a/reader/utils/batching4bert.py
+++ b/reader/utils/batching4bert.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+
+
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       max_len=None,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_sent_ids = [inst[1] for inst in insts]
+    batch_pos_ids = [inst[2] for inst in insts]
+    labels_list = []
+    # compatible with mrqa, whose example includes start/end positions, 
+    # or unique id
+    for i in range(3, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, 
+        max_len=max_len,
+        pad_idx=pad_id, return_input_mask=True)
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        max_len=max_len,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        max_len=max_len,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   max_len=None,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    if max_len is None:
+        max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+    inst_data = np.array([
+        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+    ])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+    if return_max_len:
+        return_list += [max_len]
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
+
+
--- a/reader/utils/batching4ernie.py
+++ b/reader/utils/batching4ernie.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from six.moves import xrange
+
+
+def mask(batch_tokens,
+         seg_labels,
+         mask_word_tags,
+         total_token_num,
+         vocab_size,
+         CLS=1,
+         SEP=2,
+         MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        mask_word = mask_word_tags[sent_index]
+        prob_index += pre_sent_len
+        if mask_word:
+            beg = 0
+            for token_index, token in enumerate(sent):
+                seg_label = seg_labels[sent_index][token_index]
+                if seg_label == 1:
+                    continue
+                if beg == 0:
+                    if seg_label != -1:
+                        beg = token_index
+                    continue
+
+                prob = prob_mask[prob_index + beg]
+                if prob > 0.15:
+                    pass
+                else:
+                    for index in xrange(beg, token_index):
+                        prob = prob_mask[prob_index + index]
+                        base_prob = 1.0
+                        if index == beg:
+                            base_prob = 0.15
+                        if base_prob * 0.2 < prob <= base_prob:
+                            mask_label.append(sent[index])
+                            sent[index] = MASK
+                            mask_flag = True
+                            mask_pos.append(sent_index * max_len + index)
+                        elif base_prob * 0.1 < prob <= base_prob * 0.2:
+                            mask_label.append(sent[index])
+                            sent[index] = replace_ids[prob_index + index]
+                            mask_flag = True
+                            mask_pos.append(sent_index * max_len + index)
+                        else:
+                            mask_label.append(sent[index])
+                            mask_pos.append(sent_index * max_len + index)
+
+                if seg_label == -1:
+                    beg = 0
+                else:
+                    beg = token_index
+        else:
+            for token_index, token in enumerate(sent):
+                prob = prob_mask[prob_index + token_index]
+                if prob > 0.15:
+                    continue
+                elif 0.03 < prob <= 0.15:
+                    # mask
+                    if token != SEP and token != CLS:
+                        mask_label.append(sent[token_index])
+                        sent[token_index] = MASK
+                        mask_flag = True
+                        mask_pos.append(sent_index * max_len + token_index)
+                elif 0.015 < prob <= 0.03:
+                    # random replace
+                    if token != SEP and token != CLS:
+                        mask_label.append(sent[token_index])
+                        sent[token_index] = replace_ids[prob_index +
+                                                        token_index]
+                        mask_flag = True
+                        mask_pos.append(sent_index * max_len + token_index)
+                else:
+                    # keep the original token
+                    if token != SEP and token != CLS:
+                        mask_label.append(sent[token_index])
+                        mask_pos.append(sent_index * max_len + token_index)
+
+        pre_sent_len = len(sent)
+
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False,
+                   return_seq_lens=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array(
+        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    if return_seq_lens:
+        seq_lens = np.array([len(inst) for inst in insts])
+        return_list += [seq_lens.astype("int64").reshape([-1, 1])]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+
+    pass
--- a/reader/utils/mlm_batching.py
+++ b/reader/utils/mlm_batching.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+
+
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       max_len=None,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       task_id=0,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_sent_ids = [inst[1] for inst in insts]
+    batch_pos_ids = [inst[2] for inst in insts]
+
+    # 这里是否应该反过来？？？否则在task layer里展开后的word embedding是padding后的，这时候word的index是跟没有padding时的index对不上的？
+    # First step: do mask without padding
+    out, mask_label, mask_pos = mask(
+        batch_src_ids,
+        total_token_num,
+        vocab_size=voc_size,
+        CLS=cls_id,
+        SEP=sep_id,
+        MASK=mask_id)
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, 
+        max_len=max_len,
+        pad_idx=pad_id, return_input_mask=True)
+
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        max_len=max_len,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        max_len=max_len,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    task_ids = np.ones_like(
+        src_id, dtype="int64") * task_id
+    return_list = [
+        src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos
+    ]
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   max_len=None,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    if max_len is None:
+        max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+    inst_data = np.array([
+        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+    ])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+    if return_max_len:
+        return_list += [max_len]
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
+
+
--- a/reader/utils/mrqa_helper.py
+++ b/reader/utils/mrqa_helper.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+class MRQAExample(object):
+    """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (
+            tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class MRQAFeature(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
--- a/reader/utils/reader4ernie.py
+++ b/reader/utils/reader4ernie.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+import sys
+import os
+import json
+import random
+import logging
+import numpy as np
+import six
+from io import open
+from collections import namedtuple
+
+import paddlepalm.tokenizer.ernie_tokenizer as tokenization
+from paddlepalm.reader.utils.batching4ernie import pad_batch_data
+from paddlepalm.reader.utils.mlm_batching import prepare_batch_data
+
+
+log = logging.getLogger(__name__)
+
+if six.PY3:
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+
+
+def csv_reader(fd, delimiter='\t'):
+    def gen():
+        for i in fd:
+            yield i.rstrip('\n').split(delimiter)
+    return gen()
+
+
+class BaseReader(object):
+    def __init__(self,
+                 vocab_path,
+                 label_map_config=None,
+                 max_seq_len=512,
+                 do_lower_case=True,
+                 in_tokens=False,
+                 is_inference=False,
+                 random_seed=None,
+                 tokenizer="FullTokenizer",
+                 is_classify=True,
+                 is_regression=False,
+                 for_cn=True,
+                 task_id=0):
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self.vocab = self.tokenizer.vocab
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.mask_id = self.vocab["[MASK]"]
+        self.in_tokens = in_tokens
+        self.is_inference = is_inference
+        self.for_cn = for_cn
+        self.task_id = task_id
+
+        np.random.seed(random_seed)
+
+        self.is_classify = is_classify
+        self.is_regression = is_regression
+        self.current_example = 0
+        self.current_epoch = 0
+        self.num_examples = 0
+
+        self.examples = {}
+
+        if label_map_config:
+            with open(label_map_config, encoding='utf8') as f: 
+                self.label_map = json.load(f)
+        else:
+            self.label_map = None
+
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_example, self.current_epoch
+
+    def _read_tsv(self, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, 'r', encoding='utf8') as f:
+            reader = csv_reader(f)
+            headers = next(reader)
+            Example = namedtuple('Example', headers)
+
+            examples = []
+            for line in reader:
+                example = Example(*line)
+                examples.append(example)
+            return examples
+
+    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length."""
+
+        # This is a simple heuristic which will always truncate the longer sequence
+        # one token at a time. This makes more sense than truncating an equal percent
+        # of tokens from each, since if one sequence is very short then each token
+        # that's truncated likely contains more information than a longer sequence.
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                break
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
+    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+        """Converts a single `Example` into a single `Record`."""
+
+        text_a = tokenization.convert_to_unicode(example.text_a)
+        tokens_a = tokenizer.tokenize(text_a)
+        tokens_b = None
+
+        has_text_b = False
+        if isinstance(example, dict):
+            has_text_b = "text_b" in example.keys()
+        else:
+            has_text_b = "text_b" in example._fields
+
+        if has_text_b:
+            text_b = tokenization.convert_to_unicode(example.text_b)
+            tokens_b = tokenizer.tokenize(text_b)
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+        # The convention in BERT/ERNIE is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        text_type_ids = []
+        tokens.append("[CLS]")
+        text_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            text_type_ids.append(0)
+        tokens.append("[SEP]")
+        text_type_ids.append(0)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                text_type_ids.append(1)
+            tokens.append("[SEP]")
+            text_type_ids.append(1)
+
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+
+        if self.is_inference:
+            Record = namedtuple('Record',
+                                ['token_ids', 'text_type_ids', 'position_ids'])
+            record = Record(
+                token_ids=token_ids,
+                text_type_ids=text_type_ids,
+                position_ids=position_ids)
+        else:
+            if self.label_map:
+                label_id = self.label_map[example.label]
+            else:
+                label_id = example.label
+
+            Record = namedtuple('Record', [
+                'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'
+            ])
+
+            qid = None
+            if "qid" in example._fields:
+                qid = example.qid
+
+            record = Record(
+                token_ids=token_ids,
+                text_type_ids=text_type_ids,
+                position_ids=position_ids,
+                label_id=label_id,
+                qid=qid)
+        return record
+
+    def _prepare_batch_data(self, examples, batch_size, phase=None):
+        """generate batch records"""
+        batch_records, max_len = [], 0
+        if len(examples) < batch_size:
+            raise Exception('CLS dataset contains too few samples. Expect more than '+str(batch_size))
+        for index, example in enumerate(examples):
+            if phase == "train":
+                self.current_example = index
+            record = self._convert_example_to_record(example, self.max_seq_len,
+                                                     self.tokenizer)
+            max_len = max(max_len, len(record.token_ids))
+            if self.in_tokens:
+                to_append = (len(batch_records) + 1) * max_len <= batch_size
+            else:
+                to_append = len(batch_records) < batch_size
+            if to_append:
+                batch_records.append(record)
+            else:
+                yield self._pad_batch_records(batch_records)
+                batch_records, max_len = [record], len(record.token_ids)
+
+        if phase == 'pred' and batch_records:
+            yield self._pad_batch_records(batch_records)
+
+    def get_num_examples(self, input_file=None, phase=None):
+        if self.examples is not None:
+            if phase is None:
+                phase = 'all'
+            return len(self.examples[phase])
+        else:
+            assert input_file is not None, "Argument input_file should be given or the data_generator should be created when this func is called."
+            examples = self._read_tsv(input_file)
+            return len(examples)
+
+    def data_generator(self,
+                       input_file,
+                       batch_size,
+                       epoch,
+                       dev_count=1,
+                       shuffle=True,
+                       phase=None):
+        examples = self._read_tsv(input_file)
+        if phase is None:
+            phase = 'all'
+        self.examples[phase] = examples
+
+        def wrapper():
+            all_dev_batches = []
+            if epoch is None:
+                num_epochs = 99999999
+            else:
+                num_epochs = epoch
+            for epoch_index in range(num_epochs):
+                if phase == "train":
+                    self.current_example = 0
+                    self.current_epoch = epoch_index
+                if shuffle:
+                    np.random.shuffle(examples)
+
+                for batch_data in self._prepare_batch_data(
+                        examples, batch_size, phase=phase):
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+        def f():
+            for i in wrapper():
+                yield i
+
+        # def f():
+        #     try:
+        #         for i in wrapper():
+        #             yield i
+        #     except Exception as e:
+        #         import traceback
+        #         traceback.print_exc()
+
+        return f
+
+
+class MaskLMReader(BaseReader):
+
+    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+        """Converts a single `Example` into a single `Record`."""
+
+        text_a = tokenization.convert_to_unicode(example.text_a)
+        tokens_a = tokenizer.tokenize(text_a)
+        tokens_b = None 
+
+        has_text_b = False
+        if isinstance(example, dict):
+            has_text_b = "text_b" in example.keys()
+        else:
+            has_text_b = "text_b" in example._fields
+
+        if has_text_b:
+            text_b = tokenization.convert_to_unicode(example.text_b)
+            tokens_b = tokenizer.tokenize(text_b)
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+        # The convention in BERT/ERNIE is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        text_type_ids = []
+        tokens.append("[CLS]")
+        text_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            text_type_ids.append(0)
+        tokens.append("[SEP]")
+        text_type_ids.append(0)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                text_type_ids.append(1)
+            tokens.append("[SEP]")
+            text_type_ids.append(1)
+
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+
+        # Record = namedtuple('Record',
+        #                     ['token_ids', 'text_type_ids', 'position_ids'])
+        # record = Record(
+        #     token_ids=token_ids,
+        #     text_type_ids=text_type_ids,
+        #     position_ids=position_ids)
+
+        return [token_ids, text_type_ids, position_ids]
+
+    def batch_reader(self, examples, batch_size, in_tokens, phase):
+        batch = []
+        total_token_num = 0
+        if len(examples) < batch_size:
+            raise Exception('MaskLM dataset contains too few samples. Expect more than '+str(batch_size))
+        for e in examples:
+            parsed_line = self._convert_example_to_record(e, self.max_seq_len, self.tokenizer)
+            to_append = len(batch) < batch_size
+            if to_append:
+                batch.append(parsed_line)
+                total_token_num += len(parsed_line[0])
+            else:
+                yield batch, total_token_num
+                batch = [parsed_line]
+                total_token_num = len(parsed_line[0])
+
+        if len(batch) > 0 and phase == 'pred':
+            yield batch, total_token_num
+
+    def data_generator(self,
+                       input_file,
+                       batch_size,
+                       epoch,
+                       dev_count=1,
+                       shuffle=True,
+                       phase=None):
+        examples = self._read_tsv(input_file)
+        if phase is None:
+            phase = 'all'
+        self.examples[phase] = examples
+
+        def wrapper():
+            all_dev_batches = []
+            if epoch is None:
+                num_epochs = 99999999
+            else:
+                num_epochs = epoch
+            for epoch_index in range(num_epochs):
+                if phase == "train":
+                    self.current_example = 0
+                    self.current_epoch = epoch_index
+                if shuffle:
+                    np.random.shuffle(examples)
+
+                all_dev_batches = []
+                for batch_data, num_tokens in self.batch_reader(examples, 
+                                                    batch_size, self.in_tokens, phase=phase):
+                    batch_data = prepare_batch_data(
+                        batch_data,
+                        num_tokens,
+                        voc_size=len(self.vocab),
+                        pad_id=self.pad_id,
+                        cls_id=self.cls_id,
+                        sep_id=self.sep_id,
+                        mask_id=self.mask_id,
+                        # max_len=self.max_seq_len, # 注意，如果padding到最大长度，会导致mask_pos与实际位置不对应。因为mask pos是基于batch内最大长度来计算的。
+                        return_input_mask=True,
+                        return_max_len=False,
+                        return_num_token=False)
+
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+
+        return wrapper
+
+
+class ClassifyReader(BaseReader):
+    def _read_tsv(self, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, 'r', encoding='utf8') as f:
+            reader = csv_reader(f)
+            headers = next(reader)
+            text_indices = [
+                index for index, h in enumerate(headers) if h != "label"
+            ]
+            Example = namedtuple('Example', headers)
+
+            examples = []
+            for line in reader:
+                for index, text in enumerate(line):
+                    if index in text_indices:
+                        if self.for_cn:
+                            line[index] = text.replace(' ', '')
+                        else:
+                            line[index] = text
+                example = Example(*line)
+                examples.append(example)
+            return examples
+
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+
+        if not self.is_inference:
+            batch_labels = [record.label_id for record in batch_records]
+            if self.is_classify:
+                batch_labels = np.array(batch_labels).astype("int64").reshape(
+                    [-1, 1])
+            elif self.is_regression:
+                batch_labels = np.array(batch_labels).astype("float32").reshape(
+                    [-1, 1])
+
+            if batch_records[0].qid:
+                batch_qids = [record.qid for record in batch_records]
+                batch_qids = np.array(batch_qids).astype("int64").reshape(
+                    [-1, 1])
+            else:
+                batch_qids = np.array([]).astype("int64").reshape([-1, 1])
+
+        # padding
+        padded_token_ids, input_mask = pad_batch_data(
+            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_task_ids = np.ones_like(
+            padded_token_ids, dtype="int64") * self.task_id
+
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask
+        ]
+        if not self.is_inference:
+            return_list += [batch_labels, batch_qids]
+
+        return return_list
+
+
+class SequenceLabelReader(BaseReader):
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        batch_label_ids = [record.label_ids for record in batch_records]
+
+        # padding
+        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
+            batch_token_ids,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            return_seq_lens=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_label_ids = pad_batch_data(
+            batch_label_ids, pad_idx=len(self.label_map) - 1)
+        padded_task_ids = np.ones_like(
+            padded_token_ids, dtype="int64") * self.task_id
+
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask, padded_label_ids, batch_seq_lens
+        ]
+        return return_list
+
+    def _reseg_token_label(self, tokens, labels, tokenizer):
+        assert len(tokens) == len(labels)
+        ret_tokens = []
+        ret_labels = []
+        for token, label in zip(tokens, labels):
+            sub_token = tokenizer.tokenize(token)
+            if len(sub_token) == 0:
+                continue
+            ret_tokens.extend(sub_token)
+            if len(sub_token) == 1:
+                ret_labels.append(label)
+                continue
+
+            if label == "O" or label.startswith("I-"):
+                ret_labels.extend([label] * len(sub_token))
+            elif label.startswith("B-"):
+                i_label = "I-" + label[2:]
+                ret_labels.extend([label] + [i_label] * (len(sub_token) - 1))
+            elif label.startswith("S-"):
+                b_laebl = "B-" + label[2:]
+                e_label = "E-" + label[2:]
+                i_label = "I-" + label[2:]
+                ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label])
+            elif label.startswith("E-"):
+                i_label = "I-" + label[2:]
+                ret_labels.extend([i_label] * (len(sub_token) - 1) + [label])
+
+        assert len(ret_tokens) == len(ret_labels)
+        return ret_tokens, ret_labels
+
+    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+        tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
+        labels = tokenization.convert_to_unicode(example.label).split(u"")
+        tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
+
+        if len(tokens) > max_seq_length - 2:
+            tokens = tokens[0:(max_seq_length - 2)]
+            labels = labels[0:(max_seq_length - 2)]
+
+        tokens = ["[CLS]"] + tokens + ["[SEP]"]
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+        text_type_ids = [0] * len(token_ids)
+        no_entity_id = len(self.label_map) - 1
+        label_ids = [no_entity_id] + [
+            self.label_map[label] for label in labels
+        ] + [no_entity_id]
+
+        Record = namedtuple(
+            'Record',
+            ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
+        record = Record(
+            token_ids=token_ids,
+            text_type_ids=text_type_ids,
+            position_ids=position_ids,
+            label_ids=label_ids)
+        return record
+
+
+class ExtractEmbeddingReader(BaseReader):
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+
+        # padding
+        padded_token_ids, input_mask, seq_lens = pad_batch_data(
+            batch_token_ids,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            return_seq_lens=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_task_ids = np.ones_like(
+            padded_token_ids, dtype="int64") * self.task_id
+
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask, seq_lens
+        ]
+
+        return return_list
+
+
+class MRCReader(BaseReader):
+    def __init__(self,
+                 vocab_path,
+                 label_map_config=None,
+                 max_seq_len=512,
+                 do_lower_case=True,
+                 in_tokens=False,
+                 random_seed=None,
+                 tokenizer="FullTokenizer",
+                 is_classify=True,
+                 is_regression=False,
+                 for_cn=True,
+                 task_id=0,
+                 doc_stride=128,
+                 max_query_length=64,
+                 remove_noanswer=True):
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self.vocab = self.tokenizer.vocab
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.in_tokens = in_tokens
+        self.for_cn = for_cn
+        self.task_id = task_id
+        self.doc_stride = doc_stride
+        self.max_query_length = max_query_length
+        self.examples = {}
+        self.features = {}
+        self.remove_noanswer = remove_noanswer
+
+        if random_seed is not None:
+            np.random.seed(random_seed)
+
+        self.current_example = 0
+        self.current_epoch = 0
+        self.num_examples = 0
+
+        self.Example = namedtuple('Example',
+                ['qas_id', 'question_text', 'doc_tokens', 'orig_answer_text',
+                'start_position', 'end_position'])
+        self.Feature = namedtuple("Feature", ["unique_id", "example_index", "doc_span_index",
+                "tokens", "token_to_orig_map", "token_is_max_context",
+                "token_ids", "position_ids", "text_type_ids",
+                "start_position", "end_position"])
+        self.DocSpan = namedtuple("DocSpan", ["start", "length"])
+
+    def _read_json(self, input_file, is_training):
+        examples = []
+        with open(input_file, "r", encoding='utf8') as f:
+            input_data = json.load(f)["data"]
+            for entry in input_data:
+                for paragraph in entry["paragraphs"]:
+                    paragraph_text = paragraph["context"]
+                    for qa in paragraph["qas"]:
+                        qas_id = qa["id"]
+                        question_text = qa["question"]
+                        start_pos = None
+                        end_pos = None
+                        orig_answer_text = None
+
+                        if is_training:
+                            if len(qa["answers"]) != 1:
+                                raise ValueError(
+                                    "For training, each question should have exactly 1 answer."
+                                )
+
+                            answer = qa["answers"][0]
+                            orig_answer_text = answer["text"]
+                            answer_offset = answer["answer_start"]
+                            answer_length = len(orig_answer_text)
+                            doc_tokens = [
+                                paragraph_text[:answer_offset],
+                                paragraph_text[answer_offset:answer_offset +
+                                               answer_length],
+                                paragraph_text[answer_offset + answer_length:]
+                            ]
+
+                            start_pos = 1
+                            end_pos = 1
+
+                            actual_text = " ".join(doc_tokens[start_pos:(end_pos
+                                                                         + 1)])
+                            if actual_text.find(orig_answer_text) == -1:
+                                log.info("Could not find answer: '%s' vs. '%s'",
+                                      actual_text, orig_answer_text)
+                                continue
+                        else:
+                            doc_tokens = tokenization.tokenize_chinese_chars(
+                                paragraph_text)
+
+                        example = self.Example(
+                            qas_id=qas_id,
+                            question_text=question_text,
+                            doc_tokens=doc_tokens,
+                            orig_answer_text=orig_answer_text,
+                            start_position=start_pos,
+                            end_position=end_pos)
+                        examples.append(example)
+
+        return examples
+
+    def _improve_answer_span(self, doc_tokens, input_start, input_end,
+                             tokenizer, orig_answer_text):
+        tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+        for new_start in range(input_start, input_end + 1):
+            for new_end in range(input_end, new_start - 1, -1):
+                text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+                if text_span == tok_answer_text:
+                    return (new_start, new_end)
+
+        return (input_start, input_end)
+
+    def _check_is_max_context(self, doc_spans, cur_span_index, position):
+        best_score = None
+        best_span_index = None
+        for (span_index, doc_span) in enumerate(doc_spans):
+            end = doc_span.start + doc_span.length - 1
+            if position < doc_span.start:
+                continue
+            if position > end:
+                continue
+            num_left_context = position - doc_span.start
+            num_right_context = end - position
+            score = min(num_left_context,
+                        num_right_context) + 0.01 * doc_span.length
+            if best_score is None or score > best_score:
+                best_score = score
+                best_span_index = span_index
+
+        return cur_span_index == best_span_index
+
+    def _convert_example_to_feature(self, examples, max_seq_length, tokenizer,
+                                    is_training, remove_noanswer=True):
+        features = []
+        unique_id = 1000000000
+
+        print('converting examples to features...')
+        for (example_index, example) in enumerate(examples):
+            if example_index % 1000 == 0:
+                print('processing {}th example...'.format(example_index))
+            query_tokens = tokenizer.tokenize(example.question_text)
+            if len(query_tokens) > self.max_query_length:
+                query_tokens = query_tokens[0:self.max_query_length]
+            tok_to_orig_index = []
+            orig_to_tok_index = []
+            all_doc_tokens = []
+            for (i, token) in enumerate(example.doc_tokens):
+                orig_to_tok_index.append(len(all_doc_tokens))
+                sub_tokens = tokenizer.tokenize(token)
+                for sub_token in sub_tokens:
+                    tok_to_orig_index.append(i)
+                    all_doc_tokens.append(sub_token)
+
+            tok_start_position = None
+            tok_end_position = None
+            if is_training:
+                tok_start_position = orig_to_tok_index[example.start_position]
+                if example.end_position < len(example.doc_tokens) - 1:
+                    tok_end_position = orig_to_tok_index[example.end_position +
+                                                         1] - 1
+                else:
+                    tok_end_position = len(all_doc_tokens) - 1
+                (tok_start_position,
+                 tok_end_position) = self._improve_answer_span(
+                     all_doc_tokens, tok_start_position, tok_end_position,
+                     tokenizer, example.orig_answer_text)
+
+            max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+            doc_spans = []
+            start_offset = 0
+            while start_offset < len(all_doc_tokens):
+                length = len(all_doc_tokens) - start_offset
+                if length > max_tokens_for_doc:
+                    length = max_tokens_for_doc
+                doc_spans.append(self.DocSpan(start=start_offset, length=length))
+                if start_offset + length == len(all_doc_tokens):
+                    break
+                start_offset += min(length, self.doc_stride)
+
+            for (doc_span_index, doc_span) in enumerate(doc_spans):
+                tokens = []
+                token_to_orig_map = {}
+                token_is_max_context = {}
+                text_type_ids = []
+                tokens.append("[CLS]")
+                text_type_ids.append(0)
+                for token in query_tokens:
+                    tokens.append(token)
+                    text_type_ids.append(0)
+                tokens.append("[SEP]")
+                text_type_ids.append(0)
+
+                for i in range(doc_span.length):
+                    split_token_index = doc_span.start + i
+                    token_to_orig_map[len(tokens)] = tok_to_orig_index[
+                        split_token_index]
+
+                    is_max_context = self._check_is_max_context(
+                        doc_spans, doc_span_index, split_token_index)
+                    token_is_max_context[len(tokens)] = is_max_context
+                    tokens.append(all_doc_tokens[split_token_index])
+                    text_type_ids.append(1)
+                tokens.append("[SEP]")
+                text_type_ids.append(1)
+
+                token_ids = tokenizer.convert_tokens_to_ids(tokens)
+                position_ids = list(range(len(token_ids)))
+                start_position = None
+                end_position = None
+                if is_training:
+                    doc_start = doc_span.start
+                    doc_end = doc_span.start + doc_span.length - 1
+                    out_of_span = False
+                    if not (tok_start_position >= doc_start and
+                            tok_end_position <= doc_end):
+                        out_of_span = True
+                    if out_of_span:
+                        start_position = 0
+                        end_position = 0
+                        if remove_noanswer:
+                            continue
+                    else:
+                        doc_offset = len(query_tokens) + 2
+                        start_position = tok_start_position - doc_start + doc_offset
+                        end_position = tok_end_position - doc_start + doc_offset
+
+                feature = self.Feature(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    token_ids=token_ids,
+                    position_ids=position_ids,
+                    text_type_ids=text_type_ids,
+                    start_position=start_position,
+                    end_position=end_position)
+                features.append(feature)
+
+                unique_id += 1
+
+        return features
+
+    def _prepare_batch_data(self, records, batch_size, phase=None):
+        """generate batch records"""
+        batch_records, max_len = [], 0
+
+        if len(records) < batch_size:
+            raise Exception('mrc dataset contains too few samples. Expect more than '+str(batch_size))
+
+        for index, record in enumerate(records):
+            if phase == "train":
+                self.current_example = index
+            max_len = max(max_len, len(record.token_ids))
+            if self.in_tokens:
+                to_append = (len(batch_records) + 1) * max_len <= batch_size
+            else:
+                to_append = len(batch_records) < batch_size
+            if to_append:
+                batch_records.append(record)
+            else:
+                yield self._pad_batch_records(batch_records, phase == "train")
+                batch_records, max_len = [record], len(record.token_ids)
+
+        if phase == 'pred' and batch_records:
+            yield self._pad_batch_records(batch_records, phase == "train")
+
+    def _pad_batch_records(self, batch_records, is_training):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        if is_training:
+            batch_start_position = [
+                record.start_position for record in batch_records
+            ]
+            batch_end_position = [
+                record.end_position for record in batch_records
+            ]
+            batch_start_position = np.array(batch_start_position).astype(
+                "int64").reshape([-1, 1])
+            batch_end_position = np.array(batch_end_position).astype(
+                "int64").reshape([-1, 1])
+
+        else:
+            batch_size = len(batch_token_ids)
+            batch_start_position = np.zeros(
+                shape=[batch_size, 1], dtype="int64")
+            batch_end_position = np.zeros(shape=[batch_size, 1], dtype="int64")
+
+        batch_unique_ids = [record.unique_id for record in batch_records]
+        batch_unique_ids = np.array(batch_unique_ids).astype("int64").reshape(
+            [-1, 1])
+
+        # padding
+        padded_token_ids, input_mask = pad_batch_data(
+            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_task_ids = np.ones_like(
+            padded_token_ids, dtype="int64") * self.task_id
+
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask, batch_start_position,
+            batch_end_position, batch_unique_ids
+        ]
+
+        return return_list
+
+    def get_num_examples(self, phase):
+        return len(self.features[phase])
+
+    def get_features(self, phase):
+        return self.features[phase]
+
+    def get_examples(self, phase):
+        return self.examples[phase]
+
+    def data_generator(self,
+                       input_file,
+                       batch_size,
+                       epoch,
+                       dev_count=1,
+                       shuffle=True,
+                       phase=None):
+
+        examples = self.examples.get(phase, None)
+        features = self.features.get(phase, None)
+        if not examples:
+            examples = self._read_json(input_file, phase == "train")
+            features = self._convert_example_to_feature(
+                examples, self.max_seq_len, self.tokenizer, phase == "train", remove_noanswer=self.remove_noanswer)
+            self.examples[phase] = examples
+            self.features[phase] = features
+
+        def wrapper():
+            all_dev_batches = []
+            if epoch is None:
+                num_epochs = 99999999
+            else:
+                num_epochs = epoch
+            for epoch_index in range(num_epochs):
+                if phase == "train":
+                    self.current_example = 0
+                    self.current_epoch = epoch_index
+                if phase == "train" and shuffle:
+                    np.random.shuffle(features)
+
+                for batch_data in self._prepare_batch_data(
+                        features, batch_size, phase=phase):
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+
+        return wrapper
+
+
+if __name__ == '__main__':
+    pass
--- a/tasktype/__init__.py
+++ b/tasktype/__init__.py
--- a/paddlepalm/task_paradigm/cls.py
+++ b/paddlepalm/task_paradigm/cls.py
--- a/tasktype/match.py
+++ b/tasktype/match.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from paddle.fluid import layers
+from paddlepalm.interface import task_paradigm
+import numpy as np
+import os
+
+class TaskParadigm(task_paradigm):
+    '''
+    matching
+    '''
+    def __init__(self, config, phase, backbone_config=None):
+        self._is_training = phase == 'train'
+        self._hidden_size = backbone_config['hidden_size']
+
+        if 'initializer_range' in config:
+            self._param_initializer = config['initializer_range']
+        else:
+            self._param_initializer = fluid.initializer.TruncatedNormal(
+                scale=backbone_config.get('initializer_range', 0.02))
+        if 'dropout_prob' in config:
+            self._dropout_prob = config['dropout_prob']
+        else:
+            self._dropout_prob = backbone_config.get('hidden_dropout_prob', 0.0)
+
+        self._pred_output_path = config.get('pred_output_path', None)
+        self._preds = []
+
+    
+    @property
+    def inputs_attrs(self):
+        if self._is_training:
+            reader = {"label_ids": [[-1, 1], 'int64']}
+        else:
+            reader = {}
+        bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']}
+        return {'reader': reader, 'backbone': bb}
+
+    @property
+    def outputs_attrs(self):
+        if self._is_training:
+            return {"loss": [[1], 'float32']}
+        else:
+            return {"logits": [[-1, 2], 'float32']}
+
+    def build(self, inputs, scope_name=""):
+        if self._is_training:
+            labels = inputs["reader"]["label_ids"] 
+        cls_feats = inputs["backbone"]["sentence_pair_embedding"]
+
+        if self._is_training:
+            cls_feats = fluid.layers.dropout(
+                x=cls_feats,
+                dropout_prob=self._dropout_prob,
+                dropout_implementation="upscale_in_train")
+
+        logits = fluid.layers.fc(
+            input=cls_feats,
+            size=2,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+"cls_out_w",
+                initializer=self._param_initializer),
+            bias_attr=fluid.ParamAttr(
+                name=scope_name+"cls_out_b",
+                initializer=fluid.initializer.Constant(0.)))
+
+        if self._is_training:
+            ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
+                logits=logits, label=labels, return_softmax=True)
+            loss = fluid.layers.mean(x=ce_loss)
+            return {'loss': loss}
+        else:
+            return {'logits': logits}
+
+    def postprocess(self, rt_outputs):
+        if not self._is_training:
+            logits = rt_outputs['logits']
+            preds = np.argmax(logits, -1)
+            self._preds.extend(preds.tolist())
+
+    def epoch_postprocess(self, post_inputs):
+        # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
+        if not self._is_training:
+            if self._pred_output_path is None:
+                raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
+            with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer:
+                for p in self._preds:
+                    writer.write(str(p)+'\n')
+            print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json'))
+
+                
--- a/tasktype/mlm.py
+++ b/tasktype/mlm.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from paddlepalm.interface import task_paradigm
+from paddle.fluid import layers
+from paddlepalm.backbone.utils.transformer import pre_process_layer
+
+class TaskParadigm(task_paradigm):
+    '''
+    matching
+    '''
+    def __init__(self, config, phase, backbone_config=None):
+        self._is_training = phase == 'train'
+        self._emb_size = backbone_config['hidden_size']
+        self._hidden_size = backbone_config['hidden_size']
+        self._vocab_size = backbone_config['vocab_size']
+        self._hidden_act = backbone_config['hidden_act']
+        self._initializer_range = backbone_config['initializer_range']
+    
+    @property
+    def inputs_attrs(self):
+        reader = {
+            "mask_label": [[-1, 1], 'int64'],
+            "mask_pos": [[-1, 1], 'int64']}
+        if not self._is_training:
+            del reader['mask_label']
+            del reader['batchsize_x_seqlen']
+        bb = {
+            "encoder_outputs": [[-1, -1, self._hidden_size], 'float32'],
+            "embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']}
+        return {'reader': reader, 'backbone': bb}
+
+    @property
+    def outputs_attrs(self):
+        if self._is_training:
+            return {"loss": [[1], 'float32']}
+        else:
+            return {"logits": [[-1], 'float32']}
+
+    def build(self, inputs, scope_name=""):
+        mask_pos = inputs["reader"]["mask_pos"]
+        if self._is_training:
+            mask_label = inputs["reader"]["mask_label"] 
+            max_position = inputs["reader"]["batchsize_x_seqlen"] - 1
+            mask_pos = fluid.layers.elementwise_min(mask_pos, max_position)
+            mask_pos.stop_gradient = True
+
+        word_emb = inputs["backbone"]["embedding_table"]
+        enc_out = inputs["backbone"]["encoder_outputs"]
+
+        emb_size = word_emb.shape[-1]
+
+        _param_initializer = fluid.initializer.TruncatedNormal(
+            scale=self._initializer_range)
+
+        reshaped_emb_out = fluid.layers.reshape(
+            x=enc_out, shape=[-1, emb_size])
+
+        # extract masked tokens' feature
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+
+        # transform: fc
+        mask_trans_feat = fluid.layers.fc(
+            input=mask_feat,
+            size=emb_size,
+            act=self._hidden_act,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+'mask_lm_trans_fc.w_0',
+                initializer=_param_initializer),
+            bias_attr=fluid.ParamAttr(name=scope_name+'mask_lm_trans_fc.b_0'))
+        # transform: layer norm
+        mask_trans_feat = pre_process_layer(
+            mask_trans_feat, 'n', name=scope_name+'mask_lm_trans')
+
+        mask_lm_out_bias_attr = fluid.ParamAttr(
+            name=scope_name+"mask_lm_out_fc.b_0",
+            initializer=fluid.initializer.Constant(value=0.0))
+
+        fc_out = fluid.layers.matmul(
+            x=mask_trans_feat,
+            y=word_emb,
+            transpose_y=True)
+        fc_out += fluid.layers.create_parameter(
+            shape=[self._vocab_size],
+            dtype='float32',
+            attr=mask_lm_out_bias_attr,
+            is_bias=True)
+
+        if self._is_training:
+            mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+                logits=fc_out, label=mask_label)
+            loss = fluid.layers.mean(mask_lm_loss)
+            return {'loss': loss}
+        else:
+            return {'logits': fc_out}
+
+
--- a/tasktype/mrc.py
+++ b/tasktype/mrc.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from paddlepalm.interface import task_paradigm
+import collections
+import numpy as np
+import os
+import math
+import six
+import paddlepalm.tokenizer.ernie_tokenizer as tokenization
+import json
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+class TaskParadigm(task_paradigm):
+    """"""
+
+    def __init__(self, config, phase, backbone_config=None):
+        
+        self._is_training = phase == 'train'
+        self._max_sequence_length = config['max_seq_len']
+        self._hidden_size = backbone_config['hidden_size']
+        self._pred_results = []
+        
+        if phase == 'pred':
+            self._max_answer_length = config.get('max_answer_len', None)
+            self._null_score_diff_threshold = config.get('null_score_diff_threshold', 0.0)
+            self._n_best_size = config.get('n_best_size', 20)
+            self._pred_output_path = config.get('pred_output_path', None)
+            self._verbose = config.get('verbose', False)
+            self._with_negative = config.get('with_negative', False)
+            self._do_lower_case = config.get('do_lower_case', False)
+
+
+    @property
+    def inputs_attrs(self):
+        if self._is_training:
+            reader = {"start_positions": [[-1, 1], 'int64'],
+                      "end_positions": [[-1, 1], 'int64'],
+                      }
+        else:
+            reader = {'unique_ids': [[-1, 1], 'int64']}
+        bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']}
+        return {'reader': reader, 'backbone': bb}
+        
+    @property
+    def epoch_inputs_attrs(self):
+        if not self._is_training:
+            from_reader = {'examples': None, 'features': None}
+            return {'reader': from_reader}
+
+    @property
+    def outputs_attr(self):
+        if self._is_training:
+            return {'loss': [[1], 'float32']}
+        else:
+            return {'start_logits': [[-1, -1, 1], 'float32'],
+                    'end_logits': [[-1, -1, 1], 'float32'],
+                    'unique_ids': [[-1, 1], 'int64']}
+
+
+    def build(self, inputs, scope_name=""):
+        if self._is_training:
+            start_positions = inputs['reader']['start_positions']
+            end_positions = inputs['reader']['end_positions']
+            max_position = inputs["reader"]["seqlen"] - 1
+            start_positions = fluid.layers.elementwise_min(start_positions, max_position)
+            end_positions = fluid.layers.elementwise_min(end_positions, max_position)
+            start_positions.stop_gradient = True
+            end_positions.stop_gradient = True
+        else:
+            unique_id = inputs['reader']['unique_ids']
+
+        enc_out = inputs['backbone']['encoder_outputs']
+        logits = fluid.layers.fc(
+            input=enc_out,
+            size=2,
+            num_flatten_dims=2,
+            param_attr=fluid.ParamAttr(
+                name=scope_name+"cls_squad_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name=scope_name+"cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
+
+        logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
+        start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
+
+        def _compute_single_loss(logits, positions):
+            """Compute start/end loss for mrc model"""
+            loss = fluid.layers.softmax_with_cross_entropy(
+                logits=logits, label=positions)
+            loss = fluid.layers.mean(x=loss)
+            return loss
+
+        if self._is_training:
+            start_loss = _compute_single_loss(start_logits, start_positions)
+            end_loss = _compute_single_loss(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2.0
+            return {'loss': total_loss}
+        else:
+            return {'start_logits': start_logits,
+                    'end_logits': end_logits,
+                    'unique_ids': unique_id}
+
+
+    def postprocess(self, rt_outputs):
+        """this func will be called after each step(batch) of training/evaluating/predicting process."""
+        if not self._is_training:
+            unique_ids = np.squeeze(rt_outputs['unique_ids'], -1)
+            start_logits = rt_outputs['start_logits']
+            end_logits = rt_outputs['end_logits']
+            for idx in range(len(unique_ids)):
+                
+                if unique_ids[idx] < 0:
+                    continue
+                if len(self._pred_results) % 1000 == 0:
+                    print("Predicting example: {}".format(len(self._pred_results)))
+                uid = int(unique_ids[idx])
+
+                s = [float(x) for x in start_logits[idx].flat]
+                e = [float(x) for x in end_logits[idx].flat]
+                self._pred_results.append(
+                    RawResult(
+                        unique_id=uid,
+                        start_logits=s,
+                        end_logits=e))
+
+    def epoch_postprocess(self, post_inputs):
+        """(optional interface) this func will be called after evaluation/predicting process and each epoch during training process."""
+
+        if not self._is_training:
+            if self._pred_output_path is None:
+                raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
+            examples = post_inputs['reader']['examples']
+            features = post_inputs['reader']['features']
+            if not os.path.exists(self._pred_output_path):
+                os.makedirs(self._pred_output_path)
+            output_prediction_file = os.path.join(self._pred_output_path, "predictions.json")
+            output_nbest_file = os.path.join(self._pred_output_path, "nbest_predictions.json")
+            output_null_log_odds_file = os.path.join(self._pred_output_path, "null_odds.json")
+            _write_predictions(examples, features, self._pred_results,
+                              self._n_best_size, self._max_answer_length,
+                              self._do_lower_case, output_prediction_file,
+                              output_nbest_file, output_null_log_odds_file,
+                              self._with_negative,
+                              self._null_score_diff_threshold, self._verbose)
+
+
+def _write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      with_negative, null_score_diff_threshold,
+                      verbose):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    print("Writing predictions to: %s" % (output_prediction_file))
+    print("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[
+                    0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        if with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
+                                                              )]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
+                                                                 1)]
+                tok_text = " ".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = _get_final_text(tok_text, orig_text, do_lower_case,
+                                            verbose)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+
+        # if we didn't inlude the empty option in the n-best, inlcude it
+        if with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(
+                    text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+        # debug
+        if best_non_null_entry is None:
+            print("Emmm..., sth wrong")
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+
+def _get_final_text(pred_text, orig_text, do_lower_case, verbose):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the MRQA eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose:
+            print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose:
+            print("Length not equal after stripping spaces: '%s' vs '%s'",
+                  orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose:
+            print("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose:
+            print("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(
+        enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+