Merge branch 'develop' of https://github.com/PaddlePaddle/models into yxp1222

c93faf44 · smallv0221 · 2ddb6305 · a48aacb4 · c93faf44 · c93faf44
75 changed file
--- a/PaddleCV/image_classification/build_model.py
+++ b/PaddleCV/image_classification/build_model.py
@@ -15,7 +15,7 @@ import paddle
 import paddle.fluid as fluid
 import utils.utility as utility

-AMP_MODEL_LIST = ["ResNet50", "SE_ResNet50_vd"]
+AMP_MODEL_LIST = ["ResNet50", "SE_ResNet50_vd", "ResNet200_vd"]


 def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon):

--- a/PaddleCV/image_classification/models/resnet_vd.py
+++ b/PaddleCV/image_classification/models/resnet_vd.py
@@ -23,7 +23,8 @@ import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr

 __all__ = [
-    "ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd", "ResNet152_vd", "ResNet200_vd"
+    "ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
+    "ResNet152_vd", "ResNet200_vd"
 ]


@@ -32,7 +33,7 @@ class ResNet():
        self.layers = layers
        self.is_3x3 = is_3x3

-    def net(self, input, class_dim=1000):
+    def net(self, input, class_dim=1000, data_format="NCHW"):
        is_3x3 = self.is_3x3
        layers = self.layers
        supported_layers = [18, 34, 50, 101, 152, 200]
@@ -40,7 +41,7 @@ class ResNet():
            "supported layers are {} but input layer is {}".format(supported_layers, layers)

        if layers == 18:
-            depth = [2, 2, 2, 2]     
+            depth = [2, 2, 2, 2]
        elif layers == 34 or layers == 50:
            depth = [3, 4, 6, 3]
        elif layers == 101:
@@ -56,7 +57,8 @@ class ResNet():
                num_filters=64,
                filter_size=7,
                stride=2,
-                act='relu')
+                act='relu',
+                data_format=data_format)
        else:
            conv = self.conv_bn_layer(
                input=input,
@@ -64,29 +66,33 @@ class ResNet():
                filter_size=3,
                stride=2,
                act='relu',
-                name='conv1_1')
+                name='conv1_1',
+                data_format=data_format)
            conv = self.conv_bn_layer(
                input=conv,
                num_filters=32,
                filter_size=3,
                stride=1,
                act='relu',
-                name='conv1_2')
+                name='conv1_2',
+                data_format=data_format)
            conv = self.conv_bn_layer(
                input=conv,
                num_filters=64,
                filter_size=3,
                stride=1,
                act='relu',
-                name='conv1_3')
+                name='conv1_3',
+                data_format=data_format)

        conv = fluid.layers.pool2d(
            input=conv,
            pool_size=3,
            pool_stride=2,
            pool_padding=1,
-            pool_type='max')
-        
+            pool_type='max',
+            data_format=data_format)
+
        if layers >= 50:
            for block in range(len(depth)):
                for i in range(depth[block]):
@@ -101,22 +107,29 @@ class ResNet():
                        input=conv,
                        num_filters=num_filters[block],
                        stride=2 if i == 0 and block != 0 else 1,
-                        if_first=block==i==0,
-                        name=conv_name)
+                        if_first=block == i == 0,
+                        name=conv_name,
+                        data_format=data_format)
        else:
            for block in range(len(depth)):
                for i in range(depth[block]):
-                    conv_name="res"+str(block+2)+chr(97+i)
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
                    conv = self.basic_block(
                        input=conv,
                        num_filters=num_filters[block],
                        stride=2 if i == 0 and block != 0 else 1,
-                        if_first=block==i==0,
-                        name=conv_name)
+                        if_first=block == i == 0,
+                        name=conv_name,
+                        data_format=data_format)

        pool = fluid.layers.pool2d(
-            input=conv, pool_type='avg', global_pooling=True)
-        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+            input=conv,
+            pool_type='avg',
+            global_pooling=True,
+            data_format=data_format)
+        pool_channel = pool.shape[1] if data_format == "NCHW" else pool.shape[
+            -1]
+        stdv = 1.0 / math.sqrt(pool_channel * 1.0)

        out = fluid.layers.fc(
            input=pool,
@@ -133,7 +146,8 @@ class ResNet():
                      stride=1,
                      groups=1,
                      act=None,
-                      name=None):
+                      name=None,
+                      data_format="NCHW"):
        conv = fluid.layers.conv2d(
            input=input,
            num_filters=num_filters,
@@ -143,7 +157,8 @@ class ResNet():
            groups=groups,
            act=None,
            param_attr=ParamAttr(name=name + "_weights"),
-            bias_attr=False)
+            bias_attr=False,
+            data_format=data_format)
        if name == "conv1":
            bn_name = "bn_" + name
        else:
@@ -154,7 +169,8 @@ class ResNet():
            param_attr=ParamAttr(name=bn_name + '_scale'),
            bias_attr=ParamAttr(bn_name + '_offset'),
            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance')
+            moving_variance_name=bn_name + '_variance',
+            data_layout=data_format)

    def conv_bn_layer_new(self,
                          input,
@@ -163,14 +179,16 @@ class ResNet():
                          stride=1,
                          groups=1,
                          act=None,
-                          name=None):
+                          name=None,
+                          data_format="NCHW"):
        pool = fluid.layers.pool2d(
            input=input,
            pool_size=2,
            pool_stride=2,
            pool_padding=0,
            pool_type='avg',
-            ceil_mode=True)
+            ceil_mode=True,
+            data_format=data_format)

        conv = fluid.layers.conv2d(
            input=pool,
@@ -181,7 +199,8 @@ class ResNet():
            groups=groups,
            act=None,
            param_attr=ParamAttr(name=name + "_weights"),
-            bias_attr=False)
+            bias_attr=False,
+            data_format=data_format)
        if name == "conv1":
            bn_name = "bn_" + name
        else:
@@ -192,81 +211,114 @@ class ResNet():
            param_attr=ParamAttr(name=bn_name + '_scale'),
            bias_attr=ParamAttr(bn_name + '_offset'),
            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance')
+            moving_variance_name=bn_name + '_variance',
+            data_layout=data_format)

-    def shortcut(self, input, ch_out, stride, name, if_first=False):
-        ch_in = input.shape[1]
+    def shortcut(self,
+                 input,
+                 ch_out,
+                 stride,
+                 name,
+                 if_first=False,
+                 data_format="NCHW"):
+        ch_in = input.shape[1] if data_format == "NCHW" else input.shape[-1]
        if ch_in != ch_out or stride != 1:
            if if_first:
-                return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+                return self.conv_bn_layer(
+                    input,
+                    ch_out,
+                    1,
+                    stride,
+                    name=name,
+                    data_format=data_format)
            else:
-                return self.conv_bn_layer_new(input, ch_out, 1, stride, name=name)
+                return self.conv_bn_layer_new(
+                    input,
+                    ch_out,
+                    1,
+                    stride,
+                    name=name,
+                    data_format=data_format)
        elif if_first:
-            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+            return self.conv_bn_layer(
+                input, ch_out, 1, stride, name=name, data_format=data_format)
        else:
            return input
-            

-    def bottleneck_block(self, input, num_filters, stride, name, if_first):
+    def bottleneck_block(self,
+                         input,
+                         num_filters,
+                         stride,
+                         name,
+                         if_first,
+                         data_format="NCHW"):
        conv0 = self.conv_bn_layer(
            input=input,
            num_filters=num_filters,
            filter_size=1,
            act='relu',
-            name=name + "_branch2a")
+            name=name + "_branch2a",
+            data_format=data_format)
        conv1 = self.conv_bn_layer(
            input=conv0,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
            act='relu',
-            name=name + "_branch2b")
+            name=name + "_branch2b",
+            data_format=data_format)
        conv2 = self.conv_bn_layer(
            input=conv1,
            num_filters=num_filters * 4,
            filter_size=1,
            act=None,
-            name=name + "_branch2c")
+            name=name + "_branch2c",
+            data_format=data_format)

        short = self.shortcut(
            input,
            num_filters * 4,
            stride,
            if_first=if_first,
-            name=name + "_branch1")
+            name=name + "_branch1",
+            data_format=data_format)

        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-    
-    
-    def basic_block(self, input, num_filters, stride, name, if_first):
+
+    def basic_block(self, input, num_filters, stride, name, if_first,
+                    data_format):
        conv0 = self.conv_bn_layer(
-            input=input, 
-            num_filters=num_filters, 
-            filter_size=3, 
-            act='relu', 
+            input=input,
+            num_filters=num_filters,
+            filter_size=3,
+            act='relu',
            stride=stride,
-            name=name+"_branch2a")
+            name=name + "_branch2a",
+            data_format=data_format)
        conv1 = self.conv_bn_layer(
-            input=conv0, 
-            num_filters=num_filters, 
-            filter_size=3, 
-            act=None, 
-            name=name+"_branch2b")
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b",
+            data_format=data_format)
        short = self.shortcut(
-            input, 
-            num_filters, 
-            stride, 
-            if_first=if_first, 
-            name=name + "_branch1")
+            input,
+            num_filters,
+            stride,
+            if_first=if_first,
+            name=name + "_branch1",
+            data_format=data_format)
        return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')

+
 def ResNet18_vd():
-    model=ResNet(layers=18, is_3x3=True)
+    model = ResNet(layers=18, is_3x3=True)
    return model


 def ResNet34_vd():
-    model=ResNet(layers=34, is_3x3=True)
+    model = ResNet(layers=34, is_3x3=True)
    return model



--- a/PaddleCV/image_classification/scripts/train/ResNet200_vd_fp16.sh
+++ b/PaddleCV/image_classification/scripts/train/ResNet200_vd_fp16.sh
+#!/bin/bash -ex
+
+#Training details
+export FLAGS_conv_workspace_size_limit=4000 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+DATA_DIR="Your image dataset path, e.g. ./data/ILSVRC2012/"
+DATA_FORMAT="NHWC"
+USE_AMP=true #whether to use amp
+USE_DALI=true
+USE_ADDTO=true
+
+if ${USE_ADDTO} ;then
+    export FLAGS_max_inplace_grad_add=8
+fi
+if ${USE_DALI}; then
+    export FLAGS_fraction_of_gpu_memory_to_use=0.8
+fi
+
+python train.py \
+       --model=ResNet200_vd \
+       --data_dir=${DATA_DIR} \
+       --batch_size=64 \
+       --num_epochs=200 \
+       --total_images=1281167 \
+       --image_shape 4 224 224 \
+       --class_dim=1000 \
+       --print_step=10 \
+       --model_save_dir=output/ \
+       --lr_strategy=cosine_decay \
+       --use_amp=${USE_AMP} \
+       --scale_loss=128.0 \
+       --use_dynamic_loss_scaling=true \
+       --data_format=${DATA_FORMAT} \
+       --fuse_elewise_add_act_ops=true \
+       --fuse_bn_act_ops=true \
+       --fuse_bn_add_act_ops=true \
+       --enable_addto=${USE_ADDTO} \
+       --validate=true \
+       --is_profiler=false \
+       --profiler_path=profile/ \
+       --reader_thread=10 \
+       --reader_buf_size=4000 \
+       --use_dali=${USE_DALI} \
+       --lr=0.1 \
+       --l2_decay=1e-4 \
+       --use_label_smoothing=True \
+       --label_smoothing_epsilon=0.1
--- a/PaddleNLP/README_en.md
+++ b/PaddleNLP/README_en.md
@@ -95,13 +95,16 @@ For more pretrained model selection, please refer to [PretrainedModels](./paddle
 - [Models API](./docs/models.md)


+
+
+
 ## Tutorials

 Please refer to our official AI Studio account for more interactive tutorials: [PaddleNLP on AI Studio](https://aistudio.baidu.com/aistudio/personalcenter/thirdview/574995)

-* [What's Seq2Vec?](https://aistudio.baidu.com/aistudio/projectdetail/1294333) shows how to use LSTM to do sentiment analysis.
+* [What's Seq2Vec?](https://aistudio.baidu.com/aistudio/projectdetail/1283423) shows how to use LSTM to do sentiment analysis.

-* [Sentiment Analysis with ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/1283423) shows how to exploit the pretrained ERNIE to make sentiment analysis better.
+* [Sentiment Analysis with ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/1294333) shows how to exploit the pretrained ERNIE to make sentiment analysis better.

 * [Waybill Information Extraction with BiGRU-CRF Model](https://aistudio.baidu.com/aistudio/projectdetail/1317771) shows how to make use of bigru and crf to do information extraction.


--- a/PaddleNLP/benchmark/bert/README.md
+++ b/PaddleNLP/benchmark/bert/README.md
 # BERT Benchmark with Fleet API
 BERT - Bidirectional Encoder Representations from Transformers [论文链接](https://arxiv.org/abs/1810.04805)
-PaddlePaddle实现了BERT的预训练模型（Pre-training）和下游任务(Fine-tunning)。在预训练任务上提供单机版本和多机版本，同时提供混合精度接口来进行加速，可以任务需要进行选择。
+PaddlePaddle实现了BERT的预训练模型（Pre-training）和下游任务(Fine-tunning)。
 ## 数据集
 ### Pre-training数据集

@@ -10,7 +10,8 @@ PaddlePaddle实现了BERT的预训练模型（Pre-training）和下游任务(Fin
 ## Pre-training任务训练
 ### 环境变量设置
 1. paddlenlp的安装
-pip install paddlenlp==2.0.0a2 -i https://pypi.org/simple
+pip install paddlenlp==2.0.0b0 -i https://pypi.org/simple
+
 2. 设置预训练的数据地址环境变量
 ```shell
 export DATA_DIR=${HOME}/bert_data/wikicorpus_en
@@ -54,26 +55,6 @@ python ./run_pretrain_single.py \
    --max_steps 1000000
 ```

-### 训练速度对比
-进行速度对比的模型是bert-based模型，主要对比的方式是单机单机和多机多卡（4机32卡）下面进行速度对比，所有的GPU测试配置都是基于 Tesla V100-SXM2-16GB，下面的配置如下：
- InfiniBand 100 Gb/sec (4X EDR)， Mellanox Technologies MT27700 Family
- 48 CPU(s), Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz
- Memory 500G
- Ubuntu 16.04.4 LTS (GNU/Linux 4.4.0-116-generic x86_64)
- CUDA Version: 10.2, Driver API Version: 10.2, Driver Version: 440.33.01
- cuDNN Version: 7.6
- PaddlePaddle version: paddlepadle-gpu >= 2.0.0rc1
- PaddleNLP version: paddlenlp >= 2.0.0a2
-
-速度统计方式是统计每秒预训练模型能处理的样本数量，其中
- batch_size=64
- max_seq_length=128
-
-下面是具体速度对比情况:
-| node num | node num | gpu num/node | gpu num | batch_size/gpu |Throughput | Speedup |
-|----------| -------- | -------------| ------- | --------       | ----------| ------- |
-
-
 ## Fine-tuning任务训练

 在完成 BERT 模型的预训练后，即可利用预训练参数在特定的 NLP 任务上做 Fine-tuning。以下利用开源的预训练模型，示例如何进行分类任务的 Fine-tuning。

--- a/PaddleNLP/benchmark/bert/run_pretrain.py
+++ b/PaddleNLP/benchmark/bert/run_pretrain.py
@@ -172,13 +172,27 @@ def reset_program_state_dict(model, state_dict):
                loc=0.0, scale=scale, size=p.shape).astype(dtype_str)
    return new_state_dict

-
-def build_compiled_program(main_program, loss):
+def create_strategy():
+    """
+    Create build strategy and exec strategy.
+    Args:
+
+    Returns:
+        build_strategy: build strategy
+        exec_strategy: exec strategy
+    """
+    build_strategy = paddle.static.BuildStrategy()
    exec_strategy = paddle.static.ExecutionStrategy()
+
+    build_strategy.enable_addto = args.enable_addto
+
    exec_strategy.num_threads = 1
    exec_strategy.num_iteration_per_drop_scope = 10000
-    build_strategy = paddle.static.BuildStrategy()
-    build_strategy.enable_addto = args.enable_addto
+    return build_strategy, exec_strategy
+
+
+def build_compiled_program(main_program, loss):
+    build_strategy, exec_strategy = create_strategy()
    main_program = paddle.static.CompiledProgram(
        main_program).with_data_parallel(
            loss_name=loss.name,
@@ -187,6 +201,33 @@ def build_compiled_program(main_program, loss):
    return main_program


+def dist_optimizer(args, optimizer):
+    """
+    Create a distributed optimizer based on a normal optimizer
+    Args:
+        args:
+        optimizer: a normal optimizer
+    Returns:
+        optimizer: a distributed optimizer
+    """
+    build_strategy, exec_strategy = create_strategy()
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.execution_strategy = exec_strategy
+    dist_strategy.build_strategy = build_strategy
+
+    dist_strategy.fuse_grad_size_in_MB = 16
+    if args.use_amp:
+        dist_strategy.amp = True
+        dist_strategy.amp_configs = {
+            'custom_white_list': ['softmax', 'layer_norm', 'gelu'],
+            'init_loss_scaling': args.scale_loss,
+        }
+
+    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+    return optimizer
+
+
 def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
@@ -208,9 +249,12 @@ def do_train(args):
    place = paddle.set_device(args.select_device)
    fleet.init(is_collective=True)

+    worker_num = fleet.worker_num()
+    worker_index = fleet.worker_index()
+
    # Create the random seed for the worker
    set_seed(args.seed)
-    worker_init = WorkerInitObj(args.seed + fleet.worker_index())
+    worker_init = WorkerInitObj(args.seed + worker_index)

    # Define the input data in the static mode
    main_program = paddle.static.default_main_program()
@@ -260,7 +304,7 @@ def do_train(args):
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])
-    if args.use_amp:
+    if worker_num == 1 and args.use_amp:
        amp_list = paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
            custom_white_list=['softmax', 'layer_norm', 'gelu'])
        optimizer = paddle.fluid.contrib.mixed_precision.decorate(
@@ -268,9 +312,10 @@ def do_train(args):
            amp_list,
            init_loss_scaling=args.scale_loss,
            use_dynamic_loss_scaling=True)
-    # Use the fleet api to compile the distributed optimizer
-    strategy = fleet.DistributedStrategy()
-    optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+
+    if worker_num > 1:
+        # Use the fleet api to compile the distributed optimizer
+        optimizer = dist_optimizer(args, optimizer)
    optimizer.minimize(loss)

    # Define the Executor for running the static model
@@ -281,14 +326,14 @@ def do_train(args):
    # Use the state dict to update the parameter
    reset_state_dict = reset_program_state_dict(model, state_dict)
    paddle.static.set_program_state(main_program, reset_state_dict)
-    # Construct the compiled program
-    main_program = build_compiled_program(main_program, loss)
+
+    if worker_num == 1:
+        # Construct the compiled program
+        main_program = build_compiled_program(main_program, loss)

    pool = ThreadPoolExecutor(1)
    global_step = 0
    tic_train = time.time()
-    worker_num = fleet.worker_num()
-    worker_index = fleet.worker_index()
    epoch = 0
    while True:
        files = [

--- a/PaddleNLP/benchmark/transformer/configs/transformer.big.yaml
+++ b/PaddleNLP/benchmark/transformer/configs/transformer.big.yaml
@@ -27,6 +27,12 @@ pool_size: 200000
 sort_type: "global"
 batch_size: 4096
 infer_batch_size: 16
+shuffle_batch: True
+# Data shuffle only works when sort_type is pool or none
+shuffle: True
+# shuffle_seed must be set when shuffle is True and using multi-cards to train. 
+# Otherwise, the number of batches cannot be guaranteed. 
+shuffle_seed: 128

 # Hyparams for training:
 # The number of epoches for training

--- a/PaddleNLP/benchmark/transformer/reader.py
+++ b/PaddleNLP/benchmark/transformer/reader.py
@@ -43,6 +43,12 @@ def create_data_loader(args):
            mode=m, transform_func=transform_func) for m in ["train", "dev"]
    ]

+    if args.shuffle or args.shuffle_batch:
+        if args.shuffle_seed == "None" or args.shuffle_seed is None:
+            shuffle_seed = 0
+        else:
+            shuffle_seed = args.shuffle_seed
+
    def _max_token_fn(current_idx, current_batch_size, tokens_sofar,
                      data_source):
        return max(tokens_sofar,
@@ -60,19 +66,17 @@ def create_data_loader(args):
                min_max_filer, max_len=args.max_length))
        sampler = SamplerHelper(dataset)

-        src_key = (lambda x, data_source: len(data_source[x][0]) + 1)
        if args.sort_type == SortType.GLOBAL:
-            buffer_size = -1
+            src_key = (lambda x, data_source: len(data_source[x][0]) + 1)
            trg_key = (lambda x, data_source: len(data_source[x][1]) + 1)
            # Sort twice
-            sampler = sampler.sort(
-                key=trg_key, buffer_size=buffer_size).sort(
-                    key=src_key, buffer_size=buffer_size)
+            sampler = sampler.sort(key=trg_key).sort(key=src_key)
        else:
-            sampler = sampler.shuffle()
+            if args.shuffle:
+                sampler = sampler.shuffle(seed=shuffle_seed)
+            max_key = (lambda x, data_source: max(len(data_source[x][0]), len(data_source[x][1])) + 1)
            if args.sort_type == SortType.POOL:
-                buffer_size = args.pool_size
-                sampler = sampler.sort(key=src_key, buffer_size=buffer_size)
+                sampler = sampler.sort(key=max_key, buffer_size=args.pool_size)

        batch_sampler = sampler.batch(
            batch_size=args.batch_size,
@@ -80,6 +84,9 @@ def create_data_loader(args):
            batch_size_fn=_max_token_fn,
            key=_key)

+        if args.shuffle_batch:
+            batch_sampler.shuffle(seed=shuffle_seed)
+
        if m == "train":
            batch_sampler = batch_sampler.shard()


--- a/PaddleNLP/docs/embeddings.md
+++ b/PaddleNLP/docs/embeddings.md
+- [Embedding 模型汇总](#embedding-模型汇总)  
+  - [中文词向量](#中文词向量)
+  - [英文词向量](#英文词向量)  
+    - [GloVe](#glove)  
+    - [FastText](#fasttext)
+  - [模型信息](#模型信息)
+  - [致谢](#致谢)  
+  - [参考论文](#参考论文)
+
 # Embedding 模型汇总

 PaddleNLP提供多个开源的预训练Embedding模型，用户仅需在使用`paddlenlp.embeddings.TokenEmbedding`时，指定预训练模型的名称，即可加载相对应的预训练模型。以下为PaddleNLP所支持的预训练Embedding模型，其名称用作`paddlenlp.embeddings.TokenEmbedding`的参数。命名方式为：\${训练模型}.\${语料}.\${词向量类型}.\${co-occurrence type}.dim\${维度}。训练模型有三种，分别是Word2Vec(w2v, 使用skip-gram模型训练), GloVe(glove)和FastText(fasttext)。
@@ -42,11 +51,91 @@ PaddleNLP提供多个开源的预训练Embedding模型，用户仅需在使用`p

 ## 英文词向量

-待更新。
+### GloVe
+
+| 语料                | 25维     | 50维      | 100维    | 200维    | 300 维   |
+| -----------------   | ------   |  ------   | ------   | ------   | ------   |
+| Wiki2014 + GigaWord | 无 | glove.wiki2014-gigaword.target.word-word.dim50.en | glove.wiki2014-gigaword.target.word-word.dim100.en | glove.wiki2014-gigaword.target.word-word.dim200.en | glove.wiki2014-gigaword.target.word-word.dim300.en |
+| Twitter             | glove.twitter.target.word-word.dim25.en | glove.twitter.target.word-word.dim50.en | glove.twitter.target.word-word.dim100.en | glove.twitter.target.word-word.dim200.en | 无 |
+
+### FastText
+
+| 语料 | 名称 |
+|------|------|
+| Wiki2017 | fasttext.wiki-news.target.word-word.dim300.en |
+| Crawl    | fasttext.crawl.target.word-word.dim300.en |
+
+## 模型信息
+
+| 模型 | 文件大小 | 词表大小 |
+|-----|---------|---------|
+| w2v.baidu_encyclopedia.target.word-word.dim300                         | 678.21 MB  | 635965 |
+| w2v.baidu_encyclopedia.target.word-character.char1-1.dim300            | 679.15 MB  | 636038 |
+| w2v.baidu_encyclopedia.target.word-character.char1-2.dim300            | 679.30 MB  | 636038 |
+| w2v.baidu_encyclopedia.target.word-character.char1-4.dim300            | 679.51 MB  | 636038 |
+| w2v.baidu_encyclopedia.target.word-ngram.1-2.dim300                    | 679.48 MB  | 635977 |
+| w2v.baidu_encyclopedia.target.word-ngram.1-3.dim300                    | 671.27 MB  | 628669 |
+| w2v.baidu_encyclopedia.target.word-ngram.2-2.dim300                    | 7.28 GB    | 6969069 |
+| w2v.baidu_encyclopedia.target.word-wordLR.dim300                       | 678.22 MB  | 635958 |
+| w2v.baidu_encyclopedia.target.word-wordPosition.dim300                 | 679.32 MB  | 636038 |
+| w2v.baidu_encyclopedia.target.bigram-char.dim300                       | 679.29 MB  | 635976 |
+| w2v.baidu_encyclopedia.context.word-word.dim300                        | 677.74 MB  | 635952 |
+| w2v.baidu_encyclopedia.context.word-character.char1-1.dim300           | 678.65 MB  | 636200 |
+| w2v.baidu_encyclopedia.context.word-character.char1-2.dim300           | 844.23 MB  | 792631 |
+| w2v.baidu_encyclopedia.context.word-character.char1-4.dim300           | 1.16 GB    | 1117461 |
+| w2v.baidu_encyclopedia.context.word-ngram.1-2.dim300                   | 7.25 GB    | 6967598 |
+| w2v.baidu_encyclopedia.context.word-ngram.1-3.dim300                   | 5.21 GB    | 5000001 |
+| w2v.baidu_encyclopedia.context.word-ngram.2-2.dim300                   | 7.26 GB    | 6968998 |
+| w2v.baidu_encyclopedia.context.word-wordLR.dim300                      | 1.32 GB    | 1271031 |
+| w2v.baidu_encyclopedia.context.word-wordPosition.dim300                | 6.47 GB    | 6293920 |
+| w2v.wiki.target.bigram-char.dim300                                     | 375.98 MB  | 352274 |
+| w2v.wiki.target.word-char.dim300                                       | 375.52 MB  | 352223 |
+| w2v.wiki.target.word-word.dim300                                       | 374.95 MB  | 352219 |
+| w2v.wiki.target.word-bigram.dim300                                     | 375.72 MB  | 352219 |
+| w2v.people_daily.target.bigram-char.dim300                             | 379.96 MB  | 356055 |
+| w2v.people_daily.target.word-char.dim300                               | 379.45 MB  | 355998 |
+| w2v.people_daily.target.word-word.dim300                               | 378.93 MB  | 355989 |
+| w2v.people_daily.target.word-bigram.dim300                             | 379.68 MB  | 355991 |
+| w2v.weibo.target.bigram-char.dim300                                    | 208.24 MB  | 195199 |
+| w2v.weibo.target.word-char.dim300                                      | 208.03 MB  | 195204 |
+| w2v.weibo.target.word-word.dim300                                      | 207.94 MB  | 195204 |
+| w2v.weibo.target.word-bigram.dim300                                    | 208.19 MB  | 195204 |
+| w2v.sogou.target.bigram-char.dim300                                    | 389.81 MB  | 365112 |
+| w2v.sogou.target.word-char.dim300                                      | 389.89 MB  | 365078 |
+| w2v.sogou.target.word-word.dim300                                      | 388.66 MB  | 364992 |
+| w2v.sogou.target.word-bigram.dim300                                    | 388.66 MB  | 364994 |
+| w2v.zhihu.target.bigram-char.dim300                                    | 277.35 MB  | 259755 |
+| w2v.zhihu.target.word-char.dim300                                      | 277.40 MB  | 259940 |
+| w2v.zhihu.target.word-word.dim300                                      | 276.98 MB  | 259871 |
+| w2v.zhihu.target.word-bigram.dim300                                    | 277.53 MB  | 259885 |
+| w2v.financial.target.bigram-char.dim300                                | 499.52 MB  | 467163 |
+| w2v.financial.target.word-char.dim300                                  | 499.17 MB  | 467343 |
+| w2v.financial.target.word-word.dim300                                  | 498.94 MB  | 467324 |
+| w2v.financial.target.word-bigram.dim300                                | 499.54 MB  | 467331 |
+| w2v.literature.target.bigram-char.dim300                               | 200.69 MB  | 187975 |
+| w2v.literature.target.word-char.dim300                                 | 200.44 MB  | 187980 |
+| w2v.literature.target.word-word.dim300                                 | 200.28 MB  | 187961 |
+| w2v.literature.target.word-bigram.dim300                               | 200.59 MB  | 187962 |
+| w2v.sikuquanshu.target.word-word.dim300                                | 20.70 MB   | 19529 |
+| w2v.sikuquanshu.target.word-bigram.dim300                              | 20.77 MB   | 19529 |
+| w2v.mixed-large.target.word-char.dim300                                | 1.35 GB    | 1292552 |
+| w2v.mixed-large.target.word-word.dim300                                | 1.35 GB    | 1292483 |
+| glove.wiki2014-gigaword.target.word-word.dim50.en                      | 73.45 MB   | 400002 |
+| glove.wiki2014-gigaword.target.word-word.dim100.en                     | 143.30 MB  | 400002 |
+| glove.wiki2014-gigaword.target.word-word.dim200.en                     | 282.97 MB  | 400002 |
+| glove.wiki2014-gigaword.target.word-word.dim300.en                     | 422.83 MB  | 400002 |
+| glove.twitter.target.word-word.dim25.en                                | 116.92 MB  | 1193516 |
+| glove.twitter.target.word-word.dim50.en                                | 221.64 MB  | 1193516 |
+| glove.twitter.target.word-word.dim100.en                               | 431.08 MB  | 1193516 |
+| glove.twitter.target.word-word.dim200.en                               | 848.56 MB  | 1193516 |
+| fasttext.wiki-news.target.word-word.dim300.en                          | 541.63 MB  | 999996 |
+| fasttext.crawl.target.word-word.dim300.en                              | 1.19 GB    | 2000002 |

 ## 致谢
- 感谢 [Chinese-Word-Vectors](https://github.com/Embedding/Chinese-Word-Vectors)提供Word2Vec中文Embedding来源。
+- 感谢 [Chinese-Word-Vectors](https://github.com/Embedding/Chinese-Word-Vectors)提供Word2Vec中文Embedding预训练模型，[GloVe Project](https://nlp.stanford.edu/projects/glove)提供的GloVe英文Embedding预训练模型，[FastText Project](https://fasttext.cc/docs/en/english-vectors.html)提供的fasttext英文预训练模型。

 ## 参考论文
 - Li, Shen, et al. "Analogical reasoning on chinese morphological and semantic relations." arXiv preprint arXiv:1805.06504 (2018).
 - Qiu, Yuanyuan, et al. "Revisiting correlations between intrinsic and extrinsic evaluations of word embeddings." Chinese Computational Linguistics and Natural Language Processing Based on Naturally Annotated Big Data. Springer, Cham, 2018. 209-221.
+- Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.
+- T. Mikolov, E. Grave, P. Bojanowski, C. Puhrsch, A. Joulin. Advances in Pre-Training Distributed Word Representations
--- a/PaddleNLP/examples/README.md
+++ b/PaddleNLP/examples/README.md
--- a/PaddleNLP/examples/dialogue/dgu/README.md
+++ b/PaddleNLP/examples/dialogue/dgu/README.md
@@ -39,17 +39,17 @@ DGU模型中的6个任务，分别采用不同的评估指标在test集上进行

 * PaddlePaddle 安装

-   本项目依赖于 PaddlePaddle 2.0 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
+   本项目依赖于 PaddlePaddle 2.0rc1 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装

 * PaddleNLP 安装

   ```shell
-   pip install paddlenlp
+   pip install paddlenlp>=2.0.0b
   ```

 * 环境依赖

-   Python的版本要求 3.6+，其它环境请参考 PaddlePaddle [安装说明](https://www.paddlepaddle.org.cn/install/quick/zh/2.0rc-linux-docker) 部分的内容
+   Python的版本要求 3.6+

 ### 代码结构说明


--- a/PaddleNLP/examples/dialogue/plato-2/README.md
+++ b/PaddleNLP/examples/dialogue/plato-2/README.md
@@ -18,7 +18,7 @@ PLATO-2的训练过程及其他细节详见 [Knover](https://github.com/PaddlePa

 * PaddlePaddle 安装

-   本项目依赖于 PaddlePaddle 2.0 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
+   本项目依赖于 PaddlePaddle 2.0rc1 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装

 * PaddleNLP 安装

@@ -28,13 +28,13 @@ PLATO-2的训练过程及其他细节详见 [Knover](https://github.com/PaddlePa

 * 环境依赖

-    Python的版本要求 3.6+
+   Python的版本要求 3.6+

-    本项目依赖sentencepiece和termcolor，请在运行本项目之前进行安装
+   本项目依赖sentencepiece和termcolor，请在运行本项目之前进行安装

-    ```shell
-    pip install sentencepiece termcolor
-    ```
+   ```shell
+   pip install sentencepiece termcolor
+   ```

 ### 代码结构说明


--- a/PaddleNLP/examples/language_model/bert/README.md
+++ b/PaddleNLP/examples/language_model/bert/README.md
+# BERT
+
+## 模型简介
+
+[BERT](https://arxiv.org/abs/1810.04805) （Bidirectional Encoder Representations from Transformers）以[Transformer](https://arxiv.org/abs/1706.03762) 编码器为网络基本组件，使用掩码语言模型（Masked Language Model）和邻接句子预测（Next Sentence Prediction）两个任务在大规模无标注文本语料上进行预训练（pre-train），得到融合了双向内容的通用语义表示模型。以预训练产生的通用语义表示模型为基础，结合任务适配的简单输出层，微调（fine-tune）后即可应用到下游的NLP任务，效果通常也较直接在下游的任务上训练的模型更优。此前BERT即在[GLUE评测任务](https://gluebenchmark.com/tasks)上取得了SOTA的结果。
+
+本项目是BERT在 Paddle 2.0上的开源实现，包含了预训练和[GLUE评测任务](https://gluebenchmark.com/tasks)上的微调代码。
+
+## 快速开始
+
+### 安装说明
+
+* PaddlePaddle 安装
+
+   本项目依赖于 PaddlePaddle 2.0rc1 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
+
+* PaddleNLP 安装
+
+   ```shell
+   pip install paddlenlp>=2.0.0b
+   ```
+
+### 数据准备
+
+#### Pre-training数据准备
+
+`create_pretraining_data.py` 是创建预训练程序所需数据的脚本。其以文本文件（使用换行符换行和空白符分隔，data目录下提供了部分示例数据）为输入，经由BERT tokenizer进行tokenize后再做生成sentence pair正负样本、掩码token等处理，最后输出hdf5格式的数据文件。使用方式如下：
+
+```python
+python create_pretraining_data.py \
+  --input_file=data/sample_text.txt \
+  --output_file=data/training_data.hdf5 \
+  --bert_model=bert-base-uncased \
+  --max_seq_length=128 \
+  --max_predictions_per_seq=20 \
+  --masked_lm_prob=0.15 \
+  --random_seed=12345 \
+  --dupe_factor=5
+```
+
+其中参数释义如下：
+- `input_file` 指定输入文件，可以使用目录，指定目录时将包括目录中的所有`.txt`文件。
+- `output_file` 指定输出文件。
+- `bert_model` 指定使用特定BERT模型对应的tokenizer进行tokenize处理。
+- `max_seq_length` 指定最大句子长度，超过该长度将被截断，不足该长度的将会进行padding。
+- `max_predictions_per_seq` 表示每个句子中会被mask的token的最大数目。
+- `masked_lm_prob` 表示每个token被mask的概率。
+- `random_seed` 指定随机种子。
+- `dupe_factor` 指定输入数据被重复处理的次数，每次处理将重新产生随机mask。
+
+使用以上预训练数据生成程序可以用于处理领域垂类数据后进行二次预训练。若需要使用BERT论文中预训练使用的英文Wiki和BookCorpus数据，可以参考[这里](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)进行处理，得到的数据可以直接接入本项目中的预训练程序使用。
+
+#### Fine-tunning数据准备
+
+##### GLUE评测任务数据
+
+GLUE评测任务所含数据集已在paddlenlp中以API形式提供，无需预先准备，使用`run_glue.py`执行微调时将会自动下载。
+
+### 执行Pre-training
+
+```shell
+python -u ./run_pretrain.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --max_predictions_per_seq 20 \
+    --batch_size 32   \
+    --learning_rate 1e-4 \
+    --weight_decay 1e-2 \
+    --adam_epsilon 1e-6 \
+    --warmup_steps 10000 \
+    --num_train_epochs 3 \
+    --input_dir data/ \
+    --output_dir pretrained_models/ \
+    --logging_steps 1 \
+    --save_steps 20000 \
+    --max_steps 1000000 \
+    --n_gpu 1
+```
+
+其中参数释义如下：
+- `model_type` 指示了模型类型，使用BERT模型时设置为bert即可。
+- `model_name_or_path` 指示了某种特定配置的模型，对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地，这里也可以提供相应目录地址。
+- `max_predictions_per_seq` 表示每个句子中会被mask的token的最大数目，与创建预训练数据时的设置一致。
+- `batch_size` 表示每次迭代**每张卡**上的样本数目。
+- `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
+- `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
+- `adam_epsilon` 表示AdamW优化器中使用的epsilon值。
+- `warmup_steps` 表示。
+- `num_train_epochs` 表示训练轮数。
+- `input_dir` 表示输入数据的目录，该目录下所有文件名中包含training的文件将被作为训练数据。
+- `output_dir` 表示模型的保存目录。
+- `logging_steps` 表示日志打印间隔。
+- `save_steps` 表示模型保存及评估间隔。
+- `max_steps` 表示最大训练步数。若训练`num_train_epochs`轮包含的训练步数大于该值，则达到`max_steps`后就提前结束。
+- `n_gpu` 表示使用的 GPU 卡数。若希望使用多卡训练，将其设置为指定数目即可；若为0，则使用CPU。
+
+### 执行Fine-tunning
+
+以GLUE中的SST-2任务为例，启动Fine-tuning的方式如下：
+
+```shell
+python -u ./run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name SST-2 \
+    --max_seq_length 128 \
+    --batch_size 32   \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3 \
+    --logging_steps 1 \
+    --save_steps 500 \
+    --output_dir ./tmp/ \
+    --n_gpu 1 \
+```
+
+其中参数释义如下：
+- `model_type` 指示了模型类型，使用BERT模型时设置为bert即可。
+- `model_name_or_path` 指示了某种特定配置的模型，对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地，这里也可以提供相应目录地址。
+- `task_name` 表示Fine-tuning的任务。
+- `max_seq_length` 表示最大句子长度，超过该长度将被截断。
+- `batch_size` 表示每次迭代**每张卡**上的样本数目。
+- `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
+- `num_train_epochs` 表示训练轮数。
+- `logging_steps` 表示日志打印间隔。
+- `save_steps` 表示模型保存及评估间隔。
+- `output_dir` 表示模型保存路径。
+- `n_gpu` 表示使用的 GPU 卡数。若希望使用多卡训练，将其设置为指定数目即可；若为0，则使用CPU。
+
+基于`bert-base-uncased`在GLUE各评测任务上Fine-tuning后，在验证集上有如下结果：
+
+| Task  | Metric                       | Result            |
+|:-----:|:----------------------------:|:-----------------:|
+| SST-2 | Accuracy                     |      0.92660      |
+| QNLI  | Accuracy                     |      0.91707      |
+| CoLA  | Mattehew's corr              |      0.59557      |
+| MRPC  | F1/Accuracy                  |  0.91667/0.88235  |
+| STS-B | Person/Spearman corr         |  0.88847/0.88350  |
+| QQP   | Accuracy/F1                  |  0.90581/0.87347  |
+| MNLI  | Matched acc/MisMatched acc   |  0.84422/0.84825  |
+| RTE   | Accuracy                     |      0.711191     |
--- a/PaddleNLP/examples/language_model/bert/create_pretraining_data.py
+++ b/PaddleNLP/examples/language_model/bert/create_pretraining_data.py
+# coding=utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+import os
+import random
+from io import open
+import h5py
+import numpy as np
+from tqdm import tqdm
+
+from paddlenlp.transformers import BertTokenizer
+from paddlenlp.transformers.tokenizer_utils import convert_to_unicode
+
+import random
+import collections
+
+
+class TrainingInstance(object):
+    """A single training instance (sentence pair)."""
+    def __init__(self, tokens, segment_ids, masked_lm_positions,
+                 masked_lm_labels, is_random_next):
+        self.tokens = tokens
+        self.segment_ids = segment_ids
+        self.is_random_next = is_random_next
+        self.masked_lm_positions = masked_lm_positions
+        self.masked_lm_labels = masked_lm_labels
+
+
+def write_instance_to_example_file(instances, tokenizer, max_seq_length,
+                                   max_predictions_per_seq, output_file):
+    """Create TF example files from `TrainingInstance`s."""
+
+    total_written = 0
+    features = collections.OrderedDict()
+
+    num_instances = len(instances)
+    features["input_ids"] = np.zeros([num_instances, max_seq_length],
+                                     dtype="int32")
+    features["input_mask"] = np.zeros([num_instances, max_seq_length],
+                                      dtype="int32")
+    features["segment_ids"] = np.zeros([num_instances, max_seq_length],
+                                       dtype="int32")
+    features["masked_lm_positions"] = np.zeros(
+        [num_instances, max_predictions_per_seq], dtype="int32")
+    features["masked_lm_ids"] = np.zeros(
+        [num_instances, max_predictions_per_seq], dtype="int32")
+    features["next_sentence_labels"] = np.zeros(num_instances, dtype="int32")
+
+    for inst_index, instance in enumerate(tqdm(instances)):
+        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+        input_mask = [1] * len(input_ids)
+        segment_ids = list(instance.segment_ids)
+        assert len(input_ids) <= max_seq_length
+
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        masked_lm_positions = list(instance.masked_lm_positions)
+        masked_lm_ids = tokenizer.convert_tokens_to_ids(
+            instance.masked_lm_labels)
+        masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+        while len(masked_lm_positions) < max_predictions_per_seq:
+            masked_lm_positions.append(0)
+            masked_lm_ids.append(0)
+            masked_lm_weights.append(0.0)
+
+        next_sentence_label = 1 if instance.is_random_next else 0
+
+        features["input_ids"][inst_index] = input_ids
+        features["input_mask"][inst_index] = input_mask
+        features["segment_ids"][inst_index] = segment_ids
+        features["masked_lm_positions"][inst_index] = masked_lm_positions
+        features["masked_lm_ids"][inst_index] = masked_lm_ids
+        features["next_sentence_labels"][inst_index] = next_sentence_label
+
+        total_written += 1
+
+    print("saving data")
+    f = h5py.File(output_file, 'w')
+    f.create_dataset("input_ids",
+                     data=features["input_ids"],
+                     dtype='i4',
+                     compression='gzip')
+    f.create_dataset("input_mask",
+                     data=features["input_mask"],
+                     dtype='i1',
+                     compression='gzip')
+    f.create_dataset("segment_ids",
+                     data=features["segment_ids"],
+                     dtype='i1',
+                     compression='gzip')
+    f.create_dataset("masked_lm_positions",
+                     data=features["masked_lm_positions"],
+                     dtype='i4',
+                     compression='gzip')
+    f.create_dataset("masked_lm_ids",
+                     data=features["masked_lm_ids"],
+                     dtype='i4',
+                     compression='gzip')
+    f.create_dataset("next_sentence_labels",
+                     data=features["next_sentence_labels"],
+                     dtype='i1',
+                     compression='gzip')
+    f.flush()
+    f.close()
+
+
+def create_training_instances(input_files, tokenizer, max_seq_length,
+                              dupe_factor, short_seq_prob, masked_lm_prob,
+                              max_predictions_per_seq, rng):
+    """Create `TrainingInstance`s from raw text."""
+    all_documents = [[]]
+
+    # Input file format:
+    # (1) One sentence per line. These should ideally be actual sentences, not
+    # entire paragraphs or arbitrary spans of text. (Because we use the
+    # sentence boundaries for the "next sentence prediction" task).
+    # (2) Blank lines between documents. Document boundaries are needed so
+    # that the "next sentence prediction" task doesn't span between documents.
+    for input_file in input_files:
+        print("creating instance from {}".format(input_file))
+        with open(input_file, "r") as reader:
+            while True:
+                line = convert_to_unicode(reader.readline())
+                if not line:
+                    break
+                line = line.strip()
+
+                # Empty lines are used as document delimiters
+                if not line:
+                    all_documents.append([])
+                # tokens = tokenizer.tokenize(line)
+                tokens = tokenizer(line)
+                if tokens:
+                    all_documents[-1].append(tokens)
+
+    # Remove empty documents
+    all_documents = [x for x in all_documents if x]
+    rng.shuffle(all_documents)
+
+    # vocab_words = list(tokenizer.vocab.keys())
+    vocab_words = list(tokenizer.vocab.token_to_idx.keys())
+    instances = []
+    for _ in range(dupe_factor):
+        for document_index in range(len(all_documents)):
+            instances.extend(
+                create_instances_from_document(all_documents, document_index,
+                                               max_seq_length, short_seq_prob,
+                                               masked_lm_prob,
+                                               max_predictions_per_seq,
+                                               vocab_words, rng))
+
+    rng.shuffle(instances)
+    return instances
+
+
+def create_instances_from_document(all_documents, document_index,
+                                   max_seq_length, short_seq_prob,
+                                   masked_lm_prob, max_predictions_per_seq,
+                                   vocab_words, rng):
+    """Creates `TrainingInstance`s for a single document."""
+    document = all_documents[document_index]
+
+    # Account for [CLS], [SEP], [SEP]
+    max_num_tokens = max_seq_length - 3
+
+    # We *usually* want to fill up the entire sequence since we are padding
+    # to `max_seq_length` anyways, so short sequences are generally wasted
+    # computation. However, we *sometimes*
+    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+    # sequences to minimize the mismatch between pre-training and fine-tuning.
+    # The `target_seq_length` is just a rough target however, whereas
+    # `max_seq_length` is a hard limit.
+    target_seq_length = max_num_tokens
+    if rng.random() < short_seq_prob:
+        target_seq_length = rng.randint(2, max_num_tokens)
+
+    # We DON'T just concatenate all of the tokens from a document into a long
+    # sequence and choose an arbitrary split point because this would make the
+    # next sentence prediction task too easy. Instead, we split the input into
+    # segments "A" and "B" based on the actual "sentences" provided by the user
+    # input.
+    instances = []
+    current_chunk = []
+    current_length = 0
+    i = 0
+    while i < len(document):
+        segment = document[i]
+        current_chunk.append(segment)
+        current_length += len(segment)
+        if i == len(document) - 1 or current_length >= target_seq_length:
+            if current_chunk:
+                # `a_end` is how many segments from `current_chunk` go into the `A`
+                # (first) sentence.
+                a_end = 1
+                if len(current_chunk) >= 2:
+                    a_end = rng.randint(1, len(current_chunk) - 1)
+
+                tokens_a = []
+                for j in range(a_end):
+                    tokens_a.extend(current_chunk[j])
+
+                tokens_b = []
+                # Random next
+                is_random_next = False
+                if len(current_chunk) == 1 or rng.random() < 0.5:
+                    is_random_next = True
+                    target_b_length = target_seq_length - len(tokens_a)
+
+                    # This should rarely go for more than one iteration for large
+                    # corpora. However, just to be careful, we try to make sure that
+                    # the random document is not the same as the document
+                    # we're processing.
+                    for _ in range(10):
+                        random_document_index = rng.randint(
+                            0,
+                            len(all_documents) - 1)
+                        if random_document_index != document_index:
+                            break
+
+                    #If picked random document is the same as the current document
+                    if random_document_index == document_index:
+                        is_random_next = False
+
+                    random_document = all_documents[random_document_index]
+                    random_start = rng.randint(0, len(random_document) - 1)
+                    for j in range(random_start, len(random_document)):
+                        tokens_b.extend(random_document[j])
+                        if len(tokens_b) >= target_b_length:
+                            break
+                    # We didn't actually use these segments so we "put them back" so
+                    # they don't go to waste.
+                    num_unused_segments = len(current_chunk) - a_end
+                    i -= num_unused_segments
+                # Actual next
+                else:
+                    is_random_next = False
+                    for j in range(a_end, len(current_chunk)):
+                        tokens_b.extend(current_chunk[j])
+                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+                assert len(tokens_a) >= 1
+                assert len(tokens_b) >= 1
+
+                tokens = []
+                segment_ids = []
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                for token in tokens_a:
+                    tokens.append(token)
+                    segment_ids.append(0)
+
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+
+                for token in tokens_b:
+                    tokens.append(token)
+                    segment_ids.append(1)
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+
+                (tokens, masked_lm_positions,
+                 masked_lm_labels) = create_masked_lm_predictions(
+                     tokens, masked_lm_prob, max_predictions_per_seq,
+                     vocab_words, rng)
+                instance = TrainingInstance(
+                    tokens=tokens,
+                    segment_ids=segment_ids,
+                    is_random_next=is_random_next,
+                    masked_lm_positions=masked_lm_positions,
+                    masked_lm_labels=masked_lm_labels)
+                instances.append(instance)
+            current_chunk = []
+            current_length = 0
+        i += 1
+
+    return instances
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+    """Creates the predictions for the masked LM objective."""
+
+    cand_indexes = []
+    for (i, token) in enumerate(tokens):
+        if token == "[CLS]" or token == "[SEP]":
+            continue
+        cand_indexes.append(i)
+
+    rng.shuffle(cand_indexes)
+
+    output_tokens = list(tokens)
+
+    num_to_predict = min(max_predictions_per_seq,
+                         max(1, int(round(len(tokens) * masked_lm_prob))))
+
+    masked_lms = []
+    covered_indexes = set()
+    for index in cand_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        if index in covered_indexes:
+            continue
+        covered_indexes.add(index)
+
+        masked_token = None
+        # 80% of the time, replace with [MASK]
+        if rng.random() < 0.8:
+            masked_token = "[MASK]"
+        else:
+            # 10% of the time, keep original
+            if rng.random() < 0.5:
+                masked_token = tokens[index]
+            # 10% of the time, replace with random word
+            else:
+                masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+        output_tokens[index] = masked_token
+
+        masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+    masked_lm_positions = []
+    masked_lm_labels = []
+    for p in masked_lms:
+        masked_lm_positions.append(p.index)
+        masked_lm_labels.append(p.label)
+
+    return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_num_tokens:
+            break
+
+        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+        assert len(trunc_tokens) >= 1
+
+        # We want to sometimes truncate from the front and sometimes from the
+        # back to add more randomness and avoid biases.
+        if rng.random() < 0.5:
+            del trunc_tokens[0]
+        else:
+            trunc_tokens.pop()
+
+
+def main():
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--input_file",
+        default=None,
+        type=str,
+        required=True,
+        help=
+        "The input train corpus. can be directory with .txt files or a path to a single file"
+    )
+    parser.add_argument(
+        "--output_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The output file where created hdf5 formatted data will be written.")
+    parser.add_argument("--vocab_file",
+                        default=None,
+                        type=str,
+                        required=False,
+                        help="The vocabulary the BERT model will train on. "
+                        "Use bert_model argument would ignore this. "
+                        "The bert_model argument is recommended.")
+    parser.add_argument(
+        "--do_lower_case",
+        action='store_true',
+        default=True,
+        help=
+        "Whether to lower case the input text. True for uncased models, False for cased models. "
+        "Use bert_model argument would ignore this. The bert_model argument is recommended."
+    )
+    parser.add_argument(
+        "--bert_model",
+        default="bert-base-uncased",
+        type=str,
+        required=False,
+        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
+        "If provided, use the pre-trained model used tokenizer to create data "
+        "and ignore vocab_file and do_lower_case.")
+
+    ## Other parameters
+    #int
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help=
+        "The maximum total input sequence length after WordPiece tokenization. \n"
+        "Sequences longer than this will be truncated, and sequences shorter \n"
+        "than this will be padded.")
+    parser.add_argument(
+        "--dupe_factor",
+        default=10,
+        type=int,
+        help=
+        "Number of times to duplicate the input data (with different masks).")
+    parser.add_argument(
+        "--max_predictions_per_seq",
+        default=20,
+        type=int,
+        help="Maximum number of masked LM predictions per sequence.")
+
+    # floats
+    parser.add_argument("--masked_lm_prob",
+                        default=0.15,
+                        type=float,
+                        help="Masked LM probability.")
+    parser.add_argument(
+        "--short_seq_prob",
+        default=0.1,
+        type=float,
+        help=
+        "Probability to create a sequence shorter than maximum sequence length")
+
+    parser.add_argument('--random_seed',
+                        type=int,
+                        default=12345,
+                        help="random seed for initialization")
+
+    args = parser.parse_args()
+    print(args)
+
+    if args.bert_model:
+        tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+    else:
+        assert args.vocab_file, (
+            "vocab_file must be set If bert_model is not provided.")
+        tokenizer = BertTokenizer(args.vocab_file,
+                                  do_lower_case=args.do_lower_case)
+
+    input_files = []
+    if os.path.isfile(args.input_file):
+        input_files.append(args.input_file)
+    elif os.path.isdir(args.input_file):
+        input_files = [
+            os.path.join(args.input_file, f)
+            for f in os.listdir(args.input_file)
+            if (os.path.isfile(os.path.join(args.input_file, f))
+                and f.endswith('.txt'))
+        ]
+    else:
+        raise ValueError("{} is not a valid path".format(args.input_file))
+
+    rng = random.Random(args.random_seed)
+    instances = create_training_instances(input_files, tokenizer,
+                                          args.max_seq_length, args.dupe_factor,
+                                          args.short_seq_prob,
+                                          args.masked_lm_prob,
+                                          args.max_predictions_per_seq, rng)
+
+    output_file = args.output_file
+
+    write_instance_to_example_file(instances, tokenizer, args.max_seq_length,
+                                   args.max_predictions_per_seq, output_file)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/PaddleNLP/examples/language_model/bert/data/sample_text.txt
+++ b/PaddleNLP/examples/language_model/bert/data/sample_text.txt
+Zulfiqar A. Bhutta trained as a physician in Pakistan in the early stages of his career.
+He holds titles across various organizations in diverse geographies.
+Professor Bhutta is the Founding Director of the Center of Excellence in Women and Child Health & Institute for Global Child Health & Development, at the Aga Khan University South-Central Asia, East Africa & United Kingdom.
+He is currently the Co-Director at the Centre for Global Child Health, at the Hospital for Sick Children and leads many projects as a Senior Scientist at the Research Institute in the Centre for Global Child Health at Sick Kids.
+He holds a Professorship at the University of Toronto in the Department of Nutritional Sciences and the Division of Epidemiology, Dalla Lana School of Public Health.
+Additionally, he holds concurrent professorship at the Department of Paediatrics, Aga Khan University in Karachi, Pakistan and at the Schools of Public Health of Johns Hopkins University, Tufts University, Boston University, University of Alberta and the London School of Hygiene & Tropical Medicine.
+He is a designated Distinguished National Professor of the Government of Pakistan and was the Founding Chair of the National Research Ethics Committee of the Government of Pakistan from 2003-2014.
+Dr. Bhutta received his MBBS from Khyber Medical College in Peshawar, Pakistan in 1977 at which time he was names "Best Graduate of the Year" and awarded the University Gold Medal for overall distinction.
+His PhD work was completed at Karolinska Institute in Stockholm, Sweden in 1996.
+He is a Fellow of the Royal College of Physicians (Edinburgh & London), the Royal College of Paediatrics and Child Health (London), American Academy of Paediatrics and the Pakistan Academy of Sciences.
+Following the completion of his PhD Dr. Bhutta began working as House Surgeon in Obstetrics & Gynecology at the Khyber Teaching Hospital, Peshawar (April-November 1978).
+He began work in paediatrics as a physician in November of 1978 in the Professorial Unit at the Institute of Child Health, Jinnah Postgraduate Medical Centre, Karachi (Pakistan).
+Through 1980's he continued his work as a surgeon and paediatrician.
+He undertook his first professor position in the Department of Paediatrics, The Aga Khan University Hospital, Karachi (Pakistan), from November 1987 to June 1992.
+In 2005, Dr. Bhutta became the Chairman of the Department of Paediatrics & Child Health at the Aga Khan University & Medical Center, a position held until 2008.
+Following his term as Chairman he became The Noordin Noormahomed Sheriff Professor & Founding Chair, Division of Women & Child Health, The Aga Khan University, a position he held for four years.
+Dr. Bhutta currently holds the titles of co-director of the Centre for Global Child Health at the Hospital for Sick Children in Toronto, and founding director of the Centre of Excellence in Women and Child Health at the Aga Khan University.
+In 2020, he was appointed founding director of the Institute for Global child Health & Development at the Aga Khan University and elected Fellow to the Royal Society, United Kingdom.
+Outside of his professional responsibilities Dr. Bhutta serves on various local and international boards and committees, including a series of editorial boards.
+In his various capacities Dr. Bhutta has produced a large collection of publications working with his teams at Sick Kids, AKU and international partners.
+These include book reviews, chapters, 1.
+"Haematological disorders" "Neonatal Jaundice" in Neonatal Vade‑Mecum, Fleming PJ, Speidel BD, Dunn PM Eds, Lloyd‑Luke Publishers, UK, 1986.
+Revised 2nd Edition 1991.
+2.
+"Nutritional management of acute and persistent diarrhoea".
+A M Molla, Bhutta Z A and  A Molla.
+In McNeish A S, Mittal S K and Walker-Smith J A (eds).
+Recent trends in diarrhoea and malnutrition, MAMC, Delhi, 1991, pp 37-51.
+3.
+"Paediatric Prescribing” in "Text book of Paediatrics for developing countries"            Arif MA, Hanif SM, Wasti SMK Eds, 1989, 2nd Edition 1996,  PPA, Karachi.
+& Lahore 4.
+"Innovations in neonatal care : Impact on neonatal survival in the developing world:.
+Bhutta Z A  Zaidi S (Editor) 1992.
+TWEL Publisher.
+Karachi pp 121-131 5.
+"Short course therapy in Pediatrics" Bhutta Z A& Teele D.  In Tice A D, Waldvogel F (Eds), Contemporary issues in Infectious Disease Epidemiology and Management, 1993 Gardiner Caldwell, Cheshire, pp 52 - 60.
+6.
+"Dietary management of persistent diarrhoea".
+Bhutta Z A, Molla A M, Issani Z.
+In Reflections on  Diarrhoeal Disease & Nutrition  of Children".
+1993 Karachi, pp 97 - 103.
+7.
+"Prescribing practices amongst general practitioners (GPs) and consultant paediatricians in childhood diarrhoea.”  S.Q.
+Nizami, I.A.
+Khan, Bhutta Z A.
+In "Reflections on Diarrhoeal Disease and Nutrition of Children".
+1993 Karachi, pp  88-90.
+8.
+"The challenge of multidrug-resistant typhoid".
+Bhutta Z A.
+In Puri R K, Sachdev H P S, Choudhry P, Verma I C (Eds), Current concepts in Paediatrics, 1994.
+Jaypee Publishers, New Delhi, pp 403.8.
+9.
+"Perinatal Care in Pakistan: Current status and trends".
+In Proceedings of the Workshop in Reproductive Health.
+College of Physicians and Surgeons, Pakistan, Karachi, 1995, pp 95-103.
+10.
+“A study of whole body protein kinetics in malnourished children with persistent diarrhoea” Bhutta Z A, Nizami SQ, Isani Z, Hardy S, Hendricks K, Young V.   Report of the second RCM coordinated Research Programme for application of stable isotope tracer methods to studies of energy metabolism in malnourished populations of developing countries.
+NAHRES-30 1996 IAEA Vienna.
+11.
+"Pneumococcal infections in Pakistan: a country report".
+In Adult Immunization in Asia, Fondation Mercel Merieux, Lyon, 1998. pp 79-82.
+12.
+“Factors affecting protein and aminoacid metabolism in childhood from developing countries".
+In Child Nutrition: an international perspective.
+Editors Solomons NW, Caballero B, Brown KH.
+CRC Press 1998.
+13.
+"Protein Digestion and Bioavailability".
+In Encyclopedia of Human Nutrition.
+Editors: Sadler M, Strain JJ, Caballero B.
+Academic Press (London), 1998 pp.1646-54.
+14.
+"Perinatal Care in Pakistan.
+Reproductive Health: A manual for family practice and primary health care.
+Bhutta Z A, Maqbool S.  College of Physicians and Surgeons, Pakistan, Karachi, 1999, pp 69-78.
+15.
+“Effective interventions to reduce neonatal mortality and morbidity from perinatal infection.
+Bhutta ZA.
+In Costello A, Manandhar D (eds).
+"Improving Newborn Infant Health in Developing Countries’ 1999.
+Imperial College Press, London pp.289-308.
+16.
+“Ambulatory management of typhoid fever”            “Risk factors and management of micronutrient deficiencies”            “Management of persistent diarrhoea in developing countries”.
+In Manual of International Child Health, British Medical Journal, 2000 (in press).
+17.
+“The role of Cefixime in typhoid fever during childhood” in Cefixime, Adam D, Quintiliani R (Eds), Torre-Lazur-McCann, Tokyo, 2000; pp.107-112.
+18.
+"Micronutrients and Child Health in the Commonwealth”, Commonwealth Foundation" (UK) (2001).
+19.
+"Isotopic evaluation of breast milk intake, energy metabolism growth and body composition of exclusively breastfed infants in Pakistan".
+Bhutta ZA, Nizami SQ, Weaver LT, Preston T. In Application of Stable Isotopes to evaluate Growth and Body Composition of Exclusively Breastfed infants, IAEA and WHO, NAHRES Report.
+2000.
+20.
+“Typhoid Fever in Childhood: the south Asian experience”.
+Ahmad K &Bhutta ZA.
+In "Recent Advances in Paediatrics", Gupte S (Ed), 2000, India .
+21.
+“Neonatal Infections in developing countries” in  Carrera JM, Cabero L, Baraibar R (Eds).
+The Perinatal Medicine of the new Millennium.
\ No newline at end of file
--- a/PaddleNLP/examples/language_model/bert/run_glue.py
+++ b/PaddleNLP/examples/language_model/bert/run_glue.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+import sys
+import random
+import time
+import math
+from functools import partial
+
+import numpy as np
+import paddle
+from paddle.io import DataLoader
+from paddle.metric import Metric, Accuracy, Precision, Recall
+
+from paddlenlp.datasets import GlueCoLA, GlueSST2, GlueMRPC, GlueSTSB, GlueQQP, GlueMNLI, GlueQNLI, GlueRTE
+from paddlenlp.data import Stack, Tuple, Pad
+from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
+from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
+from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
+from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
+
+FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
+
+TASK_CLASSES = {
+    "cola": (GlueCoLA, Mcc),
+    "sst-2": (GlueSST2, Accuracy),
+    "mrpc": (GlueMRPC, AccuracyAndF1),
+    "sts-b": (GlueSTSB, PearsonAndSpearman),
+    "qqp": (GlueQQP, AccuracyAndF1),
+    "mnli": (GlueMNLI, Accuracy),
+    "qnli": (GlueQNLI, Accuracy),
+    "rte": (GlueRTE, Accuracy),
+}
+
+MODEL_CLASSES = {"bert": (BertForSequenceClassification, BertTokenizer)}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " +
+        ", ".join(TASK_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " +
+        ", ".join(MODEL_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: "
+        + ", ".join(
+            sum([
+                list(classes[-1].pretrained_init_configuration.keys())
+                for classes in MODEL_CLASSES.values()
+            ], [])), )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.", )
+    parser.add_argument(
+        "--learning_rate",
+        default=1e-4,
+        type=float,
+        help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--num_train_epochs",
+        default=3,
+        type=int,
+        help="Total number of training epochs to perform.", )
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=100,
+        help="Log every X updates steps.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=100,
+        help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--batch_size",
+        default=32,
+        type=int,
+        help="Batch size per GPU/CPU for training.", )
+    parser.add_argument(
+        "--weight_decay",
+        default=0.0,
+        type=float,
+        help="Weight decay if we apply some.")
+    parser.add_argument(
+        "--warmup_steps",
+        default=0,
+        type=int,
+        help="Linear warmup over warmup_steps. If > 0: Override warmup_proportion"
+    )
+    parser.add_argument(
+        "--warmup_proportion",
+        default=0.,
+        type=float,
+        help="Linear warmup proportion over total steps.")
+    parser.add_argument(
+        "--adam_epsilon",
+        default=1e-6,
+        type=float,
+        help="Epsilon for Adam optimizer.")
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--seed", default=42, type=int, help="random seed for initialization")
+    parser.add_argument(
+        "--n_gpu",
+        default=1,
+        type=int,
+        help="number of gpus to use, 0 for cpu.")
+    args = parser.parse_args()
+    return args
+
+
+def set_seed(args):
+    random.seed(args.seed + paddle.distributed.get_rank())
+    np.random.seed(args.seed + paddle.distributed.get_rank())
+    paddle.seed(args.seed + paddle.distributed.get_rank())
+
+
+def evaluate(model, loss_fct, metric, data_loader):
+    model.eval()
+    metric.reset()
+    for batch in data_loader:
+        input_ids, segment_ids, labels = batch
+        logits = model(input_ids, segment_ids)
+        loss = loss_fct(logits, labels)
+        correct = metric.compute(logits, labels)
+        metric.update(correct)
+    res = metric.accumulate()
+    if isinstance(metric, AccuracyAndF1):
+        logger.info(
+            "eval loss: %f, acc: %s, precision: %s, recall: %s, f1: %s, acc and f1: %s."
+            % (loss.numpy(), res[0], res[1], res[2], res[3], res[4]))
+    elif isinstance(metric, Mcc):
+        logger.info("eval loss: %f, mcc: %s." % (loss.numpy(), res[0]))
+    elif isinstance(metric, PearsonAndSpearman):
+        logger.info(
+            "eval loss: %f, pearson: %s, spearman: %s, pearson and spearman: %s."
+            % (loss.numpy(), res[0], res[1], res[2]))
+    else:
+        logger.info("eval loss: %f, acc: %s." % (loss.numpy(), res))
+    model.train()
+
+
+def convert_example(example,
+                    tokenizer,
+                    label_list,
+                    max_seq_length=512,
+                    is_test=False):
+    """convert a glue example into necessary features"""
+
+    def _truncate_seqs(seqs, max_seq_length):
+        if len(seqs) == 1:  # single sentence
+            # Account for [CLS] and [SEP] with "- 2"
+            seqs[0] = seqs[0][0:(max_seq_length - 2)]
+        else:  # Sentence pair
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            tokens_a, tokens_b = seqs
+            max_seq_length -= 3
+            while True:  # Truncate with longest_first strategy
+                total_length = len(tokens_a) + len(tokens_b)
+                if total_length <= max_seq_length:
+                    break
+                if len(tokens_a) > len(tokens_b):
+                    tokens_a.pop()
+                else:
+                    tokens_b.pop()
+        return seqs
+
+    def _concat_seqs(seqs, separators, seq_mask=0, separator_mask=1):
+        concat = sum((seq + sep for sep, seq in zip(separators, seqs)), [])
+        segment_ids = sum(
+            ([i] * (len(seq) + len(sep))
+             for i, (sep, seq) in enumerate(zip(separators, seqs))), [])
+        if isinstance(seq_mask, int):
+            seq_mask = [[seq_mask] * len(seq) for seq in seqs]
+        if isinstance(separator_mask, int):
+            separator_mask = [[separator_mask] * len(sep) for sep in separators]
+        p_mask = sum((s_mask + mask
+                      for sep, seq, s_mask, mask in zip(
+                          separators, seqs, seq_mask, separator_mask)), [])
+        return concat, segment_ids, p_mask
+
+    if not is_test:
+        # `label_list == None` is for regression task
+        label_dtype = "int64" if label_list else "float32"
+        # Get the label
+        label = example[-1]
+        example = example[:-1]
+        # Create label maps if classification task
+        if label_list:
+            label_map = {}
+            for (i, l) in enumerate(label_list):
+                label_map[l] = i
+            label = label_map[label]
+        label = np.array([label], dtype=label_dtype)
+
+    # Tokenize raw text
+    tokens_raw = [tokenizer(l) for l in example]
+    # Truncate to the truncate_length,
+    tokens_trun = _truncate_seqs(tokens_raw, max_seq_length)
+    # Concate the sequences with special tokens
+    tokens_trun[0] = [tokenizer.cls_token] + tokens_trun[0]
+    tokens, segment_ids, _ = _concat_seqs(tokens_trun, [[tokenizer.sep_token]] *
+                                          len(tokens_trun))
+    # Convert the token to ids
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+    valid_length = len(input_ids)
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    # input_mask = [1] * len(input_ids)
+    if not is_test:
+        return input_ids, segment_ids, valid_length, label
+    else:
+        return input_ids, segment_ids, valid_length
+
+
+def do_train(args):
+    paddle.set_device("gpu" if args.n_gpu else "cpu")
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    set_seed(args)
+
+    args.task_name = args.task_name.lower()
+    dataset_class, metric_class = TASK_CLASSES[args.task_name]
+    args.model_type = args.model_type.lower()
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+
+    train_dataset = dataset_class.get_datasets(["train"])
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+
+    trans_func = partial(
+        convert_example,
+        tokenizer=tokenizer,
+        label_list=train_dataset.get_labels(),
+        max_seq_length=args.max_seq_length)
+    train_dataset = train_dataset.apply(trans_func, lazy=True)
+    train_batch_sampler = paddle.io.DistributedBatchSampler(
+        train_dataset, batch_size=args.batch_size, shuffle=True)
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+        Stack(),  # length
+        Stack(dtype="int64" if train_dataset.get_labels() else "float32")  # label
+    ): [data for i, data in enumerate(fn(samples)) if i != 2]
+    train_data_loader = DataLoader(
+        dataset=train_dataset,
+        batch_sampler=train_batch_sampler,
+        collate_fn=batchify_fn,
+        num_workers=0,
+        return_list=True)
+    if args.task_name == "mnli":
+        dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets(
+            ["dev_matched", "dev_mismatched"])
+        dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True)
+        dev_dataset_mismatched = dev_dataset_mismatched.apply(
+            trans_func, lazy=True)
+        dev_batch_sampler_matched = paddle.io.BatchSampler(
+            dev_dataset_matched, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader_matched = DataLoader(
+            dataset=dev_dataset_matched,
+            batch_sampler=dev_batch_sampler_matched,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            return_list=True)
+        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
+            dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader_mismatched = DataLoader(
+            dataset=dev_dataset_mismatched,
+            batch_sampler=dev_batch_sampler_mismatched,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            return_list=True)
+    else:
+        dev_dataset = dataset_class.get_datasets(["dev"])
+        dev_dataset = dev_dataset.apply(trans_func, lazy=True)
+        dev_batch_sampler = paddle.io.BatchSampler(
+            dev_dataset, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader = DataLoader(
+            dataset=dev_dataset,
+            batch_sampler=dev_batch_sampler,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            return_list=True)
+
+    num_classes = 1 if train_dataset.get_labels() == None else len(
+        train_dataset.get_labels())
+    model = model_class.from_pretrained(
+        args.model_name_or_path, num_classes=num_classes)
+    if paddle.distributed.get_world_size() > 1:
+        model = paddle.DataParallel(model)
+
+    num_training_steps = args.max_steps if args.max_steps > 0 else (
+        len(train_data_loader) * args.num_train_epochs)
+    warmup_steps = args.warmup_steps if args.warmup_steps > 0 else (
+        int(math.floor(num_training_steps * args.warmup_proportion)))
+    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
+        args.learning_rate,
+        lambda current_step, num_warmup_steps=warmup_steps,
+        num_training_steps=num_training_steps : float(
+            current_step) / float(max(1, num_warmup_steps))
+        if current_step < num_warmup_steps else max(
+            0.0,
+            float(num_training_steps - current_step) / float(
+                max(1, num_training_steps - num_warmup_steps))))
+
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_scheduler,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=args.adam_epsilon,
+        parameters=model.parameters(),
+        weight_decay=args.weight_decay,
+        apply_decay_param_fun=lambda x: x in [
+            p.name for n, p in model.named_parameters()
+            if not any(nd in n for nd in ["bias", "norm"])
+        ])
+
+    loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels(
+    ) else paddle.nn.loss.MSELoss()
+
+    metric = metric_class()
+
+    global_step = 0
+    tic_train = time.time()
+    for epoch in range(args.num_train_epochs):
+        for step, batch in enumerate(train_data_loader):
+            global_step += 1
+            input_ids, segment_ids, labels = batch
+            logits = model(input_ids, segment_ids)
+            loss = loss_fct(logits, labels)
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.clear_gradients()
+            if global_step % args.logging_steps == 0:
+                logger.info(
+                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
+                    % (global_step, num_training_steps, epoch, step,
+                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
+                       args.logging_steps / (time.time() - tic_train)))
+                tic_train = time.time()
+            if global_step % args.save_steps == 0:
+                tic_eval = time.time()
+                if args.task_name == "mnli":
+                    evaluate(model, loss_fct, metric, dev_data_loader_matched)
+                    evaluate(model, loss_fct, metric,
+                             dev_data_loader_mismatched)
+                    logger.info("eval done total : %s s" %
+                                (time.time() - tic_eval))
+                else:
+                    evaluate(model, loss_fct, metric, dev_data_loader)
+                    logger.info("eval done total : %s s" %
+                                (time.time() - tic_eval))
+                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
+                    output_dir = os.path.join(
+                        args.output_dir, "%s_ft_model_%d.pdparams" %
+                        (args.task_name, global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    # Need better way to get inner model of DataParallel
+                    model_to_save = model._layers if isinstance(
+                        model, paddle.DataParallel) else model
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+
+
+def print_arguments(args):
+    """print arguments"""
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).items()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print_arguments(args)
+    if args.n_gpu > 1:
+        paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
+    else:
+        do_train(args)
--- a/PaddleNLP/examples/language_model/bert/run_pretrain.py
+++ b/PaddleNLP/examples/language_model/bert/run_pretrain.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import collections
+import itertools
+import logging
+import os
+import random
+import time
+import h5py
+from functools import partial
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+from paddle.io import DataLoader, Dataset
+
+from paddlenlp.data import Stack, Tuple, Pad
+from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
+from paddlenlp.transformers import BertTokenizer
+
+FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
+
+MODEL_CLASSES = {
+    "bert": (BertForPretraining, BertTokenizer),
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " +
+        ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: "
+        + ", ".join(
+            sum([
+                list(classes[-1].pretrained_init_configuration.keys())
+                for classes in MODEL_CLASSES.values()
+            ], [])),
+    )
+    parser.add_argument(
+        "--input_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input directory where the data will be read from.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help=
+        "The output directory where the model predictions and checkpoints will be written.",
+    )
+
+    parser.add_argument(
+        "--max_predictions_per_seq",
+        default=80,
+        type=int,
+        help="The maximum total of masked tokens in input sequence")
+
+    parser.add_argument(
+        "--batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
+    )
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay",
+                        default=0.0,
+                        type=float,
+                        help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon",
+                        default=1e-8,
+                        type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm",
+                        default=1.0,
+                        type=float,
+                        help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs",
+        default=3,
+        type=int,
+        help="Total number of training epochs to perform.",
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help=
+        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps",
+                        default=0,
+                        type=int,
+                        help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps",
+                        type=int,
+                        default=500,
+                        help="Log every X updates steps.")
+    parser.add_argument("--save_steps",
+                        type=int,
+                        default=500,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument("--n_gpu",
+                        type=int,
+                        default=1,
+                        help="number of gpus to use, 0 for cpu.")
+    args = parser.parse_args()
+    return args
+
+
+def set_seed(args):
+    random.seed(args.seed + paddle.distributed.get_rank())
+    np.random.seed(args.seed + paddle.distributed.get_rank())
+    paddle.seed(args.seed + paddle.distributed.get_rank())
+
+
+class WorkerInitObj(object):
+    def __init__(self, seed):
+        self.seed = seed
+
+    def __call__(self, id):
+        np.random.seed(seed=self.seed + id)
+        random.seed(self.seed + id)
+
+
+def create_pretraining_dataset(input_file, max_pred_length, shared_list, args,
+                               worker_init):
+    train_data = PretrainingDataset(input_file=input_file,
+                                    max_pred_length=max_pred_length)
+    # files have been sharded, no need to dispatch again
+    train_batch_sampler = paddle.io.BatchSampler(train_data,
+                                                 batch_size=args.batch_size,
+                                                 shuffle=True)
+
+    # DataLoader cannot be pickled because of its place.
+    # If it can be pickled, use global function instead of lambda and use
+    # ProcessPoolExecutor instead of ThreadPoolExecutor to prefetch.
+    def _collate_data(data, stack_fn=Stack()):
+        num_fields = len(data[0])
+        out = [None] * num_fields
+        # input_ids, segment_ids, input_mask, masked_lm_positions,
+        # masked_lm_labels, next_sentence_labels, mask_token_num
+        for i in (0, 1, 2, 5):
+            out[i] = stack_fn([x[i] for x in data])
+        batch_size, seq_length = out[0].shape
+        size = num_mask = sum(len(x[3]) for x in data)
+        # Padding for divisibility by 8 for fp16 or int8 usage
+        if size % 8 != 0:
+            size += 8 - (size % 8)
+        # masked_lm_positions
+        # Organize as a 1D tensor for gather or use gather_nd
+        out[3] = np.full(size, 0, dtype=np.int64)
+        # masked_lm_labels
+        out[4] = np.full([size, 1], -1, dtype=np.int64)
+        mask_token_num = 0
+        for i, x in enumerate(data):
+            for j, pos in enumerate(x[3]):
+                out[3][mask_token_num] = i * seq_length + pos
+                out[4][mask_token_num] = x[4][j]
+                mask_token_num += 1
+        # mask_token_num
+        out.append(np.asarray([mask_token_num], dtype=np.float32))
+        return out
+
+    train_data_loader = DataLoader(dataset=train_data,
+                                   batch_sampler=train_batch_sampler,
+                                   collate_fn=_collate_data,
+                                   num_workers=0,
+                                   worker_init_fn=worker_init,
+                                   return_list=True)
+    return train_data_loader, input_file
+
+
+class PretrainingDataset(Dataset):
+    def __init__(self, input_file, max_pred_length):
+        self.input_file = input_file
+        self.max_pred_length = max_pred_length
+        f = h5py.File(input_file, "r")
+        keys = [
+            'input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions',
+            'masked_lm_ids', 'next_sentence_labels'
+        ]
+        self.inputs = [np.asarray(f[key][:]) for key in keys]
+        f.close()
+
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.inputs[0])
+
+    def __getitem__(self, index):
+
+        [
+            input_ids, input_mask, segment_ids, masked_lm_positions,
+            masked_lm_ids, next_sentence_labels
+        ] = [
+            input[index].astype(np.int64)
+            if indice < 5 else np.asarray(input[index].astype(np.int64))
+            for indice, input in enumerate(self.inputs)
+        ]
+        # TODO: whether to use reversed mask by changing 1s and 0s to be
+        # consistent with nv bert
+        input_mask = (1 - np.reshape(input_mask.astype(np.float32),
+                                     [1, 1, input_mask.shape[0]])) * -1e9
+
+        index = self.max_pred_length
+        # store number of  masked tokens in index
+        # outputs of torch.nonzero diff with that of numpy.nonzero by zip
+        padded_mask_indices = (masked_lm_positions == 0).nonzero()[0]
+        if len(padded_mask_indices) != 0:
+            index = padded_mask_indices[0].item()
+            mask_token_num = index
+        else:
+            index = 0
+            mask_token_num = 0
+        # masked_lm_labels = np.full(input_ids.shape, -1, dtype=np.int64)
+        # masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
+        masked_lm_labels = masked_lm_ids[:index]
+        masked_lm_positions = masked_lm_positions[:index]
+        # softmax_with_cross_entropy enforce last dim size equal 1
+        masked_lm_labels = np.expand_dims(masked_lm_labels, axis=-1)
+        next_sentence_labels = np.expand_dims(next_sentence_labels, axis=-1)
+
+        return [
+            input_ids, segment_ids, input_mask, masked_lm_positions,
+            masked_lm_labels, next_sentence_labels
+        ]
+
+
+def do_train(args):
+    paddle.set_device("gpu" if args.n_gpu else "cpu")
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    set_seed(args)
+    worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank())
+
+    args.model_type = args.model_type.lower()
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+
+    model = BertForPretraining(
+        BertModel(**model_class.pretrained_init_configuration[
+            args.model_name_or_path]))
+    criterion = BertPretrainingCriterion(
+        getattr(model,
+                BertForPretraining.base_model_prefix).config["vocab_size"])
+    if paddle.distributed.get_world_size() > 1:
+        model = paddle.DataParallel(model)
+
+    # If use defalut last_epoch, lr of the first iteration is 0.
+    # Use `last_epoch = 0` to be consistent with nv bert.
+    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
+        args.learning_rate,
+        lambda current_step, num_warmup_steps=args.warmup_steps,
+        num_training_steps=args.max_steps if args.max_steps > 0 else
+        (len(train_data_loader) * args.num_train_epochs): float(
+            current_step) / float(max(1, num_warmup_steps))
+        if current_step < num_warmup_steps else max(
+            0.0,
+            float(num_training_steps - current_step) / float(
+                max(1, num_training_steps - num_warmup_steps))),
+        last_epoch=0)
+
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_scheduler,
+        epsilon=args.adam_epsilon,
+        parameters=model.parameters(),
+        weight_decay=args.weight_decay,
+        apply_decay_param_fun=lambda x: x in [
+            p.name for n, p in model.named_parameters()
+            if not any(nd in n for nd in ["bias", "norm"])
+        ])
+
+    pool = ThreadPoolExecutor(1)
+    global_step = 0
+    tic_train = time.time()
+    for epoch in range(args.num_train_epochs):
+        files = [
+            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
+            if os.path.isfile(os.path.join(args.input_dir, f))
+            and "training" in f
+        ]
+        files.sort()
+        num_files = len(files)
+        random.Random(args.seed + epoch).shuffle(files)
+        f_start_id = 0
+
+        shared_file_list = {}
+
+        if paddle.distributed.get_world_size() > num_files:
+            remainder = paddle.distributed.get_world_size() % num_files
+            data_file = files[
+                (f_start_id * paddle.distributed.get_world_size() +
+                 paddle.distributed.get_rank() + remainder * f_start_id) %
+                num_files]
+        else:
+            data_file = files[(f_start_id * paddle.distributed.get_world_size()
+                               + paddle.distributed.get_rank()) % num_files]
+
+        previous_file = data_file
+
+        train_data_loader, _ = create_pretraining_dataset(
+            data_file, args.max_predictions_per_seq, shared_file_list, args,
+            worker_init)
+
+        # TODO(guosheng): better way to process single file
+        single_file = True if f_start_id + 1 == len(files) else False
+
+        for f_id in range(f_start_id, len(files)):
+            if not single_file and f_id == f_start_id:
+                continue
+            if paddle.distributed.get_world_size() > num_files:
+                data_file = files[(f_id * paddle.distributed.get_world_size() +
+                                   paddle.distributed.get_rank() +
+                                   remainder * f_id) % num_files]
+            else:
+                data_file = files[(f_id * paddle.distributed.get_world_size() +
+                                   paddle.distributed.get_rank()) % num_files]
+
+            previous_file = data_file
+            dataset_future = pool.submit(create_pretraining_dataset, data_file,
+                                         args.max_predictions_per_seq,
+                                         shared_file_list, args, worker_init)
+            for step, batch in enumerate(train_data_loader):
+                global_step += 1
+                (input_ids, segment_ids, input_mask, masked_lm_positions,
+                 masked_lm_labels, next_sentence_labels,
+                 masked_lm_scale) = batch
+                prediction_scores, seq_relationship_score = model(
+                    input_ids=input_ids,
+                    token_type_ids=segment_ids,
+                    attention_mask=input_mask,
+                    masked_positions=masked_lm_positions)
+                loss = criterion(prediction_scores, seq_relationship_score,
+                                 masked_lm_labels, next_sentence_labels,
+                                 masked_lm_scale)
+                if global_step % args.logging_steps == 0:
+                    if (not args.n_gpu > 1
+                        ) or paddle.distributed.get_rank() == 0:
+                        logger.info(
+                            "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
+                            % (global_step, epoch, step, loss,
+                               args.logging_steps / (time.time() - tic_train)))
+                    tic_train = time.time()
+                loss.backward()
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.clear_gradients()
+                if global_step % args.save_steps == 0:
+                    if (not args.n_gpu > 1
+                        ) or paddle.distributed.get_rank() == 0:
+                        output_dir = os.path.join(args.output_dir,
+                                                  "model_%d" % global_step)
+                        if not os.path.exists(output_dir):
+                            os.makedirs(output_dir)
+                        # need better way to get inner model of DataParallel
+                        model_to_save = model._layers if isinstance(
+                            model, paddle.DataParallel) else model
+                        model_to_save.save_pretrained(output_dir)
+                        tokenizer.save_pretrained(output_dir)
+                        paddle.save(
+                            optimizer.state_dict(),
+                            os.path.join(output_dir, "model_state.pdopt"))
+                if global_step >= args.max_steps:
+                    del train_data_loader
+                    return
+
+            del train_data_loader
+            train_data_loader, data_file = dataset_future.result(timeout=None)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.n_gpu > 1:
+        paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
+    else:
+        do_train(args)
--- a/PaddleNLP/examples/electra/README.md
+++ b/PaddleNLP/examples/electra/README.md
--- a/PaddleNLP/examples/electra/run_glue.py
+++ b/PaddleNLP/examples/electra/run_glue.py
--- a/PaddleNLP/examples/electra/run_pretrain.py
+++ b/PaddleNLP/examples/electra/run_pretrain.py
--- a/PaddleNLP/examples/language_model/elmo/README.md
+++ b/PaddleNLP/examples/language_model/elmo/README.md
@@ -18,15 +18,17 @@ ELMo(Embeddings from Language Models)是重要的通用语义表示模型之一

 * PaddlePaddle 安装

-   本项目依赖于 PaddlePaddle 2.0 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
+   本项目依赖于 PaddlePaddle 2.0rc1 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装

 * 环境依赖

-   Python的版本要求 3.6+，并安装sklearn和gensim。其它环境请参考 PaddlePaddle [安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) 部分的内容
+   Python的版本要求 3.6+

-```shell
-pip install sklearn gensim
-```
+   本项目依赖sklearn和gensim，请在运行本项目之前进行安装
+
+   ```shell
+   pip install sklearn gensim
+   ```

 ### 代码结构说明


--- a/PaddleNLP/examples/language_model/rnnlm/README.md
+++ b/PaddleNLP/examples/language_model/rnnlm/README.md
@@ -5,8 +5,6 @@
 ## 1. 任务说明
 本文主要介绍基于lstm的语言的模型的实现，给定一个输入词序列（中文分词、英文tokenize），计算其ppl（语言模型困惑度，用户表示句子的流利程度），基于循环神经网络语言模型的介绍可以[参阅论文](https://arxiv.org/abs/1409.2329)。相对于传统的方法，基于循环神经网络的方法能够更好的解决稀疏词的问题。

-**目前语言模型要求使用PaddlePaddle 2.0及以上版本或适当的develop版本。**
-

 ## 2. 效果说明

@@ -27,6 +25,22 @@

 ## 1. 开始第一次模型调用

+### 安装说明
+
+* PaddlePaddle 安装
+
+   本项目依赖于 PaddlePaddle 2.0-rc1 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
+
+* PaddleNLP 安装
+
+   ```shell
+   pip install paddlenlp>=2.0.0b
+   ```
+
+* 环境依赖
+
+    Python的版本要求 3.6+
+
 ### 数据准备
 为了方便开发者进行测试，我们内置了数据下载脚本，默认自动下载PTB数据集。


--- a/PaddleNLP/examples/language_model/rnnlm/model.py
+++ b/PaddleNLP/examples/language_model/rnnlm/model.py
@@ -89,4 +89,3 @@ class UpdateModel(paddle.callbacks.Callback):
    # This callback reset model hidden states and update learning rate before each epoch begins 
    def on_epoch_begin(self, epoch=None, logs=None):
        self.model.network.reset_states()
-        
--- a/PaddleNLP/examples/language_model/transformer-xl/README.md
+++ b/PaddleNLP/examples/language_model/transformer-xl/README.md
+# Language Model
+
+## Transformer-XL
+
+以下是本例的简要目录结构及说明：
+
+```text
+.
+├── eval.py                 # 预测脚本
+├── reader.py               # 数据读取接口
+├── README.md               # 文档
+├── train.py                # 训练脚本
+└── configs                 # 配置文件
+```
+
+## 模型简介
+
+本项目是语言模型 Transformer-XL 的 PaddlePaddle 实现， 包含模型训练，预测等内容。
+
+
+## 快速开始
+
+### 安装说明
+
+1. paddle安装
+
+    本项目依赖于 PaddlePaddle 2.0rc及以上版本或适当的develop版本，请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装
+
+2. 下载代码
+
+    克隆代码库到本地
+
+3. 环境依赖
+
+    该模型使用PaddlePaddle，关于环境依赖部分，请先参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)关于环境依赖部分的内容。
+    此外，需要另外涉及：
+      * attrdict
+      * pyyaml
+
+
+
+### 数据准备
+
+公开数据集：enwik8、text8、wt103 多用于语言模型的 benchmark 测试。输出获取与处理方式如下：
+
+```shell
+bash gen_data.sh
+```
+
+会在当前路径下的 ./gen_data/ 路径下生成我们需要的数据。
+
+
+### 单机训练
+
+### 单机单卡
+
+以提供的 enwik8 数据为例，可以执行以下命令进行模型训练：
+
+```sh
+# setting visible devices for training
+export CUDA_VISIBLE_DEVICES=0
+python train.py --config ./configs/enwik8.yaml
+```
+
+可以在 enwik8.yaml 文件中设置相应的参数，比如 `batch_size`、`epoch` 等。
+
+### 单机多卡
+
+同样，可以执行如下命令实现八卡训练：
+
+```sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train.py --config ./configs/enwik8.yaml
+```
+
+
+### 模型推断
+
+以 enwik8 数据为例，模型训练完成后可以执行以下命令可以进行预测：
+
+```sh
+# setting visible devices for prediction
+export CUDA_VISIBLE_DEVICES=0
+python eval.py --config ./configs/enwik8.yaml
+```
+
+完成推断之后，会将显示在验证集和测试集上的结果。
+
+## 参考文献
--- a/PaddleNLP/examples/language_model/transformer-xl/configs/enwik8.yaml
+++ b/PaddleNLP/examples/language_model/transformer-xl/configs/enwik8.yaml
+# The frequency to save trained models when training.
+save_step: 10000
+# The frequency to fetch and print output when training.
+print_step: 100
+# Path of the checkpoint, to resume the previous training
+init_from_checkpoint: ""
+# Path of the pretrain model, to better solve the current task
+init_from_pretrain_model: ""
+# Path of trained parameter, to make prediction
+init_from_params: "./trained_models/step_final/"
+# The directory for saving model
+save_model: "trained_models"
+# The directory for saving inference model.
+inference_model_dir: "infer_model"
+# Set seed for CE or debug
+random_seed: None
+# The path to data files 
+data: "./gen_data/enwik8/"
+# The name of dataset
+dataset: "enwik8"
+
+# Whether to use cuda
+use_gpu: True
+
+# Args for reader, see reader.py for details
+token_delimiter: None
+batch_size: 16
+eval_batch_size: 2
+
+# Hyparams for training:
+# The number of epoches for training
+epoch: 30
+
+# The hyper parameters for optimizer.
+# Type of ptimizer. 
+optim: adam
+# Learning rate schedule. 
+scheduler: cosine
+# This static learning_rate will be applied to the LearningRateScheduler
+# derived learning rate the to get the final learning rate.
+learning_rate: 0.00025
+# The hyper parameters for Adam optimizer.
+beta1: 0.9
+beta2: 0.997
+eps: 1e-9
+# The hyper parameters for Momentum optimizer.
+mom: 0.0
+# Global gradient clip. 
+clip: 0.25
+# The parameters for learning rate scheduling.
+warmup_steps: 0
+# The parameters for CosineAnnealingDecay. Minimum learning rate.
+eta_min: 0.0
+# The parameters for ReduceLROnPlateau.
+# The Ratio that the learning rate will be reduced. 
+decay_rate: 0.5
+# When loss doesn’t improve for this number of epochs, learing rate will be reduced.
+patience: 0
+# The lower bound of the learning rate after reduction.
+min_lr: 0.0
+
+# Hyparams for model:
+# Whe use adaptive softmax. 
+adaptive: False
+# Size of dictionary. This can be obtained automatically. 
+ntokens: 10000
+# The dimension for word embeddings, which is also the last dimension of
+# the input and output of multi-head attention, position-wise feed-forward
+# networks, encoder and decoder.
+d_model: 512
+# Dimension of heads.
+d_head: 64
+# Size of the hidden layer in position-wise feed-forward networks.
+d_inner_hid: 2048
+# Number of head used in multi-head attention.
+n_head: 8
+# Number of sub-layers to be stacked in the encoder and decoder.
+n_layer: 12
+# Dropout rates.
+dropout: 0.1
+# Attention dropout
+attn_dropout: 0.0
+# Attention type for decoder. 
+# 0 for relative partial MHA (in Transformer-XL). 
+# 1 for relative MHA (in Shaw et al). 
+attn_type: 0
+# Apply layer normalization before or after sublayers. 
+normalize_before: False
+# Whether to tie weight or not. 
+tie_weight: True
+# The length of the extended context.
+ext_len: 0
+# The divident value for softmax and adapative input. 
+div_val: 1
+# Target length. The number of tokens to predict. 
+tgt_len: 512
+# Memory length. The length of the retained previous heads. 
+mem_len: 512
+# Use the same attention length for all tokens. 
+same_length: False
+# Use the same positional encoding after clamp len. 
+clamp_len: -1
+# The number of samples in sample softmax. -1 means do not use sampled softmax. 
+sample_softmax: -1
+# Max step for training.
+max_step: 400000
+# Target length for evaluation. That is, the number of tokens to predict for evaluation. 
+eval_tgt_len: 128
+# What kind of mode for evaluation. valid, test or both("all"). 
+mode: "all"
+# Maximum evaluation step. 
+max_eval_steps: -1
--- a/PaddleNLP/examples/language_model/transformer-xl/configs/text8.yaml
+++ b/PaddleNLP/examples/language_model/transformer-xl/configs/text8.yaml
+# The frequency to save trained models when training.
+save_step: 10000
+# The frequency to fetch and print output when training.
+print_step: 100
+# Path of the checkpoint, to resume the previous training
+init_from_checkpoint: ""
+# Path of the pretrain model, to better solve the current task
+init_from_pretrain_model: ""
+# Path of trained parameter, to make prediction
+init_from_params: "./trained_models/step_final/"
+# The directory for saving model
+save_model: "trained_models"
+# The directory for saving inference model.
+inference_model_dir: "infer_model"
+# Set seed for CE or debug
+random_seed: None
+# The path to data files 
+data: "./gen_data/text8/"
+# The name of dataset
+dataset: "text8"
+
+# Whether to use cuda
+use_gpu: True
+
+# Args for reader, see reader.py for details
+token_delimiter: None
+batch_size: 15
+eval_batch_size: 5
+
+# Hyparams for training:
+# The number of epoches for training
+epoch: 30
+
+# The hyper parameters for optimizer.
+# Type of ptimizer. 
+optim: adam
+# Learning rate schedule. 
+scheduler: cosine
+# This static learning_rate will be applied to the LearningRateScheduler
+# derived learning rate the to get the final learning rate.
+learning_rate: 0.00025
+# The hyper parameters for Adam optimizer.
+beta1: 0.9
+beta2: 0.997
+eps: 1e-9
+# The hyper parameters for Momentum optimizer.
+mom: 0.0
+# Global gradient clip. 
+clip: 0.25
+# The parameters for learning rate scheduling.
+warmup_steps: 0
+# The parameters for CosineAnnealingDecay. Minimum learning rate.
+eta_min: 0.0
+# The parameters for ReduceLROnPlateau.
+# The Ratio that the learning rate will be reduced. 
+decay_rate: 0.5
+# When loss doesn’t improve for this number of epochs, learing rate will be reduced.
+patience: 0
+# The lower bound of the learning rate after reduction.
+min_lr: 0.0
+
+# Hyparams for model:
+# Whe use adaptive softmax. 
+adaptive: False
+# Size of dictionary. This can be obtained automatically. 
+ntokens: 10000
+# The dimension for word embeddings, which is also the last dimension of
+# the input and output of multi-head attention, position-wise feed-forward
+# networks, encoder and decoder.
+d_model: 512
+# Dimension of heads.
+d_head: 64
+# Size of the hidden layer in position-wise feed-forward networks.
+d_inner_hid: 2048
+# Number of head used in multi-head attention.
+n_head: 8
+# Number of sub-layers to be stacked in the encoder and decoder.
+n_layer: 12
+# Dropout rates.
+dropout: 0.1
+# Attention dropout
+attn_dropout: 0.0
+# Attention type for decoder. 
+# 0 for relative partial MHA (in Transformer-XL). 
+# 1 for relative MHA (in Shaw et al). 
+attn_type: 0
+# Apply layer normalization before or after sublayers. 
+normalize_before: False
+# Whether to tie weight or not. 
+tie_weight: True
+# The length of the extended context.
+ext_len: 0
+# The divident value for softmax and adapative input. 
+div_val: 1
+# Target length. The number of tokens to predict. 
+tgt_len: 512
+# Memory length. The length of the retained previous heads. 
+mem_len: 512
+# Use the same attention length for all tokens. 
+same_length: False
+# Use the same positional encoding after clamp len. 
+clamp_len: -1
+# The number of samples in sample softmax. -1 means do not use sampled softmax. 
+sample_softmax: -1
+# Max step for training.
+max_step: 400000
+# Target length for evaluation. That is, the number of tokens to predict for evaluation. 
+eval_tgt_len: 128
+# What kind of mode for evaluation. valid, test or both("all"). 
+mode: "all"
+# Maximum evaluation step. 
+max_eval_steps: -1
--- a/PaddleNLP/examples/language_model/transformer-xl/configs/wt103.yaml
+++ b/PaddleNLP/examples/language_model/transformer-xl/configs/wt103.yaml
+# The frequency to save trained models when training.
+save_step: 10000
+# The frequency to fetch and print output when training.
+print_step: 100
+# Path of the checkpoint, to resume the previous training
+init_from_checkpoint: ""
+# Path of the pretrain model, to better solve the current task
+init_from_pretrain_model: ""
+# Path of trained parameter, to make prediction
+init_from_params: "./trained_models/step_final/"
+# The directory for saving model
+save_model: "trained_models"
+# The directory for saving inference model.
+inference_model_dir: "infer_model"
+# Set seed for CE or debug
+random_seed: None
+# The path to data files 
+data: "./gen_data/wikitext-103/"
+# The name of dataset
+dataset: "wt103"
+
+# Whether to use cuda
+use_gpu: True
+
+# Args for reader, see reader.py for details
+token_delimiter: None
+batch_size: 32
+eval_batch_size: 5
+
+# Hyparams for training:
+# The number of epoches for training
+epoch: 30
+
+# The hyper parameters for optimizer.
+# Type of ptimizer. 
+optim: adam
+# Learning rate schedule. 
+scheduler: cosine
+# This static learning_rate will be applied to the LearningRateScheduler
+# derived learning rate the to get the final learning rate.
+learning_rate: 0.00025
+# The hyper parameters for Adam optimizer.
+beta1: 0.9
+beta2: 0.997
+eps: 1e-9
+# The hyper parameters for Momentum optimizer.
+mom: 0.0
+# Global gradient clip. 
+clip: 0.25
+# The parameters for learning rate scheduling.
+warmup_steps: 0
+# The parameters for CosineAnnealingDecay. Minimum learning rate.
+eta_min: 0.0
+# The parameters for ReduceLROnPlateau.
+# The Ratio that the learning rate will be reduced. 
+decay_rate: 0.5
+# When loss doesn’t improve for this number of epochs, learing rate will be reduced.
+patience: 0
+# The lower bound of the learning rate after reduction.
+min_lr: 0.0
+
+# Hyparams for model:
+# Whe use adaptive softmax. 
+adaptive: True
+# Size of dictionary. This can be obtained automatically. 
+ntokens: 10000
+# The dimension for word embeddings, which is also the last dimension of
+# the input and output of multi-head attention, position-wise feed-forward
+# networks, encoder and decoder.
+d_model: 410
+# Dimension of heads.
+d_head: 41
+# Size of the hidden layer in position-wise feed-forward networks.
+d_inner_hid: 2100
+# Number of head used in multi-head attention.
+n_head: 10
+# Number of sub-layers to be stacked in the encoder and decoder.
+n_layer: 16
+# Dropout rates.
+dropout: 0.1
+# Attention dropout
+attn_dropout: 0.0
+# Attention type for decoder. 
+# 0 for relative partial MHA (in Transformer-XL). 
+# 1 for relative MHA (in Shaw et al). 
+attn_type: 0
+# Apply layer normalization before or after sublayers. 
+normalize_before: False
+# Whether to tie weight or not. 
+tie_weight: True
+# The length of the extended context.
+ext_len: 0
+# The divident value for softmax and adapative input. 
+div_val: 1
+# Target length. The number of tokens to predict. 
+tgt_len: 150
+# Memory length. The length of the retained previous heads. 
+mem_len: 150
+# Target length for evaluation. That is, the number of tokens to predict for evaluation. 
+eval_tgt_len: 150
+# Use the same attention length for all tokens. 
+same_length: False
+# Use the same positional encoding after clamp len. 
+clamp_len: -1
+# The number of samples in sample softmax. -1 means do not use sampled softmax. 
+sample_softmax: -1
+# Max step for training.
+max_step: 200000
+# What kind of mode for evaluation. valid, test or both("all"). 
+mode: "all"
+# Maximum evaluation step. 
+max_eval_steps: -1
--- a/PaddleNLP/examples/language_model/transformer-xl/eval.py
+++ b/PaddleNLP/examples/language_model/transformer-xl/eval.py
+import os
+import time
+import yaml
+import logging
+import argparse
+import numpy as np
+from pprint import pprint
+from attrdict import AttrDict
+
+import paddle
+
+from reader import get_lm_vocab, get_lm_data_loader
+from mem_transformer import MemTransformerLM
+
+FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="./configs/enwik8.yaml",
+        type=str,
+        help="Path of the config file. ")
+    args = parser.parse_args()
+    return args
+
+
+def do_eval(args):
+    assert args.ext_len >= 0, 'Extended context length must be no less than 0'
+
+    def _evaluate(loader):
+        total_len, total_loss = 0, 0.
+
+        eval_mems = tuple()
+        for i, (src, target, seq_len) in enumerate(loader):
+            if args.max_eval_steps > 0 and i >= args.max_eval_steps:
+                break
+            ret = mem_transformer(src, target, *eval_mems)
+            loss, eval_mems = ret[0], ret[1:]
+            seq_len = seq_len.numpy()
+            eval_cur_loss = seq_len * loss.numpy()
+            total_loss += eval_cur_loss
+            total_len += seq_len
+        return total_loss / total_len
+
+    def _logger(loss):
+        if args.dataset in ['enwik8', 'text8']:
+            logger_info = "loss: %f, bpc: %f" % \
+                          (loss, loss / np.log(2))
+        else:
+            logger_info = "loss: %f, ppl: %.2f" % \
+                          (loss, np.exp(loss))
+        return logger_info
+
+    vocab = get_lm_vocab(args)
+    eval_loader = get_lm_data_loader(args, vocab, "valid")
+    test_loader = get_lm_data_loader(args, vocab, "test")
+
+    cutoffs, tie_projs = [], [False]
+    if args.adaptive:
+        assert args.dataset in ['wt103', 'lm1b']
+        if args.dataset == 'wt103':
+            cutoffs = [20000, 40000, 200000]
+            tie_projs += [True] * len(cutoffs)
+        elif args.dataset == 'lm1b':
+            cutoffs = [60000, 100000, 640000]
+            tie_projs += [False] * len(cutoffs)
+
+    mem_transformer = MemTransformerLM(
+        args.ntokens,
+        args.n_layer,
+        args.n_head,
+        args.d_model,
+        args.d_head,
+        args.d_inner_hid,
+        args.dropout,
+        args.attn_dropout,
+        tie_weight=args.tie_weight,
+        d_embed=args.d_model,
+        div_val=args.div_val,
+        tie_projs=tie_projs,
+        normalize_before=args.normalize_before,
+        tgt_len=args.tgt_len,
+        ext_len=args.ext_len,
+        mem_len=args.mem_len,
+        cutoffs=cutoffs,
+        same_length=args.same_length,
+        attn_type=args.attn_type,
+        clamp_len=args.clamp_len,
+        sample_softmax=args.sample_softmax)
+
+    assert args.init_from_params, (
+        "Please set init_from_params to load the infer model.")
+
+    model_dict = paddle.load(
+        os.path.join(args.init_from_params, "mem_transformer.pdparams"))
+    mem_transformer.load_dict(model_dict)
+
+    logger.info(
+        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".
+        format(args.batch_size, args.tgt_len, args.ext_len, args.mem_len,
+               args.clamp_len))
+
+    mem_transformer.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+
+    test_loss = None
+    valid_loss = None
+    if args.mode == 'all':
+        test_loss = _evaluate(test_loader)
+        valid_loss = _evaluate(eval_loader)
+    elif args.mode == 'valid':
+        valid_loss = _evaluate(eval_loader)
+    elif args.mode == 'test':
+        test_loss = _evaluate(test_loader)
+
+    logger_info = ''
+    if valid_loss is not None:
+        logger_info = logger_info + _logger(valid_loss)
+    if test_loss is not None:
+        logger_info = logger_info + _logger(test_loss)
+    logger.info(logger_info)
+
+
+if __name__ == "__main__":
+    ARGS = parse_args()
+    yaml_file = ARGS.config
+    with open(yaml_file, 'rt') as f:
+        args = AttrDict(yaml.safe_load(f))
+        pprint(args)
+
+    do_eval(args)
--- a/PaddleNLP/examples/language_model/transformer-xl/gen_data.sh
+++ b/PaddleNLP/examples/language_model/transformer-xl/gen_data.sh
+echo "Downloading dataset..."
+
+CUR_DIR=$PWD
+
+mkdir -p gen_data
+cd ./gen_data/
+
+if [ ! -d "wikitext-103" ]; then
+    echo "Downloading wikitext-103..."
+    wget -O wikitext-103-v1.zip https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
+    echo "Unzip wikitext-103..."
+    unzip wikitext-103-v1.zip
+    cd wikitext-103
+    # Rename
+    mv wiki.train.tokens train.txt
+    mv wiki.valid.tokens valid.txt
+    mv wiki.test.tokens test.txt
+    cd -
+fi
+
+if [ ! -d 'enwik8' ]; then
+    mkdir -p enwik8
+    cd enwik8
+    echo "Downloading enwik8..."
+    wget -O enwik8.zip http://mattmahoney.net/dc/enwik8.zip
+    wget -O prep_enwik8.py https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py
+    python3 prep_enwik8.py
+    rm -f prep_enwik8.py
+    cd -
+fi
+
+if [ ! -d 'text8' ]; then
+    mkdir -p text8
+    cd text8
+    echo "Downloading text8..."
+    wget -O text8.zip http://mattmahoney.net/dc/text8.zip
+    python ${CUR_DIR}/utils/preprocess_text8.py 5000000
+    cd -
+fi
+
+if [ ! -d 'one-billion-words' ]; then
+    mkdir -p one-billion-words
+    cd one-billion-words
+    echo "Downloading one-billion-words..."
+    wget -O 1-billion-word-language-modeling-benchmark-r13output.tar.gz http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
+    tar xzf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
+
+    dir="./1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/"
+    cat ${dir}/news.en.heldout-00000-of-00050 > valid.txt
+    cat ${dir}/news.en.heldout-00000-of-00050 > test.txt
+    wget -O 1b_word_vocab.txt https://github.com/rafaljozefowicz/lm/raw/master/1b_word_vocab.txt
+    cd -
+fi
+
+echo "All done. "
--- a/PaddleNLP/examples/language_model/transformer-xl/mem_transformer.py
+++ b/PaddleNLP/examples/language_model/transformer-xl/mem_transformer.py
--- a/PaddleNLP/examples/language_model/transformer-xl/reader.py
+++ b/PaddleNLP/examples/language_model/transformer-xl/reader.py
+import os
+
+import numpy as np
+
+from paddlenlp.data import Vocab
+
+import paddle
+from paddle.io import IterableDataset, DataLoader
+import paddle.distributed as dist
+
+
+class LMDataset(IterableDataset):
+    def __init__(self, mode, vocab, path, dataset_name, batch_size, bptt,
+                 ext_len, nranks, rank):
+        assert (mode in ["train", "valid", "test"]
+                ), "Parameter mode must be one of [train, valid, test]."
+
+        super(LMDataset, self).__init__()
+        self.vocab = vocab
+        self.dataset_name = dataset_name
+
+        if self.dataset_name in ["wt103"]:
+            self.data = self.read_raw_data(
+                filename=os.path.join(path, mode + ".txt"), ordered=True)
+        elif self.dataset_name in ["enwik8", "text8"]:
+            self.data = self.read_raw_data(
+                filename=os.path.join(path, mode + ".txt"),
+                ordered=True,
+                add_eos=False)
+        else:
+            raise ValueError("Not supported dataset yet. ")
+        self.rank = rank
+        self.batch_size = batch_size
+        batch_size *= nranks
+
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.num_step = len(self.data) // batch_size
+        data = self.data[:self.num_step * batch_size]
+        self.data = data.reshape([batch_size, -1])
+
+        # Number of samples
+        self.num_samples = (self.num_step + self.bptt - 1) // self.bptt
+
+    def __len__(self):
+        return self.num_samples
+
+    def __iter__(self):
+        for i in range(0, self.data.shape[1] - 1, self.bptt):
+            seq_len = min(self.bptt, self.data.shape[1] - 1 - i)
+            end_idx = i + seq_len
+            beg_idx = max(0, i - self.ext_len)
+            src = self.data[:, beg_idx:end_idx]
+            target = self.data[:, i + 1:i + 1 + seq_len]
+
+            # NOTE: `seq_len` will be transfered to numpy immediately
+            # after returned by DataLoader. Hence, `seq_len` can be
+            # yield as `int`. And the returned tensor `seq_len`'s shape
+            # will be empty [].
+            # However, if it's necessary to use `seq_len` as input for some
+            # PaddlePaddle op, then it must be returned by `[seq_len]` whose
+            # shape is [1], cause some op cannot use shape [] as input. 
+            yield [
+                src[self.rank * self.batch_size:(self.rank + 1) *
+                    self.batch_size], target[self.rank * self.batch_size:(
+                        self.rank + 1) * self.batch_size], seq_len
+            ]
+
+    def read_raw_data(self,
+                      filename,
+                      ordered=False,
+                      lower_case=True,
+                      delimiter=None,
+                      add_eos=True,
+                      add_double_eos=False):
+        assert os.path.exists(filename), "%s is not exist. " % filename
+
+        data = []
+        with open(filename, 'r', encoding='utf-8') as f:
+            for line in f:
+                tokens = LMDataset.tokenize(
+                    line=line, delimiter=delimiter, lower_case=lower_case)
+                if add_double_eos:  # for lm1b
+                    tokens = [self.vocab._identifiers_to_tokens['bos_token']
+                              ] + tokens + [
+                                  self.vocab._identifiers_to_tokens['bos_token']
+                              ]
+                elif add_eos:
+                    tokens = tokens + [
+                        self.vocab._identifiers_to_tokens['eos_token']
+                    ]
+                data.append(
+                    np.asarray(self.get_indices(tokens)).astype("int64"))
+
+        if ordered:
+            data = np.concatenate(data)
+
+        return data
+
+    def get_indices(self, tokens):
+        return self.vocab.to_indices(tokens)
+
+    @classmethod
+    def get_vocab(cls,
+                  files,
+                  max_size=None,
+                  min_freq=0,
+                  lower_case=True,
+                  delimiter=None,
+                  unk_token=None,
+                  pad_token=None,
+                  bos_token=None,
+                  eos_token=None,
+                  **kwargs):
+        return Vocab.build_vocab(
+            cls.data_iterator(
+                files=files, delimiter=delimiter, lower_case=lower_case),
+            max_size=max_size,
+            min_freq=min_freq,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token)
+
+    @classmethod
+    def tokenize(cls, line, delimiter=None, lower_case=True):
+        line = line.strip()
+        if lower_case:
+            line = line.lower()
+        tokens = list(line) if delimiter == "" else line.split(delimiter)
+        return tokens
+
+    @classmethod
+    def data_iterator(cls, files, delimiter=None, lower_case=True):
+        if isinstance(files, str):
+            files = [files]
+        elif not isinstance(files, (list, tuple)):
+            raise ValueError(
+                "The parameter files must be a str or a list/tuple.")
+
+        for fl in files:
+            assert os.path.exists(fl), "%s is not exist. " % fl
+
+            with open(fl, 'r', encoding='utf-8') as f:
+                for line in f:
+                    tokens = cls.tokenize(
+                        line=line, delimiter=delimiter, lower_case=lower_case)
+                    yield tokens
+
+
+def get_lm_data_loader(args, vocab, mode="train"):
+    lm_dataset = LMDataset(
+        mode=mode,
+        vocab=vocab,
+        path=args.data,
+        dataset_name=args.dataset,
+        batch_size=args.batch_size if mode == "train" else args.eval_batch_size,
+        bptt=args.tgt_len,
+        ext_len=args.ext_len,
+        nranks=dist.get_world_size() if mode == "train" else 1,
+        rank=dist.get_rank() if mode == "train" else 0)
+
+    data_loader = DataLoader(
+        dataset=lm_dataset, batch_size=None, num_workers=0, return_list=True)
+
+    return data_loader
+
+
+def get_lm_vocab(args):
+    kwargs = {"unk_token": "<unk>"}
+    if args.token_delimiter == "None":
+        kwargs["delimiter"] = None
+    else:
+        kwargs["delimiter"] = args.token_delimiter
+
+    if args.dataset == "wt103":
+        kwargs["eos_token"] = "<eos>"
+        kwargs["lower_case"] = False
+
+    if args.dataset in ["enwik8", "text8"]:
+        files = [
+            os.path.join(args.data, "train.txt"),
+            os.path.join(args.data, "valid.txt"),
+            os.path.join(args.data, "test.txt")
+        ]
+    elif args.dataset == "wt103":
+        files = [os.path.join(args.data, "train.txt")]
+    else:
+        raise ValueError("Not supported dataset yet. ")
+
+    vocab = LMDataset.get_vocab(files, **kwargs)
+    args.ntokens = len(vocab)
+    print("Finish processing vocabulary, and the size of vocabulary is {}".
+          format(args.ntokens))
+
+    return vocab
--- a/PaddleNLP/examples/language_model/transformer-xl/train.py
+++ b/PaddleNLP/examples/language_model/transformer-xl/train.py
+import os
+import time
+import yaml
+import logging
+import argparse
+import numpy as np
+from pprint import pprint
+from attrdict import AttrDict
+
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+
+from mem_transformer import MemTransformerLM
+from reader import get_lm_vocab, get_lm_data_loader
+
+FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="./configs/enwik8.yaml",
+        type=str,
+        help="Path of the config file. ")
+    args = parser.parse_args()
+    return args
+
+
+def do_train(args):
+    if args.use_gpu:
+        rank = dist.get_rank()
+        trainer_count = dist.get_world_size()
+    else:
+        rank = 0
+        trainer_count = 1
+
+    if trainer_count > 1:
+        dist.init_parallel_env()
+
+    random_seed = eval(str(args.random_seed))
+    if random_seed is not None:
+        paddle.seed(random_seed)
+
+    vocab = get_lm_vocab(args)
+    train_loader = get_lm_data_loader(args, vocab, "train")
+    eval_loader = get_lm_data_loader(args, vocab, "valid")
+
+    cutoffs, tie_projs = [], [False]
+    if args.adaptive:
+        assert args.dataset in ['wt103', 'lm1b']
+        if args.dataset == 'wt103':
+            cutoffs = [20000, 40000, 200000]
+            tie_projs += [True] * len(cutoffs)
+        elif args.dataset == 'lm1b':
+            cutoffs = [60000, 100000, 640000]
+            tie_projs += [False] * len(cutoffs)
+
+    mem_transformer = MemTransformerLM(
+        args.ntokens,
+        args.n_layer,
+        args.n_head,
+        args.d_model,
+        args.d_head,
+        args.d_inner_hid,
+        args.dropout,
+        args.attn_dropout,
+        tie_weight=args.tie_weight,
+        d_embed=args.d_model,
+        div_val=args.div_val,
+        tie_projs=tie_projs,
+        normalize_before=args.normalize_before,
+        tgt_len=args.tgt_len,
+        ext_len=args.ext_len,
+        mem_len=args.mem_len,
+        cutoffs=cutoffs,
+        same_length=args.same_length,
+        attn_type=args.attn_type,
+        clamp_len=args.clamp_len,
+        sample_softmax=args.sample_softmax)
+
+    if args.scheduler == 'cosine':
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(
+            learning_rate=args.learning_rate,
+            T_max=args.max_step,
+            eta_min=args.eta_min)
+    elif args.scheduler == 'noam':
+        scheduler = paddle.optimizer.lr.NoamDecay(
+            d_model=args.d_model,
+            warmup_steps=args.warmup_steps,
+            learning_rate=args.learning_rate)
+    elif args.scheduler == 'dev_perf':
+        # fluid api
+        scheduler = paddle.fluid.dygraph.ReduceLROnPlateau(
+            learning_rate=args.learning_rate,
+            decay_rate=args.decay_rate,
+            patience=args.patience,
+            min_lr=args.lr_min)
+    elif args.scheduler == 'constant':
+        scheduler = args.learning_rate
+
+    clip = paddle.nn.ClipGradByGlobalNorm(args.clip)
+    if args.optim.lower() == 'momentum':
+        optimizer = paddle.optimizer.Momentum(
+            learning_rate=scheduler,
+            parameters=mem_transformer.parameters(),
+            momentum=args.mom,
+            grad_clip=clip)
+    elif args.optim.lower() == 'adam':
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=scheduler,
+            parameters=mem_transformer.parameters(),
+            beta1=args.beta1,
+            beta2=args.beta2,
+            epsilon=eval(args.eps),
+            grad_clip=clip)
+    elif args.optim.lower() == 'adagrad':
+        optimizer = paddle.optimizer.Adagrad(
+            learning_rate=scheduler,
+            parameters=mem_transformer.parameters(),
+            grad_clip=clip)
+
+    # Init from some checkpoint, to resume the previous training
+    if args.init_from_checkpoint:
+        model_dict = paddle.load(
+            os.path.join(args.init_from_checkpoint, "mem_transformer.pdparams"))
+        opt_dict = paddle.load(
+            os.path.join(args.init_from_checkpoint, "mem_transformer.pdopt"))
+        mem_transformer.set_state_dict(model_dict)
+        optimizer.set_state_dict(opt_dict)
+        print("loaded from checkpoint.")
+    # Init from some pretrain models, to better solve the current task
+    if args.init_from_pretrain_model:
+        model_dict = paddle.load(
+            os.path.join(args.init_from_pretrain_model,
+                         "mem_transformer.pdparams"))
+        mem_transformer.set_state_dict(model_dict)
+        print("loaded from pre-trained model.")
+
+    if trainer_count > 1:
+        mem_transformer = paddle.DataParallel(mem_transformer)
+
+    step_idx = 0
+    train_loss = 0.0
+
+    log_start_time = time.time()
+
+    for pass_id in range(args.epoch):
+        batch_id = 0
+
+        mems = tuple()
+        for input_data in train_loader:
+            (src, target, seq_len) = input_data
+            ret = mem_transformer(src, target, *mems)
+            loss = ret[0]
+            mems = ret[1:]
+            train_loss += loss.numpy()
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+
+            if step_idx > 0 and step_idx % args.print_step == 0 and rank == 0:
+                cur_loss = train_loss / args.print_step
+                elapsed = time.time() - log_start_time
+                if args.scheduler == "constant":
+                    lr = optimizer.get_lr()
+                else:
+                    lr = scheduler.get_lr()
+                logger_info = "step_idx: %d, epoch: %d, batch: %d, learning rate: %.8f, " \
+                              "speed: %f ms/batch, loss: %f" % \
+                              (step_idx, pass_id, batch_id, lr,
+                               elapsed * 1000.0 / args.print_step, cur_loss)
+                if args.dataset in ["enwik8", "text8"]:
+                    logger_info = logger_info + ", bpc: %f" % (cur_loss /
+                                                               np.log(2))
+                else:
+                    logger_info = logger_info + ", ppl: %f" % (np.exp(cur_loss))
+
+                logger.info(logger_info)
+                train_loss = 0.0
+                log_start_time = time.time()
+
+            if step_idx % args.save_step == 0 and step_idx != 0:
+                # Do validation. 
+                mem_transformer.eval()
+
+                # TODO(FrostML): simplify this.
+                if args.mem_len == 0:
+                    if dist.get_world_size() == 1:
+                        mem_transformer.reset_length(
+                            tgt_len=args.eval_tgt_len,
+                            ext_len=args.ext_len + args.tgt_len -
+                            args.eval_tgt_len,
+                            mem_len=args.mem_len)
+                    else:
+                        mem_transformer._layers.reset_length(
+                            tgt_len=args.eval_tgt_len,
+                            ext_len=args.ext_len + args.tgt_len -
+                            args.eval_tgt_len,
+                            mem_len=args.mem_len)
+                else:
+                    if dist.get_world_size() == 1:
+                        mem_transformer.reset_length(
+                            tgt_len=args.eval_tgt_len,
+                            ext_len=args.ext_len,
+                            mem_len=args.mem_len + args.tgt_len -
+                            args.eval_tgt_len)
+                    else:
+                        mem_transformer._layers.reset_length(
+                            tgt_len=args.eval_tgt_len,
+                            ext_len=args.ext_len,
+                            mem_len=args.mem_len + args.tgt_len -
+                            args.eval_tgt_len)
+
+                total_len, total_loss = 0, 0.
+
+                eval_mems = tuple()
+                with paddle.no_grad():
+                    for i, (src, target, seq_len) in enumerate(eval_loader):
+                        if args.max_eval_steps > 0 and i >= args.max_eval_steps:
+                            break
+                        ret = mem_transformer(src, target, *eval_mems)
+                        loss, eval_mems = ret[0], ret[1:]
+                        seq_len = seq_len.numpy()
+                        eval_cur_loss = seq_len * loss.numpy()
+                        total_loss += eval_cur_loss
+                        total_len += seq_len
+                    eval_loss = total_loss / total_len
+
+                logger_info = "Validation, step_idx: %d, validation loss: %f" % \
+                            (step_idx, eval_loss)
+                if args.dataset in ['enwik8', 'text8']:
+                    logger_info = logger_info + ", bpc: %f" % (eval_loss /
+                                                               np.log(2))
+                else:
+                    logger_info = logger_info + ", ppl: %f" % (np.exp(eval_loss)
+                                                               )
+                logger.info(logger_info)
+
+                if args.save_model and rank == 0:
+                    model_dir = os.path.join(args.save_model,
+                                             "step_" + str(step_idx))
+                    if not os.path.exists(model_dir):
+                        os.makedirs(model_dir)
+                    paddle.save(
+                        mem_transformer.state_dict(),
+                        os.path.join(model_dir, "mem_transformer.pdparams"))
+                    paddle.save(
+                        optimizer.state_dict(),
+                        os.path.join(model_dir, "mem_transformer.pdopt"))
+
+                if args.scheduler == 'dev_perf':
+                    scheduler.step(eval_loss)
+
+                # TODO(FrostML): simplify this.
+                if dist.get_world_size() == 1:
+                    mem_transformer.reset_length(
+                        tgt_len=args.tgt_len,
+                        ext_len=args.ext_len,
+                        mem_len=args.mem_len)
+                else:
+                    mem_transformer._layers.reset_length(
+                        tgt_len=args.tgt_len,
+                        ext_len=args.ext_len,
+                        mem_len=args.mem_len)
+
+                mem_transformer.train()
+
+            step_idx += 1
+            batch_id += 1
+            if args.scheduler in ['cosine', 'dev_perf']:
+                if step_idx < args.warmup_steps:
+                    curr_lr = args.learning_rate * step_idx / args.warmup_steps
+                    scheduler.base_lr = curr_lr
+                else:
+                    if args.scheduler == 'cosine':
+                        scheduler.step()
+            elif args.scheduler == 'constant':
+                if step_idx < args.warmup_steps:
+                    curr_lr = args.learning_rate * step_idx / args.warmup_steps
+                    optimizer.set_lr(curr_lr)
+            elif args.scheduler == 'noam':
+                scheduler.step()
+        if step_idx >= args.max_step:
+            break
+
+    if args.save_model and rank == 0:
+        model_dir = os.path.join(args.save_model, "step_final")
+        if not os.path.exists(model_dir):
+            os.makedirs(model_dir)
+        paddle.save(mem_transformer.state_dict(),
+                    os.path.join(model_dir, "mem_transformer.pdparams"))
+        paddle.save(optimizer.state_dict(),
+                    os.path.join(model_dir, "mem_transformer.pdopt"))
+
+
+if __name__ == "__main__":
+    ARGS = parse_args()
+    yaml_file = ARGS.config
+    with open(yaml_file, 'rt') as f:
+        args = AttrDict(yaml.safe_load(f))
+        pprint(args)
+
+    do_train(args)
--- a/PaddleNLP/examples/language_model/transformer-xl/utils/preprocess_text8.py
+++ b/PaddleNLP/examples/language_model/transformer-xl/utils/preprocess_text8.py
+import sys
+import zipfile
+import argparse
+
+if __name__ == "__main__":
+    data = zipfile.ZipFile("text8.zip").extractall()
+    data = open("text8", "r", encoding="utf-8").read()
+
+    num_test_char = int(sys.argv[1])
+
+    train_data = data[:-2 * num_test_char]
+    valid_data = data[-2 * num_test_char:-num_test_char]
+    test_data = data[-num_test_char:]
+
+    for files, data in [("train.txt", train_data), ("valid.txt", valid_data),
+                        ("test.txt", test_data)]:
+        data_str = " ".join(["_" if c == " " else c for c in data.strip()])
+        with open(files, "w") as f:
+            f.write(data_str)
+        with open(files + ".raw", "w", encoding="utf-8") as fw:
+            fw.write(data)
--- a/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/README.md
+++ b/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/README.md
@@ -27,7 +27,7 @@ DuReader-robust数据集是单篇章、抽取式阅读理解数据集，具体

 * PaddlePaddle 安装

-   本项目依赖于 PaddlePaddle 2.0 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
+   本项目依赖于 PaddlePaddle 2.0-rc1 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装

 * PaddleNLP 安装


--- a/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py
+++ b/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py
@@ -181,7 +181,7 @@ def do_train(args):
        args.learning_rate,
        lambda current_step, warmup_proportion=args.warmup_proportion,
        num_training_steps=args.max_steps if args.max_steps > 0 else
-        (len(train_ds.examples)//args.batch_size*args.num_train_epochs): float(
+        (len(train_data_loader)*args.num_train_epochs): float(
            current_step) / float(max(1, warmup_proportion*num_training_steps))
        if current_step < warmup_proportion*num_training_steps else max(
            0.0,

--- a/PaddleNLP/examples/machine_reading_comprehension/DuReader-yesno/README.md
+++ b/PaddleNLP/examples/machine_reading_comprehension/DuReader-yesno/README.md
@@ -41,7 +41,7 @@

 * PaddlePaddle 安装

-   本项目依赖于 PaddlePaddle 2.0 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
+   本项目依赖于 PaddlePaddle 2.0-rc1 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装

 * PaddleNLP 安装


--- a/PaddleNLP/examples/machine_reading_comprehension/SQuAD/README.md
+++ b/PaddleNLP/examples/machine_reading_comprehension/SQuAD/README.md
@@ -27,7 +27,7 @@ SQuAD v2.0

 * PaddlePaddle 安装

-   本项目依赖于 PaddlePaddle 2.0 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
+   本项目依赖于 PaddlePaddle 2.0-rc1 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装

 * PaddleNLP 安装

@@ -56,7 +56,7 @@ python -u ./run_squad.py \
    --batch_size 12 \
    --learning_rate 3e-5 \
    --num_train_epochs 2 \
-    --logging_steps 1000 \
+    --logging_steps 100 \
    --save_steps 1000 \
    --warmup_proportion 0.1 \
    --weight_decay 0.01 \

--- a/PaddleNLP/examples/machine_translation/seq2seq/README.md
+++ b/PaddleNLP/examples/machine_translation/seq2seq/README.md
@@ -19,7 +19,7 @@ Sequence to Sequence (Seq2Seq)，使用编码器-解码器（Encoder-Decoder）

 本目录包含Seq2Seq的一个经典样例：机器翻译，带attention机制的翻译模型。Seq2Seq翻译模型，模拟了人类在进行翻译类任务时的行为：先解析源语言，理解其含义，再根据该含义来写出目标语言的语句。更多关于机器翻译的具体原理和数学表达式，我们推荐参考飞桨官网[机器翻译案例](https://www.paddlepaddle.org.cn/documentation/docs/zh/user_guides/nlp_case/machine_translation/README.cn.html)。

-运行本目录下的范例模型需要安装PaddlePaddle 2.0-rc版。如果您的 PaddlePaddle 安装版本低于此要求，请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。
+运行本目录下的范例模型需要安装PaddlePaddle 2.0-rc1及以上版本。如果您的 PaddlePaddle 安装版本低于此要求，请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。


 ## 模型概览

--- a/PaddleNLP/examples/machine_translation/transformer/configs/transformer.base.yaml
+++ b/PaddleNLP/examples/machine_translation/transformer/configs/transformer.base.yaml
@@ -24,9 +24,15 @@ use_gpu: True

 # Args for reader, see reader.py for details
 pool_size: 200000
-sort_type: "pool"
+sort_type: "global"
 batch_size: 4096
 infer_batch_size: 8
+shuffle_batch: True
+# Data shuffle only works when sort_type is pool or none
+shuffle: True
+# shuffle_seed must be set when shuffle is True and using multi-cards to train. 
+# Otherwise, the number of batches cannot be guaranteed. 
+shuffle_seed: 128

 # Hyparams for training:
 # The number of epoches for training

--- a/PaddleNLP/examples/machine_translation/transformer/configs/transformer.big.yaml
+++ b/PaddleNLP/examples/machine_translation/transformer/configs/transformer.big.yaml
@@ -24,9 +24,15 @@ use_gpu: True

 # Args for reader, see reader.py for details
 pool_size: 200000
-sort_type: "pool"
+sort_type: "global"
 batch_size: 4096
 infer_batch_size: 8
+shuffle_batch: True
+# Data shuffle only works when sort_type is pool or none
+shuffle: True
+# shuffle_seed must be set when shuffle is True and using multi-cards to train. 
+# Otherwise, the number of batches cannot be guaranteed. 
+shuffle_seed: 128

 # Hyparams for training:
 # The number of epoches for training

--- a/PaddleNLP/examples/machine_translation/transformer/reader.py
+++ b/PaddleNLP/examples/machine_translation/transformer/reader.py
@@ -43,6 +43,12 @@ def create_data_loader(args):
            mode=m, transform_func=transform_func) for m in ["train", "dev"]
    ]

+    if args.shuffle or args.shuffle_batch:
+        if args.shuffle_seed == "None" or args.shuffle_seed is None:
+            shuffle_seed = 0
+        else:
+            shuffle_seed = args.shuffle_seed
+
    def _max_token_fn(current_idx, current_batch_size, tokens_sofar,
                      data_source):
        return max(tokens_sofar,
@@ -60,19 +66,17 @@ def create_data_loader(args):
                min_max_filer, max_len=args.max_length))
        sampler = SamplerHelper(dataset)

-        src_key = (lambda x, data_source: len(data_source[x][0]) + 1)
        if args.sort_type == SortType.GLOBAL:
-            buffer_size = -1
+            src_key = (lambda x, data_source: len(data_source[x][0]) + 1)
            trg_key = (lambda x, data_source: len(data_source[x][1]) + 1)
            # Sort twice
-            sampler = sampler.sort(
-                key=trg_key, buffer_size=buffer_size).sort(
-                    key=src_key, buffer_size=buffer_size)
+            sampler = sampler.sort(key=trg_key).sort(key=src_key)
        else:
-            sampler = sampler.shuffle()
+            if args.shuffle:
+                sampler = sampler.shuffle(seed=shuffle_seed)
+            max_key = (lambda x, data_source: max(len(data_source[x][0]), len(data_source[x][1])) + 1)
            if args.sort_type == SortType.POOL:
-                buffer_size = args.pool_size
-                sampler = sampler.sort(key=src_key, buffer_size=buffer_size)
+                sampler = sampler.sort(key=max_key, buffer_size=args.pool_size)

        batch_sampler = sampler.batch(
            batch_size=args.batch_size,
@@ -80,6 +84,9 @@ def create_data_loader(args):
            batch_size_fn=_max_token_fn,
            key=_key)

+        if args.shuffle_batch:
+            batch_sampler.shuffle(seed=shuffle_seed)
+
        if m == "train":
            batch_sampler = batch_sampler.shard()


--- a/PaddleNLP/examples/named_entity_recognition/README.md
+++ b/PaddleNLP/examples/named_entity_recognition/README.md
--- a/PaddleNLP/examples/named_entity_recognition/express_ner/README.md
+++ b/PaddleNLP/examples/named_entity_recognition/express_ner/README.md
--- a/PaddleNLP/examples/named_entity_recognition/msra_ner/README.md
+++ b/PaddleNLP/examples/named_entity_recognition/msra_ner/README.md
--- a/PaddleNLP/examples/named_entity_recognition/run_msra_ner.py
+++ b/PaddleNLP/examples/named_entity_recognition/run_msra_ner.py
--- a/PaddleNLP/examples/slim/README.md
+++ b/PaddleNLP/examples/slim/README.md
@@ -4,7 +4,7 @@ BERT-base模型是一个迁移能力很强的通用语义表示模型，但是

 ## 压缩结果

-基于`bert-base-uncased` 在GLUE dev数据集上的finetune结果进行压缩。压缩后模型精度和压缩前模型在GLUE dev数据集上的精度对比如下表所示， 压缩后模型相比压缩前加速约2倍，模型参数大小减小26%（从110M减少到81M）。
+基于`bert-base-uncased` 在GLUE dev数据集上的finetune结果进行压缩。压缩后模型精度和压缩前模型在GLUE dev数据集上的精度对比如下表所示:

 | Task  | Metric                       | Result            | Result with PaddleSlim |
 |:-----:|:----------------------------:|:-----------------:|:----------------------:|
@@ -17,6 +17,7 @@ BERT-base模型是一个迁移能力很强的通用语义表示模型，但是
 | MNLI  | Matched acc/MisMatched acc   |  0.84422/0.84825  |   0.84687/0.85242      |
 | RTE   | Accuracy                     |      0.711191     |       0.718412         |

+压缩后模型相比压缩前加速约59%（测试环境: T4, FP32, batch_size=16），模型参数大小减小26%（从110M减少到81M）。

 ## 快速开始
 本教程示例以GLUE/SST-2 数据集为例。
@@ -86,7 +87,7 @@ python -u ./run_glue_ofa.py --model_type bert \
 - `n_gpu` 表示使用的 GPU 卡数。若希望使用多卡训练，将其设置为指定数目即可；若为0，则使用CPU。
 - `width_mult_list` 表示压缩训练过程中，对每层Transformer Block的宽度选择的范围。

-压缩训练之后在dev上的结果如压缩结果表格中Result with PaddleSlim那一列所示， 速度相比原始模型加速2倍。
+压缩训练之后在dev上的结果如压缩结果表格中Result with PaddleSlim那一列所示，速度相比原始模型加速59%。

 ## 压缩原理


--- a/PaddleNLP/examples/text_classification/README.md
+++ b/PaddleNLP/examples/text_classification/README.md
--- a/PaddleNLP/examples/text_classification/pretrained_models/README.md
+++ b/PaddleNLP/examples/text_classification/pretrained_models/README.md
--- a/PaddleNLP/examples/text_classification/rnn/README.md
+++ b/PaddleNLP/examples/text_classification/rnn/README.md
--- a/PaddleNLP/examples/text_generation/couplet/README.md
+++ b/PaddleNLP/examples/text_generation/couplet/README.md
--- a/PaddleNLP/examples/text_generation/ernie-gen/README.md
+++ b/PaddleNLP/examples/text_generation/ernie-gen/README.md
@@ -124,3 +124,9 @@ python -u ./predict.py \
  year={2020}
 }
 ```
+
+## 线上教程体验
+
+我们为诗歌文本生成提供了线上教程，欢迎体验：
+
+* [使用PaddleNLP预训练模型ERNIE-GEN生成诗歌](https://aistudio.baidu.com/aistudio/projectdetail/1339888)
--- a/PaddleNLP/examples/text_generation/vae-seq2seq/README.md
+++ b/PaddleNLP/examples/text_generation/vae-seq2seq/README.md
-运行本目录下的范例模型需要安装PaddlePaddle 2.0-rc版。如果您的 PaddlePaddle 安装版本低于此要求，请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。
+运行本目录下的范例模型需要安装PaddlePaddle 2.0-rc1及以上版本。如果您的 PaddlePaddle 安装版本低于此要求，请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。

 # Variational Autoencoder (VAE) for Text Generation
 以下是本范例模型的简要目录结构及说明：
@@ -15,7 +15,7 @@
 ```

 ## 简介
-本目录下此范例模型的实现，旨在展示如何用Paddle 2.0-rc 构建用于文本生成的VAE示例，其中LSTM作为编码器和解码器。分别对官方PTB数据和yahoo数据集进行训练。
+本目录下此范例模型的实现，旨在展示如何用Paddle构建用于文本生成的VAE示例，其中LSTM作为编码器和解码器。分别对官方PTB数据和yahoo数据集进行训练。

 关于VAE的详细介绍参照： [(Bowman et al., 2015) Generating Sentences from a Continuous Space](https://arxiv.org/pdf/1511.06349.pdf)


--- a/PaddleNLP/examples/text_graph/erniesage/README.md
+++ b/PaddleNLP/examples/text_graph/erniesage/README.md
--- a/PaddleNLP/examples/text_graph/erniesage/config/erniesage_link_prediction.yaml
+++ b/PaddleNLP/examples/text_graph/erniesage/config/erniesage_link_prediction.yaml
--- a/PaddleNLP/examples/text_graph/erniesage/data/__init__.py
+++ b/PaddleNLP/examples/text_graph/erniesage/data/__init__.py
--- a/PaddleNLP/examples/text_graph/erniesage/data/dataset.py
+++ b/PaddleNLP/examples/text_graph/erniesage/data/dataset.py
--- a/PaddleNLP/examples/text_graph/erniesage/data/graph_reader.py
+++ b/PaddleNLP/examples/text_graph/erniesage/data/graph_reader.py
--- a/PaddleNLP/examples/text_graph/erniesage/example_data/graph_data.txt
+++ b/PaddleNLP/examples/text_graph/erniesage/example_data/graph_data.txt
--- a/PaddleNLP/examples/text_graph/erniesage/example_data/train_data.txt
+++ b/PaddleNLP/examples/text_graph/erniesage/example_data/train_data.txt
--- a/PaddleNLP/examples/text_graph/erniesage/link_prediction.py
+++ b/PaddleNLP/examples/text_graph/erniesage/link_prediction.py
--- a/PaddleNLP/examples/text_graph/erniesage/models/conv.py
+++ b/PaddleNLP/examples/text_graph/erniesage/models/conv.py
--- a/PaddleNLP/examples/text_graph/erniesage/models/encoder.py
+++ b/PaddleNLP/examples/text_graph/erniesage/models/encoder.py
--- a/PaddleNLP/examples/text_graph/erniesage/models/model.py
+++ b/PaddleNLP/examples/text_graph/erniesage/models/model.py
--- a/PaddleNLP/examples/text_graph/erniesage/preprocessing/dump_graph.py
+++ b/PaddleNLP/examples/text_graph/erniesage/preprocessing/dump_graph.py
--- a/PaddleNLP/examples/text_matching/README.md
+++ b/PaddleNLP/examples/text_matching/README.md
--- a/PaddleNLP/examples/text_matching/sentence_transformers/README.md
+++ b/PaddleNLP/examples/text_matching/sentence_transformers/README.md
--- a/PaddleNLP/examples/text_matching/simnet/README.md
+++ b/PaddleNLP/examples/text_matching/simnet/README.md
--- a/PaddleNLP/examples/time_series/README.md
+++ b/PaddleNLP/examples/time_series/README.md
--- a/PaddleNLP/examples/time_series/covid-19_forecasting.ipynb
+++ b/PaddleNLP/examples/time_series/covid-19_forecasting.ipynb
--- a/PaddleNLP/examples/word_embedding/README.md
+++ b/PaddleNLP/examples/word_embedding/README.md
--- a/PaddleNLP/paddlenlp/__init__.py
+++ b/PaddleNLP/paddlenlp/__init__.py
--- a/PaddleNLP/paddlenlp/datasets/squad.py
+++ b/PaddleNLP/paddlenlp/datasets/squad.py
--- a/PaddleNLP/paddlenlp/embeddings/constant.py
+++ b/PaddleNLP/paddlenlp/embeddings/constant.py
--- a/PaddleNLP/paddlenlp/embeddings/token_embedding.py
+++ b/PaddleNLP/paddlenlp/embeddings/token_embedding.py
--- a/PaddleNLP/paddlenlp/metrics/squad.py
+++ b/PaddleNLP/paddlenlp/metrics/squad.py