Release ERNIE 2.0

ERNIE 2.0 is a continual pre-training framework for language understanding in which pre-training tasks can be incrementally built and learned through multi-task learning

Release ERNIE 2.0
ERNIE 2.0 is a continual pre-training framework for language understanding in which pre-training tasks can be incrementally built and learned through multi-task learning
5c3b8cd3 · tianxin · 03504515 · 5c3b8cd3 · 5c3b8cd3 · 5c3b8cd3
89 changed file
--- a/.gitignore
+++ b/.gitignore
 *.pyc
+*.un~
--- a/.metas/ernie2.0_arch.png
+++ b/.metas/ernie2.0_arch.png
--- a/.metas/ernie2.0_model.png
+++ b/.metas/ernie2.0_model.png
--- a/ERNIE/.run_ce.sh
+++ b/ERNIE/.run_ce.sh
--- a/ERNIE/README.md
+++ b/ERNIE/README.md
+<div align="center">
+    <h1>
+        <font color="red">
+        ERNIE 项目已经迁移至 <a href="../README.zh.md">这里</a>
+        </font>
+    </h1>
+</div>
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
+&nbsp;
 ## ERNIE: **E**nhanced **R**epresentation through k**N**owledge **I**nt**E**gration
 **** **2019-04-10 更新**: update ERNIE_stable-1.0.1.tar.gz, 将模型参数、配置 ernie_config.json、vocab.txt 打包发布 ****
@@ -170,7 +197,7 @@ nlpcc-dbqa是由国际自然语言处理和中文计算会议NLPCC于2016年举
 | [模型](https://ernie.bj.bcebos.com/ERNIE_stable.tgz) | 包含预训练模型参数 |
 | [模型(含配置文件及词典)](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz)) | 包含预训练模型参数、词典 vocab.txt、模型配置 ernie_config.json|
-2) [任务数据下载](https://ernie.bj.bcebos.com/task_data.tgz)
+2) [任务数据下载](https://ernie.bj.bcebos.com/task_data_zh.tgz)
 ### 安装
 本项目依赖于 Paddle Fluid 1.3.1，请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。

--- a/README.md
+++ b/README.md
--- a/README.zh.md
+++ b/README.zh.md
--- a/ERNIE/__init__.py
+++ b/ERNIE/__init__.py
--- a/ERNIE/_ce.py
+++ b/ERNIE/_ce.py
--- a/ERNIE/batching.py
+++ b/ERNIE/batching.py
--- a/classify_infer.py
+++ b/classify_infer.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference by """
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import time
+import argparse
+import numpy as np
+import multiprocessing
+# NOTE(paddle-dev): All of these flags should be
+# set before `import paddle`. Otherwise, it would
+# not take any effect.
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0'  # enable gc
+import paddle.fluid as fluid
+from paddle.fluid.core import PaddleBuf
+from paddle.fluid.core import PaddleDType
+from paddle.fluid.core import PaddleTensor
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+from reader.task_reader import ClassifyReader
+from model.ernie import ErnieConfig
+from finetune.classifier import create_model
+from utils.args import ArgumentGroup, print_arguments
+from utils.init import init_pretraining_params
+from finetune_args import parser
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+model_g = ArgumentGroup(parser, "model", "options to init, resume and save model.")
+model_g.add_arg("ernie_config_path",            str,  None,  "Path to the json file for bert model config.")
+model_g.add_arg("init_checkpoint",              str,  None,  "Init checkpoint to resume training from.")
+model_g.add_arg("save_inference_model_path",    str,  "inference_model",  "If set, save the inference model to this path.")
+model_g.add_arg("use_fp16",                     bool, False, "Whether to resume parameters from fp16 checkpoint.")
+model_g.add_arg("num_labels",                   int,  2,     "num labels for classify")
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options.")
+data_g.add_arg("predict_set",         str,  None,  "Predict set file")
+data_g.add_arg("vocab_path",          str,  None,  "Vocabulary path.")
+data_g.add_arg("label_map_config",    str,  None,  "Label_map_config json file.")
+data_g.add_arg("max_seq_len",         int,  128,   "Number of words of the longest seqence.")
+data_g.add_arg("batch_size",          int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("do_lower_case",       bool, True,
+               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda",          bool,   True,  "If set, use GPU for training.")
+run_type_g.add_arg("do_prediction",     bool,   True,  "Whether to do prediction on test set.")
+args = parser.parse_args()
+# yapf: enable.
+def main(args):
+    ernie_config = ErnieConfig(args.ernie_config_path)
+    ernie_config.print_config()
+    reader = ClassifyReader(
+        vocab_path=args.vocab_path,
+        label_map_config=args.label_map_config,
+        max_seq_len=args.max_seq_len,
+        do_lower_case=args.do_lower_case,
+        in_tokens=False,
+        is_inference=True)
+    predict_prog = fluid.Program()
+    predict_startup = fluid.Program()
+    with fluid.program_guard(predict_prog, predict_startup):
+        with fluid.unique_name.guard():
+            predict_pyreader, probs, feed_target_names = create_model(
+                args,
+                pyreader_name='predict_reader',
+                ernie_config=ernie_config,
+                is_classify=True,
+                is_prediction=True)
+    predict_prog = predict_prog.clone(for_test=True)
+    if args.use_cuda:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(predict_startup)
+    if args.init_checkpoint:
+        init_pretraining_params(exe, args.init_checkpoint, predict_prog)
+    else:
+        raise ValueError("args 'init_checkpoint' should be set for prediction!")
+    assert args.save_inference_model_path, "args save_inference_model_path should be set for prediction"
+    _, ckpt_dir = os.path.split(args.init_checkpoint.rstrip('/'))
+    dir_name = ckpt_dir + '_inference_model'
+    model_path = os.path.join(args.save_inference_model_path, dir_name)
+    print("save inference model to %s" % model_path)
+    fluid.io.save_inference_model(
+        model_path,
+        feed_target_names, [probs],
+        exe,
+        main_program=predict_prog)
+    # Set config
+    #config = AnalysisConfig(args.model_dir)
+    #config = AnalysisConfig(os.path.join(model_path, "__model__"), os.path.join(model_path, ""))
+    config = AnalysisConfig(model_path)
+    if not args.use_cuda:
+        print("disable gpu")
+        config.disable_gpu()
+    # Create PaddlePredictor
+    predictor = create_paddle_predictor(config)
+    predict_data_generator = reader.data_generator(
+        input_file=args.predict_set,
+        batch_size=args.batch_size,
+        epoch=1,
+        shuffle=False)
+    print("-------------- prediction results --------------")
+    np.set_printoptions(precision=4, suppress=True)
+    index = 0
+    total_time = 0
+    for sample in predict_data_generator():
+        src_ids    = sample[0]
+        sent_ids   = sample[1]
+        pos_ids    = sample[2]
+        task_ids   = sample[3]
+        input_mask = sample[4]
+        inputs = [array2tensor(ndarray) for ndarray in [src_ids, sent_ids, pos_ids, input_mask]]
+        begin_time = time.time()
+        outputs = predictor.run(inputs)
+        end_time = time.time()
+        total_time += end_time - begin_time
+        # parse outputs
+        output = outputs[0]
+        print(output.name)
+        output_data = output.data.float_data()
+        #assert len(output_data) == args.num_labels * args.batch_size
+        batch_result  = np.array(output_data).reshape((-1, args.num_labels))
+        for single_example_probs in batch_result:
+            print("{} example\t{}".format(index, single_example_probs))
+            index += 1
+    print("qps:{}\ttotal_time:{}\ttotal_example:{}\tbatch_size:{}".format(index/total_time, total_time, index, args.batch_size))
+def array2tensor(ndarray):
+    """ convert numpy array to PaddleTensor"""
+    assert isinstance(ndarray, np.ndarray), "input type must be np.ndarray"
+    tensor = PaddleTensor()
+    tensor.name = "data"
+    tensor.shape = ndarray.shape
+    if "float" in str(ndarray.dtype):
+        tensor.dtype = PaddleDType.FLOAT32
+    elif "int" in str(ndarray.dtype):
+        tensor.dtype = PaddleDType.INT64
+    else:
+        raise ValueError("{} type ndarray is unsupported".format(tensor.dtype))
+    tensor.data = PaddleBuf(ndarray.flatten().tolist())
+    return tensor
+if __name__ == '__main__':
+    print_arguments(args)
+    main(args)
--- a/ERNIE/config/ernie_config.json
+++ b/ERNIE/config/ernie_config.json
--- a/ERNIE/config/vocab.txt
+++ b/ERNIE/config/vocab.txt
--- a/config/vocab_en.txt
+++ b/config/vocab_en.txt
--- a/ERNIE/data/demo_train_set.gz
+++ b/ERNIE/data/demo_train_set.gz
--- a/ERNIE/data/demo_valid_set.gz
+++ b/ERNIE/data/demo_valid_set.gz
--- a/ERNIE/data/train_filelist
+++ b/ERNIE/data/train_filelist
--- a/ERNIE/data/valid_filelist
+++ b/ERNIE/data/valid_filelist
--- a/ERNIE/ernie_encoder.py
+++ b/ERNIE/ernie_encoder.py
@@ -55,19 +55,21 @@ def create_model(args, pyreader_name, ernie_config):
    pyreader = fluid.layers.py_reader(
        capacity=50,
        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]],
+                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-        dtypes=['int64', 'int64', 'int64', 'float', 'int64'],
+                [-1, args.max_seq_len, 1], [-1, 1]],
-        lod_levels=[0, 0, 0, 0, 0],
+        dtypes=['int64', 'int64', 'int64', 'int64', 'float', 'int64'],
+        lod_levels=[0, 0, 0, 0, 0, 0],
        name=pyreader_name,
        use_double_buffer=True)
-    (src_ids, sent_ids, pos_ids, input_mask,
+    (src_ids, sent_ids, pos_ids, task_ids, input_mask,
     seq_lens) = fluid.layers.read_file(pyreader)
    ernie = ErnieModel(
        src_ids=src_ids,
        position_ids=pos_ids,
        sentence_ids=sent_ids,
+        task_ids=task_ids,
        input_mask=input_mask,
        config=ernie_config)
@@ -154,8 +156,8 @@ def main(args):
            cls_emb, unpad_top_layer_emb = exe.run(
                program=infer_program,
                fetch_list=[
-                    graph_vars["cls_embeddings"].name, graph_vars[
+                    graph_vars["cls_embeddings"].name,
-                        "top_layer_embeddings"].name
+                    graph_vars["top_layer_embeddings"].name
                ],
                return_numpy=False)
            # batch_size * embedding_size

--- a/ERNIE/finetune/__init__.py
+++ b/ERNIE/finetune/__init__.py
--- a/ERNIE/finetune/classifier.py
+++ b/ERNIE/finetune/classifier.py
@@ -20,30 +20,55 @@ from __future__ import print_function
 import time
 import numpy as np
+from scipy.stats import pearsonr, spearmanr
 from six.moves import xrange
 import paddle.fluid as fluid
 from model.ernie import ErnieModel
-def create_model(args, pyreader_name, ernie_config, is_prediction=False):
+def create_model(args,
+                 pyreader_name,
+                 ernie_config,
+                 is_prediction=False,
+                 task_name="",
+                 is_classify=False,
+                 is_regression=False,
+                 ernie_version="1.0"):
+    if is_classify:
        pyreader = fluid.layers.py_reader(
            capacity=50,
            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1],
+                    [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-                [-1, 1]],
+                    [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
-        dtypes=['int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
+            dtypes=[
-        lod_levels=[0, 0, 0, 0, 0, 0],
+                'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'
-        name=pyreader_name,
+            ],
+            lod_levels=[0, 0, 0, 0, 0, 0, 0],
+            name=task_name + "_" + pyreader_name,
+            use_double_buffer=True)
+    elif is_regression:
+        pyreader = fluid.layers.py_reader(
+            capacity=50,
+            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                    [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
+            dtypes=[
+                'int64', 'int64', 'int64', 'int64', 'float32', 'float32',
+                'int64'
+            ],
+            lod_levels=[0, 0, 0, 0, 0, 0, 0],
+            name=task_name + "_" + pyreader_name,
            use_double_buffer=True)
-    (src_ids, sent_ids, pos_ids, input_mask, labels,
+    (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels,
     qids) = fluid.layers.read_file(pyreader)
    ernie = ErnieModel(
        src_ids=src_ids,
        position_ids=pos_ids,
        sentence_ids=sent_ids,
+        task_ids=task_ids,
        input_mask=input_mask,
        config=ernie_config,
        use_fp16=args.use_fp16)
@@ -57,28 +82,29 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
        input=cls_feats,
        size=args.num_labels,
        param_attr=fluid.ParamAttr(
-            name="cls_out_w",
+            name=task_name + "_cls_out_w",
            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
        bias_attr=fluid.ParamAttr(
-            name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+            name=task_name + "_cls_out_b",
+            initializer=fluid.initializer.Constant(0.)))
    if is_prediction:
        probs = fluid.layers.softmax(logits)
        feed_targets_name = [
            src_ids.name, sent_ids.name, pos_ids.name, input_mask.name
        ]
+        if ernie_version == "2.0":
+            feed_targets_name += [task_ids.name]
        return pyreader, probs, feed_targets_name
+    assert is_classify != is_regression, 'is_classify or is_regression must be true and only one of them can be true'
+    num_seqs = fluid.layers.create_tensor(dtype='int64')
+    if is_classify:
        ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
            logits=logits, label=labels, return_softmax=True)
        loss = fluid.layers.mean(x=ce_loss)
+        accuracy = fluid.layers.accuracy(
-    if args.use_fp16 and args.loss_scaling > 1.0:
+            input=probs, label=labels, total=num_seqs)
-        loss *= args.loss_scaling
-    num_seqs = fluid.layers.create_tensor(dtype='int64')
-    accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
        graph_vars = {
            "loss": loss,
            "probs": probs,
@@ -87,9 +113,19 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
            "num_seqs": num_seqs,
            "qids": qids
        }
+    elif is_regression:
-    for k, v in graph_vars.items():
+        cost = fluid.layers.square_error_cost(input=logits, label=labels)
-        v.persistable = True
+        loss = fluid.layers.mean(x=cost)
+        graph_vars = {
+            "loss": loss,
+            "probs": logits,
+            "labels": labels,
+            "num_seqs": num_seqs,
+            "qids": qids
+        }
+    else:
+        raise ValueError(
+            'unsupported fine tune mode. only supported classify/regression')
    return pyreader, graph_vars
@@ -144,7 +180,15 @@ def evaluate_map(preds):
    return total_map / qnum
-def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
+def evaluate_classify(exe,
+                      test_program,
+                      test_pyreader,
+                      graph_vars,
+                      eval_phase,
+                      use_multi_gpu_test=False,
+                      metric='simple_accuracy',
+                      is_classify=False,
+                      is_regression=False):
    train_fetch_list = [
        graph_vars["loss"].name, graph_vars["accuracy"].name,
        graph_vars["num_seqs"].name
@@ -161,7 +205,7 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
    test_pyreader.start()
    total_cost, total_acc, total_num_seqs, total_label_pos_num, total_pred_pos_num, total_correct_num = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
-    qids, labels, scores = [], [], []
+    qids, labels, scores, preds = [], [], [], []
    time_begin = time.time()
    fetch_list = [
@@ -171,6 +215,10 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
    ]
    while True:
        try:
+            if use_multi_gpu_test:
+                np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(
+                    fetch_list=fetch_list)
+            else:
                np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(
                    program=test_program, fetch_list=fetch_list)
            total_cost += np.sum(np_loss * np_num_seqs)
@@ -182,6 +230,7 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
            qids.extend(np_qids.reshape(-1).tolist())
            scores.extend(np_probs[:, 1].reshape(-1).tolist())
            np_preds = np.argmax(np_probs, axis=1).astype(np.float32)
+            preds.extend(np_preds)
            total_label_pos_num += np.sum(np_labels)
            total_pred_pos_num += np.sum(np_preds)
            total_correct_num += np.sum(np.dot(np_preds, np_labels))
@@ -189,25 +238,221 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
            test_pyreader.reset()
            break
    time_end = time.time()
+    cost = total_cost / total_num_seqs
+    elapsed_time = time_end - time_begin
+    evaluate_info = ""
+    if metric == 'acc_and_f1':
+        ret = acc_and_f1(preds, labels)
+        evaluate_info = "[%s evaluation] ave loss: %f, ave_acc: %f, f1: %f, data_num: %d, elapsed time: %f s" \
+            % (eval_phase, cost, ret['acc'], ret['f1'], total_num_seqs, elapsed_time)
+    elif metric == 'matthews_corrcoef':
+        ret = matthews_corrcoef(preds, labels)
+        evaluate_info = "[%s evaluation] ave loss: %f, matthews_corrcoef: %f, data_num: %d, elapsed time: %f s" \
+            % (eval_phase, cost, ret, total_num_seqs, elapsed_time)
+    elif metric == 'pearson_and_spearman':
+        ret = pearson_and_spearman(scores, labels)
+        evaluate_info = "[%s evaluation] ave loss: %f, pearson:%f, spearman:%f, corr:%f, data_num: %d, elapsed time: %f s" \
+            % (eval_phase, cost, ret['pearson'], ret['spearman'], ret['corr'], total_num_seqs, elapsed_time)
+    elif metric == 'simple_accuracy':
+        ret = simple_accuracy(preds, labels)
+        evaluate_info = "[%s evaluation] ave loss: %f, acc:%f, data_num: %d, elapsed time: %f s" \
+            % (eval_phase, cost, ret, total_num_seqs, elapsed_time)
+    elif metric == "acc_and_f1_and_mrr":
+        ret_a = acc_and_f1(preds, labels)
+        preds = sorted(
+            zip(qids, scores, labels), key=lambda elem: (elem[0], -elem[1]))
+        ret_b = evaluate_mrr(preds)
+        evaluate_info = "[%s evaluation] ave loss: %f, acc: %f, f1: %f, mrr: %f, data_num: %d, elapsed time: %f s" \
+            % (eval_phase, cost, ret_a['acc'], ret_a['f1'], ret_b, total_num_seqs, elapsed_time)
+    else:
+        raise ValueError('unsupported metric {}'.format(metric))
+    return evaluate_info
+def evaluate_regression(exe,
+                        test_program,
+                        test_pyreader,
+                        graph_vars,
+                        eval_phase,
+                        use_multi_gpu_test=False,
+                        metric='pearson_and_spearman'):
+    if eval_phase == "train":
+        train_fetch_list = [graph_vars["loss"].name]
+        if "learning_rate" in graph_vars:
+            train_fetch_list.append(graph_vars["learning_rate"].name)
+        outputs = exe.run(fetch_list=train_fetch_list)
+        ret = {"loss": np.mean(outputs[0])}
+        if "learning_rate" in graph_vars:
+            ret["learning_rate"] = float(outputs[1][0])
+        return ret
+    test_pyreader.start()
+    total_cost, total_num_seqs = 0.0, 0.0
+    qids, labels, scores = [], [], []
+    fetch_list = [
+        graph_vars["loss"].name, graph_vars["probs"].name,
+        graph_vars["labels"].name, graph_vars["qids"].name
+    ]
+    time_begin = time.time()
+    while True:
+        try:
+            if use_multi_gpu_test:
+                np_loss, np_probs, np_labels, np_qids = exe.run(
+                    fetch_list=fetch_list)
+            else:
+                np_loss, np_probs, np_labels, np_qids = exe.run(
+                    program=test_program, fetch_list=fetch_list)
+            labels.extend(np_labels.reshape((-1)).tolist())
+            if np_qids is None:
+                np_qids = np.array([])
+            qids.extend(np_qids.reshape(-1).tolist())
+            scores.extend(np_probs.reshape(-1).tolist())
+        except fluid.core.EOFException:
+            test_pyreader.reset()
+            break
+    time_end = time.time()
-    if len(qids) == 0:
+    elapsed_time = time_end - time_begin
-        print(
-            "[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s"
+    if metric == 'pearson_and_spearman':
-            % (eval_phase, total_cost / total_num_seqs, total_acc /
+        ret = pearson_and_spearman(scores, labels)
-               total_num_seqs, total_num_seqs, time_end - time_begin))
+        evaluate_info = "[%s evaluation] ave loss: %f, pearson:%f, spearman:%f, corr:%f, elapsed time: %f s" \
+            % (eval_phase, 0.0, ret['pearson'], ret['spearmanr'], ret['corr'], elapsed_time)
+    else:
+        raise ValueError('unsupported metric {}'.format(metric))
+    return evaluate_info
+def evaluate(exe,
+             test_program,
+             test_pyreader,
+             graph_vars,
+             eval_phase,
+             use_multi_gpu_test=False,
+             metric='simple_accuracy',
+             is_classify=False,
+             is_regression=False):
+    if is_classify:
+        return evaluate_classify(
+            exe,
+            test_program,
+            test_pyreader,
+            graph_vars,
+            eval_phase,
+            use_multi_gpu_test=use_multi_gpu_test,
+            metric=metric)
    else:
-        r = total_correct_num / total_label_pos_num
+        return evaluate_regression(
-        p = total_correct_num / total_pred_pos_num
+            exe,
-        f = 2 * p * r / (p + r)
+            test_program,
+            test_pyreader,
+            graph_vars,
+            eval_phase,
+            use_multi_gpu_test=use_multi_gpu_test,
+            metric=metric)
+def matthews_corrcoef(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    tp = np.sum((labels == 1) & (preds == 1))
+    tn = np.sum((labels == 0) & (preds == 0))
+    fp = np.sum((labels == 0) & (preds == 1))
+    fn = np.sum((labels == 1) & (preds == 0))
+    mcc = ((tp * tn) - (fp * fn)) / np.sqrt(
+        (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
+    return mcc
+def f1_score(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    tp = np.sum((labels == 1) & (preds == 1))
+    tn = np.sum((labels == 0) & (preds == 0))
+    fp = np.sum((labels == 0) & (preds == 1))
+    fn = np.sum((labels == 1) & (preds == 0))
+    p = tp / (tp + fp)
+    r = tp / (tp + fn)
+    f1 = (2 * p * r) / (p + r + 1e-8)
+    return f1
+def pearson_and_spearman(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
-        assert len(qids) == len(labels) == len(scores)
-        preds = sorted(
+def acc_and_f1(preds, labels):
-            zip(qids, scores, labels), key=lambda elem: (elem[0], -elem[1]))
+    preds = np.array(preds)
-        mrr = evaluate_mrr(preds)
+    labels = np.array(labels)
-        map = evaluate_map(preds)
+    acc = simple_accuracy(preds, labels)
-        print(
+    f1 = f1_score(preds, labels)
-            "[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s"
+    return {
-            % (eval_phase, total_cost / total_num_seqs,
+        "acc": acc,
-               total_acc / total_num_seqs, mrr, map, p, r, f, total_num_seqs,
+        "f1": f1,
-               time_end - time_begin))
+        "acc_and_f1": (acc + f1) / 2,
+    }
+def simple_accuracy(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    return (preds == labels).mean()
+def predict(exe,
+            test_program,
+            test_pyreader,
+            graph_vars,
+            dev_count=1,
+            is_classify=False,
+            is_regression=False):
+    test_pyreader.start()
+    qids, scores, probs = [], [], []
+    preds = []
+    fetch_list = [graph_vars["probs"].name, graph_vars["qids"].name]
+    while True:
+        try:
+            if dev_count == 1:
+                np_probs, np_qids = exe.run(program=test_program,
+                                            fetch_list=fetch_list)
+            else:
+                np_probs, np_qids = exe.run(fetch_list=fetch_list)
+            if np_qids is None:
+                np_qids = np.array([])
+            qids.extend(np_qids.reshape(-1).tolist())
+            if is_classify:
+                np_preds = np.argmax(np_probs, axis=1).astype(np.float32)
+                preds.extend(np_preds)
+            elif is_regression:
+                preds.extend(np_probs.reshape(-1))
+            probs.append(np_probs)
+        except fluid.core.EOFException:
+            test_pyreader.reset()
+            break
+    probs = np.concatenate(probs, axis=0).reshape([len(preds), -1])
+    return qids, preds, probs
--- a/finetune/mrc.py
+++ b/finetune/mrc.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model for classifier."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+import numpy as np
+import os
+import math
+import json
+import collections
+import six
+from scipy.stats import pearsonr, spearmanr
+from six.moves import xrange
+import paddle.fluid as fluid
+from utils.cmrc2018_eval import eval_file
+from model.ernie import ErnieModel
+import tokenization
+def create_model(args, pyreader_name, ernie_config, is_training):
+    pyreader = fluid.layers.py_reader(
+        capacity=50,
+        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+                [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1]],
+        dtypes=[
+            'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64',
+            'int64'
+        ],
+        lod_levels=[0, 0, 0, 0, 0, 0, 0, 0],
+        name=pyreader_name,
+        use_double_buffer=True)
+    (src_ids, sent_ids, pos_ids, task_ids, input_mask, start_positions,
+     end_positions, unique_id) = fluid.layers.read_file(pyreader)
+    ernie = ErnieModel(
+        src_ids=src_ids,
+        position_ids=pos_ids,
+        sentence_ids=sent_ids,
+        task_ids=task_ids,
+        input_mask=input_mask,
+        config=ernie_config,
+        use_fp16=args.use_fp16)
+    enc_out = ernie.get_sequence_output()
+    enc_out = fluid.layers.dropout(
+        x=enc_out, dropout_prob=0.1, dropout_implementation="upscale_in_train")
+    logits = fluid.layers.fc(
+        input=enc_out,
+        size=2,
+        num_flatten_dims=2,
+        param_attr=fluid.ParamAttr(
+            name="cls_mrc_out_w",
+            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+        bias_attr=fluid.ParamAttr(
+            name="cls_mrc_out_b", initializer=fluid.initializer.Constant(0.)))
+    logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
+    start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
+    batch_ones = fluid.layers.fill_constant_batch_size_like(
+        input=start_logits, dtype='int64', shape=[1], value=1)
+    num_seqs = fluid.layers.reduce_sum(input=batch_ones)
+    def compute_loss(logits, positions):
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=logits, label=positions)
+        loss = fluid.layers.mean(x=loss)
+        return loss
+    start_loss = compute_loss(start_logits, start_positions)
+    end_loss = compute_loss(end_logits, end_positions)
+    loss = (start_loss + end_loss) / 2.0
+    if args.use_fp16 and args.loss_scaling > 1.0:
+        loss *= args.loss_scaling
+    graph_vars = {
+        "loss": loss,
+        "num_seqs": num_seqs,
+        "unique_id": unique_id,
+        "start_logits": start_logits,
+        "end_logits": end_logits
+    }
+    for k, v in graph_vars.items():
+        v.persistable = True
+    return pyreader, graph_vars
+def evaluate(exe,
+             test_program,
+             test_pyreader,
+             graph_vars,
+             eval_phase,
+             tag_num=None,
+             dev_count=1,
+             examples=None,
+             features=None,
+             args=None):
+    if eval_phase == "train":
+        train_fetch_list = [graph_vars["loss"].name]
+        if "learning_rate" in graph_vars:
+            train_fetch_list.append(graph_vars["learning_rate"].name)
+        outputs = exe.run(fetch_list=train_fetch_list)
+        ret = {"loss": np.mean(outputs[0])}
+        if "learning_rate" in graph_vars:
+            ret["learning_rate"] = float(outputs[1][0])
+        return ret
+    output_dir = args.checkpoints
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    output_prediction_file = os.path.join(output_dir,
+                                          eval_phase + "_predictions.json")
+    output_nbest_file = os.path.join(output_dir,
+                                     eval_phase + "_nbest_predictions.json")
+    RawResult = collections.namedtuple(
+        "RawResult", ["unique_id", "start_logits", "end_logits"])
+    test_pyreader.start()
+    all_results = []
+    time_begin = time.time()
+    fetch_list = [
+        graph_vars["unique_id"].name, graph_vars["start_logits"].name,
+        graph_vars["end_logits"].name, graph_vars["num_seqs"].name
+    ]
+    while True:
+        try:
+            np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = exe.run(
+                program=test_program, fetch_list=fetch_list)
+            for idx in range(np_unique_ids.shape[0]):
+                if len(all_results) % 1000 == 0:
+                    print("Processing example: %d" % len(all_results))
+                unique_id = int(np_unique_ids[idx])
+                start_logits = [float(x) for x in np_start_logits[idx].flat]
+                end_logits = [float(x) for x in np_end_logits[idx].flat]
+                all_results.append(
+                    RawResult(
+                        unique_id=unique_id,
+                        start_logits=start_logits,
+                        end_logits=end_logits))
+        except fluid.core.EOFException:
+            test_pyreader.reset()
+            break
+    write_predictions(examples, features, all_results, args.n_best_size,
+                      args.max_answer_length, args.do_lower_case,
+                      output_prediction_file, output_nbest_file)
+    if eval_phase.find("dev") != -1:
+        data_file = args.dev_set
+    elif eval_phase.find("test") != -1:
+        data_file = args.test_set
+    em, f1, avg, total = eval_file(data_file, output_prediction_file)
+    time_end = time.time()
+    elapsed_time = time_end - time_begin
+    print(
+        "[%s evaluation] em: %f, f1: %f, avg: %f, questions: %d, elapsed time: %f"
+        % (eval_phase, em, f1, avg, total, elapsed_time))
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    print("Writing predictions to: %s" % (output_prediction_file))
+    print("Writing nbest to: %s" % (output_nbest_file))
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
+                                                              )]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
+                                                                 1)]
+                tok_text = " ".join(tok_tokens)
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = "".join(orig_tokens)
+                final_text = get_final_text(tok_text, orig_text, do_lower_case)
+                if final_text in seen_predictions:
+                    continue
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(
+                    text="empty", start_logit=0.0, end_logit=0.0))
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+        probs = _compute_softmax(total_scores)
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+        assert len(nbest_json) >= 1
+        all_predictions[example.qas_id] = nbest_json[0]["text"]
+        all_nbest_json[example.qas_id] = nbest_json
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+def get_final_text(pred_text, orig_text, do_lower_case):
+    """Project the tokenized prediction back to the original text."""
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+    if len(orig_ns_text) != len(tok_ns_text):
+        return orig_text
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+    if orig_start_position is None:
+        return orig_text
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+    if orig_end_position is None:
+        return orig_text
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(
+        enumerate(logits), key=lambda x: x[1], reverse=True)
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
--- a/ERNIE/finetune/sequence_label.py
+++ b/ERNIE/finetune/sequence_label.py
@@ -35,24 +35,29 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
        capacity=50,
        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-                [-1, args.max_seq_len, 1], [-1, 1]],
+                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]],
-        dtypes=['int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
+        dtypes=[
-        lod_levels=[0, 0, 0, 0, 0, 0],
+            'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'
+        ],
+        lod_levels=[0, 0, 0, 0, 0, 0, 0],
        name=pyreader_name,
        use_double_buffer=True)
-    (src_ids, sent_ids, pos_ids, input_mask, labels,
+    (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels,
     seq_lens) = fluid.layers.read_file(pyreader)
    ernie = ErnieModel(
        src_ids=src_ids,
        position_ids=pos_ids,
        sentence_ids=sent_ids,
+        task_ids=task_ids,
        input_mask=input_mask,
        config=ernie_config,
        use_fp16=args.use_fp16)
    enc_out = ernie.get_sequence_output()
+    enc_out = fluid.layers.dropout(
+        x=enc_out, dropout_prob=0.1, dropout_implementation="upscale_in_train")
    logits = fluid.layers.fc(
        input=enc_out,
        size=args.num_labels,
@@ -75,6 +80,8 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
            logits, axis=2),
        label=labels,
        return_softmax=True)
+    input_mask = fluid.layers.flatten(input_mask, axis=2)
+    ce_loss = ce_loss * input_mask
    loss = fluid.layers.mean(x=ce_loss)
    if args.use_fp16 and args.loss_scaling > 1.0:
@@ -218,15 +225,15 @@ def evaluate(exe,
        num_label, num_infer, num_correct = chunk_eval(
            np_labels, np_infers, np_lens, tag_num, dev_count)
        precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct)
-        outputs = {
+        rets = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "loss": np.mean(np_loss)
        }
        if "learning_rate" in graph_vars:
-            outputs["lr"] = float(outputs[4][0])
+            rets["lr"] = float(outputs[4][0])
-        return outputs
+        return rets
    else:
        total_label, total_infer, total_correct = 0.0, 0.0, 0.0

--- a/ERNIE/finetune_args.py
+++ b/ERNIE/finetune_args.py
@@ -32,6 +32,10 @@ model_g.add_arg("init_pretraining_params",  str,  None,
                 "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
 model_g.add_arg("checkpoints",              str,  "checkpoints",  "Path to save checkpoints.")
+model_g.add_arg("is_classify",    bool, True,  "is_classify")
+model_g.add_arg("is_regression",  bool, False, "is_regression")
+model_g.add_arg("task_id",           int,    0,       "task id")
 train_g = ArgumentGroup(parser, "training", "training options.")
 train_g.add_arg("epoch",             int,    3,       "Number of epoches for fine-tuning.")
 train_g.add_arg("learning_rate",     float,  5e-5,    "Learning rate used to train with warmup.")
@@ -45,26 +49,39 @@ train_g.add_arg("validation_steps",  int,    1000,    "The steps interval to eva
 train_g.add_arg("use_fp16",          bool,   False,   "Whether to use fp16 mixed precision training.")
 train_g.add_arg("loss_scaling",      float,  1.0,
                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+train_g.add_arg("test_save",            str,    "test_result",       "test_save")
+train_g.add_arg("metric",               str,    "simple_accuracy",   "metric")
 log_g = ArgumentGroup(parser,     "logging", "logging related.")
 log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
 log_g.add_arg("verbose",             bool,   False, "Whether to output verbose log.")
 data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("tokenizer",           str, "FullTokenizer",
+              "ATTENTION: the INPUT must be splited by Word with blank while using SentencepieceTokenizer or WordsegTokenizer")
 data_g.add_arg("train_set",           str,  None,  "Path to training data.")
 data_g.add_arg("test_set",            str,  None,  "Path to test data.")
 data_g.add_arg("dev_set",             str,  None,  "Path to validation data.")
 data_g.add_arg("vocab_path",          str,  None,  "Vocabulary path.")
 data_g.add_arg("max_seq_len",         int,  512,   "Number of words of the longest seqence.")
 data_g.add_arg("batch_size",          int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("predict_batch_size",  int,  None,    "Total examples' number in batch for predict. see also --in_tokens.")
 data_g.add_arg("in_tokens",           bool, False,
              "If set, the batch size will be the maximum number of tokens in one batch. "
              "Otherwise, it will be the maximum number of examples in one batch.")
 data_g.add_arg("do_lower_case",       bool, True,
               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
-data_g.add_arg("random_seed",         int,  0,     "Random seed.")
+data_g.add_arg("random_seed",         int,  None,     "Random seed.")
 data_g.add_arg("label_map_config",    str,  None,  "label_map_path.")
 data_g.add_arg("num_labels",          int,  2,     "label number")
+data_g.add_arg("diagnostic",          str,  None,  "GLUE Diagnostic Dataset")
+data_g.add_arg("diagnostic_save",     str,  None,  "GLUE Diagnostic save f")
+data_g.add_arg("max_query_length",          int,   64,    "Max query length.")
+data_g.add_arg("max_answer_length",         int,   100,    "Max answer length.")
+data_g.add_arg("doc_stride",                int,   128,
+               "When splitting up a long document into chunks, how much stride to take between chunks.")
+data_g.add_arg("n_best_size",               int,   20,
+               "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
 run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
 run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")
@@ -73,8 +90,10 @@ run_type_g.add_arg("num_iteration_per_drop_scope", int,    10,    "Iteration int
 run_type_g.add_arg("do_train",                     bool,   True,  "Whether to perform training.")
 run_type_g.add_arg("do_val",                       bool,   True,  "Whether to perform evaluation on dev data set.")
 run_type_g.add_arg("do_test",                      bool,   True,  "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("use_multi_gpu_test",           bool,   False, "Whether to perform evaluation using multiple gpu cards")
 run_type_g.add_arg("metrics",                      bool,   True,  "Whether to perform evaluation on test data set.")
 run_type_g.add_arg("shuffle",                      bool,   True,  "")
+run_type_g.add_arg("for_cn",                       bool,   True,  "model train for cn or for other langs.")
 parser.add_argument("--enable_ce", action='store_true', help="The flag indicating whether to run the task for continuous evaluation.")
 # yapf: enable
--- a/ERNIE/model/__init__.py
+++ b/ERNIE/model/__init__.py
--- a/model/ernie.py
+++ b/model/ernie.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ernie model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import six
+import paddle.fluid as fluid
+from model.transformer_encoder import encoder, pre_process_layer
+class ErnieConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing Ernie model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+    def __getitem__(self, key):
+        return self._config_dict.get(key, None)
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+class ErnieModel(object):
+    def __init__(self,
+                 src_ids,
+                 position_ids,
+                 sentence_ids,
+                 task_ids,
+                 input_mask,
+                 config,
+                 weight_sharing=True,
+                 use_fp16=False):
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        if config['sent_type_vocab_size']:
+            self._sent_types = config['sent_type_vocab_size']
+        else:
+            self._sent_types = config['type_vocab_size']
+        self._use_task_id = config['use_task_id']
+        if self._use_task_id:
+            self._task_types = config['task_type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self._weight_sharing = weight_sharing
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._task_emb_name = "task_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+        self._emb_dtype = "float32"
+        # Initialize all weigths by truncated normal initializer, and all biases
+        # will be initialized by constant zero by default.
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
+                          input_mask)
+    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
+                     input_mask):
+        # padding id in vocabulary must be set to 0
+        emb_out = fluid.layers.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False)
+        position_emb_out = fluid.layers.embedding(
+            input=position_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer))
+        sent_emb_out = fluid.layers.embedding(
+            sentence_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer))
+        emb_out = emb_out + position_emb_out
+        emb_out = emb_out + sent_emb_out
+        if self._use_task_id:
+            task_emb_out = fluid.layers.embedding(
+                task_ids,
+                size=[self._task_types, self._emb_size],
+                dtype=self._emb_dtype,
+                param_attr=fluid.ParamAttr(
+                    name=self._task_emb_name,
+                    initializer=self._param_initializer))
+            emb_out = emb_out + task_emb_out
+        emb_out = pre_process_layer(
+            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
+        if self._dtype is "float16":
+            emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype)
+            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
+        self_attn_mask = fluid.layers.matmul(
+            x=input_mask, y=input_mask, transpose_y=True)
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+        self._enc_out = encoder(
+            enc_input=emb_out,
+            attn_bias=n_head_self_attn_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            param_initializer=self._param_initializer,
+            name='encoder')
+    def get_sequence_output(self):
+        return self._enc_out
+    def get_pooled_output(self):
+        """Get the first feature of each sequence for classification"""
+        next_sent_feat = fluid.layers.slice(
+            input=self._enc_out, axes=[1], starts=[0], ends=[1])
+        if self._dtype == "float16":
+            next_sent_feat = fluid.layers.cast(
+                x=next_sent_feat, dtype=self._emb_dtype)
+        next_sent_feat = fluid.layers.fc(
+            input=next_sent_feat,
+            size=self._emb_size,
+            act="tanh",
+            param_attr=fluid.ParamAttr(
+                name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0")
+        return next_sent_feat
+    def get_lm_output(self, mask_label, mask_pos):
+        """Get the loss & accuracy for pretraining"""
+        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+        # extract the first token feature in each sentence
+        self.next_sent_feat = self.get_pooled_output()
+        reshaped_emb_out = fluid.layers.reshape(
+            x=self._enc_out, shape=[-1, self._emb_size])
+        # extract masked tokens' feature
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+        if self._dtype == "float16":
+            mask_feat = fluid.layers.cast(x=mask_feat, dtype=self._emb_dtype)
+        # transform: fc
+        mask_trans_feat = fluid.layers.fc(
+            input=mask_feat,
+            size=self._emb_size,
+            act=self._hidden_act,
+            param_attr=fluid.ParamAttr(
+                name='mask_lm_trans_fc.w_0',
+                initializer=self._param_initializer),
+            bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
+        # transform: layer norm 
+        mask_trans_feat = fluid.layers.layer_norm(
+            mask_trans_feat,
+            begin_norm_axis=len(mask_trans_feat.shape) - 1,
+            param_attr=fluid.ParamAttr(
+                name='mask_lm_trans_layer_norm_scale',
+                initializer=fluid.initializer.Constant(1.)),
+            bias_attr=fluid.ParamAttr(
+                name='mask_lm_trans_layer_norm_bias',
+                initializer=fluid.initializer.Constant(1.)))
+        # transform: layer norm 
+        #mask_trans_feat = pre_process_layer(
+        #    mask_trans_feat, 'n', name='mask_lm_trans')
+        mask_lm_out_bias_attr = fluid.ParamAttr(
+            name="mask_lm_out_fc.b_0",
+            initializer=fluid.initializer.Constant(value=0.0))
+        if self._weight_sharing:
+            fc_out = fluid.layers.matmul(
+                x=mask_trans_feat,
+                y=fluid.default_main_program().global_block().var(
+                    self._word_emb_name),
+                transpose_y=True)
+            fc_out += fluid.layers.create_parameter(
+                shape=[self._voc_size],
+                dtype=self._emb_dtype,
+                attr=mask_lm_out_bias_attr,
+                is_bias=True)
+        else:
+            fc_out = fluid.layers.fc(input=mask_trans_feat,
+                                     size=self._voc_size,
+                                     param_attr=fluid.ParamAttr(
+                                         name="mask_lm_out_fc.w_0",
+                                         initializer=self._param_initializer),
+                                     bias_attr=mask_lm_out_bias_attr)
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+            logits=fc_out, label=mask_label)
+        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
+        return mean_mask_lm_loss
+    def get_task_output(self, task, task_labels):
+        task_fc_out = fluid.layers.fc(input=self.next_sent_feat,
+                                      size=task["num_labels"],
+                                      param_attr=fluid.ParamAttr(
+                                          name=task["task_name"] + "_fc.w_0",
+                                          initializer=self._param_initializer),
+                                      bias_attr=task["task_name"] + "_fc.b_0")
+        task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy(
+            logits=task_fc_out, label=task_labels, return_softmax=True)
+        task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels)
+        mean_task_loss = fluid.layers.mean(task_loss)
+        return mean_task_loss, task_acc
--- a/ERNIE/model/ernie.py
+++ b/ERNIE/model/ernie.py
--- a/ERNIE/model/transformer_encoder.py
+++ b/ERNIE/model/transformer_encoder.py
--- a/ERNIE/optimization.py
+++ b/ERNIE/optimization.py
@@ -59,7 +59,12 @@ def optimization(loss,
                 weight_decay,
                 scheduler='linear_warmup_decay',
                 use_fp16=False,
-                 loss_scaling=1.0):
+                 use_dynamic_loss_scaling=False,
+                 init_loss_scaling=1.0,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=2,
+                 incr_ratio=2.0,
+                 decr_ratio=0.8):
    if warmup_steps > 0:
        if scheduler == 'noam_decay':
            scheduled_lr = fluid.layers.learning_rate_scheduler\
@@ -73,16 +78,18 @@ def optimization(loss,
                             "'noam_decay' or 'linear_warmup_decay'")
        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
    else:
-        optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
+        scheduled_lr = fluid.layers.create_global_var(
-        scheduled_lr = learning_rate
+            name=fluid.unique_name.generate("learning_rate"),
+            shape=[1],
-    clip_norm_thres = 1.0
+            value=learning_rate,
-    # When using mixed precision training, scale the gradient clip threshold
+            dtype='float32',
-    # by loss_scaling
+            persistable=True)
-    if use_fp16 and loss_scaling > 1.0:
+        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
-        clip_norm_thres *= loss_scaling
+        optimizer._learning_rate_map[fluid.default_main_program(
+        )] = scheduled_lr
    fluid.clip.set_gradient_clip(
-        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
+        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
    def exclude_from_weight_decay(name):
        if name.find("layer_norm") > -1:
@@ -95,8 +102,17 @@ def optimization(loss,
    param_list = dict()
+    loss_scaling = fluid.layers.create_global_var(
+        name=fluid.unique_name.generate("loss_scaling"),
+        shape=[1],
+        value=init_loss_scaling,
+        dtype='float32',
+        persistable=True)
    if use_fp16:
+        loss *= loss_scaling
        param_grads = optimizer.backward(loss)
        master_param_grads = create_master_params_grads(
            param_grads, train_program, startup_prog, loss_scaling)
@@ -104,6 +120,11 @@ def optimization(loss,
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True
+        if use_dynamic_loss_scaling:
+            apply_dynamic_loss_scaling(
+                loss_scaling, master_param_grads, incr_every_n_steps,
+                decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
        optimizer.apply_gradients(master_param_grads)
        if weight_decay > 0:
@@ -136,4 +157,4 @@ def optimization(loss,
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)
-    return scheduled_lr
+    return scheduled_lr, loss_scaling
--- a/ERNIE/predict_classifier.py
+++ b/ERNIE/predict_classifier.py
@@ -46,6 +46,7 @@ model_g.add_arg("init_checkpoint",              str,  None,  "Init checkpoint to
 model_g.add_arg("save_inference_model_path",    str,  "inference_model",  "If set, save the inference model to this path.")
 model_g.add_arg("use_fp16",                     bool, False, "Whether to resume parameters from fp16 checkpoint.")
 model_g.add_arg("num_labels",                   int,  2,     "num labels for classify")
+model_g.add_arg("ernie_version",                str,  "1.0", "ernie_version")
 data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options.")
 data_g.add_arg("predict_set",         str,  None,  "Predict set file")
@@ -83,7 +84,9 @@ def main(args):
                args,
                pyreader_name='predict_reader',
                ernie_config=ernie_config,
-                is_prediction=True)
+                is_classify=True,
+                is_prediction=True,
+                ernie_version=args.ernie_version)
    predict_prog = predict_prog.clone(for_test=True)
@@ -122,6 +125,8 @@ def main(args):
    sent_ids = feed_target_names[1]
    pos_ids = feed_target_names[2]
    input_mask = feed_target_names[3]
+    if args.ernie_version == "2.0":
+        task_ids = feed_target_names[4]
    predict_data_generator = reader.data_generator(
        input_file=args.predict_set,
@@ -136,7 +141,9 @@ def main(args):
        src_ids_data = sample[0]
        sent_ids_data = sample[1]
        pos_ids_data = sample[2]
-        input_mask_data = sample[3]
+        task_ids_data = sample[3]
+        input_mask_data = sample[4]
+        if args.ernie_version == "1.0":
            output = exe.run(
                infer_program,
                feed={src_ids: src_ids_data,
@@ -144,6 +151,18 @@ def main(args):
                      pos_ids: pos_ids_data,
                      input_mask: input_mask_data},
                fetch_list=probs)
+        elif args.ernie_version == "2.0":
+            output = exe.run(
+                infer_program,
+                feed={src_ids: src_ids_data,
+                      sent_ids: sent_ids_data,
+                      pos_ids: pos_ids_data,
+                      task_ids: task_ids_data,
+                      input_mask: input_mask_data},
+                fetch_list=probs)
+        else:
+            raise ValueError("ernie_version must be 1.0 or 2.0")
        for single_result in output[0]:
            print("example_index:{}\t{}".format(index, single_result))
            index += 1

--- a/ERNIE/pretrain_args.py
+++ b/ERNIE/pretrain_args.py
--- a/ERNIE/reader/__init__.py
+++ b/ERNIE/reader/__init__.py
--- a/ERNIE/reader/pretraining.py
+++ b/ERNIE/reader/pretraining.py
--- a/ERNIE/reader/task_reader.py
+++ b/ERNIE/reader/task_reader.py
--- a/run_classifier.py
+++ b/run_classifier.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning on classification tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import time
+import multiprocessing
+# NOTE(paddle-dev): All of these flags should be
+# set before `import paddle`. Otherwise, it would
+# not take any effect.
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0'  # enable gc
+import paddle.fluid as fluid
+import reader.task_reader as task_reader
+from model.ernie import ErnieConfig
+from finetune.classifier import create_model, evaluate, predict
+from optimization import optimization
+from utils.args import print_arguments, check_cuda
+from utils.init import init_pretraining_params, init_checkpoint
+from utils.cards import get_cards
+from finetune_args import parser
+args = parser.parse_args()
+def main(args):
+    ernie_config = ErnieConfig(args.ernie_config_path)
+    ernie_config.print_config()
+    if args.use_cuda:
+        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    exe = fluid.Executor(place)
+    reader = task_reader.ClassifyReader(
+        vocab_path=args.vocab_path,
+        label_map_config=args.label_map_config,
+        max_seq_len=args.max_seq_len,
+        do_lower_case=args.do_lower_case,
+        in_tokens=args.in_tokens,
+        random_seed=args.random_seed,
+        tokenizer=args.tokenizer,
+        is_classify=args.is_classify,
+        is_regression=args.is_regression,
+        for_cn=args.for_cn,
+        task_id=args.task_id)
+    if not (args.do_train or args.do_val or args.do_test):
+        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
+                         "least one of them must be True.")
+    if args.do_test:
+        assert args.test_save is not None
+    startup_prog = fluid.Program()
+    if args.random_seed is not None:
+        startup_prog.random_seed = args.random_seed
+    if args.predict_batch_size == None:
+        args.predict_batch_size = args.batch_size
+    if args.do_train:
+        train_data_generator = reader.data_generator(
+            input_file=args.train_set,
+            batch_size=args.batch_size,
+            epoch=args.epoch,
+            dev_count=dev_count,
+            shuffle=True,
+            phase="train")
+        num_train_examples = reader.get_num_examples(args.train_set)
+        if args.in_tokens:
+            max_train_steps = args.epoch * num_train_examples // (
+                args.batch_size // args.max_seq_len) // dev_count
+        else:
+            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
+        warmup_steps = int(max_train_steps * args.warmup_proportion)
+        print("Device count: %d" % dev_count)
+        print("Num train examples: %d" % num_train_examples)
+        print("Max train steps: %d" % max_train_steps)
+        print("Num warmup steps: %d" % warmup_steps)
+        train_program = fluid.Program()
+        if args.random_seed is not None and args.enable_ce:
+            train_program.random_seed = args.random_seed
+        with fluid.program_guard(train_program, startup_prog):
+            with fluid.unique_name.guard():
+                train_pyreader, graph_vars = create_model(
+                    args,
+                    pyreader_name='train_reader',
+                    ernie_config=ernie_config,
+                    is_classify=args.is_classify,
+                    is_regression=args.is_regression)
+                scheduled_lr, loss_scaling = optimization(
+                    loss=graph_vars["loss"],
+                    warmup_steps=warmup_steps,
+                    num_train_steps=max_train_steps,
+                    learning_rate=args.learning_rate,
+                    train_program=train_program,
+                    startup_prog=startup_prog,
+                    weight_decay=args.weight_decay,
+                    scheduler=args.lr_scheduler,
+                    use_fp16=args.use_fp16)
+        if args.verbose:
+            if args.in_tokens:
+                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+                    program=train_program,
+                    batch_size=args.batch_size // args.max_seq_len)
+            else:
+                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+                    program=train_program, batch_size=args.batch_size)
+            print("Theoretical memory usage in training: %.3f - %.3f %s" %
+                  (lower_mem, upper_mem, unit))
+    if args.do_val or args.do_test:
+        test_prog = fluid.Program()
+        with fluid.program_guard(test_prog, startup_prog):
+            with fluid.unique_name.guard():
+                test_pyreader, graph_vars = create_model(
+                    args,
+                    pyreader_name='test_reader',
+                    ernie_config=ernie_config,
+                    is_classify=args.is_classify,
+                    is_regression=args.is_regression)
+        test_prog = test_prog.clone(for_test=True)
+    nccl2_num_trainers = 1
+    nccl2_trainer_id = 0
+    exe.run(startup_prog)
+    if args.do_train:
+        if args.init_checkpoint and args.init_pretraining_params:
+            print(
+                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
+                "both are set! Only arg 'init_checkpoint' is made valid.")
+        if args.init_checkpoint:
+            init_checkpoint(
+                exe,
+                args.init_checkpoint,
+                main_program=startup_prog,
+                use_fp16=args.use_fp16)
+        elif args.init_pretraining_params:
+            init_pretraining_params(
+                exe,
+                args.init_pretraining_params,
+                main_program=startup_prog,
+                use_fp16=args.use_fp16)
+    elif args.do_val or args.do_test:
+        if not args.init_checkpoint:
+            raise ValueError("args 'init_checkpoint' should be set if"
+                             "only doing validation or testing!")
+        init_checkpoint(
+            exe,
+            args.init_checkpoint,
+            main_program=startup_prog,
+            use_fp16=args.use_fp16)
+    if args.do_train:
+        exec_strategy = fluid.ExecutionStrategy()
+        if args.use_fast_executor:
+            exec_strategy.use_experimental_executor = True
+        exec_strategy.num_threads = dev_count
+        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=args.use_cuda,
+            loss_name=graph_vars["loss"].name,
+            exec_strategy=exec_strategy,
+            main_program=train_program,
+            num_trainers=nccl2_num_trainers,
+            trainer_id=nccl2_trainer_id)
+        train_pyreader.decorate_tensor_provider(train_data_generator)
+    else:
+        train_exe = None
+    test_exe = exe
+    if args.do_val or args.do_test:
+        if args.use_multi_gpu_test:
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=args.use_cuda,
+                main_program=test_prog,
+                share_vars_from=train_exe)
+    if args.do_train:
+        train_pyreader.start()
+        steps = 0
+        if warmup_steps > 0:
+            graph_vars["learning_rate"] = scheduled_lr
+        ce_info = []
+        time_begin = time.time()
+        last_epoch = 0
+        current_epoch = 0
+        while True:
+            try:
+                steps += 1
+                if steps % args.skip_steps != 0:
+                    train_exe.run(fetch_list=[])
+                else:
+                    outputs = evaluate(
+                        train_exe,
+                        train_program,
+                        train_pyreader,
+                        graph_vars,
+                        "train",
+                        metric=args.metric,
+                        is_classify=args.is_classify,
+                        is_regression=args.is_regression)
+                    if args.verbose:
+                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
+                        )
+                        verbose += "learning rate: %f" % (
+                            outputs["learning_rate"]
+                            if warmup_steps > 0 else args.learning_rate)
+                        print(verbose)
+                    current_example, current_epoch = reader.get_train_progress()
+                    time_end = time.time()
+                    used_time = time_end - time_begin
+                    if args.is_classify:
+                        print(
+                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
+                            "ave acc: %f, speed: %f steps/s" %
+                            (current_epoch, current_example, num_train_examples,
+                             steps, outputs["loss"], outputs["accuracy"],
+                             args.skip_steps / used_time))
+                        ce_info.append(
+                            [outputs["loss"], outputs["accuracy"], used_time])
+                    if args.is_regression:
+                        print(
+                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
+                            " speed: %f steps/s" %
+                            (current_epoch, current_example, num_train_examples,
+                             steps, outputs["loss"],
+                             args.skip_steps / used_time))
+                    time_begin = time.time()
+                if steps % args.save_steps == 0:
+                    save_path = os.path.join(args.checkpoints,
+                                             "step_" + str(steps))
+                    fluid.io.save_persistables(exe, save_path, train_program)
+                if steps % args.validation_steps == 0 or last_epoch != current_epoch:
+                    # evaluate dev set
+                    if args.do_val:
+                        evaluate_wrapper(args, reader, exe, test_prog,
+                                         test_pyreader, graph_vars,
+                                         current_epoch, steps)
+                    if args.do_test:
+                        predict_wrapper(args, reader, exe, test_prog,
+                                        test_pyreader, graph_vars,
+                                        current_epoch, steps)
+                if last_epoch != current_epoch:
+                    last_epoch = current_epoch
+            except fluid.core.EOFException:
+                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
+                fluid.io.save_persistables(exe, save_path, train_program)
+                train_pyreader.reset()
+                break
+        if args.enable_ce:
+            card_num = get_cards()
+            ce_loss = 0
+            ce_acc = 0
+            ce_time = 0
+            try:
+                ce_loss = ce_info[-2][0]
+                ce_acc = ce_info[-2][1]
+                ce_time = ce_info[-2][2]
+            except:
+                print("ce info error")
+            print("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time))
+            print("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss))
+            print("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc))
+    # final eval on dev set
+    if args.do_val:
+        evaluate_wrapper(args, reader, exe, test_prog, test_pyreader,
+                         graph_vars, current_epoch, steps)
+    # final eval on test set
+    if args.do_test:
+        predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
+                        current_epoch, steps)
+    # final eval on dianostic, hack for glue-ax
+    if args.diagnostic:
+        test_pyreader.decorate_tensor_provider(
+            reader.data_generator(
+                args.diagnostic,
+                batch_size=args.batch_size,
+                epoch=1,
+                dev_count=1,
+                shuffle=False))
+        print("Final diagnostic")
+        qids, preds, probs = predict(
+            test_exe,
+            test_prog,
+            test_pyreader,
+            graph_vars,
+            is_classify=args.is_classify,
+            is_regression=args.is_regression)
+        assert len(qids) == len(preds), '{} v.s. {}'.format(
+            len(qids), len(preds))
+        with open(args.diagnostic_save, 'w') as f:
+            for id, s, p in zip(qids, preds, probs):
+                f.write('{}\t{}\t{}\n'.format(id, s, p))
+        print("Done final diagnostic, saving to {}".format(
+            args.diagnostic_save))
+def evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
+                     epoch, steps):
+    # evaluate dev set
+    for ds in args.dev_set.split(','):
+        test_pyreader.decorate_tensor_provider(
+            reader.data_generator(
+                ds,
+                batch_size=args.predict_batch_size,
+                epoch=1,
+                dev_count=1,
+                shuffle=False))
+        print("validation result of dataset {}:".format(ds))
+        evaluate_info = evaluate(
+            exe,
+            test_prog,
+            test_pyreader,
+            graph_vars,
+            "dev",
+            metric=args.metric,
+            is_classify=args.is_classify,
+            is_regression=args.is_regression)
+        print(evaluate_info + ', file: {}, epoch: {}, steps: {}'.format(
+            ds, epoch, steps))
+def predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
+                    epoch, steps):
+    test_sets = args.test_set.split(',')
+    save_dirs = args.test_save.split(',')
+    assert len(test_sets) == len(save_dirs)
+    for test_f, save_f in zip(test_sets, save_dirs):
+        test_pyreader.decorate_tensor_provider(
+            reader.data_generator(
+                test_f,
+                batch_size=args.predict_batch_size,
+                epoch=1,
+                dev_count=1,
+                shuffle=False))
+        save_path = save_f + '.' + str(epoch) + '.' + str(steps)
+        print("testing {}, save to {}".format(test_f, save_path))
+        qids, preds, probs = predict(
+            exe,
+            test_prog,
+            test_pyreader,
+            graph_vars,
+            is_classify=args.is_classify,
+            is_regression=args.is_regression)
+        save_dir = os.path.dirname(save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(save_path, 'w') as f:
+            for id, s, p in zip(qids, preds, probs):
+                f.write('{}\t{}\t{}\n'.format(id, s, p))
+if __name__ == '__main__':
+    print_arguments(args)
+    check_cuda(args.use_cuda)
+    main(args)
--- a/ERNIE/run_classifier.py
+++ b/ERNIE/run_classifier.py
@@ -21,15 +21,19 @@ import os
 import time
 import multiprocessing
+# NOTE(paddle-dev): All of these flags should be
+# set before `import paddle`. Otherwise, it would
+# not take any effect.
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0'  # enable gc
 import paddle.fluid as fluid
 import reader.task_reader as task_reader
 from model.ernie import ErnieConfig
-from finetune.classifier import create_model, evaluate
+from finetune.mrc import create_model, evaluate
 from optimization import optimization
-from utils.args import print_arguments, check_cuda
+from utils.args import print_arguments
 from utils.init import init_pretraining_params, init_checkpoint
-from utils.cards import get_cards
 from finetune_args import parser
 args = parser.parse_args()
@@ -47,13 +51,20 @@ def main(args):
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)
-    reader = task_reader.ClassifyReader(
+    reader = task_reader.MRCReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
-        random_seed=args.random_seed)
+        random_seed=args.random_seed,
+        tokenizer=args.tokenizer,
+        is_classify=args.is_classify,
+        is_regression=args.is_regression,
+        for_cn=args.for_cn,
+        task_id=args.task_id,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length)
    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
@@ -63,15 +74,18 @@ def main(args):
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed
+    if args.predict_batch_size == None:
+        args.predict_batch_size = args.batch_size
    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
-            shuffle=args.shuffle,
+            dev_count=dev_count,
+            shuffle=True,
            phase="train")
-        num_train_examples = reader.get_num_examples(args.train_set)
+        num_train_examples = reader.get_num_examples("train")
        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
@@ -86,16 +100,15 @@ def main(args):
        print("Num warmup steps: %d" % warmup_steps)
        train_program = fluid.Program()
-        if args.random_seed is not None and args.enable_ce:
-            train_program.random_seed = args.random_seed
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
-                    ernie_config=ernie_config)
+                    ernie_config=ernie_config,
-                scheduled_lr = optimization(
+                    is_training=True)
+                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
@@ -104,17 +117,15 @@ def main(args):
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
-                    use_fp16=args.use_fp16,
+                    use_fp16=args.use_fp16)
-                    loss_scaling=args.loss_scaling)
+                """
                fluid.memory_optimize(
                    input_program=train_program,
                    skip_opt_set=[
                        graph_vars["loss"].name,
-                        graph_vars["probs"].name,
-                        graph_vars["accuracy"].name,
                        graph_vars["num_seqs"].name,
                    ])
+                """
        if args.verbose:
            if args.in_tokens:
@@ -131,13 +142,16 @@ def main(args):
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
-                test_pyreader, graph_vars = create_model(
+                test_pyreader, test_graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
-                    ernie_config=ernie_config)
+                    ernie_config=ernie_config,
+                    is_training=False)
        test_prog = test_prog.clone(for_test=True)
+    nccl2_num_trainers = 1
+    nccl2_trainer_id = 0
    exe.run(startup_prog)
    if args.do_train:
@@ -178,7 +192,9 @@ def main(args):
            use_cuda=args.use_cuda,
            loss_name=graph_vars["loss"].name,
            exec_strategy=exec_strategy,
-            main_program=train_program)
+            main_program=train_program,
+            num_trainers=nccl2_num_trainers,
+            trainer_id=nccl2_trainer_id)
        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
@@ -190,7 +206,6 @@ def main(args):
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr
-        ce_info = []
        time_begin = time.time()
        while True:
            try:
@@ -213,11 +228,9 @@ def main(args):
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
-                          "ave acc: %f, speed: %f steps/s" %
+                          "speed: %f steps/s" %
                          (current_epoch, current_example, num_train_examples,
-                           steps, outputs["loss"], outputs["accuracy"],
+                           steps, outputs["loss"], args.skip_steps / used_time))
-                           args.skip_steps / used_time))
-                    ce_info.append([outputs["loss"], outputs["accuracy"], used_time])
                    time_begin = time.time()
                if steps % args.save_steps == 0:
@@ -226,74 +239,95 @@ def main(args):
                    fluid.io.save_persistables(exe, save_path, train_program)
                if steps % args.validation_steps == 0:
-                    # evaluate dev set
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            reader.data_generator(
                                args.dev_set,
                                batch_size=args.batch_size,
                                epoch=1,
-                                shuffle=False))
+                                dev_count=1,
-                        evaluate(exe, test_prog, test_pyreader, graph_vars,
+                                shuffle=False,
-                                 "dev")
+                                phase="dev"))
-                    # evaluate test set
+                        evaluate(
+                            exe,
+                            test_prog,
+                            test_pyreader,
+                            test_graph_vars,
+                            str(steps) + "_dev",
+                            examples=reader.get_examples("dev"),
+                            features=reader.get_features("dev"),
+                            args=args)
                    if args.do_test:
                        test_pyreader.decorate_tensor_provider(
                            reader.data_generator(
                                args.test_set,
                                batch_size=args.batch_size,
                                epoch=1,
-                                shuffle=False))
+                                dev_count=1,
-                        evaluate(exe, test_prog, test_pyreader, graph_vars,
+                                shuffle=False,
-                                 "test")
+                                phase="test"))
+                        evaluate(
+                            exe,
+                            test_prog,
+                            test_pyreader,
+                            test_graph_vars,
+                            str(steps) + "_test",
+                            examples=reader.get_examples("test"),
+                            features=reader.get_features("test"),
+                            args=args)
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break
-        if args.enable_ce:
-            card_num = get_cards()
-            ce_loss = 0
-            ce_acc = 0
-            ce_time = 0
-            try:
-                ce_loss = ce_info[-2][0]
-                ce_acc = ce_info[-2][1]
-                ce_time = ce_info[-2][2]
-            except:
-                print("ce info error")
-            print("kpis\ttrain_duration_card%s\t%s" %
-                (card_num, ce_time))
-            print("kpis\ttrain_loss_card%s\t%f" %
-                (card_num, ce_loss))
-            print("kpis\ttrain_acc_card%s\t%f" %
-                (card_num, ce_acc))
    # final eval on dev set
    if args.do_val:
+        print("Final validation result:")
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.dev_set,
                batch_size=args.batch_size,
                epoch=1,
-                shuffle=False))
+                dev_count=1,
-        print("Final validation result:")
+                shuffle=False,
-        evaluate(exe, test_prog, test_pyreader, graph_vars, "dev")
+                phase="dev"))
+        evaluate(
+            exe,
+            test_prog,
+            test_pyreader,
+            test_graph_vars,
+            "dev",
+            examples=reader.get_examples("dev"),
+            features=reader.get_features("dev"),
+            args=args)
    # final eval on test set
    if args.do_test:
+        print("Final test result:")
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.test_set,
                batch_size=args.batch_size,
                epoch=1,
-                shuffle=False))
+                dev_count=1,
-        print("Final test result:")
+                shuffle=False,
-        evaluate(exe, test_prog, test_pyreader, graph_vars, "test")
+                phase="test"))
+        evaluate(
+            exe,
+            test_prog,
+            test_pyreader,
+            test_graph_vars,
+            "test",
+            examples=reader.get_examples("test"),
+            features=reader.get_features("test"),
+            args=args)
 if __name__ == '__main__':
-    print_arguments(args)
+    while True:
-    check_cuda(args.use_cuda)
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
            main(args)
--- a/ERNIE/run_sequence_labeling.py
+++ b/ERNIE/run_sequence_labeling.py
@@ -21,6 +21,11 @@ import os
 import time
 import multiprocessing
+# NOTE(paddle-dev): All of these flags should be
+# set before `import paddle`. Otherwise, it would
+# not take any effect.
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0'  # enable gc
 import paddle.fluid as fluid
 import reader.task_reader as task_reader
@@ -52,7 +57,8 @@ def main(args):
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
-        random_seed=args.random_seed)
+        random_seed=args.random_seed,
+        task_id=args.task_id)
    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
@@ -92,7 +98,7 @@ def main(args):
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
-                scheduled_lr = optimization(
+                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
@@ -101,8 +107,7 @@ def main(args):
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
-                    use_fp16=args.use_fp16,
+                    use_fp16=args.use_fp16)
-                    loss_scaling=args.loss_scaling)
                fluid.memory_optimize(
                    input_program=train_program,

--- a/script/en_glue/ernie_base/CoLA/task.sh
+++ b/script/en_glue/ernie_base/CoLA/task.sh
+#!/bin/bash
+R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
+export FLAGS_sync_nccl_allreduce=1
+export FLAGS_eager_delete_tensor_gb=0.0
+if [[ -f ./model_conf ]];then
+    source ./model_conf
+else
+    export CUDA_VISIBLE_DEVICES=0
+fi
+mkdir -p log/
+timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
+lr=3e-5
+batch_size=64
+epoch=3
+for i in {1..5};do
+python -u run_classifier.py                                                          \
+       --use_cuda true                                                               \
+       --for_cn  False                                                               \
+       --use_fast_executor ${e_executor:-"true"}                                     \
+       --tokenizer ${TOKENIZER:-"FullTokenizer"}                                     \
+       --use_fp16 ${USE_FP16:-"false"}                                               \
+       --do_train true                                                               \
+       --do_val true                                                                 \
+       --do_test true                                                                \
+       --batch_size $batch_size                                                      \
+       --init_pretraining_params ${MODEL_PATH}/params                                \
+       --verbose true                                                                \
+       --train_set ${TASK_DATA_PATH}/CoLA/train.tsv                                  \
+       --dev_set   ${TASK_DATA_PATH}/CoLA/dev.tsv                                    \
+       --test_set  ${TASK_DATA_PATH}/CoLA/test.tsv                                   \
+       --vocab_path script/en_glue/ernie_base/vocab.txt                              \
+       --checkpoints ./checkpoints                                                   \
+       --save_steps 1000                                                             \
+       --weight_decay  0.0                                                           \
+       --warmup_proportion 0.1                                                       \
+       --validation_steps 1000000000                                                 \
+       --epoch $epoch                                                                \
+       --max_seq_len 128                                                             \
+       --ernie_config_path script/en_glue/ernie_base/ernie_config.json               \
+       --learning_rate $lr                                                           \
+       --skip_steps 10                                                               \
+       --num_iteration_per_drop_scope 1                                              \
+       --num_labels 2                                                                \
+       --metric 'matthews_corrcoef'                                                  \
+       --test_save output/test_out.$i.$lr.$batch_size.$epoch.$timestamp.tsv          \
+       --random_seed 1 2>&1 | tee  log/job.$i.$lr.$batch_size.$epoch.$timestamp.log  \
+done
--- a/script/en_glue/ernie_base/MNLI/task.sh
+++ b/script/en_glue/ernie_base/MNLI/task.sh
+#!/bin/bash
+R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
+export FLAGS_eager_delete_tensor_gb=0.0
+export FLAGS_sync_nccl_allreduce=1
+if [[ -f ./model_conf ]];then
+    source ./model_conf
+else
+    export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+fi
+mkdir -p log/
+lr=3e-5
+batch_size=64
+epoch=3
+for i in {1..5};do
+timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
+python -u run_classifier.py                                                             \
+       --use_cuda true                                                                  \
+       --use_fast_executor ${e_executor:-"true"}                                        \
+       --tokenizer ${TOKENIZER:-"FullTokenizer"}                                        \
+       --use_fp16 ${USE_FP16:-"false"}                                                  \
+       --do_train true                                                                  \
+       --do_val true                                                                    \
+       --do_test true                                                                   \
+       --batch_size $batch_size                                                         \
+       --init_pretraining_params ${MODEL_PATH}/params                                   \
+       --verbose true                                                                   \
+       --train_set ${TASK_DATA_PATH}/MNLI/train.tsv                                     \
+       --dev_set   ${TASK_DATA_PATH}/MNLI/m/dev.tsv,${TASK_DATA_PATH}/MNLI/mm/dev.tsv   \
+       --test_set  ${TASK_DATA_PATH}/MNLI/m/test.tsv,${TASK_DATA_PATH}/MNLI/mm/test.tsv \
+       --vocab_path script/en_glue/ernie_base/vocab.txt                                 \
+       --checkpoints ./checkpoints                                                      \
+       --save_steps 25000                                                               \
+       --weight_decay 0.0                                                               \
+       --warmup_proportion 0.1                                                          \
+       --validation_steps 1000000000000                                                 \
+       --epoch $epoch                                                                   \
+       --max_seq_len 128                                                                \
+       --ernie_config_path script/en_glue/ernie_base/ernie_config.json                  \
+       --learning_rate $lr                                                              \
+       --skip_steps 10                                                                  \
+       --num_iteration_per_drop_scope 1                                                 \
+       --num_labels 3                                                                   \
+       --for_cn False                                                                   \
+       --test_save output/test_out.$i.m.tsv,output/test_out.$i.mm.tsv                   \
+       --diagnostic ${TASK_DATA_PATH}/diagnostic.tsv                                    \
+       --diagnostic_save output/test_out.$i.$lr.$batch_size.$epoch.$timestamp.m.diagnostic.tsv \
+       --random_seed 1 2>&1 | tee  log/job.$i.$lr.$batch_size.$epoch.$timestamp.log            \
+done
--- a/script/en_glue/ernie_base/MRPC/task.sh
+++ b/script/en_glue/ernie_base/MRPC/task.sh
+#!/bin/bash
+R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
+export FLAGS_eager_delete_tensor_gb=0.0
+export FLAGS_sync_nccl_allreduce=1
+if [[ -f ./model_conf ]];then
+    source ./model_conf
+else
+    export CUDA_VISIBLE_DEVICES=0,1
+fi
+mkdir -p log/
+lr=3e-5
+batch_size=16
+epoch=4
+for i in {1..5};do
+    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
+    python -u run_classifier.py                                              \
+           --use_cuda true                                                   \
+           --for_cn  False                                                   \
+           --use_fast_executor ${e_executor:-"true"}                         \
+           --tokenizer ${TOKENIZER:-"FullTokenizer"}                         \
+           --use_fp16 ${USE_FP16:-"false"}                                   \
+           --do_train true                                                   \
+           --do_val true                                                     \
+           --do_test true                                                    \
+           --batch_size 16                                                   \
+           --init_pretraining_params ${MODEL_PATH}/params                    \
+           --verbose true                                                    \
+           --train_set ${TASK_DATA_PATH}/MRPC/train.tsv                      \
+           --dev_set   ${TASK_DATA_PATH}/MRPC/dev.tsv                        \
+           --test_set  ${TASK_DATA_PATH}/MRPC/test.tsv                       \
+           --vocab_path script/en_glue/ernie_base/vocab.txt                  \
+           --checkpoints ./checkpoints                                       \
+           --save_steps 1000                                                 \
+           --weight_decay  0.0                                               \
+           --warmup_proportion 0.1                                           \
+           --validation_steps 1000000                                        \
+           --epoch 4                                                         \
+           --max_seq_len 128                                                 \
+           --ernie_config_path script/en_glue/ernie_base/ernie_config.json   \
+           --learning_rate 3e-5                                              \
+           --skip_steps 10                                                   \
+           --num_iteration_per_drop_scope 1                                  \
+           --num_labels 2                                                    \
+           --metric 'acc_and_f1'                                             \
+           --for_cn  False                                                   \
+           --test_save output/test_out.$i.$lr.$batch_size.$epoch.tsv         \
+           --random_seed 1 2>&1 | tee  log/job.$i.$lr.$batch_size.$epoch.log \
+done
--- a/script/en_glue/ernie_base/QNLI/task.sh
+++ b/script/en_glue/ernie_base/QNLI/task.sh
--- a/script/en_glue/ernie_base/QQP/task.sh
+++ b/script/en_glue/ernie_base/QQP/task.sh
--- a/script/en_glue/ernie_base/RTE/task.sh
+++ b/script/en_glue/ernie_base/RTE/task.sh
--- a/script/en_glue/ernie_base/SST-2/task.sh
+++ b/script/en_glue/ernie_base/SST-2/task.sh
--- a/script/en_glue/ernie_base/STS-B/task.sh
+++ b/script/en_glue/ernie_base/STS-B/task.sh
--- a/script/en_glue/ernie_base/WNLI/task.sh
+++ b/script/en_glue/ernie_base/WNLI/task.sh
--- a/script/en_glue/ernie_base/ernie_config.json
+++ b/script/en_glue/ernie_base/ernie_config.json
+{
+  "attention_probs_dropout_prob": 0.1, 
+  "hidden_act": "gelu", 
+  "hidden_dropout_prob": 0.1, 
+  "hidden_size": 768, 
+  "initializer_range": 0.02, 
+  "max_position_embeddings": 512, 
+  "num_attention_heads": 12, 
+  "num_hidden_layers": 12, 
+  "sent_type_vocab_size": 4, 
+  "task_type_vocab_size": 16, 
+  "vocab_size": 30522
+}
--- a/script/en_glue/ernie_base/vocab.txt
+++ b/script/en_glue/ernie_base/vocab.txt
--- a/script/en_glue/ernie_large/CoLA/task.sh
+++ b/script/en_glue/ernie_large/CoLA/task.sh
--- a/script/en_glue/ernie_large/MNLI/task.sh
+++ b/script/en_glue/ernie_large/MNLI/task.sh
--- a/script/en_glue/ernie_large/MRPC/task.sh
+++ b/script/en_glue/ernie_large/MRPC/task.sh
--- a/script/en_glue/ernie_large/QNLI/task.sh
+++ b/script/en_glue/ernie_large/QNLI/task.sh
--- a/script/en_glue/ernie_large/QQP/task.sh
+++ b/script/en_glue/ernie_large/QQP/task.sh
--- a/script/en_glue/ernie_large/RTE/task.sh
+++ b/script/en_glue/ernie_large/RTE/task.sh
--- a/script/en_glue/ernie_large/SST-2/task.sh
+++ b/script/en_glue/ernie_large/SST-2/task.sh
--- a/script/en_glue/ernie_large/STS-B/task.sh
+++ b/script/en_glue/ernie_large/STS-B/task.sh
--- a/script/en_glue/ernie_large/WNLI/task.sh
+++ b/script/en_glue/ernie_large/WNLI/task.sh
--- a/script/en_glue/ernie_large/ernie_config.json
+++ b/script/en_glue/ernie_large/ernie_config.json
+{
+  "attention_probs_dropout_prob": 0.1, 
+  "hidden_act": "gelu", 
+  "hidden_dropout_prob": 0.1, 
+  "hidden_size": 1024, 
+  "initializer_range": 0.02, 
+  "max_position_embeddings": 512, 
+  "num_attention_heads": 16, 
+  "num_hidden_layers": 24, 
+  "sent_type_vocab_size": 4, 
+  "task_type_vocab_size": 16, 
+  "vocab_size": 30522
+}
--- a/script/en_glue/ernie_large/vocab.txt
+++ b/script/en_glue/ernie_large/vocab.txt
--- a/script/en_glue/preprocess/cvt.sh
+++ b/script/en_glue/preprocess/cvt.sh
--- a/script/en_glue/preprocess/mnli.py
+++ b/script/en_glue/preprocess/mnli.py
--- a/script/en_glue/preprocess/qnli.py
+++ b/script/en_glue/preprocess/qnli.py
--- a/script/zh_task/ernie_base/run_ChnSentiCorp.sh
+++ b/script/zh_task/ernie_base/run_ChnSentiCorp.sh
--- a/script/zh_task/ernie_base/run_bq.sh
+++ b/script/zh_task/ernie_base/run_bq.sh
--- a/script/zh_task/ernie_base/run_cmrc2018.sh
+++ b/script/zh_task/ernie_base/run_cmrc2018.sh
--- a/script/zh_task/ernie_base/run_dbqa.sh
+++ b/script/zh_task/ernie_base/run_dbqa.sh
--- a/script/zh_task/ernie_base/run_drcd.sh
+++ b/script/zh_task/ernie_base/run_drcd.sh
--- a/script/zh_task/ernie_base/run_lcqmc.sh
+++ b/script/zh_task/ernie_base/run_lcqmc.sh
--- a/script/zh_task/ernie_base/run_msra_ner.sh
+++ b/script/zh_task/ernie_base/run_msra_ner.sh
--- a/script/zh_task/ernie_base/run_thuc.sh
+++ b/script/zh_task/ernie_base/run_thuc.sh
--- a/script/zh_task/ernie_base/run_xnli.sh
+++ b/script/zh_task/ernie_base/run_xnli.sh
--- a/ERNIE/script/run_ChnSentiCorp.sh
+++ b/ERNIE/script/run_ChnSentiCorp.sh
@@ -12,8 +12,7 @@ python -u run_classifier.py \
                   --batch_size 24 \
                   --init_pretraining_params ${MODEL_PATH}/params \
                   --train_set ${TASK_DATA_PATH}/chnsenticorp/train.tsv \
-                   --dev_set ${TASK_DATA_PATH}/chnsenticorp/dev.tsv \
+                   --dev_set ${TASK_DATA_PATH}/chnsenticorp/dev.tsv,${TASK_DATA_PATH}/chnsenticorp/test.tsv \
-                   --test_set ${TASK_DATA_PATH}/chnsenticorp/test.tsv \
                   --vocab_path config/vocab.txt \
                   --checkpoints ./checkpoints \
                   --save_steps 1000 \
@@ -23,7 +22,7 @@ python -u run_classifier.py \
                   --epoch 10 \
                   --max_seq_len 256 \
                   --ernie_config_path config/ernie_config.json \
-                   --learning_rate 5e-5 \
+                   --learning_rate 1e-5 \
                   --skip_steps 10 \
                   --num_iteration_per_drop_scope 1 \
                   --num_labels 2 \

--- a/script/zh_task/ernie_large/run_bq.sh
+++ b/script/zh_task/ernie_large/run_bq.sh
--- a/script/zh_task/ernie_large/run_cmrc2018.sh
+++ b/script/zh_task/ernie_large/run_cmrc2018.sh
--- a/ERNIE/script/run_dbqa.sh
+++ b/ERNIE/script/run_dbqa.sh
--- a/script/zh_task/ernie_large/run_drcd.sh
+++ b/script/zh_task/ernie_large/run_drcd.sh
--- a/ERNIE/script/run_lcqmc.sh
+++ b/ERNIE/script/run_lcqmc.sh
--- a/ERNIE/script/run_msra_ner.sh
+++ b/ERNIE/script/run_msra_ner.sh
--- a/script/zh_task/ernie_large/run_thuc.sh
+++ b/script/zh_task/ernie_large/run_thuc.sh
--- a/ERNIE/script/run_xnli.sh
+++ b/ERNIE/script/run_xnli.sh
--- a/ERNIE/script/pretrain.sh
+++ b/ERNIE/script/pretrain.sh
--- a/ERNIE/tokenization.py
+++ b/ERNIE/tokenization.py
--- a/ERNIE/train.py
+++ b/ERNIE/train.py
--- a/ERNIE/utils/__init__.py
+++ b/ERNIE/utils/__init__.py
--- a/ERNIE/utils/args.py
+++ b/ERNIE/utils/args.py
--- a/ERNIE/utils/cards.py
+++ b/ERNIE/utils/cards.py
--- a/utils/cmrc2018_eval.py
+++ b/utils/cmrc2018_eval.py
--- a/ERNIE/utils/fp16.py
+++ b/ERNIE/utils/fp16.py
--- a/ERNIE/utils/init.py
+++ b/ERNIE/utils/init.py