diff --git a/README.md b/README.md
index e1bd0e0ca79ed0a8be6fec793961b63593d3dfa4..8f54d2fb5864bae9db5e4fe6696db76505eb69d4 100644
--- a/README.md
+++ b/README.md
@@ -30,9 +30,9 @@ You can easily re-produce following competitive results with minor codes, which
     <tr>
       <th><strong>Dataset</strong>
         <br></th>
-      <th colspan="3"><center><strong>chnsenticorp</strong></center></th>
-      <th colspan="3"><center><strong>Quora Question Pairs matching</strong><center></th>
-      <th colspan="3"><strong>MSRA-NER<br>(SIGHAN2006)</strong></th>
+      <th colspan="2"><center><strong>chnsenticorp</strong></center></th>
+      <th colspan="2"><center><strong>Quora Question Pairs matching</strong><center></th>
+      <th colspan="1"><strong>MSRA-NER<br>(SIGHAN2006)</strong></th>
       <th colspan="2"><strong>CMRC2018</strong></th>
     </tr>
     <tr>
@@ -42,31 +42,19 @@ You can easily re-produce following competitive results with minor codes, which
           <br></p>
       </td>
       <td colspan="1">
-        <center><strong>precision</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
+        <center><strong>accuracy</strong></center>
         <br></td>
       <td colspan="1">
         <strong>f1-score</strong>
         <strong></strong>
         <br></td>
       <td colspan="1">
-        <center><strong>precision</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
+        <center><strong>accuracy</strong></center>
         <br></td>
       <td colspan="1">
         <strong>f1-score</strong>
         <strong></strong>
         <br></td>
-      <td colspan="1">
-        <center><strong>precision</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
-        <br></td>
       <td colspan="1">
         <strong>f1-score</strong>
         <strong></strong>
@@ -79,13 +67,13 @@ You can easily re-produce following competitive results with minor codes, which
         <br></td>
     </tr>
     <tr>
-      <td colspan="3" width="">
+      <td colspan="2" width="">
         <strong>test</strong>
         <br></td>
-      <td colspan="3" width="">
+      <td colspan="2" width="">
         <strong>test</strong>
         <br></td>
-      <td colspan="3" width="">
+      <td colspan="1" width="">
         <strong>test</strong>
         <br></td>
       <td colspan="2" width="">
@@ -94,15 +82,11 @@ You can easily re-produce following competitive results with minor codes, which
     </tr>
     <tr>
       <td><strong>ERNIE Base</strong></td>
-      <td>95.7</td>
-      <td>95.0</td>
-      <td>95.7</td>
-      <td>85.8</td>
-      <td>82.4</td>
-      <td>81.5</td>
-      <td>94.9</td>
-      <td>94.5</td>
-      <td>94.7</td>
+      <td>95.8</td>
+      <td>95.8</td>
+      <td>86.2</td>
+      <td>82.2</td>
+      <td>99.2</td>
       <td>64.3</td>
       <td>85.2</td>
     </tr>
diff --git a/README_zh.md b/README_zh.md
index bbffd7d7ad66af1e1b0fd4ff8da479786945a58f..678a41d44ba858bdccbbabab6a1732c2211481a4 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -30,9 +30,9 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活，通用且易
     <tr>
       <th><strong>数据集</strong>
         <br></th>
-      <th colspan="3"><center><strong>chnsenticorp</strong></center></th>
-      <th colspan="3"><center><strong>Quora Question Pairs matching</strong><center></th>
-      <th colspan="3"><strong>MSRA-NER<br>(SIGHAN2006)</strong></th>
+      <th colspan="2"><center><strong>chnsenticorp</strong></center></th>
+      <th colspan="2"><center><strong>Quora Question Pairs matching</strong><center></th>
+      <th colspan="1"><strong>MSRA-NER<br>(SIGHAN2006)</strong></th>
       <th colspan="2"><strong>CMRC2018</strong></th>
     </tr>
     <tr>
@@ -42,31 +42,19 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活，通用且易
           <br></p>
       </td>
       <td colspan="1">
-        <center><strong>precision</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
+        <center><strong>accuracy</strong></center>
         <br></td>
       <td colspan="1">
         <strong>f1-score</strong>
         <strong></strong>
         <br></td>
       <td colspan="1">
-        <center><strong>precision</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
+        <center><strong>accuracy</strong></center>
         <br></td>
       <td colspan="1">
         <strong>f1-score</strong>
         <strong></strong>
         <br></td>
-      <td colspan="1">
-        <center><strong>precision</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
-        <br></td>
       <td colspan="1">
         <strong>f1-score</strong>
         <strong></strong>
@@ -79,13 +67,13 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活，通用且易
         <br></td>
     </tr>
     <tr>
-      <td colspan="3" width="">
+      <td colspan="2" width="">
         <strong>test</strong>
         <br></td>
-      <td colspan="3" width="">
+      <td colspan="2" width="">
         <strong>test</strong>
         <br></td>
-      <td colspan="3" width="">
+      <td colspan="1" width="">
         <strong>test</strong>
         <br></td>
       <td colspan="2" width="">
@@ -94,17 +82,13 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活，通用且易
     </tr>
     <tr>
       <td><strong>ERNIE Base</strong></td>
-      <td>95.7</td>
-      <td>95.0</td>
-      <td>95.7</td>
-      <td>85.8</td>
-      <td>82.4</td>
-      <td>81.5</td>
-      <td>94.9</td>
-      <td>94.5</td>
-      <td>94.7</td>
-      <td>96.3</td>
-      <td>84.0</td>
+      <td>95.8</td>
+      <td>95.8</td>
+      <td>86.2</td>
+      <td>82.2</td>
+      <td>99.2</td>
+      <td>64.3</td>
+      <td>85.2</td>
     </tr>
 
   </tbody>
@@ -121,6 +105,16 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活，通用且易
 	</p>
 </p>
 
+
+PaddlePALM是一个设计良好的高级NLP框架。基于PaddlePALM的小代码可以高效实现**监督学习、非监督/自监督学习、多任务学习和迁移学习**。在PaddlePALM架构中有三层，即从下到上依次是组件层、训练器层和高级训练器层。
+
+在组件层，PaddlePALM提供了6个 **解耦的**组件来实现NLP任务。每个组件包含丰富的`pre-defined`类和一个`Base`类。`pre-defined`类是针对典型的NLP任务的，而`Base`类是帮助用户开发一个新类（基于`pre-defined`类或从`Base`类）。
+
+训练器层是用选定的构件建立计算图，进行训练和预测。该层描述了训练策略、模型保存和加载、评估和预测过程。一个训练器只能处理一个任务。
+
+高级训练器层用于复杂的学习和推理策略，如多任务学习。您可以添加辅助任务来训练健壮的NLP模型（提高模型的测试集和领域外的性能），或者联合训练多个相关任务来获得每个任务的更高性能。
+
+
 | 模块 | 描述 | 
 | - | - |
 | **paddlepalm** | 一个开源的NLP预训练和多任务学习框架，建立在paddlepaddle框架上。 |
@@ -187,6 +181,8 @@ Available pretrain items:
 
 ## 使用
 
+#### 快速开始
+
 8个步骤开始一个典型的NLP训练任务。
 
 1. 使用`paddlepalm.reader` 要为数据集加载和输入特征生成创建一个`reader`，然后调用`reader.load_data`方法加载训练数据。
@@ -205,14 +201,8 @@ Available pretrain items:
 - [Tagging](https://github.com/PaddlePaddle/PALM/tree/master/examples/tagging)
 - [SQuAD machine Reading Comprehension](https://github.com/PaddlePaddle/PALM/tree/master/examples/mrc).
 
-### 设置saver
-
-在训练时保存 models/checkpoints 和 logs， 调用 `trainer.set_saver` 方法. 更多实现细节见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples).
-
-### 预测
-训练结束后进行预测和评价, 只需创建额外的reader, backbone和head示例（重复上面1~4步骤），注意创建时需设`phase='predict'`。 然后使用trainer的`predict`方法进行预测（不需创建额外的trainer）。更多实现细节请见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples/predict).
 
-### 多任务学习
+#### 多任务学习
 
 多任务学习模式下运行:
 
@@ -226,11 +216,31 @@ Available pretrain items:
 multi_head_trainer的保存/加载和预测操作与trainer相同。
 
 
-更多实现`multi_head_trainer`的细节, 请见
+更多实现`multi_head_trainer`的细节，请见
 
 - [ATIS: joint training of dialogue intent recognition and slot filling](https://github.com/PaddlePaddle/PALM/tree/master/examples/multi-task)
-- [MRQA: learning reading comprehension auxilarized with mask language model]() (初次发版先不用加)
 
+#### 设置saver
+
+在训练时保存 models/checkpoints 和 logs， 调用 `trainer.set_saver` 方法。更多实现细节见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples)。
+
+#### 评估/预测
+训练结束后进行预测和评价, 只需创建额外的reader, backbone和head示例（重复上面1~4步骤），注意创建时需设`phase='predict'`。 然后使用trainer的`predict`方法进行预测（不需创建额外的trainer）。更多实现细节请见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples/predict)。
+
+#### 使用多GPU
+如果您的环境中存在多个GPU，您可以通过环境变量控制这些GPU的数量和索引[CUDA_VISIBLE_DEVICES](https://devblogs.nvidia.com/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/)。例如，如果您的环境中有4个gpu，索引为0、1、2、3，那么您可以运行以下命令来只使用GPU2：
+
+```shell
+CUDA_VISIBLE_DEVICES=2 python run.py
+```
+
+多GPU的使用需要 `,`作为分隔。例如，使用GPU2和GPU3，运行以下命令：
+
+```shell
+CUDA_VISIBLE_DEVICES=2,3 python run.py
+```
+
+在多GPU模式下，PaddlePALM会自动将每批数据分配到可用的卡上。例如，如果`batch_size`设置为64，并且有4个GPU可以用于PaddlePALM，那么每个GPU中的batch_size实际上是64/4=16。因此，**当使用多个GPU时，您需要确保设置batch_size可以整除卡片的数量**。
 
 
 ## 许可证书
diff --git a/examples/classification/README.md b/examples/classification/README.md
index b1ed1b1074e30d2d1a205d014794354feb88476f..4ac05170078c858a2399e9659cd1145e76920b93 100644
--- a/examples/classification/README.md
+++ b/examples/classification/README.md
@@ -75,5 +75,5 @@ The evaluation results are as follows:
 
 ```
 data num: 1200
-precision: 0.956666666667, recall: 0.949013157895, f1: 0.95688225039
+accuracy: 0.9575, precision: 0.9634, recall: 0.9523, f1: 0.9578
 ```
diff --git a/examples/classification/evaluate.py b/examples/classification/evaluate.py
index f7949a93cf57a20e0477025099dbba6ae6a56050..4b1b0d39e7a8b92fdd993d51d9564deb58b31fa3 100644
--- a/examples/classification/evaluate.py
+++ b/examples/classification/evaluate.py
@@ -8,26 +8,19 @@ def accuracy(preds, labels):
     labels = np.array(labels) 
     return (preds == labels).mean()
 
-def f1(preds, labels):
-    preds = np.array(preds)
-    labels = np.array(labels)
-    tp = np.sum((labels == '1') & (preds == '1'))
-    tn = np.sum((labels == '0') & (preds == '0'))
-    fp = np.sum((labels == '0') & (preds == '1'))
-    fn = np.sum((labels == '1') & (preds == '0'))
-    p = tp * 1.0 / (tp + fp) 
-    r = tp * 1.0 / (tp + fn) * 1.0
-    f1 = (2 * p * r) / (p + r + 1e-8)
-    return f1
-  
-def recall(preds, labels):
+def pre_recall_f1(preds, labels):
     preds = np.array(preds)
     labels = np.array(labels)
     # recall=TP/(TP+FN)
     tp = np.sum((labels == '1') & (preds == '1'))
+    fp = np.sum((labels == '0') & (preds == '1'))
     fn = np.sum((labels == '1') & (preds == '0'))
-    re = tp * 1.0 / (tp + fn)
-    return re
+    r = tp * 1.0 / (tp + fn)
+    # Precision=TP/(TP+FP)
+    p = tp * 1.0 / (tp + fp)
+    epsilon = 1e-31
+    f1 = 2 * p * r / (p+r+epsilon)
+    return p, r, f1
 
 
 def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
@@ -58,6 +51,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'
     file.close()
     assert len(labels) == len(preds), "prediction result doesn't match to labels"
     print('data num: {}'.format(len(labels)))
-    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+    p, r, f1 = pre_recall_f1(preds, labels)
+    print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
 
 res_evaluate()
diff --git a/examples/matching/README.md b/examples/matching/README.md
index 3cb10327c92658b49c3dd8cd4f160571c574b16b..aecb97f405353db0efdee234c023dc33a11de9c7 100644
--- a/examples/matching/README.md
+++ b/examples/matching/README.md
@@ -81,6 +81,6 @@ python evaluate.py
 The evaluation results are as follows:
 
 ```
-data_num: 4300
-precision: 0.857906976744, recall: 0.824249846908, f1: 0.81501664653
+data num: 4300
+accuracy: 0.8619, precision: 0.8061, recall: 0.8377, f1: 0.8216
 ```
diff --git a/examples/matching/evaluate.py b/examples/matching/evaluate.py
index 5ea6da3cee00f46508475597499584ad68f07858..385a52e74cd90f9094e5781df48a21e78bd8d273 100644
--- a/examples/matching/evaluate.py
+++ b/examples/matching/evaluate.py
@@ -8,26 +8,19 @@ def accuracy(preds, labels):
     labels = np.array(labels) 
     return (preds == labels).mean()
 
-def f1(preds, labels):
-    preds = np.array(preds)
-    labels = np.array(labels)
-    tp = np.sum((labels == '1') & (preds == '1'))
-    tn = np.sum((labels == '0') & (preds == '0'))
-    fp = np.sum((labels == '0') & (preds == '1'))
-    fn = np.sum((labels == '1') & (preds == '0'))
-    p = tp * 1.0 / (tp + fp) 
-    r = tp * 1.0 / (tp + fn) * 1.0
-    f1 = (2 * p * r) / (p + r + 1e-8)
-    return f1
-  
-def recall(preds, labels):
+def pre_recall_f1(preds, labels):
     preds = np.array(preds)
     labels = np.array(labels)
     # recall=TP/(TP+FN)
     tp = np.sum((labels == '1') & (preds == '1'))
+    fp = np.sum((labels == '0') & (preds == '1'))
     fn = np.sum((labels == '1') & (preds == '0'))
-    re = tp * 1.0 / (tp + fn)
-    return re
+    r = tp * 1.0 / (tp + fn)
+    # Precision=TP/(TP+FP)
+    p = tp * 1.0 / (tp + fp)
+    epsilon = 1e-31
+    f1 = 2 * p * r / (p+r+epsilon)
+    return p, r, f1
 
 
 def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
@@ -58,6 +51,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'
     file.close()
     assert len(labels) == len(preds), "prediction result({}) doesn't match to labels({})".format(len(preds),len(labels))
     print('data num: {}'.format(len(labels)))
-    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+    p, r, f1 = pre_recall_f1(preds, labels)
+    print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
 
 res_evaluate()
diff --git a/examples/mrc/README.md b/examples/mrc/README.md
index 9d40fd0effbd278a7824aa25f716402eafc5c95c..6d01a3563d9398dfd32fe18280da6d83edbc9a4f 100644
--- a/examples/mrc/README.md
+++ b/examples/mrc/README.md
@@ -94,5 +94,5 @@ The evaluation results are as follows:
 
 ```
 data_num: 3219
-em_sroce: 64.3367505436, f1: 85.1781896843
+em_sroce: 0.6434, f1: 0.8518
 ```
diff --git a/examples/mrc/evaluate.py b/examples/mrc/evaluate.py
index a1bc874a6aa5b80ed4fcb705507abb4c389fb793..bd6ba5fa8c78ea98be358c48cbdc3941d1f3fd59 100644
--- a/examples/mrc/evaluate.py
+++ b/examples/mrc/evaluate.py
@@ -121,8 +121,8 @@ def evaluate(ground_truth_file, prediction_file):
                 f1 += calc_f1_score(answers, prediction)
                 em += calc_em_score(answers, prediction)
 
-    f1_score = 100.0 * f1 / total_count
-    em_score = 100.0 * em / total_count
+    f1_score = f1 / total_count
+    em_score = em / total_count
     return f1_score, em_score, total_count, skip_count
 
 
@@ -164,4 +164,4 @@ def eval_file(dataset_file, prediction_file):
 if __name__ == '__main__':
     EM, F1, AVG, TOTAL = eval_file("data/dev.json", "outputs/predict/predictions.json")
     print('data_num: {}'.format(TOTAL))
-    print('em_sroce: {}, f1: {}'.format(EM,F1))
+    print('em_sroce: {:.4f}, f1: {:.4f}'.format(EM,F1))
diff --git a/examples/multi-task/README.md b/examples/multi-task/README.md
index 45c9ea40b32fb24edfa48fd2491419a8958d9bba..63038ab0fd5368a4c183b6e0a461d021e9a72e7a 100644
--- a/examples/multi-task/README.md
+++ b/examples/multi-task/README.md
@@ -118,11 +118,12 @@ The evaluation results are as follows:
 
 `atis_slot`:
 ```
-precision: 0.894397728514, recall: 0.894104803493, f1: 0.894251242016
+data num: 891
+f1: 0.8934
 ```
 
 `atis_intent`:
 ```
 data num: 893
-precision: 0.708846584546, recall: 1.0, f1: 0.999999995
+accuracy: 0.7088, precision: 1.0000, recall: 1.0000, f1: 1.0000
 ```
diff --git a/examples/multi-task/evaluate-intent.py b/examples/multi-task/evaluate-intent.py
index 5d2feb5e9f6c8344f29a14c3bfbd7b12240cbad8..49e635e7b674b16a874fe29f8e8adb6b8a58fb4b 100644
--- a/examples/multi-task/evaluate-intent.py
+++ b/examples/multi-task/evaluate-intent.py
@@ -7,27 +7,20 @@ def accuracy(preds, labels):
     preds = np.array(preds)
     labels = np.array(labels) 
     return (preds == labels).mean()
-
-def f1(preds, labels):
-    preds = np.array(preds)
-    labels = np.array(labels)
-    tp = np.sum((labels == '1') & (preds == '1'))
-    tn = np.sum((labels == '0') & (preds == '0'))
-    fp = np.sum((labels == '0') & (preds == '1'))
-    fn = np.sum((labels == '1') & (preds == '0'))
-    p = tp * 1.0 / (tp + fp) * 1.0 
-    r = tp * 1.0 / (tp + fn) * 1.0
-    f1 = (2 * p * r) / (p + r + 1e-8)
-    return f1
   
-def recall(preds, labels):
+def pre_recall_f1(preds, labels):
     preds = np.array(preds)
     labels = np.array(labels)
     # recall=TP/(TP+FN)
     tp = np.sum((labels == '1') & (preds == '1'))
+    fp = np.sum((labels == '0') & (preds == '1'))
     fn = np.sum((labels == '1') & (preds == '0'))
-    re = tp * 1.0 / (tp + fn)
-    return re
+    r = tp * 1.0 / (tp + fn)
+    # Precision=TP/(TP+FP)
+    p = tp * 1.0 / (tp + fp)
+    epsilon = 1e-31
+    f1 = 2 * p * r / (p+r+epsilon)
+    return p, r, f1
 
 
 def res_evaluate(res_dir="./outputs/predict-intent/predictions.json", eval_phase='test'):
@@ -59,6 +52,7 @@ def res_evaluate(res_dir="./outputs/predict-intent/predictions.json", eval_phase
     file.close()
     assert len(labels) == len(preds), "prediction result doesn't match to labels"
     print('data num: {}'.format(len(labels)))
-    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+    p, r, f1 = pre_recall_f1(preds, labels)
+    print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
 
 res_evaluate()
diff --git a/examples/multi-task/evaluate-slot.py b/examples/multi-task/evaluate-slot.py
index d598e96b5f35abd03696ccb1870b94e6eb89a57c..20c83be49668b8edeaed9da9f0c3876d04b4b436 100644
--- a/examples/multi-task/evaluate-slot.py
+++ b/examples/multi-task/evaluate-slot.py
@@ -11,22 +11,31 @@ def load_label_map(map_dir="./data/atis/atis_slot/label_map.json"):
     return json.load(open(map_dir, "r"))
 
 
-def cal_chunk(total_res, total_label):
-    assert len(total_label) == len(total_res), "prediction result doesn't match to labels, {}, {}".format(len(total_res),len(total_label))
-    num_labels = 0
-    num_corr = 0
-    num_infers = 0
-    for res, label in zip(total_res, total_label):
-        assert len(res) == len(label), "prediction result doesn't match to labels, {}, {}".format(len(res),len(label))
-        num_labels += sum([0 if i == 6 else 1 for i in label])
-        num_corr += sum([1 if label[i] == res[i] and label[i] != 6 else 0 for i in range(len(label))])
-        num_infers += sum([0 if i == 6 else 1 for i in res])
+def cal_chunk(pred_label, refer_label):
+    tp = dict()
+    fn = dict()
+    fp = dict()
+    for i in range(len(refer_label)):
+        if refer_label[i] == pred_label[i]:
+            if refer_label[i] not in tp:
+                tp[refer_label[i]] = 0
+            tp[refer_label[i]] += 1
+        else:
+            if pred_label[i] not in fp:
+                fp[pred_label[i]] = 0
+            fp[pred_label[i]] += 1
+            if refer_label[i] not in fn:
+                fn[refer_label[i]] = 0
+            fn[refer_label[i]] += 1
 
-    precision = num_corr * 1.0 / num_infers if num_infers > 0 else 0.0
-    recall = num_corr * 1.0 / num_labels if num_labels > 0 else 0.0
-    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
+    tp_total = sum(tp.values())
+    fn_total = sum(fn.values())
+    fp_total = sum(fp.values())
+    p_total = float(tp_total) / (tp_total + fp_total)
+    r_total = float(tp_total) / (tp_total + fn_total)
+    f_micro = 2 * p_total * r_total / (p_total + r_total)
 
-    return precision, recall, f1
+    return f_micro
 
 
 def res_evaluate(res_dir="./outputs/predict-slot/predictions.json", data_dir="./data/atis/atis_slot/test.tsv"):
@@ -72,7 +81,18 @@ def res_evaluate(res_dir="./outputs/predict-slot/predictions.json", data_dir="./
 
             cnt += 1
 
-    precision, recall, f1 = cal_chunk(total_res, total_label)
-    print("precision: {}, recall: {}, f1: {}".format(precision, recall, f1))
+    total_res_equal = []
+    total_label_equal = []
+    assert len(total_label) == len(total_res), "prediction result doesn't match to labels"
+    for i in range(len(total_label)):
+        num = len(total_label[i])
+        total_label_equal.extend(total_label[i])
+        total_res[i] = total_res[i][:num]
+        total_res_equal.extend(total_res[i])
+
+    f1 = cal_chunk(total_res_equal, total_label_equal)
+    print('data num: {}'.format(len(total_label)))
+    print("f1: {:.4f}".format(f1))
+
 
 res_evaluate()
diff --git a/examples/predict/README.md b/examples/predict/README.md
index 49d05129c0e2f9583d22c0c3348dc54b6f18b888..19743f09642f68f8dd0bb118d91d5e1812d6cc95 100644
--- a/examples/predict/README.md
+++ b/examples/predict/README.md
@@ -44,5 +44,5 @@ The evaluation results are as follows:
 
 ```
 data num: 1200
-precision: 0.494166666667, recall: 0.0444078947368, f1: 0.0816944009455
+accuracy: 0.4758, precision: 0.4730, recall: 0.3026, f1: 0.3691
 ```
diff --git a/examples/predict/evaluate.py b/examples/predict/evaluate.py
index f7949a93cf57a20e0477025099dbba6ae6a56050..4b1b0d39e7a8b92fdd993d51d9564deb58b31fa3 100644
--- a/examples/predict/evaluate.py
+++ b/examples/predict/evaluate.py
@@ -8,26 +8,19 @@ def accuracy(preds, labels):
     labels = np.array(labels) 
     return (preds == labels).mean()
 
-def f1(preds, labels):
-    preds = np.array(preds)
-    labels = np.array(labels)
-    tp = np.sum((labels == '1') & (preds == '1'))
-    tn = np.sum((labels == '0') & (preds == '0'))
-    fp = np.sum((labels == '0') & (preds == '1'))
-    fn = np.sum((labels == '1') & (preds == '0'))
-    p = tp * 1.0 / (tp + fp) 
-    r = tp * 1.0 / (tp + fn) * 1.0
-    f1 = (2 * p * r) / (p + r + 1e-8)
-    return f1
-  
-def recall(preds, labels):
+def pre_recall_f1(preds, labels):
     preds = np.array(preds)
     labels = np.array(labels)
     # recall=TP/(TP+FN)
     tp = np.sum((labels == '1') & (preds == '1'))
+    fp = np.sum((labels == '0') & (preds == '1'))
     fn = np.sum((labels == '1') & (preds == '0'))
-    re = tp * 1.0 / (tp + fn)
-    return re
+    r = tp * 1.0 / (tp + fn)
+    # Precision=TP/(TP+FP)
+    p = tp * 1.0 / (tp + fp)
+    epsilon = 1e-31
+    f1 = 2 * p * r / (p+r+epsilon)
+    return p, r, f1
 
 
 def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
@@ -58,6 +51,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'
     file.close()
     assert len(labels) == len(preds), "prediction result doesn't match to labels"
     print('data num: {}'.format(len(labels)))
-    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+    p, r, f1 = pre_recall_f1(preds, labels)
+    print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
 
 res_evaluate()
diff --git a/examples/tagging/README.md b/examples/tagging/README.md
index f58fd3b42f0c6fb73b6eced1529631419ab766cb..465e611ea1396f125b4547391a7a68e073a6930e 100644
--- a/examples/tagging/README.md
+++ b/examples/tagging/README.md
@@ -74,5 +74,6 @@ python evaluate.py
 The evaluation results are as follows:
 
 ```
-precision: 0.948718989809, recall: 0.944806113784, f1: 0.946758508914
+data num: 4636
+f1: 0.9918
 ```
diff --git a/examples/tagging/evaluate.py b/examples/tagging/evaluate.py
index db20b919a0e3c819b6cc1768249dccf5c014cbe6..d7812eaa97511b572874ee5aaa582a5e5cb10866 100644
--- a/examples/tagging/evaluate.py
+++ b/examples/tagging/evaluate.py
@@ -11,22 +11,31 @@ def load_label_map(map_dir="./data/label_map.json"):
     return json.load(open(map_dir, "r"))
 
 
-def cal_chunk(total_res, total_label):
-    assert len(total_label) == len(total_res), 'prediction result doesn\'t match to labels'
-    num_labels = 0
-    num_corr = 0
-    num_infers = 0
-    for res, label in zip(total_res, total_label):
-        assert len(res) == len(label), "prediction result doesn\'t match to labels"
-        num_labels += sum([0 if i == 6 else 1 for i in label])
-        num_corr += sum([1 if label[i] == res[i] and label[i] != 6 else 0 for i in range(len(label))])
-        num_infers += sum([0 if i == 6 else 1 for i in res])
+def cal_chunk(pred_label, refer_label):
+    tp = dict()
+    fn = dict()
+    fp = dict()
+    for i in range(len(refer_label)):
+        if refer_label[i] == pred_label[i]:
+            if refer_label[i] not in tp:
+                tp[refer_label[i]] = 0
+            tp[refer_label[i]] += 1
+        else:
+            if pred_label[i] not in fp:
+                fp[pred_label[i]] = 0
+            fp[pred_label[i]] += 1
+            if refer_label[i] not in fn:
+                fn[refer_label[i]] = 0
+            fn[refer_label[i]] += 1
 
-    precision = num_corr * 1.0 / num_infers if num_infers > 0 else 0.0
-    recall = num_corr * 1.0 / num_labels if num_labels > 0 else 0.0
-    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
+    tp_total = sum(tp.values())
+    fn_total = sum(fn.values())
+    fp_total = sum(fp.values())
+    p_total = float(tp_total) / (tp_total + fp_total)
+    r_total = float(tp_total) / (tp_total + fn_total)
+    f_micro = 2 * p_total * r_total / (p_total + r_total)
 
-    return precision, recall, f1
+    return f_micro
 
 
 def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/test.tsv"):
@@ -48,7 +57,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/
             labels = line[1].split("\x02")
             total_label.append(labels)
     total_label = [[label_map[j] for j in i] for i in total_label]
-
+  
     total_res = []
     with open(res_dir, "r") as file:
         cnt = 0
@@ -72,7 +81,17 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/
 
             cnt += 1
 
-    precision, recall, f1 = cal_chunk(total_res, total_label)
-    print("precision: {}, recall: {}, f1: {}".format(precision, recall, f1))
+    total_res_equal = []
+    total_label_equal = []
+    assert len(total_label) == len(total_res), "prediction result doesn't match to labels"
+    for i in range(len(total_label)):
+        num = len(total_label[i])
+        total_label_equal.extend(total_label[i])
+        total_res[i] = total_res[i][:num]
+        total_res_equal.extend(total_res[i])
+
+    f1 = cal_chunk(total_res_equal, total_label_equal)
+    print('data num: {}'.format(len(total_label)))
+    print("f1: {:.4f}".format(f1))
 
 res_evaluate()