Merge pull request #65 from wangxiao1021/api

update evaluation && README

Merge pull request #65 from wangxiao1021/api
update evaluation && README
9e0f06d9 · Xiaoyao Xi · GitHub · dbd22a7e · 33fea5f3 · 9e0f06d9
15 changed file
--- a/README.md
+++ b/README.md
@@ -30,9 +30,9 @@ You can easily re-produce following competitive results with minor codes, which
    <tr>
      <th><strong>Dataset</strong>
        <br></th>
-      <th colspan="3"><center><strong>chnsenticorp</strong></center></th>
+      <th colspan="2"><center><strong>chnsenticorp</strong></center></th>
-      <th colspan="3"><center><strong>Quora Question Pairs matching</strong><center></th>
+      <th colspan="2"><center><strong>Quora Question Pairs matching</strong><center></th>
-      <th colspan="3"><strong>MSRA-NER<br>(SIGHAN2006)</strong></th>
+      <th colspan="1"><strong>MSRA-NER<br>(SIGHAN2006)</strong></th>
      <th colspan="2"><strong>CMRC2018</strong></th>
    </tr>
    <tr>
@@ -42,31 +42,19 @@ You can easily re-produce following competitive results with minor codes, which
          <br></p>
      </td>
      <td colspan="1">
-        <center><strong>precision</strong></center>
+        <center><strong>accuracy</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
        <br></td>
      <td colspan="1">
        <strong>f1-score</strong>
        <strong></strong>
        <br></td>
      <td colspan="1">
-        <center><strong>precision</strong></center>
+        <center><strong>accuracy</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
        <br></td>
      <td colspan="1">
        <strong>f1-score</strong>
        <strong></strong>
        <br></td>
-      <td colspan="1">
-        <center><strong>precision</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
-        <br></td>
      <td colspan="1">
        <strong>f1-score</strong>
        <strong></strong>
@@ -79,13 +67,13 @@ You can easily re-produce following competitive results with minor codes, which
        <br></td>
    </tr>
    <tr>
-      <td colspan="3" width="">
+      <td colspan="2" width="">
        <strong>test</strong>
        <br></td>
-      <td colspan="3" width="">
+      <td colspan="2" width="">
        <strong>test</strong>
        <br></td>
-      <td colspan="3" width="">
+      <td colspan="1" width="">
        <strong>test</strong>
        <br></td>
      <td colspan="2" width="">
@@ -94,15 +82,11 @@ You can easily re-produce following competitive results with minor codes, which
    </tr>
    <tr>
      <td><strong>ERNIE Base</strong></td>
-      <td>95.7</td>
+      <td>95.8</td>
-      <td>95.0</td>
+      <td>95.8</td>
-      <td>95.7</td>
+      <td>86.2</td>
-      <td>85.8</td>
+      <td>82.2</td>
-      <td>82.4</td>
+      <td>99.2</td>
-      <td>81.5</td>
-      <td>94.9</td>
-      <td>94.5</td>
-      <td>94.7</td>
      <td>64.3</td>
      <td>85.2</td>
    </tr>

--- a/README_zh.md
+++ b/README_zh.md
@@ -30,9 +30,9 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活，通用且易
    <tr>
      <th><strong>数据集</strong>
        <br></th>
-      <th colspan="3"><center><strong>chnsenticorp</strong></center></th>
+      <th colspan="2"><center><strong>chnsenticorp</strong></center></th>
-      <th colspan="3"><center><strong>Quora Question Pairs matching</strong><center></th>
+      <th colspan="2"><center><strong>Quora Question Pairs matching</strong><center></th>
-      <th colspan="3"><strong>MSRA-NER<br>(SIGHAN2006)</strong></th>
+      <th colspan="1"><strong>MSRA-NER<br>(SIGHAN2006)</strong></th>
      <th colspan="2"><strong>CMRC2018</strong></th>
    </tr>
    <tr>
@@ -42,31 +42,19 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活，通用且易
          <br></p>
      </td>
      <td colspan="1">
-        <center><strong>precision</strong></center>
+        <center><strong>accuracy</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
        <br></td>
      <td colspan="1">
        <strong>f1-score</strong>
        <strong></strong>
        <br></td>
      <td colspan="1">
-        <center><strong>precision</strong></center>
+        <center><strong>accuracy</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
        <br></td>
      <td colspan="1">
        <strong>f1-score</strong>
        <strong></strong>
        <br></td>
-      <td colspan="1">
-        <center><strong>precision</strong></center>
-        <br></td>
-      <td colspan="1">
-        <strong>recall</strong>
-        <br></td>
      <td colspan="1">
        <strong>f1-score</strong>
        <strong></strong>
@@ -79,13 +67,13 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活，通用且易
        <br></td>
    </tr>
    <tr>
-      <td colspan="3" width="">
+      <td colspan="2" width="">
        <strong>test</strong>
        <br></td>
-      <td colspan="3" width="">
+      <td colspan="2" width="">
        <strong>test</strong>
        <br></td>
-      <td colspan="3" width="">
+      <td colspan="1" width="">
        <strong>test</strong>
        <br></td>
      <td colspan="2" width="">
@@ -94,17 +82,13 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活，通用且易
    </tr>
    <tr>
      <td><strong>ERNIE Base</strong></td>
-      <td>95.7</td>
+      <td>95.8</td>
-      <td>95.0</td>
+      <td>95.8</td>
-      <td>95.7</td>
+      <td>86.2</td>
-      <td>85.8</td>
+      <td>82.2</td>
-      <td>82.4</td>
+      <td>99.2</td>
-      <td>81.5</td>
+      <td>64.3</td>
-      <td>94.9</td>
+      <td>85.2</td>
-      <td>94.5</td>
-      <td>94.7</td>
-      <td>96.3</td>
-      <td>84.0</td>
    </tr>
  </tbody>
@@ -121,6 +105,16 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活，通用且易
 	</p>
 </p>
+PaddlePALM是一个设计良好的高级NLP框架。基于PaddlePALM的轻量级代码可以高效实现**监督学习、非监督/自监督学习、多任务学习和迁移学习**。在PaddlePALM架构中有三层，即从下到上依次是component层、trainer层high-level trainer层。
+在组件层，PaddlePALM提供了6个 **解耦的**组件来实现NLP任务。每个组件包含丰富的预定义类和一个基本类。预定义类是针对典型的NLP任务的，而基类是帮助用户开发一个新类（基于预定义类或基类）。
+训练器层是用选定的构件建立计算图，进行训练和预测。该层描述了训练策略、模型保存和加载、评估和预测过程。一个训练器只能处理一个任务。
+高级训练器层用于复杂的学习和推理策略，如多任务学习。您可以添加辅助任务来训练健壮的NLP模型（提高模型的测试集和领域外的性能），或者联合训练多个相关任务来获得每个任务的更高性能。
 | 模块 | 描述 | 
 | - | - |
 | **paddlepalm** | 一个开源的NLP预训练和多任务学习框架，建立在paddlepaddle框架上。 |
@@ -187,6 +181,8 @@ Available pretrain items:
 ## 使用
+#### 快速开始
 8个步骤开始一个典型的NLP训练任务。
 1. 使用`paddlepalm.reader` 要为数据集加载和输入特征生成创建一个`reader`，然后调用`reader.load_data`方法加载训练数据。
@@ -205,14 +201,8 @@ Available pretrain items:
 - [Tagging](https://github.com/PaddlePaddle/PALM/tree/master/examples/tagging)
 - [SQuAD machine Reading Comprehension](https://github.com/PaddlePaddle/PALM/tree/master/examples/mrc).
-### 设置saver
-在训练时保存 models/checkpoints 和 logs， 调用 `trainer.set_saver` 方法. 更多实现细节见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples).
-### 预测
-训练结束后进行预测和评价, 只需创建额外的reader, backbone和head示例（重复上面1~4步骤），注意创建时需设`phase='predict'`。 然后使用trainer的`predict`方法进行预测（不需创建额外的trainer）。更多实现细节请见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples/predict).
-### 多任务学习
+#### 多任务学习
 多任务学习模式下运行:
@@ -226,11 +216,31 @@ Available pretrain items:
 multi_head_trainer的保存/加载和预测操作与trainer相同。
-更多实现`multi_head_trainer`的细节, 请见
+更多实现`multi_head_trainer`的细节，请见
 - [ATIS: joint training of dialogue intent recognition and slot filling](https://github.com/PaddlePaddle/PALM/tree/master/examples/multi-task)
- [MRQA: learning reading comprehension auxilarized with mask language model]() (初次发版先不用加)
+#### 设置saver
+在训练时保存 models/checkpoints 和 logs， 调用 `trainer.set_saver` 方法。更多实现细节见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples)。
+#### 评估/预测
+训练结束后进行预测和评价, 只需创建额外的reader, backbone和head示例（重复上面1~4步骤），注意创建时需设`phase='predict'`。 然后使用trainer的`predict`方法进行预测（不需创建额外的trainer）。更多实现细节请见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples/predict)。
+#### 使用多GPU
+如果您的环境中存在多个GPU，您可以通过环境变量控制这些GPU的数量和索引[CUDA_VISIBLE_DEVICES](https://devblogs.nvidia.com/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/)。例如，如果您的环境中有4个gpu，索引为0、1、2、3，那么您可以运行以下命令来只使用GPU2：
+```shell
+CUDA_VISIBLE_DEVICES=2 python run.py
+```
+多GPU的使用需要 `,`作为分隔。例如，使用GPU2和GPU3，运行以下命令：
+```shell
+CUDA_VISIBLE_DEVICES=2,3 python run.py
+```
+在多GPU模式下，PaddlePALM会自动将每批数据分配到可用的卡上。例如，如果`batch_size`设置为64，并且有4个GPU可以用于PaddlePALM，那么每个GPU中的batch_size实际上是64/4=16。因此，**当使用多个GPU时，您需要确保设置batch_size可以整除卡片的数量**。
 ## 许可证书

--- a/examples/classification/README.md
+++ b/examples/classification/README.md
@@ -75,5 +75,5 @@ The evaluation results are as follows:
 ```
 data num: 1200
-precision: 0.956666666667, recall: 0.949013157895, f1: 0.95688225039
+accuracy: 0.9575, precision: 0.9634, recall: 0.9523, f1: 0.9578
 ```
--- a/examples/classification/evaluate.py
+++ b/examples/classification/evaluate.py
@@ -8,26 +8,19 @@ def accuracy(preds, labels):
    labels = np.array(labels) 
    return (preds == labels).mean()
-def f1(preds, labels):
+def pre_recall_f1(preds, labels):
-    preds = np.array(preds)
-    labels = np.array(labels)
-    tp = np.sum((labels == '1') & (preds == '1'))
-    tn = np.sum((labels == '0') & (preds == '0'))
-    fp = np.sum((labels == '0') & (preds == '1'))
-    fn = np.sum((labels == '1') & (preds == '0'))
-    p = tp * 1.0 / (tp + fp) 
-    r = tp * 1.0 / (tp + fn) * 1.0
-    f1 = (2 * p * r) / (p + r + 1e-8)
-    return f1
-def recall(preds, labels):
    preds = np.array(preds)
    labels = np.array(labels)
    # recall=TP/(TP+FN)
    tp = np.sum((labels == '1') & (preds == '1'))
+    fp = np.sum((labels == '0') & (preds == '1'))
    fn = np.sum((labels == '1') & (preds == '0'))
-    re = tp * 1.0 / (tp + fn)
+    r = tp * 1.0 / (tp + fn)
-    return re
+    # Precision=TP/(TP+FP)
+    p = tp * 1.0 / (tp + fp)
+    epsilon = 1e-31
+    f1 = 2 * p * r / (p+r+epsilon)
+    return p, r, f1
 def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
@@ -58,6 +51,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'
    file.close()
    assert len(labels) == len(preds), "prediction result doesn't match to labels"
    print('data num: {}'.format(len(labels)))
-    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+    p, r, f1 = pre_recall_f1(preds, labels)
+    print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
 res_evaluate()
--- a/examples/matching/README.md
+++ b/examples/matching/README.md
@@ -81,6 +81,6 @@ python evaluate.py
 The evaluation results are as follows:
 ```
-data_num: 4300
+data num: 4300
-precision: 0.857906976744, recall: 0.824249846908, f1: 0.81501664653
+accuracy: 0.8619, precision: 0.8061, recall: 0.8377, f1: 0.8216
 ```
--- a/examples/matching/evaluate.py
+++ b/examples/matching/evaluate.py
@@ -8,26 +8,19 @@ def accuracy(preds, labels):
    labels = np.array(labels) 
    return (preds == labels).mean()
-def f1(preds, labels):
+def pre_recall_f1(preds, labels):
-    preds = np.array(preds)
-    labels = np.array(labels)
-    tp = np.sum((labels == '1') & (preds == '1'))
-    tn = np.sum((labels == '0') & (preds == '0'))
-    fp = np.sum((labels == '0') & (preds == '1'))
-    fn = np.sum((labels == '1') & (preds == '0'))
-    p = tp * 1.0 / (tp + fp) 
-    r = tp * 1.0 / (tp + fn) * 1.0
-    f1 = (2 * p * r) / (p + r + 1e-8)
-    return f1
-def recall(preds, labels):
    preds = np.array(preds)
    labels = np.array(labels)
    # recall=TP/(TP+FN)
    tp = np.sum((labels == '1') & (preds == '1'))
+    fp = np.sum((labels == '0') & (preds == '1'))
    fn = np.sum((labels == '1') & (preds == '0'))
-    re = tp * 1.0 / (tp + fn)
+    r = tp * 1.0 / (tp + fn)
-    return re
+    # Precision=TP/(TP+FP)
+    p = tp * 1.0 / (tp + fp)
+    epsilon = 1e-31
+    f1 = 2 * p * r / (p+r+epsilon)
+    return p, r, f1
 def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
@@ -58,6 +51,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'
    file.close()
    assert len(labels) == len(preds), "prediction result({}) doesn't match to labels({})".format(len(preds),len(labels))
    print('data num: {}'.format(len(labels)))
-    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+    p, r, f1 = pre_recall_f1(preds, labels)
+    print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
 res_evaluate()
--- a/examples/mrc/README.md
+++ b/examples/mrc/README.md
@@ -94,5 +94,5 @@ The evaluation results are as follows:
 ```
 data_num: 3219
-em_sroce: 64.3367505436, f1: 85.1781896843
+em_sroce: 0.6434, f1: 0.8518
 ```
--- a/examples/mrc/evaluate.py
+++ b/examples/mrc/evaluate.py
@@ -121,8 +121,8 @@ def evaluate(ground_truth_file, prediction_file):
                f1 += calc_f1_score(answers, prediction)
                em += calc_em_score(answers, prediction)
-    f1_score = 100.0 * f1 / total_count
+    f1_score = f1 / total_count
-    em_score = 100.0 * em / total_count
+    em_score = em / total_count
    return f1_score, em_score, total_count, skip_count
@@ -164,4 +164,4 @@ def eval_file(dataset_file, prediction_file):
 if __name__ == '__main__':
    EM, F1, AVG, TOTAL = eval_file("data/dev.json", "outputs/predict/predictions.json")
    print('data_num: {}'.format(TOTAL))
-    print('em_sroce: {}, f1: {}'.format(EM,F1))
+    print('em_sroce: {:.4f}, f1: {:.4f}'.format(EM,F1))
--- a/examples/multi-task/README.md
+++ b/examples/multi-task/README.md
@@ -118,11 +118,12 @@ The evaluation results are as follows:
 `atis_slot`:
 ```
-precision: 0.894397728514, recall: 0.894104803493, f1: 0.894251242016
+data num: 891
+f1: 0.8934
 ```
 `atis_intent`:
 ```
 data num: 893
-precision: 0.708846584546, recall: 1.0, f1: 0.999999995
+accuracy: 0.7088, precision: 1.0000, recall: 1.0000, f1: 1.0000
 ```
--- a/examples/multi-task/evaluate-intent.py
+++ b/examples/multi-task/evaluate-intent.py
@@ -7,27 +7,20 @@ def accuracy(preds, labels):
    preds = np.array(preds)
    labels = np.array(labels) 
    return (preds == labels).mean()
-def f1(preds, labels):
-    preds = np.array(preds)
-    labels = np.array(labels)
-    tp = np.sum((labels == '1') & (preds == '1'))
-    tn = np.sum((labels == '0') & (preds == '0'))
-    fp = np.sum((labels == '0') & (preds == '1'))
-    fn = np.sum((labels == '1') & (preds == '0'))
-    p = tp * 1.0 / (tp + fp) * 1.0 
-    r = tp * 1.0 / (tp + fn) * 1.0
-    f1 = (2 * p * r) / (p + r + 1e-8)
-    return f1
-def recall(preds, labels):
+def pre_recall_f1(preds, labels):
    preds = np.array(preds)
    labels = np.array(labels)
    # recall=TP/(TP+FN)
    tp = np.sum((labels == '1') & (preds == '1'))
+    fp = np.sum((labels == '0') & (preds == '1'))
    fn = np.sum((labels == '1') & (preds == '0'))
-    re = tp * 1.0 / (tp + fn)
+    r = tp * 1.0 / (tp + fn)
-    return re
+    # Precision=TP/(TP+FP)
+    p = tp * 1.0 / (tp + fp)
+    epsilon = 1e-31
+    f1 = 2 * p * r / (p+r+epsilon)
+    return p, r, f1
 def res_evaluate(res_dir="./outputs/predict-intent/predictions.json", eval_phase='test'):
@@ -59,6 +52,7 @@ def res_evaluate(res_dir="./outputs/predict-intent/predictions.json", eval_phase
    file.close()
    assert len(labels) == len(preds), "prediction result doesn't match to labels"
    print('data num: {}'.format(len(labels)))
-    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+    p, r, f1 = pre_recall_f1(preds, labels)
+    print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
 res_evaluate()
--- a/examples/multi-task/evaluate-slot.py
+++ b/examples/multi-task/evaluate-slot.py
@@ -11,22 +11,31 @@ def load_label_map(map_dir="./data/atis/atis_slot/label_map.json"):
    return json.load(open(map_dir, "r"))
-def cal_chunk(total_res, total_label):
+def cal_chunk(pred_label, refer_label):
-    assert len(total_label) == len(total_res), "prediction result doesn't match to labels, {}, {}".format(len(total_res),len(total_label))
+    tp = dict()
-    num_labels = 0
+    fn = dict()
-    num_corr = 0
+    fp = dict()
-    num_infers = 0
+    for i in range(len(refer_label)):
-    for res, label in zip(total_res, total_label):
+        if refer_label[i] == pred_label[i]:
-        assert len(res) == len(label), "prediction result doesn't match to labels, {}, {}".format(len(res),len(label))
+            if refer_label[i] not in tp:
-        num_labels += sum([0 if i == 6 else 1 for i in label])
+                tp[refer_label[i]] = 0
-        num_corr += sum([1 if label[i] == res[i] and label[i] != 6 else 0 for i in range(len(label))])
+            tp[refer_label[i]] += 1
-        num_infers += sum([0 if i == 6 else 1 for i in res])
+        else:
+            if pred_label[i] not in fp:
+                fp[pred_label[i]] = 0
+            fp[pred_label[i]] += 1
+            if refer_label[i] not in fn:
+                fn[refer_label[i]] = 0
+            fn[refer_label[i]] += 1
-    precision = num_corr * 1.0 / num_infers if num_infers > 0 else 0.0
+    tp_total = sum(tp.values())
-    recall = num_corr * 1.0 / num_labels if num_labels > 0 else 0.0
+    fn_total = sum(fn.values())
-    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
+    fp_total = sum(fp.values())
+    p_total = float(tp_total) / (tp_total + fp_total)
+    r_total = float(tp_total) / (tp_total + fn_total)
+    f_micro = 2 * p_total * r_total / (p_total + r_total)
-    return precision, recall, f1
+    return f_micro
 def res_evaluate(res_dir="./outputs/predict-slot/predictions.json", data_dir="./data/atis/atis_slot/test.tsv"):
@@ -72,7 +81,18 @@ def res_evaluate(res_dir="./outputs/predict-slot/predictions.json", data_dir="./
            cnt += 1
-    precision, recall, f1 = cal_chunk(total_res, total_label)
+    total_res_equal = []
-    print("precision: {}, recall: {}, f1: {}".format(precision, recall, f1))
+    total_label_equal = []
+    assert len(total_label) == len(total_res), "prediction result doesn't match to labels"
+    for i in range(len(total_label)):
+        num = len(total_label[i])
+        total_label_equal.extend(total_label[i])
+        total_res[i] = total_res[i][:num]
+        total_res_equal.extend(total_res[i])
+    f1 = cal_chunk(total_res_equal, total_label_equal)
+    print('data num: {}'.format(len(total_label)))
+    print("f1: {:.4f}".format(f1))
 res_evaluate()
--- a/examples/predict/README.md
+++ b/examples/predict/README.md
@@ -44,5 +44,5 @@ The evaluation results are as follows:
 ```
 data num: 1200
-precision: 0.494166666667, recall: 0.0444078947368, f1: 0.0816944009455
+accuracy: 0.4758, precision: 0.4730, recall: 0.3026, f1: 0.3691
 ```
--- a/examples/predict/evaluate.py
+++ b/examples/predict/evaluate.py
@@ -8,26 +8,19 @@ def accuracy(preds, labels):
    labels = np.array(labels) 
    return (preds == labels).mean()
-def f1(preds, labels):
+def pre_recall_f1(preds, labels):
-    preds = np.array(preds)
-    labels = np.array(labels)
-    tp = np.sum((labels == '1') & (preds == '1'))
-    tn = np.sum((labels == '0') & (preds == '0'))
-    fp = np.sum((labels == '0') & (preds == '1'))
-    fn = np.sum((labels == '1') & (preds == '0'))
-    p = tp * 1.0 / (tp + fp) 
-    r = tp * 1.0 / (tp + fn) * 1.0
-    f1 = (2 * p * r) / (p + r + 1e-8)
-    return f1
-def recall(preds, labels):
    preds = np.array(preds)
    labels = np.array(labels)
    # recall=TP/(TP+FN)
    tp = np.sum((labels == '1') & (preds == '1'))
+    fp = np.sum((labels == '0') & (preds == '1'))
    fn = np.sum((labels == '1') & (preds == '0'))
-    re = tp * 1.0 / (tp + fn)
+    r = tp * 1.0 / (tp + fn)
-    return re
+    # Precision=TP/(TP+FP)
+    p = tp * 1.0 / (tp + fp)
+    epsilon = 1e-31
+    f1 = 2 * p * r / (p+r+epsilon)
+    return p, r, f1
 def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
@@ -58,6 +51,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'
    file.close()
    assert len(labels) == len(preds), "prediction result doesn't match to labels"
    print('data num: {}'.format(len(labels)))
-    print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+    p, r, f1 = pre_recall_f1(preds, labels)
+    print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
 res_evaluate()
--- a/examples/tagging/README.md
+++ b/examples/tagging/README.md
@@ -74,5 +74,6 @@ python evaluate.py
 The evaluation results are as follows:
 ```
-precision: 0.948718989809, recall: 0.944806113784, f1: 0.946758508914
+data num: 4636
+f1: 0.9918
 ```
--- a/examples/tagging/evaluate.py
+++ b/examples/tagging/evaluate.py
@@ -11,22 +11,31 @@ def load_label_map(map_dir="./data/label_map.json"):
    return json.load(open(map_dir, "r"))
-def cal_chunk(total_res, total_label):
+def cal_chunk(pred_label, refer_label):
-    assert len(total_label) == len(total_res), 'prediction result doesn\'t match to labels'
+    tp = dict()
-    num_labels = 0
+    fn = dict()
-    num_corr = 0
+    fp = dict()
-    num_infers = 0
+    for i in range(len(refer_label)):
-    for res, label in zip(total_res, total_label):
+        if refer_label[i] == pred_label[i]:
-        assert len(res) == len(label), "prediction result doesn\'t match to labels"
+            if refer_label[i] not in tp:
-        num_labels += sum([0 if i == 6 else 1 for i in label])
+                tp[refer_label[i]] = 0
-        num_corr += sum([1 if label[i] == res[i] and label[i] != 6 else 0 for i in range(len(label))])
+            tp[refer_label[i]] += 1
-        num_infers += sum([0 if i == 6 else 1 for i in res])
+        else:
+            if pred_label[i] not in fp:
+                fp[pred_label[i]] = 0
+            fp[pred_label[i]] += 1
+            if refer_label[i] not in fn:
+                fn[refer_label[i]] = 0
+            fn[refer_label[i]] += 1
-    precision = num_corr * 1.0 / num_infers if num_infers > 0 else 0.0
+    tp_total = sum(tp.values())
-    recall = num_corr * 1.0 / num_labels if num_labels > 0 else 0.0
+    fn_total = sum(fn.values())
-    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
+    fp_total = sum(fp.values())
+    p_total = float(tp_total) / (tp_total + fp_total)
+    r_total = float(tp_total) / (tp_total + fn_total)
+    f_micro = 2 * p_total * r_total / (p_total + r_total)
-    return precision, recall, f1
+    return f_micro
 def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/test.tsv"):
@@ -48,7 +57,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/
            labels = line[1].split("\x02")
            total_label.append(labels)
    total_label = [[label_map[j] for j in i] for i in total_label]
    total_res = []
    with open(res_dir, "r") as file:
        cnt = 0
@@ -72,7 +81,17 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/
            cnt += 1
-    precision, recall, f1 = cal_chunk(total_res, total_label)
+    total_res_equal = []
-    print("precision: {}, recall: {}, f1: {}".format(precision, recall, f1))
+    total_label_equal = []
+    assert len(total_label) == len(total_res), "prediction result doesn't match to labels"
+    for i in range(len(total_label)):
+        num = len(total_label[i])
+        total_label_equal.extend(total_label[i])
+        total_res[i] = total_res[i][:num]
+        total_res_equal.extend(total_res[i])
+    f1 = cal_chunk(total_res_equal, total_label_equal)
+    print('data num: {}'.format(len(total_label)))
+    print("f1: {:.4f}".format(f1))
 res_evaluate()