diff --git a/README.md b/README.md
index e1bd0e0ca79ed0a8be6fec793961b63593d3dfa4..8f54d2fb5864bae9db5e4fe6696db76505eb69d4 100644
--- a/README.md
+++ b/README.md
@@ -30,9 +30,9 @@ You can easily re-produce following competitive results with minor codes, which
Dataset
|
- chnsenticorp |
- Quora Question Pairs matching |
- MSRA-NER (SIGHAN2006) |
+ chnsenticorp |
+ Quora Question Pairs matching |
+ MSRA-NER (SIGHAN2006) |
CMRC2018 |
@@ -42,31 +42,19 @@ You can easily re-produce following competitive results with minor codes, which
- precision
-
|
-
- recall
+ accuracy
|
f1-score
|
- precision
-
|
-
- recall
+ accuracy
|
f1-score
|
-
- precision
-
|
-
- recall
-
|
f1-score
@@ -79,13 +67,13 @@ You can easily re-produce following competitive results with minor codes, which
|
- |
+ |
test
|
-
+ |
test
|
-
+ |
test
|
@@ -94,15 +82,11 @@ You can easily re-produce following competitive results with minor codes, which
|
| ERNIE Base |
- 95.7 |
- 95.0 |
- 95.7 |
- 85.8 |
- 82.4 |
- 81.5 |
- 94.9 |
- 94.5 |
- 94.7 |
+ 95.8 |
+ 95.8 |
+ 86.2 |
+ 82.2 |
+ 99.2 |
64.3 |
85.2 |
diff --git a/README_zh.md b/README_zh.md
index bbffd7d7ad66af1e1b0fd4ff8da479786945a58f..678a41d44ba858bdccbbabab6a1732c2211481a4 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -30,9 +30,9 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活,通用且易
数据集
|
- chnsenticorp |
- Quora Question Pairs matching |
- MSRA-NER (SIGHAN2006) |
+ chnsenticorp |
+ Quora Question Pairs matching |
+ MSRA-NER (SIGHAN2006) |
CMRC2018 |
@@ -42,31 +42,19 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活,通用且易
- precision
-
|
-
- recall
+ accuracy
|
f1-score
|
- precision
-
|
-
- recall
+ accuracy
|
f1-score
|
-
- precision
-
|
-
- recall
-
|
f1-score
@@ -79,13 +67,13 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活,通用且易
|
- |
+ |
test
|
-
+ |
test
|
-
+ |
test
|
@@ -94,17 +82,13 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活,通用且易
|
| ERNIE Base |
- 95.7 |
- 95.0 |
- 95.7 |
- 85.8 |
- 82.4 |
- 81.5 |
- 94.9 |
- 94.5 |
- 94.7 |
- 96.3 |
- 84.0 |
+ 95.8 |
+ 95.8 |
+ 86.2 |
+ 82.2 |
+ 99.2 |
+ 64.3 |
+ 85.2 |
@@ -121,6 +105,16 @@ PaddlePALM (PArallel Learning from Multi-tasks) 是一个灵活,通用且易
+
+PaddlePALM是一个设计良好的高级NLP框架。基于PaddlePALM的小代码可以高效实现**监督学习、非监督/自监督学习、多任务学习和迁移学习**。在PaddlePALM架构中有三层,即从下到上依次是组件层、训练器层和高级训练器层。
+
+在组件层,PaddlePALM提供了6个 **解耦的**组件来实现NLP任务。每个组件包含丰富的`pre-defined`类和一个`Base`类。`pre-defined`类是针对典型的NLP任务的,而`Base`类是帮助用户开发一个新类(基于`pre-defined`类或从`Base`类)。
+
+训练器层是用选定的构件建立计算图,进行训练和预测。该层描述了训练策略、模型保存和加载、评估和预测过程。一个训练器只能处理一个任务。
+
+高级训练器层用于复杂的学习和推理策略,如多任务学习。您可以添加辅助任务来训练健壮的NLP模型(提高模型的测试集和领域外的性能),或者联合训练多个相关任务来获得每个任务的更高性能。
+
+
| 模块 | 描述 |
| - | - |
| **paddlepalm** | 一个开源的NLP预训练和多任务学习框架,建立在paddlepaddle框架上。 |
@@ -187,6 +181,8 @@ Available pretrain items:
## 使用
+#### 快速开始
+
8个步骤开始一个典型的NLP训练任务。
1. 使用`paddlepalm.reader` 要为数据集加载和输入特征生成创建一个`reader`,然后调用`reader.load_data`方法加载训练数据。
@@ -205,14 +201,8 @@ Available pretrain items:
- [Tagging](https://github.com/PaddlePaddle/PALM/tree/master/examples/tagging)
- [SQuAD machine Reading Comprehension](https://github.com/PaddlePaddle/PALM/tree/master/examples/mrc).
-### 设置saver
-
-在训练时保存 models/checkpoints 和 logs, 调用 `trainer.set_saver` 方法. 更多实现细节见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples).
-
-### 预测
-训练结束后进行预测和评价, 只需创建额外的reader, backbone和head示例(重复上面1~4步骤),注意创建时需设`phase='predict'`。 然后使用trainer的`predict`方法进行预测(不需创建额外的trainer)。更多实现细节请见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples/predict).
-### 多任务学习
+#### 多任务学习
多任务学习模式下运行:
@@ -226,11 +216,31 @@ Available pretrain items:
multi_head_trainer的保存/加载和预测操作与trainer相同。
-更多实现`multi_head_trainer`的细节, 请见
+更多实现`multi_head_trainer`的细节,请见
- [ATIS: joint training of dialogue intent recognition and slot filling](https://github.com/PaddlePaddle/PALM/tree/master/examples/multi-task)
-- [MRQA: learning reading comprehension auxilarized with mask language model]() (初次发版先不用加)
+#### 设置saver
+
+在训练时保存 models/checkpoints 和 logs, 调用 `trainer.set_saver` 方法。更多实现细节见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples)。
+
+#### 评估/预测
+训练结束后进行预测和评价, 只需创建额外的reader, backbone和head示例(重复上面1~4步骤),注意创建时需设`phase='predict'`。 然后使用trainer的`predict`方法进行预测(不需创建额外的trainer)。更多实现细节请见[这里](https://github.com/PaddlePaddle/PALM/tree/master/examples/predict)。
+
+#### 使用多GPU
+如果您的环境中存在多个GPU,您可以通过环境变量控制这些GPU的数量和索引[CUDA_VISIBLE_DEVICES](https://devblogs.nvidia.com/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/)。例如,如果您的环境中有4个gpu,索引为0、1、2、3,那么您可以运行以下命令来只使用GPU2:
+
+```shell
+CUDA_VISIBLE_DEVICES=2 python run.py
+```
+
+多GPU的使用需要 `,`作为分隔。例如,使用GPU2和GPU3,运行以下命令:
+
+```shell
+CUDA_VISIBLE_DEVICES=2,3 python run.py
+```
+
+在多GPU模式下,PaddlePALM会自动将每批数据分配到可用的卡上。例如,如果`batch_size`设置为64,并且有4个GPU可以用于PaddlePALM,那么每个GPU中的batch_size实际上是64/4=16。因此,**当使用多个GPU时,您需要确保设置batch_size可以整除卡片的数量**。
## 许可证书
diff --git a/examples/classification/README.md b/examples/classification/README.md
index b1ed1b1074e30d2d1a205d014794354feb88476f..4ac05170078c858a2399e9659cd1145e76920b93 100644
--- a/examples/classification/README.md
+++ b/examples/classification/README.md
@@ -75,5 +75,5 @@ The evaluation results are as follows:
```
data num: 1200
-precision: 0.956666666667, recall: 0.949013157895, f1: 0.95688225039
+accuracy: 0.9575, precision: 0.9634, recall: 0.9523, f1: 0.9578
```
diff --git a/examples/classification/evaluate.py b/examples/classification/evaluate.py
index f7949a93cf57a20e0477025099dbba6ae6a56050..4b1b0d39e7a8b92fdd993d51d9564deb58b31fa3 100644
--- a/examples/classification/evaluate.py
+++ b/examples/classification/evaluate.py
@@ -8,26 +8,19 @@ def accuracy(preds, labels):
labels = np.array(labels)
return (preds == labels).mean()
-def f1(preds, labels):
- preds = np.array(preds)
- labels = np.array(labels)
- tp = np.sum((labels == '1') & (preds == '1'))
- tn = np.sum((labels == '0') & (preds == '0'))
- fp = np.sum((labels == '0') & (preds == '1'))
- fn = np.sum((labels == '1') & (preds == '0'))
- p = tp * 1.0 / (tp + fp)
- r = tp * 1.0 / (tp + fn) * 1.0
- f1 = (2 * p * r) / (p + r + 1e-8)
- return f1
-
-def recall(preds, labels):
+def pre_recall_f1(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
# recall=TP/(TP+FN)
tp = np.sum((labels == '1') & (preds == '1'))
+ fp = np.sum((labels == '0') & (preds == '1'))
fn = np.sum((labels == '1') & (preds == '0'))
- re = tp * 1.0 / (tp + fn)
- return re
+ r = tp * 1.0 / (tp + fn)
+ # Precision=TP/(TP+FP)
+ p = tp * 1.0 / (tp + fp)
+ epsilon = 1e-31
+ f1 = 2 * p * r / (p+r+epsilon)
+ return p, r, f1
def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
@@ -58,6 +51,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'
file.close()
assert len(labels) == len(preds), "prediction result doesn't match to labels"
print('data num: {}'.format(len(labels)))
- print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+ p, r, f1 = pre_recall_f1(preds, labels)
+ print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
res_evaluate()
diff --git a/examples/matching/README.md b/examples/matching/README.md
index 3cb10327c92658b49c3dd8cd4f160571c574b16b..aecb97f405353db0efdee234c023dc33a11de9c7 100644
--- a/examples/matching/README.md
+++ b/examples/matching/README.md
@@ -81,6 +81,6 @@ python evaluate.py
The evaluation results are as follows:
```
-data_num: 4300
-precision: 0.857906976744, recall: 0.824249846908, f1: 0.81501664653
+data num: 4300
+accuracy: 0.8619, precision: 0.8061, recall: 0.8377, f1: 0.8216
```
diff --git a/examples/matching/evaluate.py b/examples/matching/evaluate.py
index 5ea6da3cee00f46508475597499584ad68f07858..385a52e74cd90f9094e5781df48a21e78bd8d273 100644
--- a/examples/matching/evaluate.py
+++ b/examples/matching/evaluate.py
@@ -8,26 +8,19 @@ def accuracy(preds, labels):
labels = np.array(labels)
return (preds == labels).mean()
-def f1(preds, labels):
- preds = np.array(preds)
- labels = np.array(labels)
- tp = np.sum((labels == '1') & (preds == '1'))
- tn = np.sum((labels == '0') & (preds == '0'))
- fp = np.sum((labels == '0') & (preds == '1'))
- fn = np.sum((labels == '1') & (preds == '0'))
- p = tp * 1.0 / (tp + fp)
- r = tp * 1.0 / (tp + fn) * 1.0
- f1 = (2 * p * r) / (p + r + 1e-8)
- return f1
-
-def recall(preds, labels):
+def pre_recall_f1(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
# recall=TP/(TP+FN)
tp = np.sum((labels == '1') & (preds == '1'))
+ fp = np.sum((labels == '0') & (preds == '1'))
fn = np.sum((labels == '1') & (preds == '0'))
- re = tp * 1.0 / (tp + fn)
- return re
+ r = tp * 1.0 / (tp + fn)
+ # Precision=TP/(TP+FP)
+ p = tp * 1.0 / (tp + fp)
+ epsilon = 1e-31
+ f1 = 2 * p * r / (p+r+epsilon)
+ return p, r, f1
def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
@@ -58,6 +51,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'
file.close()
assert len(labels) == len(preds), "prediction result({}) doesn't match to labels({})".format(len(preds),len(labels))
print('data num: {}'.format(len(labels)))
- print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+ p, r, f1 = pre_recall_f1(preds, labels)
+ print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
res_evaluate()
diff --git a/examples/mrc/README.md b/examples/mrc/README.md
index 9d40fd0effbd278a7824aa25f716402eafc5c95c..6d01a3563d9398dfd32fe18280da6d83edbc9a4f 100644
--- a/examples/mrc/README.md
+++ b/examples/mrc/README.md
@@ -94,5 +94,5 @@ The evaluation results are as follows:
```
data_num: 3219
-em_sroce: 64.3367505436, f1: 85.1781896843
+em_sroce: 0.6434, f1: 0.8518
```
diff --git a/examples/mrc/evaluate.py b/examples/mrc/evaluate.py
index a1bc874a6aa5b80ed4fcb705507abb4c389fb793..bd6ba5fa8c78ea98be358c48cbdc3941d1f3fd59 100644
--- a/examples/mrc/evaluate.py
+++ b/examples/mrc/evaluate.py
@@ -121,8 +121,8 @@ def evaluate(ground_truth_file, prediction_file):
f1 += calc_f1_score(answers, prediction)
em += calc_em_score(answers, prediction)
- f1_score = 100.0 * f1 / total_count
- em_score = 100.0 * em / total_count
+ f1_score = f1 / total_count
+ em_score = em / total_count
return f1_score, em_score, total_count, skip_count
@@ -164,4 +164,4 @@ def eval_file(dataset_file, prediction_file):
if __name__ == '__main__':
EM, F1, AVG, TOTAL = eval_file("data/dev.json", "outputs/predict/predictions.json")
print('data_num: {}'.format(TOTAL))
- print('em_sroce: {}, f1: {}'.format(EM,F1))
+ print('em_sroce: {:.4f}, f1: {:.4f}'.format(EM,F1))
diff --git a/examples/multi-task/README.md b/examples/multi-task/README.md
index 45c9ea40b32fb24edfa48fd2491419a8958d9bba..63038ab0fd5368a4c183b6e0a461d021e9a72e7a 100644
--- a/examples/multi-task/README.md
+++ b/examples/multi-task/README.md
@@ -118,11 +118,12 @@ The evaluation results are as follows:
`atis_slot`:
```
-precision: 0.894397728514, recall: 0.894104803493, f1: 0.894251242016
+data num: 891
+f1: 0.8934
```
`atis_intent`:
```
data num: 893
-precision: 0.708846584546, recall: 1.0, f1: 0.999999995
+accuracy: 0.7088, precision: 1.0000, recall: 1.0000, f1: 1.0000
```
diff --git a/examples/multi-task/evaluate-intent.py b/examples/multi-task/evaluate-intent.py
index 5d2feb5e9f6c8344f29a14c3bfbd7b12240cbad8..49e635e7b674b16a874fe29f8e8adb6b8a58fb4b 100644
--- a/examples/multi-task/evaluate-intent.py
+++ b/examples/multi-task/evaluate-intent.py
@@ -7,27 +7,20 @@ def accuracy(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
return (preds == labels).mean()
-
-def f1(preds, labels):
- preds = np.array(preds)
- labels = np.array(labels)
- tp = np.sum((labels == '1') & (preds == '1'))
- tn = np.sum((labels == '0') & (preds == '0'))
- fp = np.sum((labels == '0') & (preds == '1'))
- fn = np.sum((labels == '1') & (preds == '0'))
- p = tp * 1.0 / (tp + fp) * 1.0
- r = tp * 1.0 / (tp + fn) * 1.0
- f1 = (2 * p * r) / (p + r + 1e-8)
- return f1
-def recall(preds, labels):
+def pre_recall_f1(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
# recall=TP/(TP+FN)
tp = np.sum((labels == '1') & (preds == '1'))
+ fp = np.sum((labels == '0') & (preds == '1'))
fn = np.sum((labels == '1') & (preds == '0'))
- re = tp * 1.0 / (tp + fn)
- return re
+ r = tp * 1.0 / (tp + fn)
+ # Precision=TP/(TP+FP)
+ p = tp * 1.0 / (tp + fp)
+ epsilon = 1e-31
+ f1 = 2 * p * r / (p+r+epsilon)
+ return p, r, f1
def res_evaluate(res_dir="./outputs/predict-intent/predictions.json", eval_phase='test'):
@@ -59,6 +52,7 @@ def res_evaluate(res_dir="./outputs/predict-intent/predictions.json", eval_phase
file.close()
assert len(labels) == len(preds), "prediction result doesn't match to labels"
print('data num: {}'.format(len(labels)))
- print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+ p, r, f1 = pre_recall_f1(preds, labels)
+ print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
res_evaluate()
diff --git a/examples/multi-task/evaluate-slot.py b/examples/multi-task/evaluate-slot.py
index d598e96b5f35abd03696ccb1870b94e6eb89a57c..20c83be49668b8edeaed9da9f0c3876d04b4b436 100644
--- a/examples/multi-task/evaluate-slot.py
+++ b/examples/multi-task/evaluate-slot.py
@@ -11,22 +11,31 @@ def load_label_map(map_dir="./data/atis/atis_slot/label_map.json"):
return json.load(open(map_dir, "r"))
-def cal_chunk(total_res, total_label):
- assert len(total_label) == len(total_res), "prediction result doesn't match to labels, {}, {}".format(len(total_res),len(total_label))
- num_labels = 0
- num_corr = 0
- num_infers = 0
- for res, label in zip(total_res, total_label):
- assert len(res) == len(label), "prediction result doesn't match to labels, {}, {}".format(len(res),len(label))
- num_labels += sum([0 if i == 6 else 1 for i in label])
- num_corr += sum([1 if label[i] == res[i] and label[i] != 6 else 0 for i in range(len(label))])
- num_infers += sum([0 if i == 6 else 1 for i in res])
+def cal_chunk(pred_label, refer_label):
+ tp = dict()
+ fn = dict()
+ fp = dict()
+ for i in range(len(refer_label)):
+ if refer_label[i] == pred_label[i]:
+ if refer_label[i] not in tp:
+ tp[refer_label[i]] = 0
+ tp[refer_label[i]] += 1
+ else:
+ if pred_label[i] not in fp:
+ fp[pred_label[i]] = 0
+ fp[pred_label[i]] += 1
+ if refer_label[i] not in fn:
+ fn[refer_label[i]] = 0
+ fn[refer_label[i]] += 1
- precision = num_corr * 1.0 / num_infers if num_infers > 0 else 0.0
- recall = num_corr * 1.0 / num_labels if num_labels > 0 else 0.0
- f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
+ tp_total = sum(tp.values())
+ fn_total = sum(fn.values())
+ fp_total = sum(fp.values())
+ p_total = float(tp_total) / (tp_total + fp_total)
+ r_total = float(tp_total) / (tp_total + fn_total)
+ f_micro = 2 * p_total * r_total / (p_total + r_total)
- return precision, recall, f1
+ return f_micro
def res_evaluate(res_dir="./outputs/predict-slot/predictions.json", data_dir="./data/atis/atis_slot/test.tsv"):
@@ -72,7 +81,18 @@ def res_evaluate(res_dir="./outputs/predict-slot/predictions.json", data_dir="./
cnt += 1
- precision, recall, f1 = cal_chunk(total_res, total_label)
- print("precision: {}, recall: {}, f1: {}".format(precision, recall, f1))
+ total_res_equal = []
+ total_label_equal = []
+ assert len(total_label) == len(total_res), "prediction result doesn't match to labels"
+ for i in range(len(total_label)):
+ num = len(total_label[i])
+ total_label_equal.extend(total_label[i])
+ total_res[i] = total_res[i][:num]
+ total_res_equal.extend(total_res[i])
+
+ f1 = cal_chunk(total_res_equal, total_label_equal)
+ print('data num: {}'.format(len(total_label)))
+ print("f1: {:.4f}".format(f1))
+
res_evaluate()
diff --git a/examples/predict/README.md b/examples/predict/README.md
index 49d05129c0e2f9583d22c0c3348dc54b6f18b888..19743f09642f68f8dd0bb118d91d5e1812d6cc95 100644
--- a/examples/predict/README.md
+++ b/examples/predict/README.md
@@ -44,5 +44,5 @@ The evaluation results are as follows:
```
data num: 1200
-precision: 0.494166666667, recall: 0.0444078947368, f1: 0.0816944009455
+accuracy: 0.4758, precision: 0.4730, recall: 0.3026, f1: 0.3691
```
diff --git a/examples/predict/evaluate.py b/examples/predict/evaluate.py
index f7949a93cf57a20e0477025099dbba6ae6a56050..4b1b0d39e7a8b92fdd993d51d9564deb58b31fa3 100644
--- a/examples/predict/evaluate.py
+++ b/examples/predict/evaluate.py
@@ -8,26 +8,19 @@ def accuracy(preds, labels):
labels = np.array(labels)
return (preds == labels).mean()
-def f1(preds, labels):
- preds = np.array(preds)
- labels = np.array(labels)
- tp = np.sum((labels == '1') & (preds == '1'))
- tn = np.sum((labels == '0') & (preds == '0'))
- fp = np.sum((labels == '0') & (preds == '1'))
- fn = np.sum((labels == '1') & (preds == '0'))
- p = tp * 1.0 / (tp + fp)
- r = tp * 1.0 / (tp + fn) * 1.0
- f1 = (2 * p * r) / (p + r + 1e-8)
- return f1
-
-def recall(preds, labels):
+def pre_recall_f1(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
# recall=TP/(TP+FN)
tp = np.sum((labels == '1') & (preds == '1'))
+ fp = np.sum((labels == '0') & (preds == '1'))
fn = np.sum((labels == '1') & (preds == '0'))
- re = tp * 1.0 / (tp + fn)
- return re
+ r = tp * 1.0 / (tp + fn)
+ # Precision=TP/(TP+FP)
+ p = tp * 1.0 / (tp + fp)
+ epsilon = 1e-31
+ f1 = 2 * p * r / (p+r+epsilon)
+ return p, r, f1
def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
@@ -58,6 +51,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'
file.close()
assert len(labels) == len(preds), "prediction result doesn't match to labels"
print('data num: {}'.format(len(labels)))
- print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
+ p, r, f1 = pre_recall_f1(preds, labels)
+ print("accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, f1: {:.4f}".format(accuracy(preds, labels), p, r, f1))
res_evaluate()
diff --git a/examples/tagging/README.md b/examples/tagging/README.md
index f58fd3b42f0c6fb73b6eced1529631419ab766cb..465e611ea1396f125b4547391a7a68e073a6930e 100644
--- a/examples/tagging/README.md
+++ b/examples/tagging/README.md
@@ -74,5 +74,6 @@ python evaluate.py
The evaluation results are as follows:
```
-precision: 0.948718989809, recall: 0.944806113784, f1: 0.946758508914
+data num: 4636
+f1: 0.9918
```
diff --git a/examples/tagging/evaluate.py b/examples/tagging/evaluate.py
index db20b919a0e3c819b6cc1768249dccf5c014cbe6..d7812eaa97511b572874ee5aaa582a5e5cb10866 100644
--- a/examples/tagging/evaluate.py
+++ b/examples/tagging/evaluate.py
@@ -11,22 +11,31 @@ def load_label_map(map_dir="./data/label_map.json"):
return json.load(open(map_dir, "r"))
-def cal_chunk(total_res, total_label):
- assert len(total_label) == len(total_res), 'prediction result doesn\'t match to labels'
- num_labels = 0
- num_corr = 0
- num_infers = 0
- for res, label in zip(total_res, total_label):
- assert len(res) == len(label), "prediction result doesn\'t match to labels"
- num_labels += sum([0 if i == 6 else 1 for i in label])
- num_corr += sum([1 if label[i] == res[i] and label[i] != 6 else 0 for i in range(len(label))])
- num_infers += sum([0 if i == 6 else 1 for i in res])
+def cal_chunk(pred_label, refer_label):
+ tp = dict()
+ fn = dict()
+ fp = dict()
+ for i in range(len(refer_label)):
+ if refer_label[i] == pred_label[i]:
+ if refer_label[i] not in tp:
+ tp[refer_label[i]] = 0
+ tp[refer_label[i]] += 1
+ else:
+ if pred_label[i] not in fp:
+ fp[pred_label[i]] = 0
+ fp[pred_label[i]] += 1
+ if refer_label[i] not in fn:
+ fn[refer_label[i]] = 0
+ fn[refer_label[i]] += 1
- precision = num_corr * 1.0 / num_infers if num_infers > 0 else 0.0
- recall = num_corr * 1.0 / num_labels if num_labels > 0 else 0.0
- f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
+ tp_total = sum(tp.values())
+ fn_total = sum(fn.values())
+ fp_total = sum(fp.values())
+ p_total = float(tp_total) / (tp_total + fp_total)
+ r_total = float(tp_total) / (tp_total + fn_total)
+ f_micro = 2 * p_total * r_total / (p_total + r_total)
- return precision, recall, f1
+ return f_micro
def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/test.tsv"):
@@ -48,7 +57,7 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/
labels = line[1].split("\x02")
total_label.append(labels)
total_label = [[label_map[j] for j in i] for i in total_label]
-
+
total_res = []
with open(res_dir, "r") as file:
cnt = 0
@@ -72,7 +81,17 @@ def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/
cnt += 1
- precision, recall, f1 = cal_chunk(total_res, total_label)
- print("precision: {}, recall: {}, f1: {}".format(precision, recall, f1))
+ total_res_equal = []
+ total_label_equal = []
+ assert len(total_label) == len(total_res), "prediction result doesn't match to labels"
+ for i in range(len(total_label)):
+ num = len(total_label[i])
+ total_label_equal.extend(total_label[i])
+ total_res[i] = total_res[i][:num]
+ total_res_equal.extend(total_res[i])
+
+ f1 = cal_chunk(total_res_equal, total_label_equal)
+ print('data num: {}'.format(len(total_label)))
+ print("f1: {:.4f}".format(f1))
res_evaluate()