Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
e19f4bc7
M
models
项目概览
PaddlePaddle
/
models
大约 2 年 前同步成功
通知
232
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e19f4bc7
编写于
10月 08, 2019
作者:
D
Dilyar
提交者:
Yibing Liu
10月 08, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix some problems of simnet (#3433)
* update * update * Update README.md * Update run.sh
上级
107d4e79
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
426 addition
and
333 deletion
+426
-333
PaddleNLP/models/matching/bow.py
PaddleNLP/models/matching/bow.py
+2
-2
PaddleNLP/models/matching/cnn.py
PaddleNLP/models/matching/cnn.py
+9
-9
PaddleNLP/models/matching/gru.py
PaddleNLP/models/matching/gru.py
+7
-7
PaddleNLP/models/matching/lstm.py
PaddleNLP/models/matching/lstm.py
+7
-7
PaddleNLP/similarity_net/README.md
PaddleNLP/similarity_net/README.md
+16
-18
PaddleNLP/similarity_net/config.py
PaddleNLP/similarity_net/config.py
+2
-4
PaddleNLP/similarity_net/download_data.sh
PaddleNLP/similarity_net/download_data.sh
+5
-0
PaddleNLP/similarity_net/download_pretrained_model.sh
PaddleNLP/similarity_net/download_pretrained_model.sh
+2
-8
PaddleNLP/similarity_net/run.sh
PaddleNLP/similarity_net/run.sh
+6
-5
PaddleNLP/similarity_net/run_classifier.py
PaddleNLP/similarity_net/run_classifier.py
+312
-264
PaddleNLP/similarity_net/utils.py
PaddleNLP/similarity_net/utils.py
+58
-9
未找到文件。
PaddleNLP/models/matching/bow.py
浏览文件 @
e19f4bc7
...
@@ -49,7 +49,7 @@ class BOW(object):
...
@@ -49,7 +49,7 @@ class BOW(object):
right_soft
=
softsign_layer
.
ops
(
right_pool
)
right_soft
=
softsign_layer
.
ops
(
right_pool
)
# matching layer
# matching layer
if
self
.
task_mode
==
"pairwise"
:
if
self
.
task_mode
==
"pairwise"
:
bow_layer
=
layers
.
FCLayer
(
self
.
bow_dim
,
"relu"
,
"fc"
)
bow_layer
=
layers
.
FCLayer
(
self
.
bow_dim
,
None
,
"fc"
)
left_bow
=
bow_layer
.
ops
(
left_soft
)
left_bow
=
bow_layer
.
ops
(
left_soft
)
right_bow
=
bow_layer
.
ops
(
right_soft
)
right_bow
=
bow_layer
.
ops
(
right_soft
)
cos_sim_layer
=
layers
.
CosSimLayer
()
cos_sim_layer
=
layers
.
CosSimLayer
()
...
@@ -58,7 +58,7 @@ class BOW(object):
...
@@ -58,7 +58,7 @@ class BOW(object):
else
:
else
:
concat_layer
=
layers
.
ConcatLayer
(
1
)
concat_layer
=
layers
.
ConcatLayer
(
1
)
concat
=
concat_layer
.
ops
([
left_soft
,
right_soft
])
concat
=
concat_layer
.
ops
([
left_soft
,
right_soft
])
bow_layer
=
layers
.
FCLayer
(
self
.
bow_dim
,
"relu"
,
"fc"
)
bow_layer
=
layers
.
FCLayer
(
self
.
bow_dim
,
None
,
"fc"
)
concat_fc
=
bow_layer
.
ops
(
concat
)
concat_fc
=
bow_layer
.
ops
(
concat
)
softmax_layer
=
layers
.
FCLayer
(
2
,
"softmax"
,
"cos_sim"
)
softmax_layer
=
layers
.
FCLayer
(
2
,
"softmax"
,
"cos_sim"
)
pred
=
softmax_layer
.
ops
(
concat_fc
)
pred
=
softmax_layer
.
ops
(
concat_fc
)
...
...
PaddleNLP/models/matching/cnn.py
浏览文件 @
e19f4bc7
...
@@ -43,23 +43,23 @@ class CNN(object):
...
@@ -43,23 +43,23 @@ class CNN(object):
left_emb
=
emb_layer
.
ops
(
left
)
left_emb
=
emb_layer
.
ops
(
left
)
right_emb
=
emb_layer
.
ops
(
right
)
right_emb
=
emb_layer
.
ops
(
right
)
# Presentation context
# Presentation context
cnn_layer
=
layers
.
SequenceConvPoolLayer
(
self
.
filter_size
,
cnn_layer
=
layers
.
SequenceConvPoolLayer
(
self
.
num_filters
,
"conv"
)
self
.
filter_size
,
self
.
num_filters
,
"conv"
)
left_cnn
=
cnn_layer
.
ops
(
left_emb
)
left_cnn
=
cnn_layer
.
ops
(
left_emb
)
right_cnn
=
cnn_layer
.
ops
(
right_emb
)
right_cnn
=
cnn_layer
.
ops
(
right_emb
)
# matching layer
# matching layer
if
self
.
task_mode
==
"pairwise"
:
if
self
.
task_mode
==
"pairwise"
:
relu_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
"relu"
,
"relu
"
)
fc_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
None
,
"fc
"
)
left_
relu
=
relu
_layer
.
ops
(
left_cnn
)
left_
fc
=
fc
_layer
.
ops
(
left_cnn
)
right_
relu
=
relu
_layer
.
ops
(
right_cnn
)
right_
fc
=
fc
_layer
.
ops
(
right_cnn
)
cos_sim_layer
=
layers
.
CosSimLayer
()
cos_sim_layer
=
layers
.
CosSimLayer
()
pred
=
cos_sim_layer
.
ops
(
left_
relu
,
right_relu
)
pred
=
cos_sim_layer
.
ops
(
left_
fc
,
right_fc
)
return
left_
relu
,
pred
return
left_
fc
,
pred
else
:
else
:
concat_layer
=
layers
.
ConcatLayer
(
1
)
concat_layer
=
layers
.
ConcatLayer
(
1
)
concat
=
concat_layer
.
ops
([
left_cnn
,
right_cnn
])
concat
=
concat_layer
.
ops
([
left_cnn
,
right_cnn
])
relu_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
"relu"
,
"relu
"
)
fc_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
None
,
"fc
"
)
concat_fc
=
relu
_layer
.
ops
(
concat
)
concat_fc
=
fc
_layer
.
ops
(
concat
)
softmax_layer
=
layers
.
FCLayer
(
2
,
"softmax"
,
"cos_sim"
)
softmax_layer
=
layers
.
FCLayer
(
2
,
"softmax"
,
"cos_sim"
)
pred
=
softmax_layer
.
ops
(
concat_fc
)
pred
=
softmax_layer
.
ops
(
concat_fc
)
return
left_cnn
,
pred
return
left_cnn
,
pred
PaddleNLP/models/matching/gru.py
浏览文件 @
e19f4bc7
...
@@ -50,17 +50,17 @@ class GRU(object):
...
@@ -50,17 +50,17 @@ class GRU(object):
right_last
=
last_layer
.
ops
(
right_gru
)
right_last
=
last_layer
.
ops
(
right_gru
)
# matching layer
# matching layer
if
self
.
task_mode
==
"pairwise"
:
if
self
.
task_mode
==
"pairwise"
:
relu_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
"relu"
,
"relu
"
)
fc_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
None
,
"fc
"
)
left_
relu
=
relu
_layer
.
ops
(
left_last
)
left_
fc
=
fc
_layer
.
ops
(
left_last
)
right_
relu
=
relu
_layer
.
ops
(
right_last
)
right_
fc
=
fc
_layer
.
ops
(
right_last
)
cos_sim_layer
=
layers
.
CosSimLayer
()
cos_sim_layer
=
layers
.
CosSimLayer
()
pred
=
cos_sim_layer
.
ops
(
left_
relu
,
right_relu
)
pred
=
cos_sim_layer
.
ops
(
left_
fc
,
right_fc
)
return
left_
relu
,
pred
return
left_
fc
,
pred
else
:
else
:
concat_layer
=
layers
.
ConcatLayer
(
1
)
concat_layer
=
layers
.
ConcatLayer
(
1
)
concat
=
concat_layer
.
ops
([
left_last
,
right_last
])
concat
=
concat_layer
.
ops
([
left_last
,
right_last
])
relu_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
"relu"
,
"relu
"
)
fc_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
None
,
"fc
"
)
concat_fc
=
relu
_layer
.
ops
(
concat
)
concat_fc
=
fc
_layer
.
ops
(
concat
)
softmax_layer
=
layers
.
FCLayer
(
2
,
"softmax"
,
"cos_sim"
)
softmax_layer
=
layers
.
FCLayer
(
2
,
"softmax"
,
"cos_sim"
)
pred
=
softmax_layer
.
ops
(
concat_fc
)
pred
=
softmax_layer
.
ops
(
concat_fc
)
return
left_last
,
pred
return
left_last
,
pred
PaddleNLP/models/matching/lstm.py
浏览文件 @
e19f4bc7
...
@@ -49,17 +49,17 @@ class LSTM(object):
...
@@ -49,17 +49,17 @@ class LSTM(object):
right_last
=
last_layer
.
ops
(
right_lstm
)
right_last
=
last_layer
.
ops
(
right_lstm
)
# matching layer
# matching layer
if
self
.
task_mode
==
"pairwise"
:
if
self
.
task_mode
==
"pairwise"
:
relu_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
"relu"
,
"relu
"
)
fc_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
None
,
"fc
"
)
left_
relu
=
relu
_layer
.
ops
(
left_last
)
left_
fc
=
fc
_layer
.
ops
(
left_last
)
right_
relu
=
relu
_layer
.
ops
(
right_last
)
right_
fc
=
fc
_layer
.
ops
(
right_last
)
cos_sim_layer
=
layers
.
CosSimLayer
()
cos_sim_layer
=
layers
.
CosSimLayer
()
pred
=
cos_sim_layer
.
ops
(
left_
relu
,
right_relu
)
pred
=
cos_sim_layer
.
ops
(
left_
fc
,
right_fc
)
return
left_
relu
,
pred
return
left_
fc
,
pred
else
:
else
:
concat_layer
=
layers
.
ConcatLayer
(
1
)
concat_layer
=
layers
.
ConcatLayer
(
1
)
concat
=
concat_layer
.
ops
([
left_last
,
right_last
])
concat
=
concat_layer
.
ops
([
left_last
,
right_last
])
relu_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
"relu"
,
"relu
"
)
fc_layer
=
layers
.
FCLayer
(
self
.
hidden_dim
,
None
,
"fc
"
)
concat_fc
=
relu
_layer
.
ops
(
concat
)
concat_fc
=
fc
_layer
.
ops
(
concat
)
softmax_layer
=
layers
.
FCLayer
(
2
,
"softmax"
,
"cos_sim"
)
softmax_layer
=
layers
.
FCLayer
(
2
,
"softmax"
,
"cos_sim"
)
pred
=
softmax_layer
.
ops
(
concat_fc
)
pred
=
softmax_layer
.
ops
(
concat_fc
)
return
left_last
,
pred
return
left_last
,
pred
PaddleNLP/similarity_net/README.md
浏览文件 @
e19f4bc7
...
@@ -6,10 +6,17 @@
...
@@ -6,10 +6,17 @@
基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和语义匹配数据集(LCQMC)进行评测,效果如下表所示。LCQMC数据集以Accuracy为评测指标,而pairwise模型的输出为相似度,因此我们采用0.958作为分类阈值,相比于基线模型中网络结构同等复杂的CBOW模型(准确率为0.737),我们模型的准确率为0.7532。
基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和语义匹配数据集(LCQMC)进行评测,效果如下表所示。LCQMC数据集以Accuracy为评测指标,而pairwise模型的输出为相似度,因此我们采用0.958作为分类阈值,相比于基线模型中网络结构同等复杂的CBOW模型(准确率为0.737),我们模型的准确率为0.7532。
| 模型 | 百度知道 | ECOM |QQSIM | UNICOM | LCQMC |
| 模型 | 百度知道 | ECOM |QQSIM | UNICOM |
|:-----------:|:-------------:|:-------------:|:-------------:|:-------------:|:-------------:|
|:-----------:|:-------------:|:-------------:|:-------------:|:-------------:|
| | AUC | AUC | AUC|正逆序比|Accuracy|
| | AUC | AUC | AUC|正逆序比|
|BOW_Pairwise|0.6767|0.7329|0.7650|1.5630|0.7532|
|BOW_Pairwise|0.6767|0.7329|0.7650|1.5630|
#### 测试集说明
| 数据集 | 来源 | 垂类 |
|:-----------:|:-------------:|:-------------:|
|百度知道 | 百度知道问题 | 日常 |
|ECOM|商业问句|金融|
|QQSIM|闲聊对话|日常|
|UNICOM|联通客服|客服|
## 快速开始
## 快速开始
#### 版本依赖
#### 版本依赖
本项目依赖于 Paddlepaddle Fluid 1.3.1,请参考
[
安装指南
](
http://www.paddlepaddle.org/#quick-start
)
进行安装。
本项目依赖于 Paddlepaddle Fluid 1.3.1,请参考
[
安装指南
](
http://www.paddlepaddle.org/#quick-start
)
进行安装。
...
@@ -24,24 +31,14 @@ cd models/PaddleNLP/similarity_net
...
@@ -24,24 +31,14 @@ cd models/PaddleNLP/similarity_net
#### 数据准备
#### 数据准备
下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。
下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。
```
shell
```
shell
wget
--no-check-certificate
https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
sh download_data.sh
tar
xzf simnet_dataset-1.0.0.tar.gz
```
```
#### 模型准备
#### 模型准备
我们开源了基于大规模数据训练好的
```pairwise```
模型(基于bow模型训练),我们提供两种下载方式,模型保在
```./model_files/simnet_bow_pairwise_pretrained_model/```
下。
我们开源了基于大规模数据训练好的
```pairwise```
模型(基于bow模型训练),用户可以通过运行命令下载预训练好的模型,该模型将保存在
```./model_files/simnet_bow_pairwise_pretrained_model/```
下。
##### 方式一:基于PaddleHub命令行工具(PaddleHub[安装方式](https://github.com/PaddlePaddle/PaddleHub))
```
shell
mkdir
model_files
hub download simnet_bow_pairwise
--output_path
./
tar
xzf simnet_bow-pairwise-1.0.0.tar.gz
-C
./model_files
```
##### 方式二:直接下载
```
shell
```
shell
mkdir
model_files
sh download_pretrained_model.sh
wget
--no-check-certificate
https://baidu-nlp.bj.bcebos.com/simnet_bow-pairwise-1.0.0.tar.gz
tar
xzf simnet_bow-pairwise-1.0.0.tar.gz
-C
./model_files
```
```
#### 评估
#### 评估
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。
```
shell
```
shell
...
@@ -162,6 +159,7 @@ python run_classifier.py \
...
@@ -162,6 +159,7 @@ python run_classifier.py \
--task_mode ${TASK_MODE} #训练模式,pairwise或pointwise,与相应的配置文件匹配。
--task_mode ${TASK_MODE} #训练模式,pairwise或pointwise,与相应的配置文件匹配。
--compute_accuracy False
\
#是否计算accuracy
--compute_accuracy False
\
#是否计算accuracy
--lamda 0.91
\
#pairwise模式计算accuracy时的阈值
--lamda 0.91
\
#pairwise模式计算accuracy时的阈值
--init_checkpoint "" #预加载模型路径
```
```
### 如何组建自己的模型
### 如何组建自己的模型
用户可以根据自己的需求,组建自定义的模型,具体方法如下所示:
用户可以根据自己的需求,组建自定义的模型,具体方法如下所示:
...
...
PaddleNLP/similarity_net/config.py
浏览文件 @
e19f4bc7
...
@@ -34,14 +34,12 @@ class SimNetConfig(object):
...
@@ -34,14 +34,12 @@ class SimNetConfig(object):
with
open
(
config_path
)
as
json_file
:
with
open
(
config_path
)
as
json_file
:
config_dict
=
json
.
load
(
json_file
)
config_dict
=
json
.
load
(
json_file
)
except
Exception
:
except
Exception
:
raise
IOError
(
"Error in parsing simnet model config file '%s'"
%
raise
IOError
(
"Error in parsing simnet model config file '%s'"
%
config_path
)
config_path
)
else
:
else
:
if
config_dict
[
"task_mode"
]
!=
self
.
task_mode
:
if
config_dict
[
"task_mode"
]
!=
self
.
task_mode
:
raise
ValueError
(
raise
ValueError
(
"the config '{}' does not match the task_mode '{}'"
.
format
(
"the config '{}' does not match the task_mode '{}'"
.
format
(
self
.
config_path
,
self
.
task_mode
))
self
.
config_path
,
self
.
task_mode
))
return
config_dict
return
config_dict
def
__getitem__
(
self
,
key
):
def
__getitem__
(
self
,
key
):
...
...
PaddleNLP/similarity_net/download_data.sh
0 → 100644
浏览文件 @
e19f4bc7
#get data
wget
--no-check-certificate
https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar
xzf simnet_dataset-1.0.0.tar.gz
rm
simnet_dataset-1.0.0.tar.gz
PaddleNLP/similarity_net/download.sh
→
PaddleNLP/similarity_net/download
_pretrained_model
.sh
浏览文件 @
e19f4bc7
...
@@ -4,13 +4,7 @@ model_files_path="./model_files"
...
@@ -4,13 +4,7 @@ model_files_path="./model_files"
#get pretrained_bow_pairwise_model
#get pretrained_bow_pairwise_model
wget
--no-check-certificate
https://baidu-nlp.bj.bcebos.com/simnet_bow-pairwise-1.0.0.tar.gz
wget
--no-check-certificate
https://baidu-nlp.bj.bcebos.com/simnet_bow-pairwise-1.0.0.tar.gz
if
[
!
-d
$model_files_path
]
;
then
if
[
!
-d
$model_files_path
]
;
then
mkdir
$model_files_path
mkdir
$model_files_path
fi
fi
tar
xzf simnet_bow-pairwise-1.0.0.tar.gz
-C
$model_files_path
tar
xzf simnet_bow-pairwise-1.0.0.tar.gz
-C
$model_files_path
rm
simnet_bow-pairwise-1.0.0.tar.gz
rm
simnet_bow-pairwise-1.0.0.tar.gz
\ No newline at end of file
#get data
wget
--no-check-certificate
https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar
xzf simnet_dataset-1.0.0.tar.gz
rm
simnet_dataset-1.0.0.tar.gz
PaddleNLP/similarity_net/run.sh
浏览文件 @
e19f4bc7
...
@@ -21,7 +21,7 @@ INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/
...
@@ -21,7 +21,7 @@ INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/
train
()
{
train
()
{
python run_classifier.py
\
python run_classifier.py
\
--task_name
${
TASK_NAME
}
\
--task_name
${
TASK_NAME
}
\
--use_cuda
f
alse
\
--use_cuda
F
alse
\
--do_train
True
\
--do_train
True
\
--do_valid
True
\
--do_valid
True
\
--do_test
True
\
--do_test
True
\
...
@@ -34,12 +34,13 @@ train() {
...
@@ -34,12 +34,13 @@ train() {
--output_dir
${
CKPT_PATH
}
\
--output_dir
${
CKPT_PATH
}
\
--config_path
${
CONFIG_PATH
}
\
--config_path
${
CONFIG_PATH
}
\
--vocab_path
${
VOCAB_PATH
}
\
--vocab_path
${
VOCAB_PATH
}
\
--epoch
1
0
\
--epoch
4
0
\
--save_steps
1
000
\
--save_steps
2
000
\
--validation_steps
1
00
\
--validation_steps
2
00
\
--compute_accuracy
False
\
--compute_accuracy
False
\
--lamda
0.958
\
--lamda
0.958
\
--task_mode
${
TASK_MODE
}
--task_mode
${
TASK_MODE
}
\
--init_checkpoint
""
}
}
#run_evaluate
#run_evaluate
evaluate
()
{
evaluate
()
{
...
...
PaddleNLP/similarity_net/run_classifier.py
浏览文件 @
e19f4bc7
...
@@ -25,76 +25,67 @@ import argparse
...
@@ -25,76 +25,67 @@ import argparse
import
multiprocessing
import
multiprocessing
import
sys
import
sys
defaultencoding
=
'utf-8'
if
sys
.
getdefaultencoding
()
!=
defaultencoding
:
reload
(
sys
)
sys
.
setdefaultencoding
(
defaultencoding
)
sys
.
path
.
append
(
".."
)
sys
.
path
.
append
(
".."
)
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
numpy
as
np
import
numpy
as
np
import
codecs
import
config
import
config
import
utils
import
utils
import
reader
import
reader
import
models.matching.paddle_layers
as
layers
import
models.matching.paddle_layers
as
layers
import
codecs
from
utils
import
ArgConfig
import
logging
import
logging
parser
=
argparse
.
ArgumentParser
(
__doc__
)
def
create_model
(
args
,
pyreader_name
,
is_inference
=
False
,
is_pointwise
=
False
):
model_g
=
utils
.
ArgumentGroup
(
parser
,
"model"
,
"model configuration and paths."
)
"""
model_g
.
add_arg
(
"config_path"
,
str
,
None
,
Create Model for simnet
"Path to the json file for EmoTect model config."
)
"""
model_g
.
add_arg
(
"init_checkpoint"
,
str
,
None
,
if
is_inference
:
"Init checkpoint to resume training from."
)
inf_pyreader
=
fluid
.
layers
.
py_reader
(
model_g
.
add_arg
(
"output_dir"
,
str
,
None
,
"Directory path to save checkpoints"
)
capacity
=
16
,
model_g
.
add_arg
(
"task_mode"
,
str
,
None
,
"task mode: pairwise or pointwise"
)
shapes
=
([
-
1
,
1
],
[
-
1
,
1
]),
dtypes
=
(
'int64'
,
'int64'
),
train_g
=
utils
.
ArgumentGroup
(
parser
,
"training"
,
"training options."
)
lod_levels
=
(
1
,
1
),
train_g
.
add_arg
(
"epoch"
,
int
,
10
,
"Number of epoches for training."
)
name
=
pyreader_name
,
train_g
.
add_arg
(
"save_steps"
,
int
,
200
,
use_double_buffer
=
False
)
"The steps interval to save checkpoints."
)
train_g
.
add_arg
(
"validation_steps"
,
int
,
100
,
left
,
pos_right
=
fluid
.
layers
.
read_file
(
inf_pyreader
)
"The steps interval to evaluate model performance."
)
return
inf_pyreader
,
left
,
pos_right
log_g
=
utils
.
ArgumentGroup
(
parser
,
"logging"
,
"logging related"
)
log_g
.
add_arg
(
"skip_steps"
,
int
,
10
,
"The steps interval to print loss."
)
log_g
.
add_arg
(
"verbose_result"
,
bool
,
True
,
"Whether to output verbose result."
)
log_g
.
add_arg
(
"test_result_path"
,
str
,
"test_result"
,
"Directory path to test result."
)
log_g
.
add_arg
(
"infer_result_path"
,
str
,
"infer_result"
,
"Directory path to infer result."
)
data_g
=
utils
.
ArgumentGroup
(
parser
,
"data"
,
"Data paths, vocab paths and data processing options"
)
data_g
.
add_arg
(
"train_data_dir"
,
str
,
None
,
"Directory path to training data."
)
data_g
.
add_arg
(
"valid_data_dir"
,
str
,
None
,
"Directory path to valid data."
)
data_g
.
add_arg
(
"test_data_dir"
,
str
,
None
,
"Directory path to testing data."
)
data_g
.
add_arg
(
"infer_data_dir"
,
str
,
None
,
"Directory path to infer data."
)
data_g
.
add_arg
(
"vocab_path"
,
str
,
None
,
"Vocabulary path."
)
data_g
.
add_arg
(
"batch_size"
,
int
,
32
,
"Total examples' number in batch for training."
)
run_type_g
=
utils
.
ArgumentGroup
(
parser
,
"run_type"
,
"running type options."
)
run_type_g
.
add_arg
(
"use_cuda"
,
bool
,
False
,
"If set, use GPU for training."
)
run_type_g
.
add_arg
(
"task_name"
,
str
,
None
,
"The name of task to perform sentiment classification."
)
run_type_g
.
add_arg
(
"do_train"
,
bool
,
False
,
"Whether to perform training."
)
run_type_g
.
add_arg
(
"do_valid"
,
bool
,
False
,
"Whether to perform dev."
)
run_type_g
.
add_arg
(
"do_test"
,
bool
,
False
,
"Whether to perform testing."
)
run_type_g
.
add_arg
(
"do_infer"
,
bool
,
False
,
"Whether to perform inference."
)
run_type_g
.
add_arg
(
"compute_accuracy"
,
bool
,
False
,
"Whether to compute accuracy."
)
run_type_g
.
add_arg
(
"lamda"
,
float
,
0.91
,
"When task_mode is pairwise, lamda is the threshold for calculating the accuracy."
)
parser
.
add_argument
(
'--enable_ce'
,
action
=
'store_true'
,
help
=
'If set, run the task with continuous evaluation logs.'
)
args
=
parser
.
parse_args
()
else
:
if
is_pointwise
:
pointwise_pyreader
=
fluid
.
layers
.
py_reader
(
capacity
=
16
,
shapes
=
([
-
1
,
1
],
[
-
1
,
1
],
[
-
1
,
1
]),
dtypes
=
(
'int64'
,
'int64'
,
'int64'
),
lod_levels
=
(
1
,
1
,
0
),
name
=
pyreader_name
,
use_double_buffer
=
False
)
left
,
right
,
label
=
fluid
.
layers
.
read_file
(
pointwise_pyreader
)
return
pointwise_pyreader
,
left
,
right
,
label
else
:
pairwise_pyreader
=
fluid
.
layers
.
py_reader
(
capacity
=
16
,
shapes
=
([
-
1
,
1
],
[
-
1
,
1
],
[
-
1
,
1
]),
dtypes
=
(
'int64'
,
'int64'
,
'int64'
),
lod_levels
=
(
1
,
1
,
1
),
name
=
pyreader_name
,
use_double_buffer
=
False
)
left
,
pos_right
,
neg_right
=
fluid
.
layers
.
read_file
(
pairwise_pyreader
)
return
pairwise_pyreader
,
left
,
pos_right
,
neg_right
def
train
(
conf_dict
,
args
):
def
train
(
conf_dict
,
args
):
"""
"""
train processic
train processic
...
@@ -129,85 +120,79 @@ def train(conf_dict, args):
...
@@ -129,85 +120,79 @@ def train(conf_dict, args):
place
=
fluid
.
CUDAPlace
(
0
)
place
=
fluid
.
CUDAPlace
(
0
)
else
:
else
:
place
=
fluid
.
CPUPlace
()
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
startup_prog
=
fluid
.
Program
()
train_program
=
fluid
.
Program
()
simnet_process
=
reader
.
SimNetProcessor
(
args
,
vocab
)
simnet_process
=
reader
.
SimNetProcessor
(
args
,
vocab
)
if
args
.
task_mode
==
"pairwise"
:
if
args
.
task_mode
==
"pairwise"
:
# Build network
# Build network
left
=
data
.
ops
(
name
=
"left"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
with
fluid
.
program_guard
(
train_program
,
startup_prog
):
pos_right
=
data
.
ops
(
name
=
"right"
,
with
fluid
.
unique_name
.
guard
():
shape
=
[
1
],
train_pyreader
,
left
,
pos_right
,
neg_right
=
create_model
(
dtype
=
"int64"
,
args
,
lod_level
=
1
)
pyreader_name
=
'train_reader'
)
neg_right
=
data
.
ops
(
name
=
"neg_right"
,
left_feat
,
pos_score
=
net
.
predict
(
left
,
pos_right
)
shape
=
[
1
],
pred
=
pos_score
dtype
=
"int64"
,
_
,
neg_score
=
net
.
predict
(
left
,
neg_right
)
lod_level
=
1
)
avg_cost
=
loss
.
compute
(
pos_score
,
neg_score
)
left_feat
,
pos_score
=
net
.
predict
(
left
,
pos_right
)
avg_cost
.
persistable
=
True
optimizer
.
ops
(
avg_cost
)
# Get Feeder and Reader
train_feeder
=
fluid
.
DataFeeder
(
# Get Reader
place
=
place
,
feed_list
=
[
left
.
name
,
pos_right
.
name
,
neg_right
.
name
])
get_train_examples
=
simnet_process
.
get_reader
(
"train"
)
train_reader
=
simnet_process
.
get_reader
(
"train"
)
if
args
.
do_valid
:
if
args
.
do_valid
:
valid_feeder
=
fluid
.
DataFeeder
(
test_prog
=
fluid
.
Program
()
place
=
place
,
feed_list
=
[
left
.
name
,
pos_right
.
name
])
with
fluid
.
program_guard
(
test_prog
,
startup_prog
):
valid_reader
=
simnet_process
.
get_reader
(
"valid"
)
with
fluid
.
unique_name
.
guard
():
pred
=
pos_score
test_pyreader
,
left
,
pos_right
=
create_model
(
args
,
pyreader_name
=
'test_reader'
,
is_inference
=
True
)
# Save Infer model
left_feat
,
pos_score
=
net
.
predict
(
left
,
pos_right
)
infer_program
=
fluid
.
default_main_program
().
clone
(
for_test
=
True
)
pred
=
pos_score
_
,
neg_score
=
net
.
predict
(
left
,
neg_right
)
test_prog
=
test_prog
.
clone
(
for_test
=
True
)
avg_cost
=
loss
.
compute
(
pos_score
,
neg_score
)
avg_cost
.
persistable
=
True
else
:
else
:
# Build network
# Build network
left
=
data
.
ops
(
name
=
"left"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
with
fluid
.
program_guard
(
train_program
,
startup_prog
):
right
=
data
.
ops
(
name
=
"right"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
with
fluid
.
unique_name
.
guard
():
label
=
data
.
ops
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
0
)
train_pyreader
,
left
,
right
,
label
=
create_model
(
left_feat
,
pred
=
net
.
predict
(
left
,
right
)
args
,
pyreader_name
=
'train_reader'
,
is_pointwise
=
True
)
left_feat
,
pred
=
net
.
predict
(
left
,
right
)
avg_cost
=
loss
.
compute
(
pred
,
label
)
avg_cost
.
persistable
=
True
optimizer
.
ops
(
avg_cost
)
# Get Feeder and Reader
# Get Feeder and Reader
train_feeder
=
fluid
.
DataFeeder
(
get_train_examples
=
simnet_process
.
get_reader
(
"train"
)
place
=
place
,
feed_list
=
[
left
.
name
,
right
.
name
,
label
.
name
])
train_reader
=
simnet_process
.
get_reader
(
"train"
)
if
args
.
do_valid
:
if
args
.
do_valid
:
valid_feeder
=
fluid
.
DataFeeder
(
test_prog
=
fluid
.
Program
()
place
=
place
,
feed_list
=
[
left
.
name
,
right
.
name
])
with
fluid
.
program_guard
(
test_prog
,
startup_prog
):
valid_reader
=
simnet_process
.
get_reader
(
"valid"
)
with
fluid
.
unique_name
.
guard
():
# Save Infer model
test_pyreader
,
left
,
right
=
create_model
(
args
,
pyreader_name
=
'test_reader'
,
is_inference
=
True
)
infer_program
=
fluid
.
default_main_program
().
clone
(
for_test
=
True
)
left_feat
,
pred
=
net
.
predict
(
left
,
right
)
avg_cost
=
loss
.
compute
(
pred
,
label
)
test_prog
=
test_prog
.
clone
(
for_test
=
True
)
avg_cost
.
persistable
=
True
if
args
.
init_checkpoint
is
not
""
:
# operate Optimization
utils
.
init_checkpoint
(
exe
,
args
.
init_checkpoint
,
optimizer
.
ops
(
avg_cost
)
startup_prog
)
executor
=
fluid
.
Executor
(
place
)
executor
.
run
(
fluid
.
default_startup_program
())
def
valid_and_test
(
test_program
,
test_pyreader
,
get_valid_examples
,
process
,
mode
,
exe
,
fetch_list
):
if
args
.
init_checkpoint
is
not
None
:
utils
.
init_checkpoint
(
executor
,
args
.
init_checkpoint
,
fluid
.
default_startup_program
())
# Get and run executor
parallel_executor
=
fluid
.
ParallelExecutor
(
use_cuda
=
args
.
use_cuda
,
loss_name
=
avg_cost
.
name
,
main_program
=
fluid
.
default_main_program
())
# Get device number
device_count
=
parallel_executor
.
device_count
logging
.
info
(
"device count: %d"
%
device_count
)
def
valid_and_test
(
program
,
feeder
,
reader
,
process
,
mode
=
"test"
):
"""
"""
return auc and acc
return auc and acc
"""
"""
# Get Batch Data
# Get Batch Data
batch_data
=
paddle
.
batch
(
reader
,
args
.
batch_size
,
drop_last
=
False
)
batch_data
=
paddle
.
batch
(
get_valid_examples
,
args
.
batch_size
,
drop_last
=
False
)
test_pyreader
.
decorate_paddle_reader
(
batch_data
)
test_pyreader
.
start
()
pred_list
=
[]
pred_list
=
[]
for
data
in
batch_data
():
while
True
:
_pred
=
executor
.
run
(
program
=
program
,
try
:
feed
=
feeder
.
feed
(
data
),
_pred
=
exe
.
run
(
program
=
test_program
,
fetch_list
=
[
pred
.
name
])
fetch_list
=
[
pred
.
name
])
pred_list
+=
list
(
_pred
)
pred_list
+=
list
(
_pred
)
except
fluid
.
core
.
EOFException
:
test_pyreader
.
reset
()
break
pred_list
=
np
.
vstack
(
pred_list
)
pred_list
=
np
.
vstack
(
pred_list
)
if
mode
==
"test"
:
if
mode
==
"test"
:
label_list
=
process
.
get_test_label
()
label_list
=
process
.
get_test_label
()
...
@@ -232,66 +217,85 @@ def train(conf_dict, args):
...
@@ -232,66 +217,85 @@ def train(conf_dict, args):
# set global step
# set global step
global_step
=
0
global_step
=
0
ce_info
=
[]
ce_info
=
[]
train_exe
=
exe
for
epoch_id
in
range
(
args
.
epoch
):
for
epoch_id
in
range
(
args
.
epoch
):
losses
=
[]
# Get batch data iterator
train_batch_data
=
paddle
.
batch
(
train_batch_data
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
reader
.
shuffle
(
train_reader
,
buf_size
=
10000
),
get_train_examples
,
buf_size
=
10000
),
args
.
batch_size
,
args
.
batch_size
,
drop_last
=
False
)
drop_last
=
False
)
train_pyreader
.
decorate_paddle_reader
(
train_batch_data
)
train_pyreader
.
start
()
exe
.
run
(
startup_prog
)
losses
=
[]
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
iter
,
data
in
enumerate
(
train_batch_data
()):
while
True
:
if
len
(
data
)
<
device_count
:
try
:
logging
.
info
(
global_step
+=
1
"the size of batch data is less than device_count(%d)"
%
fetch_list
=
[
avg_cost
.
name
]
device_count
)
avg_loss
=
train_exe
.
run
(
program
=
train_program
,
fetch_list
=
fetch_list
)
continue
if
args
.
do_valid
and
global_step
%
args
.
validation_steps
==
0
:
global_step
+=
1
get_valid_examples
=
simnet_process
.
get_reader
(
"valid"
)
avg_loss
=
parallel_executor
.
run
([
avg_cost
.
name
],
valid_result
=
valid_and_test
(
test_prog
,
test_pyreader
,
get_valid_examples
,
simnet_process
,
"valid"
,
exe
,[
pred
.
name
])
feed
=
train_feeder
.
feed
(
data
))
if
args
.
compute_accuracy
:
if
args
.
do_valid
and
global_step
%
args
.
validation_steps
==
0
:
valid_auc
,
valid_acc
=
valid_result
logging
.
info
(
valid_result
=
valid_and_test
(
"global_steps: %d, valid_auc: %f, valid_acc: %f"
%
program
=
infer_program
,
(
global_step
,
valid_auc
,
valid_acc
))
feeder
=
valid_feeder
,
else
:
reader
=
valid_reader
,
valid_auc
=
valid_result
process
=
simnet_process
,
logging
.
info
(
"global_steps: %d, valid_auc: %f"
%
mode
=
"valid"
)
(
global_step
,
valid_auc
))
if
args
.
compute_accuracy
:
if
global_step
%
args
.
save_steps
==
0
:
valid_auc
,
valid_acc
=
valid_result
model_save_dir
=
os
.
path
.
join
(
args
.
output_dir
,
logging
.
info
(
conf_dict
[
"model_path"
])
"global_steps: %d, valid_auc: %f, valid_acc: %f"
%
model_path
=
os
.
path
.
join
(
model_save_dir
,
str
(
global_step
))
(
global_step
,
valid_auc
,
valid_acc
))
else
:
if
not
os
.
path
.
exists
(
model_save_dir
):
valid_auc
=
valid_result
os
.
makedirs
(
model_save_dir
)
logging
.
info
(
"global_steps: %d, valid_auc: %f"
%
if
args
.
task_mode
==
"pairwise"
:
(
global_step
,
valid_auc
))
feed_var_names
=
[
left
.
name
,
pos_right
.
name
]
if
global_step
%
args
.
save_steps
==
0
:
target_vars
=
[
left_feat
,
pos_score
]
model_save_dir
=
os
.
path
.
join
(
args
.
output_dir
,
else
:
conf_dict
[
"model_path"
])
feed_var_names
=
[
model_path
=
os
.
path
.
join
(
model_save_dir
,
str
(
global_step
))
left
.
name
,
right
.
name
,
if
not
os
.
path
.
exists
(
model_save_dir
):
]
os
.
makedirs
(
model_save_dir
)
target_vars
=
[
left_feat
,
pred
]
if
args
.
task_mode
==
"pairwise"
:
fluid
.
io
.
save_inference_model
(
model_path
,
feed_var_names
,
feed_var_names
=
[
left
.
name
,
pos_right
.
name
]
target_vars
,
exe
,
target_vars
=
[
left_feat
,
pos_score
]
test_prog
)
else
:
logging
.
info
(
"saving infer model in %s"
%
model_path
)
feed_var_names
=
[
losses
.
append
(
np
.
mean
(
avg_loss
[
0
]))
left
.
name
,
right
.
name
,
except
fluid
.
core
.
EOFException
:
]
train_pyreader
.
reset
()
target_vars
=
[
left_feat
,
pred
]
break
fluid
.
io
.
save_inference_model
(
model_path
,
feed_var_names
,
target_vars
,
executor
,
infer_program
)
logging
.
info
(
"saving infer model in %s"
%
model_path
)
losses
.
append
(
np
.
mean
(
avg_loss
[
0
]))
end_time
=
time
.
time
()
end_time
=
time
.
time
()
logging
.
info
(
"epoch: %d, loss: %f, used time: %d sec"
%
logging
.
info
(
"epoch: %d, loss: %f, used time: %d sec"
%
(
epoch_id
,
np
.
mean
(
losses
),
end_time
-
start_time
))
(
epoch_id
,
np
.
mean
(
losses
),
end_time
-
start_time
))
ce_info
.
append
([
np
.
mean
(
losses
),
end_time
-
start_time
])
ce_info
.
append
([
np
.
mean
(
losses
),
end_time
-
start_time
])
#final save
logging
.
info
(
"the final step is %s"
%
global_step
)
model_save_dir
=
os
.
path
.
join
(
args
.
output_dir
,
conf_dict
[
"model_path"
])
model_path
=
os
.
path
.
join
(
model_save_dir
,
str
(
global_step
))
if
not
os
.
path
.
exists
(
model_save_dir
):
os
.
makedirs
(
model_save_dir
)
if
args
.
task_mode
==
"pairwise"
:
feed_var_names
=
[
left
.
name
,
pos_right
.
name
]
target_vars
=
[
left_feat
,
pos_score
]
else
:
feed_var_names
=
[
left
.
name
,
right
.
name
,
]
target_vars
=
[
left_feat
,
pred
]
fluid
.
io
.
save_inference_model
(
model_path
,
feed_var_names
,
target_vars
,
exe
,
test_prog
)
logging
.
info
(
"saving infer model in %s"
%
model_path
)
if
args
.
enable_ce
:
if
args
.
enable_ce
:
card_num
=
get_cards
()
card_num
=
get_cards
()
ce_loss
=
0
ce_loss
=
0
...
@@ -309,20 +313,11 @@ def train(conf_dict, args):
...
@@ -309,20 +313,11 @@ def train(conf_dict, args):
if
args
.
do_test
:
if
args
.
do_test
:
if
args
.
task_mode
==
"pairwise"
:
if
args
.
task_mode
==
"pairwise"
:
# Get Feeder and Reader
# Get Feeder and Reader
test_feeder
=
fluid
.
DataFeeder
(
get_test_examples
=
simnet_process
.
get_reader
(
"test"
)
place
=
place
,
feed_list
=
[
left
.
name
,
pos_right
.
name
])
test_reader
=
simnet_process
.
get_reader
(
"test"
)
else
:
else
:
# Get Feeder and Reader
# Get Feeder and Reader
test_feeder
=
fluid
.
DataFeeder
(
get_test_examples
=
simnet_process
.
get_reader
(
"test"
)
place
=
place
,
feed_list
=
[
left
.
name
,
right
.
name
])
test_result
=
valid_and_test
(
test_prog
,
test_pyreader
,
get_test_examples
,
simnet_process
,
"test"
,
exe
,[
pred
.
name
])
test_reader
=
simnet_process
.
get_reader
(
"test"
)
test_result
=
valid_and_test
(
program
=
infer_program
,
feeder
=
test_feeder
,
reader
=
test_reader
,
process
=
simnet_process
,
mode
=
"test"
)
if
args
.
compute_accuracy
:
if
args
.
compute_accuracy
:
test_auc
,
test_acc
=
test_result
test_auc
,
test_acc
=
test_result
logging
.
info
(
"AUC of test is %f, Accuracy of test is %f"
%
logging
.
info
(
"AUC of test is %f, Accuracy of test is %f"
%
...
@@ -334,51 +329,83 @@ def train(conf_dict, args):
...
@@ -334,51 +329,83 @@ def train(conf_dict, args):
def
test
(
conf_dict
,
args
):
def
test
(
conf_dict
,
args
):
"""
"""
run predict
Evaluation Function
"""
"""
if
args
.
use_cuda
:
place
=
fluid
.
CUDAPlace
(
0
)
else
:
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
vocab
=
utils
.
load_vocab
(
args
.
vocab_path
)
vocab
=
utils
.
load_vocab
(
args
.
vocab_path
)
simnet_process
=
reader
.
SimNetProcessor
(
args
,
vocab
)
simnet_process
=
reader
.
SimNetProcessor
(
args
,
vocab
)
# load auc method
startup_prog
=
fluid
.
Program
()
get_test_examples
=
simnet_process
.
get_reader
(
"test"
)
batch_data
=
paddle
.
batch
(
get_test_examples
,
args
.
batch_size
,
drop_last
=
False
)
test_prog
=
fluid
.
Program
()
conf_dict
[
'dict_size'
]
=
len
(
vocab
)
net
=
utils
.
import_class
(
"../models/matching"
,
conf_dict
[
"net"
][
"module_name"
],
conf_dict
[
"net"
][
"class_name"
])(
conf_dict
)
metric
=
fluid
.
metrics
.
Auc
(
name
=
"auc"
)
metric
=
fluid
.
metrics
.
Auc
(
name
=
"auc"
)
with
codecs
.
open
(
"predictions.txt"
,
"w"
,
"utf-8"
)
as
predictions_file
:
with
codecs
.
open
(
"predictions.txt"
,
"w"
,
"utf-8"
)
as
predictions_file
:
# Get model path
model_path
=
args
.
init_checkpoint
# Get device
if
args
.
use_cuda
:
place
=
fluid
.
CUDAPlace
(
0
)
else
:
place
=
fluid
.
CPUPlace
()
# Get executor
executor
=
fluid
.
Executor
(
place
=
place
)
# Load model
program
,
feed_var_names
,
fetch_targets
=
fluid
.
io
.
load_inference_model
(
model_path
,
executor
)
if
args
.
task_mode
==
"pairwise"
:
if
args
.
task_mode
==
"pairwise"
:
# Get Feeder and Reader
with
fluid
.
program_guard
(
test_prog
,
startup_prog
):
feeder
=
fluid
.
DataFeeder
(
with
fluid
.
unique_name
.
guard
():
place
=
place
,
feed_list
=
feed_var_names
,
program
=
program
)
test_pyreader
,
left
,
pos_right
=
create_model
(
test_reader
=
simnet_process
.
get_reader
(
"test"
)
args
,
pyreader_name
=
'test_reader'
,
is_inference
=
True
)
left_feat
,
pos_score
=
net
.
predict
(
left
,
pos_right
)
pred
=
pos_score
test_prog
=
test_prog
.
clone
(
for_test
=
True
)
else
:
else
:
# Get Feeder and Reader
with
fluid
.
program_guard
(
test_prog
,
startup_prog
):
feeder
=
fluid
.
DataFeeder
(
with
fluid
.
unique_name
.
guard
():
place
=
place
,
feed_list
=
feed_var_names
,
program
=
program
)
test_pyreader
,
left
,
right
=
create_model
(
test_reader
=
simnet_process
.
get_reader
(
"test"
)
args
,
# Get batch data iterator
pyreader_name
=
'test_reader'
,
batch_data
=
paddle
.
batch
(
test_reader
,
args
.
batch_size
,
drop_last
=
False
)
is_inference
=
True
)
left_feat
,
pred
=
net
.
predict
(
left
,
right
)
test_prog
=
test_prog
.
clone
(
for_test
=
True
)
exe
.
run
(
startup_prog
)
utils
.
init_checkpoint
(
exe
,
args
.
init_checkpoint
,
main_program
=
test_prog
)
test_exe
=
exe
test_pyreader
.
decorate_paddle_reader
(
batch_data
)
logging
.
info
(
"start test process ..."
)
logging
.
info
(
"start test process ..."
)
test_pyreader
.
start
()
pred_list
=
[]
pred_list
=
[]
for
iter
,
data
in
enumerate
(
batch_data
()):
fetch_list
=
[
pred
.
name
]
output
=
executor
.
run
(
program
,
output
=
[]
feed
=
feeder
.
feed
(
data
),
while
True
:
fetch_list
=
fetch_targets
)
try
:
if
args
.
task_mode
==
"pairwise"
:
output
=
test_exe
.
run
(
program
=
test_prog
,
fetch_list
=
fetch_list
)
pred_list
+=
list
(
map
(
lambda
item
:
float
(
item
[
0
]),
output
[
1
]))
if
args
.
task_mode
==
"pairwise"
:
predictions_file
.
write
(
"
\n
"
.
join
(
pred_list
+=
list
(
map
(
lambda
item
:
float
(
item
[
0
]),
output
[
0
]))
map
(
lambda
item
:
str
((
item
[
0
]
+
1
)
/
2
),
output
[
1
]))
+
"
\n
"
)
predictions_file
.
write
(
"
\n
"
.
join
(
else
:
map
(
lambda
item
:
str
((
item
[
0
]
+
1
)
/
2
),
output
[
0
]))
+
"
\n
"
)
pred_list
+=
map
(
lambda
item
:
item
,
output
[
1
])
else
:
predictions_file
.
write
(
"
\n
"
.
join
(
pred_list
+=
map
(
lambda
item
:
item
,
output
[
0
])
map
(
lambda
item
:
str
(
np
.
argmax
(
item
)),
output
[
1
]))
+
"
\n
"
)
predictions_file
.
write
(
"
\n
"
.
join
(
map
(
lambda
item
:
str
(
np
.
argmax
(
item
)),
output
[
0
]))
+
"
\n
"
)
except
fluid
.
core
.
EOFException
:
test_pyreader
.
reset
()
break
if
args
.
task_mode
==
"pairwise"
:
if
args
.
task_mode
==
"pairwise"
:
pred_list
=
np
.
array
(
pred_list
).
reshape
((
-
1
,
1
))
pred_list
=
np
.
array
(
pred_list
).
reshape
((
-
1
,
1
))
pred_list
=
(
pred_list
+
1
)
/
2
pred_list
=
(
pred_list
+
1
)
/
2
...
@@ -403,47 +430,72 @@ def test(conf_dict, args):
...
@@ -403,47 +430,72 @@ def test(conf_dict, args):
os
.
path
.
join
(
os
.
getcwd
(),
args
.
test_result_path
))
os
.
path
.
join
(
os
.
getcwd
(),
args
.
test_result_path
))
def
infer
(
args
):
def
infer
(
conf_dict
,
args
):
"""
"""
run predict
run predict
"""
"""
vocab
=
utils
.
load_vocab
(
args
.
vocab_path
)
simnet_process
=
reader
.
SimNetProcessor
(
args
,
vocab
)
# Get model path
model_path
=
args
.
init_checkpoint
# Get device
if
args
.
use_cuda
:
if
args
.
use_cuda
:
place
=
fluid
.
CUDAPlace
(
0
)
place
=
fluid
.
CUDAPlace
(
0
)
else
:
else
:
place
=
fluid
.
CPUPlace
()
place
=
fluid
.
CPUPlace
()
# Get executor
exe
=
fluid
.
Executor
(
place
)
executor
=
fluid
.
Executor
(
place
=
place
)
# Load model
vocab
=
utils
.
load_vocab
(
args
.
vocab_path
)
program
,
feed_var_names
,
fetch_targets
=
fluid
.
io
.
load_inference_model
(
simnet_process
=
reader
.
SimNetProcessor
(
args
,
vocab
)
model_path
,
executor
)
startup_prog
=
fluid
.
Program
()
get_infer_examples
=
simnet_process
.
get_infer_reader
batch_data
=
paddle
.
batch
(
get_infer_examples
,
args
.
batch_size
,
drop_last
=
False
)
test_prog
=
fluid
.
Program
()
conf_dict
[
'dict_size'
]
=
len
(
vocab
)
net
=
utils
.
import_class
(
"../models/matching"
,
conf_dict
[
"net"
][
"module_name"
],
conf_dict
[
"net"
][
"class_name"
])(
conf_dict
)
if
args
.
task_mode
==
"pairwise"
:
if
args
.
task_mode
==
"pairwise"
:
# Get Feeder and Reader
with
fluid
.
program_guard
(
test_prog
,
startup_prog
):
infer_feeder
=
fluid
.
DataFeeder
(
with
fluid
.
unique_name
.
guard
():
place
=
place
,
feed_list
=
feed_var_names
,
program
=
program
)
infer_pyreader
,
left
,
pos_right
=
create_model
(
args
,
pyreader_name
=
'infer_reader'
,
is_inference
=
True
)
infer_reader
=
simnet_process
.
get_infer_reader
left_feat
,
pos_score
=
net
.
predict
(
left
,
pos_right
)
pred
=
pos_score
test_prog
=
test_prog
.
clone
(
for_test
=
True
)
else
:
else
:
# Get Feeder and Reader
with
fluid
.
program_guard
(
test_prog
,
startup_prog
):
infer_feeder
=
fluid
.
DataFeeder
(
with
fluid
.
unique_name
.
guard
():
place
=
place
,
feed_list
=
feed_var_names
,
program
=
program
)
infer_pyreader
,
left
,
right
=
create_model
(
args
,
pyreader_name
=
'infer_reader'
,
is_inference
=
True
)
infer_reader
=
simnet_process
.
get_infer_reader
left_feat
,
pred
=
net
.
predict
(
left
,
right
)
# Get batch data iterator
test_prog
=
test_prog
.
clone
(
for_test
=
True
)
batch_data
=
paddle
.
batch
(
infer_reader
,
args
.
batch_size
,
drop_last
=
False
)
exe
.
run
(
startup_prog
)
utils
.
init_checkpoint
(
exe
,
args
.
init_checkpoint
,
main_program
=
test_prog
)
test_exe
=
exe
infer_pyreader
.
decorate_sample_list_generator
(
batch_data
)
logging
.
info
(
"start test process ..."
)
logging
.
info
(
"start test process ..."
)
preds_list
=
[]
preds_list
=
[]
for
iter
,
data
in
enumerate
(
batch_data
()):
fetch_list
=
[
pred
.
name
]
output
=
executor
.
run
(
program
,
output
=
[]
feed
=
infer_feeder
.
feed
(
data
),
infer_pyreader
.
start
()
fetch_list
=
fetch_targets
)
while
True
:
if
args
.
task_mode
==
"pairwise"
:
try
:
preds_list
+=
list
(
output
=
test_exe
.
run
(
program
=
test_prog
,
fetch_list
=
fetch_list
)
map
(
lambda
item
:
str
((
item
[
0
]
+
1
)
/
2
),
output
[
1
]))
if
args
.
task_mode
==
"pairwise"
:
else
:
preds_list
+=
list
(
preds_list
+=
map
(
lambda
item
:
str
(
np
.
argmax
(
item
)),
output
[
1
])
map
(
lambda
item
:
str
((
item
[
0
]
+
1
)
/
2
),
output
[
0
]))
else
:
preds_list
+=
map
(
lambda
item
:
str
(
np
.
argmax
(
item
)),
output
[
0
])
except
fluid
.
core
.
EOFException
:
infer_pyreader
.
reset
()
break
with
codecs
.
open
(
args
.
infer_result_path
,
"w"
,
"utf-8"
)
as
infer_file
:
with
codecs
.
open
(
args
.
infer_result_path
,
"w"
,
"utf-8"
)
as
infer_file
:
for
_data
,
_pred
in
zip
(
simnet_process
.
get_infer_data
(),
preds_list
):
for
_data
,
_pred
in
zip
(
simnet_process
.
get_infer_data
(),
preds_list
):
infer_file
.
write
(
_data
+
"
\t
"
+
_pred
+
"
\n
"
)
infer_file
.
write
(
_data
+
"
\t
"
+
_pred
+
"
\n
"
)
...
@@ -458,23 +510,11 @@ def get_cards():
...
@@ -458,23 +510,11 @@ def get_cards():
num
=
len
(
cards
.
split
(
","
))
num
=
len
(
cards
.
split
(
","
))
return
num
return
num
if
__name__
==
"__main__"
:
def
main
(
conf_dict
,
args
):
args
=
ArgConfig
()
"""
args
=
args
.
build_conf
()
main
"""
if
args
.
do_train
:
train
(
conf_dict
,
args
)
elif
args
.
do_test
:
test
(
conf_dict
,
args
)
elif
args
.
do_infer
:
infer
(
args
)
else
:
raise
ValueError
(
"one of do_train and do_test and do_infer must be True"
)
if
__name__
==
"__main__"
:
utils
.
print_arguments
(
args
)
utils
.
print_arguments
(
args
)
try
:
try
:
if
fluid
.
is_compiled_with_cuda
()
!=
True
and
args
.
use_cuda
==
True
:
if
fluid
.
is_compiled_with_cuda
()
!=
True
and
args
.
use_cuda
==
True
:
...
@@ -487,4 +527,12 @@ if __name__ == "__main__":
...
@@ -487,4 +527,12 @@ if __name__ == "__main__":
pass
pass
utils
.
init_log
(
"./log/TextSimilarityNet"
)
utils
.
init_log
(
"./log/TextSimilarityNet"
)
conf_dict
=
config
.
SimNetConfig
(
args
)
conf_dict
=
config
.
SimNetConfig
(
args
)
main
(
conf_dict
,
args
)
if
args
.
do_train
:
train
(
conf_dict
,
args
)
elif
args
.
do_test
:
test
(
conf_dict
,
args
)
elif
args
.
do_infer
:
infer
(
conf_dict
,
args
)
else
:
raise
ValueError
(
"one of do_train and do_test and do_infer must be True"
)
\ No newline at end of file
PaddleNLP/similarity_net/utils.py
浏览文件 @
e19f4bc7
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
"""
"""
SimNet utilities.
SimNet utilities.
"""
"""
import
argparse
import
time
import
time
import
sys
import
sys
import
re
import
re
...
@@ -26,20 +26,17 @@ import numpy as np
...
@@ -26,20 +26,17 @@ import numpy as np
import
logging
import
logging
import
logging.handlers
import
logging.handlers
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
io
"""
"""
******functions for file processing******
******functions for file processing******
"""
"""
def
load_vocab
(
file_path
):
def
load_vocab
(
file_path
):
"""
"""
load the given vocabulary
load the given vocabulary
"""
"""
vocab
=
{}
vocab
=
{}
if
six
.
PY3
:
f
=
io
.
open
(
file_path
,
"r"
,
encoding
=
"utf-8"
)
f
=
open
(
file_path
,
"r"
,
encoding
=
"utf-8"
)
else
:
f
=
open
(
file_path
,
"r"
)
for
line
in
f
:
for
line
in
f
:
items
=
line
.
strip
(
"
\n
"
).
split
(
"
\t
"
)
items
=
line
.
strip
(
"
\n
"
).
split
(
"
\t
"
)
if
items
[
0
]
not
in
vocab
:
if
items
[
0
]
not
in
vocab
:
...
@@ -61,8 +58,7 @@ def get_result_file(args):
...
@@ -61,8 +58,7 @@ def get_result_file(args):
"""
"""
with
codecs
.
open
(
args
.
test_data_dir
,
"r"
,
"utf-8"
)
as
test_file
:
with
codecs
.
open
(
args
.
test_data_dir
,
"r"
,
"utf-8"
)
as
test_file
:
with
codecs
.
open
(
"predictions.txt"
,
"r"
,
"utf-8"
)
as
predictions_file
:
with
codecs
.
open
(
"predictions.txt"
,
"r"
,
"utf-8"
)
as
predictions_file
:
with
codecs
.
open
(
args
.
test_result_path
,
"w"
,
with
codecs
.
open
(
args
.
test_result_path
,
"w"
,
"utf-8"
)
as
test_result_file
:
"utf-8"
)
as
test_result_file
:
test_datas
=
[
line
.
strip
(
"
\n
"
)
for
line
in
test_file
]
test_datas
=
[
line
.
strip
(
"
\n
"
)
for
line
in
test_file
]
predictions
=
[
line
.
strip
(
"
\n
"
)
for
line
in
predictions_file
]
predictions
=
[
line
.
strip
(
"
\n
"
)
for
line
in
predictions_file
]
for
test_data
,
prediction
in
zip
(
test_datas
,
predictions
):
for
test_data
,
prediction
in
zip
(
test_datas
,
predictions
):
...
@@ -170,6 +166,58 @@ class ArgumentGroup(object):
...
@@ -170,6 +166,58 @@ class ArgumentGroup(object):
help
=
help
+
' Default: %(default)s.'
,
help
=
help
+
' Default: %(default)s.'
,
**
kwargs
)
**
kwargs
)
class
ArgConfig
(
object
):
def
__init__
(
self
):
parser
=
argparse
.
ArgumentParser
()
model_g
=
ArgumentGroup
(
parser
,
"model"
,
"model configuration and paths."
)
model_g
.
add_arg
(
"config_path"
,
str
,
None
,
"Path to the json file for EmoTect model config."
)
model_g
.
add_arg
(
"init_checkpoint"
,
str
,
None
,
"Init checkpoint to resume training from."
)
model_g
.
add_arg
(
"output_dir"
,
str
,
None
,
"Directory path to save checkpoints"
)
model_g
.
add_arg
(
"task_mode"
,
str
,
None
,
"task mode: pairwise or pointwise"
)
train_g
=
ArgumentGroup
(
parser
,
"training"
,
"training options."
)
train_g
.
add_arg
(
"epoch"
,
int
,
10
,
"Number of epoches for training."
)
train_g
.
add_arg
(
"save_steps"
,
int
,
200
,
"The steps interval to save checkpoints."
)
train_g
.
add_arg
(
"validation_steps"
,
int
,
100
,
"The steps interval to evaluate model performance."
)
log_g
=
ArgumentGroup
(
parser
,
"logging"
,
"logging related"
)
log_g
.
add_arg
(
"skip_steps"
,
int
,
10
,
"The steps interval to print loss."
)
log_g
.
add_arg
(
"verbose_result"
,
bool
,
True
,
"Whether to output verbose result."
)
log_g
.
add_arg
(
"test_result_path"
,
str
,
"test_result"
,
"Directory path to test result."
)
log_g
.
add_arg
(
"infer_result_path"
,
str
,
"infer_result"
,
"Directory path to infer result."
)
data_g
=
ArgumentGroup
(
parser
,
"data"
,
"Data paths, vocab paths and data processing options"
)
data_g
.
add_arg
(
"train_data_dir"
,
str
,
None
,
"Directory path to training data."
)
data_g
.
add_arg
(
"valid_data_dir"
,
str
,
None
,
"Directory path to valid data."
)
data_g
.
add_arg
(
"test_data_dir"
,
str
,
None
,
"Directory path to testing data."
)
data_g
.
add_arg
(
"infer_data_dir"
,
str
,
None
,
"Directory path to infer data."
)
data_g
.
add_arg
(
"vocab_path"
,
str
,
None
,
"Vocabulary path."
)
data_g
.
add_arg
(
"batch_size"
,
int
,
32
,
"Total examples' number in batch for training."
)
run_type_g
=
ArgumentGroup
(
parser
,
"run_type"
,
"running type options."
)
run_type_g
.
add_arg
(
"use_cuda"
,
bool
,
False
,
"If set, use GPU for training."
)
run_type_g
.
add_arg
(
"task_name"
,
str
,
None
,
"The name of task to perform sentiment classification."
)
run_type_g
.
add_arg
(
"do_train"
,
bool
,
False
,
"Whether to perform training."
)
run_type_g
.
add_arg
(
"do_valid"
,
bool
,
False
,
"Whether to perform dev."
)
run_type_g
.
add_arg
(
"do_test"
,
bool
,
False
,
"Whether to perform testing."
)
run_type_g
.
add_arg
(
"do_infer"
,
bool
,
False
,
"Whether to perform inference."
)
run_type_g
.
add_arg
(
"compute_accuracy"
,
bool
,
False
,
"Whether to compute accuracy."
)
run_type_g
.
add_arg
(
"lamda"
,
float
,
0.91
,
"When task_mode is pairwise, lamda is the threshold for calculating the accuracy."
)
custom_g
=
ArgumentGroup
(
parser
,
"customize"
,
"customized options."
)
self
.
custom_g
=
custom_g
parser
.
add_argument
(
'--enable_ce'
,
action
=
'store_true'
,
help
=
'If set, run the task with continuous evaluation logs.'
)
self
.
parser
=
parser
def
add_arg
(
self
,
name
,
dtype
,
default
,
descrip
):
self
.
custom_g
.
add_arg
(
name
,
dtype
,
default
,
descrip
)
def
build_conf
(
self
):
return
self
.
parser
.
parse_args
()
def
print_arguments
(
args
):
def
print_arguments
(
args
):
"""
"""
...
@@ -302,7 +350,7 @@ def init_checkpoint(exe, init_checkpoint_path, main_program):
...
@@ -302,7 +350,7 @@ def init_checkpoint(exe, init_checkpoint_path, main_program):
"""
"""
assert
os
.
path
.
exists
(
assert
os
.
path
.
exists
(
init_checkpoint_path
),
"[%s] cann't be found."
%
init_checkpoint_path
init_checkpoint_path
),
"[%s] cann't be found."
%
init_checkpoint_path
def
existed_persitables
(
var
):
def
existed_persitables
(
var
):
if
not
fluid
.
io
.
is_persistable
(
var
):
if
not
fluid
.
io
.
is_persistable
(
var
):
return
False
return
False
...
@@ -314,3 +362,4 @@ def init_checkpoint(exe, init_checkpoint_path, main_program):
...
@@ -314,3 +362,4 @@ def init_checkpoint(exe, init_checkpoint_path, main_program):
main_program
=
main_program
,
main_program
=
main_program
,
predicate
=
existed_persitables
)
predicate
=
existed_persitables
)
print
(
"Load model from {}"
.
format
(
init_checkpoint_path
))
print
(
"Load model from {}"
.
format
(
init_checkpoint_path
))
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录