提交 e19f4bc7 编写于 作者: D Dilyar 提交者: Yibing Liu

Fix some problems of simnet (#3433)

* update

* update

* Update README.md

* Update run.sh
上级 107d4e79
...@@ -49,7 +49,7 @@ class BOW(object): ...@@ -49,7 +49,7 @@ class BOW(object):
right_soft = softsign_layer.ops(right_pool) right_soft = softsign_layer.ops(right_pool)
# matching layer # matching layer
if self.task_mode == "pairwise": if self.task_mode == "pairwise":
bow_layer = layers.FCLayer(self.bow_dim, "relu", "fc") bow_layer = layers.FCLayer(self.bow_dim, None, "fc")
left_bow = bow_layer.ops(left_soft) left_bow = bow_layer.ops(left_soft)
right_bow = bow_layer.ops(right_soft) right_bow = bow_layer.ops(right_soft)
cos_sim_layer = layers.CosSimLayer() cos_sim_layer = layers.CosSimLayer()
...@@ -58,7 +58,7 @@ class BOW(object): ...@@ -58,7 +58,7 @@ class BOW(object):
else: else:
concat_layer = layers.ConcatLayer(1) concat_layer = layers.ConcatLayer(1)
concat = concat_layer.ops([left_soft, right_soft]) concat = concat_layer.ops([left_soft, right_soft])
bow_layer = layers.FCLayer(self.bow_dim, "relu", "fc") bow_layer = layers.FCLayer(self.bow_dim, None, "fc")
concat_fc = bow_layer.ops(concat) concat_fc = bow_layer.ops(concat)
softmax_layer = layers.FCLayer(2, "softmax", "cos_sim") softmax_layer = layers.FCLayer(2, "softmax", "cos_sim")
pred = softmax_layer.ops(concat_fc) pred = softmax_layer.ops(concat_fc)
......
...@@ -43,23 +43,23 @@ class CNN(object): ...@@ -43,23 +43,23 @@ class CNN(object):
left_emb = emb_layer.ops(left) left_emb = emb_layer.ops(left)
right_emb = emb_layer.ops(right) right_emb = emb_layer.ops(right)
# Presentation context # Presentation context
cnn_layer = layers.SequenceConvPoolLayer(self.filter_size, cnn_layer = layers.SequenceConvPoolLayer(
self.num_filters, "conv") self.filter_size, self.num_filters, "conv")
left_cnn = cnn_layer.ops(left_emb) left_cnn = cnn_layer.ops(left_emb)
right_cnn = cnn_layer.ops(right_emb) right_cnn = cnn_layer.ops(right_emb)
# matching layer # matching layer
if self.task_mode == "pairwise": if self.task_mode == "pairwise":
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu") fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
left_relu = relu_layer.ops(left_cnn) left_fc = fc_layer.ops(left_cnn)
right_relu = relu_layer.ops(right_cnn) right_fc = fc_layer.ops(right_cnn)
cos_sim_layer = layers.CosSimLayer() cos_sim_layer = layers.CosSimLayer()
pred = cos_sim_layer.ops(left_relu, right_relu) pred = cos_sim_layer.ops(left_fc, right_fc)
return left_relu, pred return left_fc, pred
else: else:
concat_layer = layers.ConcatLayer(1) concat_layer = layers.ConcatLayer(1)
concat = concat_layer.ops([left_cnn, right_cnn]) concat = concat_layer.ops([left_cnn, right_cnn])
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu") fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
concat_fc = relu_layer.ops(concat) concat_fc = fc_layer.ops(concat)
softmax_layer = layers.FCLayer(2, "softmax", "cos_sim") softmax_layer = layers.FCLayer(2, "softmax", "cos_sim")
pred = softmax_layer.ops(concat_fc) pred = softmax_layer.ops(concat_fc)
return left_cnn, pred return left_cnn, pred
...@@ -50,17 +50,17 @@ class GRU(object): ...@@ -50,17 +50,17 @@ class GRU(object):
right_last = last_layer.ops(right_gru) right_last = last_layer.ops(right_gru)
# matching layer # matching layer
if self.task_mode == "pairwise": if self.task_mode == "pairwise":
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu") fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
left_relu = relu_layer.ops(left_last) left_fc = fc_layer.ops(left_last)
right_relu = relu_layer.ops(right_last) right_fc = fc_layer.ops(right_last)
cos_sim_layer = layers.CosSimLayer() cos_sim_layer = layers.CosSimLayer()
pred = cos_sim_layer.ops(left_relu, right_relu) pred = cos_sim_layer.ops(left_fc, right_fc)
return left_relu, pred return left_fc, pred
else: else:
concat_layer = layers.ConcatLayer(1) concat_layer = layers.ConcatLayer(1)
concat = concat_layer.ops([left_last, right_last]) concat = concat_layer.ops([left_last, right_last])
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu") fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
concat_fc = relu_layer.ops(concat) concat_fc = fc_layer.ops(concat)
softmax_layer = layers.FCLayer(2, "softmax", "cos_sim") softmax_layer = layers.FCLayer(2, "softmax", "cos_sim")
pred = softmax_layer.ops(concat_fc) pred = softmax_layer.ops(concat_fc)
return left_last, pred return left_last, pred
...@@ -49,17 +49,17 @@ class LSTM(object): ...@@ -49,17 +49,17 @@ class LSTM(object):
right_last = last_layer.ops(right_lstm) right_last = last_layer.ops(right_lstm)
# matching layer # matching layer
if self.task_mode == "pairwise": if self.task_mode == "pairwise":
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu") fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
left_relu = relu_layer.ops(left_last) left_fc = fc_layer.ops(left_last)
right_relu = relu_layer.ops(right_last) right_fc = fc_layer.ops(right_last)
cos_sim_layer = layers.CosSimLayer() cos_sim_layer = layers.CosSimLayer()
pred = cos_sim_layer.ops(left_relu, right_relu) pred = cos_sim_layer.ops(left_fc, right_fc)
return left_relu, pred return left_fc, pred
else: else:
concat_layer = layers.ConcatLayer(1) concat_layer = layers.ConcatLayer(1)
concat = concat_layer.ops([left_last, right_last]) concat = concat_layer.ops([left_last, right_last])
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu") fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
concat_fc = relu_layer.ops(concat) concat_fc = fc_layer.ops(concat)
softmax_layer = layers.FCLayer(2, "softmax", "cos_sim") softmax_layer = layers.FCLayer(2, "softmax", "cos_sim")
pred = softmax_layer.ops(concat_fc) pred = softmax_layer.ops(concat_fc)
return left_last, pred return left_last, pred
...@@ -6,10 +6,17 @@ ...@@ -6,10 +6,17 @@
基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和语义匹配数据集(LCQMC)进行评测,效果如下表所示。LCQMC数据集以Accuracy为评测指标,而pairwise模型的输出为相似度,因此我们采用0.958作为分类阈值,相比于基线模型中网络结构同等复杂的CBOW模型(准确率为0.737),我们模型的准确率为0.7532。 基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和语义匹配数据集(LCQMC)进行评测,效果如下表所示。LCQMC数据集以Accuracy为评测指标,而pairwise模型的输出为相似度,因此我们采用0.958作为分类阈值,相比于基线模型中网络结构同等复杂的CBOW模型(准确率为0.737),我们模型的准确率为0.7532。
| 模型 | 百度知道 | ECOM |QQSIM | UNICOM | LCQMC | | 模型 | 百度知道 | ECOM |QQSIM | UNICOM |
|:-----------:|:-------------:|:-------------:|:-------------:|:-------------:|:-------------:| |:-----------:|:-------------:|:-------------:|:-------------:|:-------------:|
| | AUC | AUC | AUC|正逆序比|Accuracy| | | AUC | AUC | AUC|正逆序比|
|BOW_Pairwise|0.6767|0.7329|0.7650|1.5630|0.7532| |BOW_Pairwise|0.6767|0.7329|0.7650|1.5630|
#### 测试集说明
| 数据集 | 来源 | 垂类 |
|:-----------:|:-------------:|:-------------:|
|百度知道 | 百度知道问题 | 日常 |
|ECOM|商业问句|金融|
|QQSIM|闲聊对话|日常|
|UNICOM|联通客服|客服|
## 快速开始 ## 快速开始
#### 版本依赖 #### 版本依赖
本项目依赖于 Paddlepaddle Fluid 1.3.1,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。 本项目依赖于 Paddlepaddle Fluid 1.3.1,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
...@@ -24,24 +31,14 @@ cd models/PaddleNLP/similarity_net ...@@ -24,24 +31,14 @@ cd models/PaddleNLP/similarity_net
#### 数据准备 #### 数据准备
下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。 下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。
```shell ```shell
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz sh download_data.sh
tar xzf simnet_dataset-1.0.0.tar.gz
``` ```
#### 模型准备 #### 模型准备
我们开源了基于大规模数据训练好的```pairwise```模型(基于bow模型训练),我们提供两种下载方式,模型保在```./model_files/simnet_bow_pairwise_pretrained_model/```下。 我们开源了基于大规模数据训练好的```pairwise```模型(基于bow模型训练),用户可以通过运行命令下载预训练好的模型,该模型将保存在```./model_files/simnet_bow_pairwise_pretrained_model/```下。
##### 方式一:基于PaddleHub命令行工具(PaddleHub[安装方式](https://github.com/PaddlePaddle/PaddleHub))
```shell
mkdir model_files
hub download simnet_bow_pairwise --output_path ./
tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C ./model_files
```
##### 方式二:直接下载
```shell ```shell
mkdir model_files sh download_pretrained_model.sh
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_bow-pairwise-1.0.0.tar.gz
tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C ./model_files
``` ```
#### 评估 #### 评估
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。 我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。
```shell ```shell
...@@ -162,6 +159,7 @@ python run_classifier.py \ ...@@ -162,6 +159,7 @@ python run_classifier.py \
--task_mode ${TASK_MODE} #训练模式,pairwise或pointwise,与相应的配置文件匹配。 --task_mode ${TASK_MODE} #训练模式,pairwise或pointwise,与相应的配置文件匹配。
--compute_accuracy False \ #是否计算accuracy --compute_accuracy False \ #是否计算accuracy
--lamda 0.91 \ #pairwise模式计算accuracy时的阈值 --lamda 0.91 \ #pairwise模式计算accuracy时的阈值
--init_checkpoint "" #预加载模型路径
``` ```
### 如何组建自己的模型 ### 如何组建自己的模型
用户可以根据自己的需求,组建自定义的模型,具体方法如下所示: 用户可以根据自己的需求,组建自定义的模型,具体方法如下所示:
......
...@@ -34,14 +34,12 @@ class SimNetConfig(object): ...@@ -34,14 +34,12 @@ class SimNetConfig(object):
with open(config_path) as json_file: with open(config_path) as json_file:
config_dict = json.load(json_file) config_dict = json.load(json_file)
except Exception: except Exception:
raise IOError("Error in parsing simnet model config file '%s'" % raise IOError("Error in parsing simnet model config file '%s'" % config_path)
config_path)
else: else:
if config_dict["task_mode"] != self.task_mode: if config_dict["task_mode"] != self.task_mode:
raise ValueError( raise ValueError(
"the config '{}' does not match the task_mode '{}'".format( "the config '{}' does not match the task_mode '{}'".format(self.config_path, self.task_mode))
self.config_path, self.task_mode))
return config_dict return config_dict
def __getitem__(self, key): def __getitem__(self, key):
......
#get data
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
...@@ -8,9 +8,3 @@ if [ ! -d $model_files_path ]; then ...@@ -8,9 +8,3 @@ if [ ! -d $model_files_path ]; then
fi fi
tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C $model_files_path tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C $model_files_path
rm simnet_bow-pairwise-1.0.0.tar.gz rm simnet_bow-pairwise-1.0.0.tar.gz
#get data
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
...@@ -21,7 +21,7 @@ INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ ...@@ -21,7 +21,7 @@ INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/
train() { train() {
python run_classifier.py \ python run_classifier.py \
--task_name ${TASK_NAME} \ --task_name ${TASK_NAME} \
--use_cuda false \ --use_cuda False \
--do_train True \ --do_train True \
--do_valid True \ --do_valid True \
--do_test True \ --do_test True \
...@@ -34,12 +34,13 @@ train() { ...@@ -34,12 +34,13 @@ train() {
--output_dir ${CKPT_PATH} \ --output_dir ${CKPT_PATH} \
--config_path ${CONFIG_PATH} \ --config_path ${CONFIG_PATH} \
--vocab_path ${VOCAB_PATH} \ --vocab_path ${VOCAB_PATH} \
--epoch 10 \ --epoch 40 \
--save_steps 1000 \ --save_steps 2000 \
--validation_steps 100 \ --validation_steps 200 \
--compute_accuracy False \ --compute_accuracy False \
--lamda 0.958 \ --lamda 0.958 \
--task_mode ${TASK_MODE} --task_mode ${TASK_MODE}\
--init_checkpoint ""
} }
#run_evaluate #run_evaluate
evaluate() { evaluate() {
......
...@@ -25,75 +25,66 @@ import argparse ...@@ -25,75 +25,66 @@ import argparse
import multiprocessing import multiprocessing
import sys import sys
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
reload(sys)
sys.setdefaultencoding(defaultencoding)
sys.path.append("..") sys.path.append("..")
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import numpy as np import numpy as np
import codecs
import config import config
import utils import utils
import reader import reader
import models.matching.paddle_layers as layers import models.matching.paddle_layers as layers
import codecs
from utils import ArgConfig
import logging import logging
parser = argparse.ArgumentParser(__doc__) def create_model(args, pyreader_name, is_inference = False, is_pointwise = False):
model_g = utils.ArgumentGroup(parser, "model", "model configuration and paths.") """
model_g.add_arg("config_path", str, None, Create Model for simnet
"Path to the json file for EmoTect model config.") """
model_g.add_arg("init_checkpoint", str, None, if is_inference:
"Init checkpoint to resume training from.") inf_pyreader = fluid.layers.py_reader(
model_g.add_arg("output_dir", str, None, "Directory path to save checkpoints") capacity=16,
model_g.add_arg("task_mode", str, None, "task mode: pairwise or pointwise") shapes=([-1,1], [-1,1]),
dtypes=('int64', 'int64'),
train_g = utils.ArgumentGroup(parser, "training", "training options.") lod_levels=(1, 1),
train_g.add_arg("epoch", int, 10, "Number of epoches for training.") name=pyreader_name,
train_g.add_arg("save_steps", int, 200, use_double_buffer=False)
"The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 100, left, pos_right = fluid.layers.read_file(inf_pyreader)
"The steps interval to evaluate model performance.") return inf_pyreader, left, pos_right
log_g = utils.ArgumentGroup(parser, "logging", "logging related") else:
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.") if is_pointwise:
log_g.add_arg("verbose_result", bool, True, "Whether to output verbose result.") pointwise_pyreader = fluid.layers.py_reader(
log_g.add_arg("test_result_path", str, "test_result", capacity=16,
"Directory path to test result.") shapes=([-1,1], [-1,1], [-1,1]),
log_g.add_arg("infer_result_path", str, "infer_result", dtypes=('int64', 'int64', 'int64'),
"Directory path to infer result.") lod_levels=(1, 1, 0),
name=pyreader_name,
data_g = utils.ArgumentGroup( use_double_buffer=False)
parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("train_data_dir", str, None, "Directory path to training data.") left, right, label = fluid.layers.read_file(pointwise_pyreader)
data_g.add_arg("valid_data_dir", str, None, "Directory path to valid data.") return pointwise_pyreader, left, right, label
data_g.add_arg("test_data_dir", str, None, "Directory path to testing data.")
data_g.add_arg("infer_data_dir", str, None, "Directory path to infer data.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("batch_size", int, 32,
"Total examples' number in batch for training.")
run_type_g = utils.ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
run_type_g.add_arg("task_name", str, None,
"The name of task to perform sentiment classification.")
run_type_g.add_arg("do_train", bool, False, "Whether to perform training.")
run_type_g.add_arg("do_valid", bool, False, "Whether to perform dev.")
run_type_g.add_arg("do_test", bool, False, "Whether to perform testing.")
run_type_g.add_arg("do_infer", bool, False, "Whether to perform inference.")
run_type_g.add_arg("compute_accuracy", bool, False,
"Whether to compute accuracy.")
run_type_g.add_arg(
"lamda", float, 0.91,
"When task_mode is pairwise, lamda is the threshold for calculating the accuracy."
)
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run the task with continuous evaluation logs.')
args = parser.parse_args()
else:
pairwise_pyreader = fluid.layers.py_reader(
capacity=16,
shapes=([-1,1], [-1,1], [-1,1]),
dtypes=('int64', 'int64', 'int64'),
lod_levels=(1, 1, 1),
name=pyreader_name,
use_double_buffer=False)
left, pos_right, neg_right = fluid.layers.read_file(pairwise_pyreader)
return pairwise_pyreader, left, pos_right, neg_right
def train(conf_dict, args): def train(conf_dict, args):
""" """
...@@ -129,85 +120,79 @@ def train(conf_dict, args): ...@@ -129,85 +120,79 @@ def train(conf_dict, args):
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0)
else: else:
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place)
startup_prog = fluid.Program()
train_program = fluid.Program()
simnet_process = reader.SimNetProcessor(args, vocab) simnet_process = reader.SimNetProcessor(args, vocab)
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
# Build network # Build network
left = data.ops(name="left", shape=[1], dtype="int64", lod_level=1) with fluid.program_guard(train_program, startup_prog):
pos_right = data.ops(name="right", with fluid.unique_name.guard():
shape=[1], train_pyreader, left, pos_right, neg_right = create_model(
dtype="int64", args,
lod_level=1) pyreader_name='train_reader')
neg_right = data.ops(name="neg_right",
shape=[1],
dtype="int64",
lod_level=1)
left_feat, pos_score = net.predict(left, pos_right) left_feat, pos_score = net.predict(left, pos_right)
# Get Feeder and Reader
train_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, pos_right.name, neg_right.name])
train_reader = simnet_process.get_reader("train")
if args.do_valid:
valid_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, pos_right.name])
valid_reader = simnet_process.get_reader("valid")
pred = pos_score pred = pos_score
# Save Infer model
infer_program = fluid.default_main_program().clone(for_test=True)
_, neg_score = net.predict(left, neg_right) _, neg_score = net.predict(left, neg_right)
avg_cost = loss.compute(pos_score, neg_score) avg_cost = loss.compute(pos_score, neg_score)
avg_cost.persistable = True avg_cost.persistable = True
optimizer.ops(avg_cost)
# Get Reader
get_train_examples = simnet_process.get_reader("train")
if args.do_valid:
test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, pos_right= create_model(args, pyreader_name = 'test_reader',is_inference=True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
else: else:
# Build network # Build network
left = data.ops(name="left", shape=[1], dtype="int64", lod_level=1) with fluid.program_guard(train_program, startup_prog):
right = data.ops(name="right", shape=[1], dtype="int64", lod_level=1) with fluid.unique_name.guard():
label = data.ops(name="label", shape=[1], dtype="int64", lod_level=0) train_pyreader, left, right, label = create_model(
args,
pyreader_name='train_reader',
is_pointwise=True)
left_feat, pred = net.predict(left, right) left_feat, pred = net.predict(left, right)
avg_cost = loss.compute(pred, label)
avg_cost.persistable = True
optimizer.ops(avg_cost)
# Get Feeder and Reader # Get Feeder and Reader
train_feeder = fluid.DataFeeder( get_train_examples = simnet_process.get_reader("train")
place=place, feed_list=[left.name, right.name, label.name])
train_reader = simnet_process.get_reader("train")
if args.do_valid: if args.do_valid:
valid_feeder = fluid.DataFeeder( test_prog = fluid.Program()
place=place, feed_list=[left.name, right.name]) with fluid.program_guard(test_prog, startup_prog):
valid_reader = simnet_process.get_reader("valid") with fluid.unique_name.guard():
# Save Infer model test_pyreader, left, right= create_model(args, pyreader_name = 'test_reader',is_inference=True)
infer_program = fluid.default_main_program().clone(for_test=True) left_feat, pred = net.predict(left, right)
avg_cost = loss.compute(pred, label) test_prog = test_prog.clone(for_test=True)
avg_cost.persistable = True
# operate Optimization if args.init_checkpoint is not "":
optimizer.ops(avg_cost) utils.init_checkpoint(exe, args.init_checkpoint,
executor = fluid.Executor(place) startup_prog)
executor.run(fluid.default_startup_program())
def valid_and_test(test_program, test_pyreader, get_valid_examples, process, mode, exe, fetch_list):
if args.init_checkpoint is not None:
utils.init_checkpoint(executor, args.init_checkpoint,
fluid.default_startup_program())
# Get and run executor
parallel_executor = fluid.ParallelExecutor(
use_cuda=args.use_cuda,
loss_name=avg_cost.name,
main_program=fluid.default_main_program())
# Get device number
device_count = parallel_executor.device_count
logging.info("device count: %d" % device_count)
def valid_and_test(program, feeder, reader, process, mode="test"):
""" """
return auc and acc return auc and acc
""" """
# Get Batch Data # Get Batch Data
batch_data = paddle.batch(reader, args.batch_size, drop_last=False) batch_data = paddle.batch(get_valid_examples, args.batch_size, drop_last=False)
test_pyreader.decorate_paddle_reader(batch_data)
test_pyreader.start()
pred_list = [] pred_list = []
for data in batch_data(): while True:
_pred = executor.run(program=program, try:
feed=feeder.feed(data), _pred = exe.run(program=test_program,fetch_list=[pred.name])
fetch_list=[pred.name])
pred_list += list(_pred) pred_list += list(_pred)
except fluid.core.EOFException:
test_pyreader.reset()
break
pred_list = np.vstack(pred_list) pred_list = np.vstack(pred_list)
if mode == "test": if mode == "test":
label_list = process.get_test_label() label_list = process.get_test_label()
...@@ -232,32 +217,26 @@ def train(conf_dict, args): ...@@ -232,32 +217,26 @@ def train(conf_dict, args):
# set global step # set global step
global_step = 0 global_step = 0
ce_info = [] ce_info = []
train_exe = exe
for epoch_id in range(args.epoch): for epoch_id in range(args.epoch):
losses = []
# Get batch data iterator
train_batch_data = paddle.batch( train_batch_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
train_reader, buf_size=10000), get_train_examples, buf_size=10000),
args.batch_size, args.batch_size,
drop_last=False) drop_last=False)
train_pyreader.decorate_paddle_reader(train_batch_data)
train_pyreader.start()
exe.run(startup_prog)
losses = []
start_time = time.time() start_time = time.time()
for iter, data in enumerate(train_batch_data()): while True:
if len(data) < device_count: try:
logging.info(
"the size of batch data is less than device_count(%d)" %
device_count)
continue
global_step += 1 global_step += 1
avg_loss = parallel_executor.run([avg_cost.name], fetch_list = [avg_cost.name]
feed=train_feeder.feed(data)) avg_loss = train_exe.run(program=train_program, fetch_list = fetch_list)
if args.do_valid and global_step % args.validation_steps == 0: if args.do_valid and global_step % args.validation_steps == 0:
get_valid_examples = simnet_process.get_reader("valid")
valid_result = valid_and_test( valid_result = valid_and_test(test_prog,test_pyreader,get_valid_examples,simnet_process,"valid",exe,[pred.name])
program=infer_program,
feeder=valid_feeder,
reader=valid_reader,
process=simnet_process,
mode="valid")
if args.compute_accuracy: if args.compute_accuracy:
valid_auc, valid_acc = valid_result valid_auc, valid_acc = valid_result
logging.info( logging.info(
...@@ -284,14 +263,39 @@ def train(conf_dict, args): ...@@ -284,14 +263,39 @@ def train(conf_dict, args):
] ]
target_vars = [left_feat, pred] target_vars = [left_feat, pred]
fluid.io.save_inference_model(model_path, feed_var_names, fluid.io.save_inference_model(model_path, feed_var_names,
target_vars, executor, target_vars, exe,
infer_program) test_prog)
logging.info("saving infer model in %s" % model_path) logging.info("saving infer model in %s" % model_path)
losses.append(np.mean(avg_loss[0])) losses.append(np.mean(avg_loss[0]))
except fluid.core.EOFException:
train_pyreader.reset()
break
end_time = time.time() end_time = time.time()
logging.info("epoch: %d, loss: %f, used time: %d sec" % logging.info("epoch: %d, loss: %f, used time: %d sec" %
(epoch_id, np.mean(losses), end_time - start_time)) (epoch_id, np.mean(losses), end_time - start_time))
ce_info.append([np.mean(losses), end_time - start_time]) ce_info.append([np.mean(losses), end_time - start_time])
#final save
logging.info("the final step is %s" % global_step)
model_save_dir = os.path.join(args.output_dir,
conf_dict["model_path"])
model_path = os.path.join(model_save_dir, str(global_step))
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
if args.task_mode == "pairwise":
feed_var_names = [left.name, pos_right.name]
target_vars = [left_feat, pos_score]
else:
feed_var_names = [
left.name,
right.name,
]
target_vars = [left_feat, pred]
fluid.io.save_inference_model(model_path, feed_var_names,
target_vars, exe,
test_prog)
logging.info("saving infer model in %s" % model_path)
if args.enable_ce: if args.enable_ce:
card_num = get_cards() card_num = get_cards()
ce_loss = 0 ce_loss = 0
...@@ -309,20 +313,11 @@ def train(conf_dict, args): ...@@ -309,20 +313,11 @@ def train(conf_dict, args):
if args.do_test: if args.do_test:
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
# Get Feeder and Reader # Get Feeder and Reader
test_feeder = fluid.DataFeeder( get_test_examples = simnet_process.get_reader("test")
place=place, feed_list=[left.name, pos_right.name])
test_reader = simnet_process.get_reader("test")
else: else:
# Get Feeder and Reader # Get Feeder and Reader
test_feeder = fluid.DataFeeder( get_test_examples = simnet_process.get_reader("test")
place=place, feed_list=[left.name, right.name]) test_result = valid_and_test(test_prog,test_pyreader,get_test_examples,simnet_process,"test",exe,[pred.name])
test_reader = simnet_process.get_reader("test")
test_result = valid_and_test(
program=infer_program,
feeder=test_feeder,
reader=test_reader,
process=simnet_process,
mode="test")
if args.compute_accuracy: if args.compute_accuracy:
test_auc, test_acc = test_result test_auc, test_acc = test_result
logging.info("AUC of test is %f, Accuracy of test is %f" % logging.info("AUC of test is %f, Accuracy of test is %f" %
...@@ -334,51 +329,83 @@ def train(conf_dict, args): ...@@ -334,51 +329,83 @@ def train(conf_dict, args):
def test(conf_dict, args): def test(conf_dict, args):
""" """
run predict Evaluation Function
""" """
vocab = utils.load_vocab(args.vocab_path)
simnet_process = reader.SimNetProcessor(args, vocab)
# load auc method
metric = fluid.metrics.Auc(name="auc")
with codecs.open("predictions.txt", "w", "utf-8") as predictions_file:
# Get model path
model_path = args.init_checkpoint
# Get device
if args.use_cuda: if args.use_cuda:
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0)
else: else:
place = fluid.CPUPlace() place = fluid.CPUPlace()
# Get executor exe = fluid.Executor(place)
executor = fluid.Executor(place=place)
# Load model vocab = utils.load_vocab(args.vocab_path)
program, feed_var_names, fetch_targets = fluid.io.load_inference_model( simnet_process = reader.SimNetProcessor(args, vocab)
model_path, executor)
startup_prog = fluid.Program()
get_test_examples = simnet_process.get_reader("test")
batch_data = paddle.batch(get_test_examples, args.batch_size, drop_last=False)
test_prog = fluid.Program()
conf_dict['dict_size'] = len(vocab)
net = utils.import_class("../models/matching",
conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
metric = fluid.metrics.Auc(name="auc")
with codecs.open("predictions.txt", "w", "utf-8") as predictions_file:
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
# Get Feeder and Reader with fluid.program_guard(test_prog, startup_prog):
feeder = fluid.DataFeeder( with fluid.unique_name.guard():
place=place, feed_list=feed_var_names, program=program) test_pyreader, left, pos_right = create_model(
test_reader = simnet_process.get_reader("test") args,
pyreader_name = 'test_reader',
is_inference=True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
else: else:
# Get Feeder and Reader with fluid.program_guard(test_prog, startup_prog):
feeder = fluid.DataFeeder( with fluid.unique_name.guard():
place=place, feed_list=feed_var_names, program=program) test_pyreader, left, right = create_model(
test_reader = simnet_process.get_reader("test") args,
# Get batch data iterator pyreader_name = 'test_reader',
batch_data = paddle.batch(test_reader, args.batch_size, drop_last=False) is_inference=True)
left_feat, pred = net.predict(left, right)
test_prog = test_prog.clone(for_test=True)
exe.run(startup_prog)
utils.init_checkpoint(
exe,
args.init_checkpoint,
main_program=test_prog)
test_exe = exe
test_pyreader.decorate_paddle_reader(batch_data)
logging.info("start test process ...") logging.info("start test process ...")
test_pyreader.start()
pred_list = [] pred_list = []
for iter, data in enumerate(batch_data()): fetch_list = [pred.name]
output = executor.run(program, output = []
feed=feeder.feed(data), while True:
fetch_list=fetch_targets) try:
output = test_exe.run(program=test_prog,fetch_list=fetch_list)
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
pred_list += list(map(lambda item: float(item[0]), output[1])) pred_list += list(map(lambda item: float(item[0]), output[0]))
predictions_file.write("\n".join( predictions_file.write("\n".join(
map(lambda item: str((item[0] + 1) / 2), output[1])) + "\n") map(lambda item: str((item[0] + 1) / 2), output[0])) + "\n")
else: else:
pred_list += map(lambda item: item, output[1]) pred_list += map(lambda item: item, output[0])
predictions_file.write("\n".join( predictions_file.write("\n".join(
map(lambda item: str(np.argmax(item)), output[1])) + "\n") map(lambda item: str(np.argmax(item)), output[0])) + "\n")
except fluid.core.EOFException:
test_pyreader.reset()
break
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
pred_list = np.array(pred_list).reshape((-1, 1)) pred_list = np.array(pred_list).reshape((-1, 1))
pred_list = (pred_list + 1) / 2 pred_list = (pred_list + 1) / 2
...@@ -403,47 +430,72 @@ def test(conf_dict, args): ...@@ -403,47 +430,72 @@ def test(conf_dict, args):
os.path.join(os.getcwd(), args.test_result_path)) os.path.join(os.getcwd(), args.test_result_path))
def infer(args): def infer(conf_dict, args):
""" """
run predict run predict
""" """
vocab = utils.load_vocab(args.vocab_path)
simnet_process = reader.SimNetProcessor(args, vocab)
# Get model path
model_path = args.init_checkpoint
# Get device
if args.use_cuda: if args.use_cuda:
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0)
else: else:
place = fluid.CPUPlace() place = fluid.CPUPlace()
# Get executor exe = fluid.Executor(place)
executor = fluid.Executor(place=place)
# Load model vocab = utils.load_vocab(args.vocab_path)
program, feed_var_names, fetch_targets = fluid.io.load_inference_model( simnet_process = reader.SimNetProcessor(args, vocab)
model_path, executor)
startup_prog = fluid.Program()
get_infer_examples = simnet_process.get_infer_reader
batch_data = paddle.batch(get_infer_examples, args.batch_size, drop_last=False)
test_prog = fluid.Program()
conf_dict['dict_size'] = len(vocab)
net = utils.import_class("../models/matching",
conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
# Get Feeder and Reader with fluid.program_guard(test_prog, startup_prog):
infer_feeder = fluid.DataFeeder( with fluid.unique_name.guard():
place=place, feed_list=feed_var_names, program=program) infer_pyreader, left, pos_right = create_model(args, pyreader_name = 'infer_reader', is_inference = True)
infer_reader = simnet_process.get_infer_reader left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
else: else:
# Get Feeder and Reader with fluid.program_guard(test_prog, startup_prog):
infer_feeder = fluid.DataFeeder( with fluid.unique_name.guard():
place=place, feed_list=feed_var_names, program=program) infer_pyreader, left, right = create_model(args, pyreader_name = 'infer_reader', is_inference = True)
infer_reader = simnet_process.get_infer_reader left_feat, pred = net.predict(left, right)
# Get batch data iterator test_prog = test_prog.clone(for_test=True)
batch_data = paddle.batch(infer_reader, args.batch_size, drop_last=False)
exe.run(startup_prog)
utils.init_checkpoint(
exe,
args.init_checkpoint,
main_program=test_prog)
test_exe = exe
infer_pyreader.decorate_sample_list_generator(batch_data)
logging.info("start test process ...") logging.info("start test process ...")
preds_list = [] preds_list = []
for iter, data in enumerate(batch_data()): fetch_list = [pred.name]
output = executor.run(program, output = []
feed=infer_feeder.feed(data), infer_pyreader.start()
fetch_list=fetch_targets) while True:
try:
output = test_exe.run(program=test_prog,fetch_list=fetch_list)
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
preds_list += list( preds_list += list(
map(lambda item: str((item[0] + 1) / 2), output[1])) map(lambda item: str((item[0] + 1) / 2), output[0]))
else: else:
preds_list += map(lambda item: str(np.argmax(item)), output[1]) preds_list += map(lambda item: str(np.argmax(item)), output[0])
except fluid.core.EOFException:
infer_pyreader.reset()
break
with codecs.open(args.infer_result_path, "w", "utf-8") as infer_file: with codecs.open(args.infer_result_path, "w", "utf-8") as infer_file:
for _data, _pred in zip(simnet_process.get_infer_data(), preds_list): for _data, _pred in zip(simnet_process.get_infer_data(), preds_list):
infer_file.write(_data + "\t" + _pred + "\n") infer_file.write(_data + "\t" + _pred + "\n")
...@@ -458,23 +510,11 @@ def get_cards(): ...@@ -458,23 +510,11 @@ def get_cards():
num = len(cards.split(",")) num = len(cards.split(","))
return num return num
if __name__ == "__main__":
def main(conf_dict, args): args = ArgConfig()
""" args = args.build_conf()
main
"""
if args.do_train:
train(conf_dict, args)
elif args.do_test:
test(conf_dict, args)
elif args.do_infer:
infer(args)
else:
raise ValueError(
"one of do_train and do_test and do_infer must be True")
if __name__ == "__main__":
utils.print_arguments(args) utils.print_arguments(args)
try: try:
if fluid.is_compiled_with_cuda() != True and args.use_cuda == True: if fluid.is_compiled_with_cuda() != True and args.use_cuda == True:
...@@ -487,4 +527,12 @@ if __name__ == "__main__": ...@@ -487,4 +527,12 @@ if __name__ == "__main__":
pass pass
utils.init_log("./log/TextSimilarityNet") utils.init_log("./log/TextSimilarityNet")
conf_dict = config.SimNetConfig(args) conf_dict = config.SimNetConfig(args)
main(conf_dict, args) if args.do_train:
train(conf_dict, args)
elif args.do_test:
test(conf_dict, args)
elif args.do_infer:
infer(conf_dict, args)
else:
raise ValueError(
"one of do_train and do_test and do_infer must be True")
\ No newline at end of file
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
""" """
SimNet utilities. SimNet utilities.
""" """
import argparse
import time import time
import sys import sys
import re import re
...@@ -26,20 +26,17 @@ import numpy as np ...@@ -26,20 +26,17 @@ import numpy as np
import logging import logging
import logging.handlers import logging.handlers
import paddle.fluid as fluid import paddle.fluid as fluid
import io
""" """
******functions for file processing****** ******functions for file processing******
""" """
def load_vocab(file_path): def load_vocab(file_path):
""" """
load the given vocabulary load the given vocabulary
""" """
vocab = {} vocab = {}
if six.PY3: f = io.open(file_path, "r", encoding="utf-8")
f = open(file_path, "r", encoding="utf-8")
else:
f = open(file_path, "r")
for line in f: for line in f:
items = line.strip("\n").split("\t") items = line.strip("\n").split("\t")
if items[0] not in vocab: if items[0] not in vocab:
...@@ -61,8 +58,7 @@ def get_result_file(args): ...@@ -61,8 +58,7 @@ def get_result_file(args):
""" """
with codecs.open(args.test_data_dir, "r", "utf-8") as test_file: with codecs.open(args.test_data_dir, "r", "utf-8") as test_file:
with codecs.open("predictions.txt", "r", "utf-8") as predictions_file: with codecs.open("predictions.txt", "r", "utf-8") as predictions_file:
with codecs.open(args.test_result_path, "w", with codecs.open(args.test_result_path, "w", "utf-8") as test_result_file:
"utf-8") as test_result_file:
test_datas = [line.strip("\n") for line in test_file] test_datas = [line.strip("\n") for line in test_file]
predictions = [line.strip("\n") for line in predictions_file] predictions = [line.strip("\n") for line in predictions_file]
for test_data, prediction in zip(test_datas, predictions): for test_data, prediction in zip(test_datas, predictions):
...@@ -170,6 +166,58 @@ class ArgumentGroup(object): ...@@ -170,6 +166,58 @@ class ArgumentGroup(object):
help=help + ' Default: %(default)s.', help=help + ' Default: %(default)s.',
**kwargs) **kwargs)
class ArgConfig(object):
def __init__(self):
parser = argparse.ArgumentParser()
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("config_path", str, None, "Path to the json file for EmoTect model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("output_dir", str, None, "Directory path to save checkpoints")
model_g.add_arg("task_mode", str, None, "task mode: pairwise or pointwise")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 10, "Number of epoches for training.")
train_g.add_arg("save_steps", int, 200, "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 100, "The steps interval to evaluate model performance.")
log_g = ArgumentGroup(parser, "logging", "logging related")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose_result", bool, True, "Whether to output verbose result.")
log_g.add_arg("test_result_path", str, "test_result", "Directory path to test result.")
log_g.add_arg("infer_result_path", str, "infer_result", "Directory path to infer result.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("train_data_dir", str, None, "Directory path to training data.")
data_g.add_arg("valid_data_dir", str, None, "Directory path to valid data.")
data_g.add_arg("test_data_dir", str, None, "Directory path to testing data.")
data_g.add_arg("infer_data_dir", str, None, "Directory path to infer data.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
run_type_g.add_arg("task_name", str, None, "The name of task to perform sentiment classification.")
run_type_g.add_arg("do_train", bool, False, "Whether to perform training.")
run_type_g.add_arg("do_valid", bool, False, "Whether to perform dev.")
run_type_g.add_arg("do_test", bool, False, "Whether to perform testing.")
run_type_g.add_arg("do_infer", bool, False, "Whether to perform inference.")
run_type_g.add_arg("compute_accuracy", bool, False, "Whether to compute accuracy.")
run_type_g.add_arg("lamda", float, 0.91, "When task_mode is pairwise, lamda is the threshold for calculating the accuracy.")
custom_g = ArgumentGroup(parser, "customize", "customized options.")
self.custom_g = custom_g
parser.add_argument('--enable_ce',action='store_true',help='If set, run the task with continuous evaluation logs.')
self.parser = parser
def add_arg(self, name, dtype, default, descrip):
self.custom_g.add_arg(name, dtype, default, descrip)
def build_conf(self):
return self.parser.parse_args()
def print_arguments(args): def print_arguments(args):
""" """
...@@ -314,3 +362,4 @@ def init_checkpoint(exe, init_checkpoint_path, main_program): ...@@ -314,3 +362,4 @@ def init_checkpoint(exe, init_checkpoint_path, main_program):
main_program=main_program, main_program=main_program,
predicate=existed_persitables) predicate=existed_persitables)
print("Load model from {}".format(init_checkpoint_path)) print("Load model from {}".format(init_checkpoint_path))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册