提交 e19f4bc7 编写于 作者: D Dilyar 提交者: Yibing Liu

Fix some problems of simnet (#3433)

* update

* update

* Update README.md

* Update run.sh
上级 107d4e79
......@@ -49,7 +49,7 @@ class BOW(object):
right_soft = softsign_layer.ops(right_pool)
# matching layer
if self.task_mode == "pairwise":
bow_layer = layers.FCLayer(self.bow_dim, "relu", "fc")
bow_layer = layers.FCLayer(self.bow_dim, None, "fc")
left_bow = bow_layer.ops(left_soft)
right_bow = bow_layer.ops(right_soft)
cos_sim_layer = layers.CosSimLayer()
......@@ -58,7 +58,7 @@ class BOW(object):
else:
concat_layer = layers.ConcatLayer(1)
concat = concat_layer.ops([left_soft, right_soft])
bow_layer = layers.FCLayer(self.bow_dim, "relu", "fc")
bow_layer = layers.FCLayer(self.bow_dim, None, "fc")
concat_fc = bow_layer.ops(concat)
softmax_layer = layers.FCLayer(2, "softmax", "cos_sim")
pred = softmax_layer.ops(concat_fc)
......
......@@ -43,23 +43,23 @@ class CNN(object):
left_emb = emb_layer.ops(left)
right_emb = emb_layer.ops(right)
# Presentation context
cnn_layer = layers.SequenceConvPoolLayer(self.filter_size,
self.num_filters, "conv")
cnn_layer = layers.SequenceConvPoolLayer(
self.filter_size, self.num_filters, "conv")
left_cnn = cnn_layer.ops(left_emb)
right_cnn = cnn_layer.ops(right_emb)
# matching layer
if self.task_mode == "pairwise":
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu")
left_relu = relu_layer.ops(left_cnn)
right_relu = relu_layer.ops(right_cnn)
fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
left_fc = fc_layer.ops(left_cnn)
right_fc = fc_layer.ops(right_cnn)
cos_sim_layer = layers.CosSimLayer()
pred = cos_sim_layer.ops(left_relu, right_relu)
return left_relu, pred
pred = cos_sim_layer.ops(left_fc, right_fc)
return left_fc, pred
else:
concat_layer = layers.ConcatLayer(1)
concat = concat_layer.ops([left_cnn, right_cnn])
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu")
concat_fc = relu_layer.ops(concat)
fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
concat_fc = fc_layer.ops(concat)
softmax_layer = layers.FCLayer(2, "softmax", "cos_sim")
pred = softmax_layer.ops(concat_fc)
return left_cnn, pred
......@@ -50,17 +50,17 @@ class GRU(object):
right_last = last_layer.ops(right_gru)
# matching layer
if self.task_mode == "pairwise":
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu")
left_relu = relu_layer.ops(left_last)
right_relu = relu_layer.ops(right_last)
fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
left_fc = fc_layer.ops(left_last)
right_fc = fc_layer.ops(right_last)
cos_sim_layer = layers.CosSimLayer()
pred = cos_sim_layer.ops(left_relu, right_relu)
return left_relu, pred
pred = cos_sim_layer.ops(left_fc, right_fc)
return left_fc, pred
else:
concat_layer = layers.ConcatLayer(1)
concat = concat_layer.ops([left_last, right_last])
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu")
concat_fc = relu_layer.ops(concat)
fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
concat_fc = fc_layer.ops(concat)
softmax_layer = layers.FCLayer(2, "softmax", "cos_sim")
pred = softmax_layer.ops(concat_fc)
return left_last, pred
......@@ -49,17 +49,17 @@ class LSTM(object):
right_last = last_layer.ops(right_lstm)
# matching layer
if self.task_mode == "pairwise":
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu")
left_relu = relu_layer.ops(left_last)
right_relu = relu_layer.ops(right_last)
fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
left_fc = fc_layer.ops(left_last)
right_fc = fc_layer.ops(right_last)
cos_sim_layer = layers.CosSimLayer()
pred = cos_sim_layer.ops(left_relu, right_relu)
return left_relu, pred
pred = cos_sim_layer.ops(left_fc, right_fc)
return left_fc, pred
else:
concat_layer = layers.ConcatLayer(1)
concat = concat_layer.ops([left_last, right_last])
relu_layer = layers.FCLayer(self.hidden_dim, "relu", "relu")
concat_fc = relu_layer.ops(concat)
fc_layer = layers.FCLayer(self.hidden_dim, None, "fc")
concat_fc = fc_layer.ops(concat)
softmax_layer = layers.FCLayer(2, "softmax", "cos_sim")
pred = softmax_layer.ops(concat_fc)
return left_last, pred
......@@ -6,10 +6,17 @@
基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和语义匹配数据集(LCQMC)进行评测,效果如下表所示。LCQMC数据集以Accuracy为评测指标,而pairwise模型的输出为相似度,因此我们采用0.958作为分类阈值,相比于基线模型中网络结构同等复杂的CBOW模型(准确率为0.737),我们模型的准确率为0.7532。
| 模型 | 百度知道 | ECOM |QQSIM | UNICOM | LCQMC |
|:-----------:|:-------------:|:-------------:|:-------------:|:-------------:|:-------------:|
| | AUC | AUC | AUC|正逆序比|Accuracy|
|BOW_Pairwise|0.6767|0.7329|0.7650|1.5630|0.7532|
| 模型 | 百度知道 | ECOM |QQSIM | UNICOM |
|:-----------:|:-------------:|:-------------:|:-------------:|:-------------:|
| | AUC | AUC | AUC|正逆序比|
|BOW_Pairwise|0.6767|0.7329|0.7650|1.5630|
#### 测试集说明
| 数据集 | 来源 | 垂类 |
|:-----------:|:-------------:|:-------------:|
|百度知道 | 百度知道问题 | 日常 |
|ECOM|商业问句|金融|
|QQSIM|闲聊对话|日常|
|UNICOM|联通客服|客服|
## 快速开始
#### 版本依赖
本项目依赖于 Paddlepaddle Fluid 1.3.1,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
......@@ -24,24 +31,14 @@ cd models/PaddleNLP/similarity_net
#### 数据准备
下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。
```shell
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
sh download_data.sh
```
#### 模型准备
我们开源了基于大规模数据训练好的```pairwise```模型(基于bow模型训练),我们提供两种下载方式,模型保在```./model_files/simnet_bow_pairwise_pretrained_model/```下。
##### 方式一:基于PaddleHub命令行工具(PaddleHub[安装方式](https://github.com/PaddlePaddle/PaddleHub))
```shell
mkdir model_files
hub download simnet_bow_pairwise --output_path ./
tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C ./model_files
```
##### 方式二:直接下载
我们开源了基于大规模数据训练好的```pairwise```模型(基于bow模型训练),用户可以通过运行命令下载预训练好的模型,该模型将保存在```./model_files/simnet_bow_pairwise_pretrained_model/```下。
```shell
mkdir model_files
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_bow-pairwise-1.0.0.tar.gz
tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C ./model_files
sh download_pretrained_model.sh
```
#### 评估
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。
```shell
......@@ -162,6 +159,7 @@ python run_classifier.py \
--task_mode ${TASK_MODE} #训练模式,pairwise或pointwise,与相应的配置文件匹配。
--compute_accuracy False \ #是否计算accuracy
--lamda 0.91 \ #pairwise模式计算accuracy时的阈值
--init_checkpoint "" #预加载模型路径
```
### 如何组建自己的模型
用户可以根据自己的需求,组建自定义的模型,具体方法如下所示:
......
......@@ -34,14 +34,12 @@ class SimNetConfig(object):
with open(config_path) as json_file:
config_dict = json.load(json_file)
except Exception:
raise IOError("Error in parsing simnet model config file '%s'" %
config_path)
raise IOError("Error in parsing simnet model config file '%s'" % config_path)
else:
if config_dict["task_mode"] != self.task_mode:
raise ValueError(
"the config '{}' does not match the task_mode '{}'".format(
self.config_path, self.task_mode))
"the config '{}' does not match the task_mode '{}'".format(self.config_path, self.task_mode))
return config_dict
def __getitem__(self, key):
......
#get data
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
......@@ -4,13 +4,7 @@ model_files_path="./model_files"
#get pretrained_bow_pairwise_model
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_bow-pairwise-1.0.0.tar.gz
if [ ! -d $model_files_path ]; then
mkdir $model_files_path
mkdir $model_files_path
fi
tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C $model_files_path
rm simnet_bow-pairwise-1.0.0.tar.gz
#get data
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
rm simnet_bow-pairwise-1.0.0.tar.gz
\ No newline at end of file
......@@ -21,7 +21,7 @@ INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/
train() {
python run_classifier.py \
--task_name ${TASK_NAME} \
--use_cuda false \
--use_cuda False \
--do_train True \
--do_valid True \
--do_test True \
......@@ -34,12 +34,13 @@ train() {
--output_dir ${CKPT_PATH} \
--config_path ${CONFIG_PATH} \
--vocab_path ${VOCAB_PATH} \
--epoch 10 \
--save_steps 1000 \
--validation_steps 100 \
--epoch 40 \
--save_steps 2000 \
--validation_steps 200 \
--compute_accuracy False \
--lamda 0.958 \
--task_mode ${TASK_MODE}
--task_mode ${TASK_MODE}\
--init_checkpoint ""
}
#run_evaluate
evaluate() {
......
......@@ -25,76 +25,67 @@ import argparse
import multiprocessing
import sys
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
reload(sys)
sys.setdefaultencoding(defaultencoding)
sys.path.append("..")
import paddle
import paddle.fluid as fluid
import numpy as np
import codecs
import config
import utils
import reader
import models.matching.paddle_layers as layers
import codecs
from utils import ArgConfig
import logging
parser = argparse.ArgumentParser(__doc__)
model_g = utils.ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("config_path", str, None,
"Path to the json file for EmoTect model config.")
model_g.add_arg("init_checkpoint", str, None,
"Init checkpoint to resume training from.")
model_g.add_arg("output_dir", str, None, "Directory path to save checkpoints")
model_g.add_arg("task_mode", str, None, "task mode: pairwise or pointwise")
train_g = utils.ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 10, "Number of epoches for training.")
train_g.add_arg("save_steps", int, 200,
"The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 100,
"The steps interval to evaluate model performance.")
log_g = utils.ArgumentGroup(parser, "logging", "logging related")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose_result", bool, True, "Whether to output verbose result.")
log_g.add_arg("test_result_path", str, "test_result",
"Directory path to test result.")
log_g.add_arg("infer_result_path", str, "infer_result",
"Directory path to infer result.")
data_g = utils.ArgumentGroup(
parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("train_data_dir", str, None, "Directory path to training data.")
data_g.add_arg("valid_data_dir", str, None, "Directory path to valid data.")
data_g.add_arg("test_data_dir", str, None, "Directory path to testing data.")
data_g.add_arg("infer_data_dir", str, None, "Directory path to infer data.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("batch_size", int, 32,
"Total examples' number in batch for training.")
run_type_g = utils.ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
run_type_g.add_arg("task_name", str, None,
"The name of task to perform sentiment classification.")
run_type_g.add_arg("do_train", bool, False, "Whether to perform training.")
run_type_g.add_arg("do_valid", bool, False, "Whether to perform dev.")
run_type_g.add_arg("do_test", bool, False, "Whether to perform testing.")
run_type_g.add_arg("do_infer", bool, False, "Whether to perform inference.")
run_type_g.add_arg("compute_accuracy", bool, False,
"Whether to compute accuracy.")
run_type_g.add_arg(
"lamda", float, 0.91,
"When task_mode is pairwise, lamda is the threshold for calculating the accuracy."
)
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run the task with continuous evaluation logs.')
args = parser.parse_args()
def create_model(args, pyreader_name, is_inference = False, is_pointwise = False):
"""
Create Model for simnet
"""
if is_inference:
inf_pyreader = fluid.layers.py_reader(
capacity=16,
shapes=([-1,1], [-1,1]),
dtypes=('int64', 'int64'),
lod_levels=(1, 1),
name=pyreader_name,
use_double_buffer=False)
left, pos_right = fluid.layers.read_file(inf_pyreader)
return inf_pyreader, left, pos_right
else:
if is_pointwise:
pointwise_pyreader = fluid.layers.py_reader(
capacity=16,
shapes=([-1,1], [-1,1], [-1,1]),
dtypes=('int64', 'int64', 'int64'),
lod_levels=(1, 1, 0),
name=pyreader_name,
use_double_buffer=False)
left, right, label = fluid.layers.read_file(pointwise_pyreader)
return pointwise_pyreader, left, right, label
else:
pairwise_pyreader = fluid.layers.py_reader(
capacity=16,
shapes=([-1,1], [-1,1], [-1,1]),
dtypes=('int64', 'int64', 'int64'),
lod_levels=(1, 1, 1),
name=pyreader_name,
use_double_buffer=False)
left, pos_right, neg_right = fluid.layers.read_file(pairwise_pyreader)
return pairwise_pyreader, left, pos_right, neg_right
def train(conf_dict, args):
"""
train processic
......@@ -129,85 +120,79 @@ def train(conf_dict, args):
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
exe = fluid.Executor(place)
startup_prog = fluid.Program()
train_program = fluid.Program()
simnet_process = reader.SimNetProcessor(args, vocab)
if args.task_mode == "pairwise":
# Build network
left = data.ops(name="left", shape=[1], dtype="int64", lod_level=1)
pos_right = data.ops(name="right",
shape=[1],
dtype="int64",
lod_level=1)
neg_right = data.ops(name="neg_right",
shape=[1],
dtype="int64",
lod_level=1)
left_feat, pos_score = net.predict(left, pos_right)
# Get Feeder and Reader
train_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, pos_right.name, neg_right.name])
train_reader = simnet_process.get_reader("train")
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, left, pos_right, neg_right = create_model(
args,
pyreader_name='train_reader')
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
_, neg_score = net.predict(left, neg_right)
avg_cost = loss.compute(pos_score, neg_score)
avg_cost.persistable = True
optimizer.ops(avg_cost)
# Get Reader
get_train_examples = simnet_process.get_reader("train")
if args.do_valid:
valid_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, pos_right.name])
valid_reader = simnet_process.get_reader("valid")
pred = pos_score
# Save Infer model
infer_program = fluid.default_main_program().clone(for_test=True)
_, neg_score = net.predict(left, neg_right)
avg_cost = loss.compute(pos_score, neg_score)
avg_cost.persistable = True
test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, pos_right= create_model(args, pyreader_name = 'test_reader',is_inference=True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
else:
# Build network
left = data.ops(name="left", shape=[1], dtype="int64", lod_level=1)
right = data.ops(name="right", shape=[1], dtype="int64", lod_level=1)
label = data.ops(name="label", shape=[1], dtype="int64", lod_level=0)
left_feat, pred = net.predict(left, right)
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, left, right, label = create_model(
args,
pyreader_name='train_reader',
is_pointwise=True)
left_feat, pred = net.predict(left, right)
avg_cost = loss.compute(pred, label)
avg_cost.persistable = True
optimizer.ops(avg_cost)
# Get Feeder and Reader
train_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, right.name, label.name])
train_reader = simnet_process.get_reader("train")
get_train_examples = simnet_process.get_reader("train")
if args.do_valid:
valid_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, right.name])
valid_reader = simnet_process.get_reader("valid")
# Save Infer model
infer_program = fluid.default_main_program().clone(for_test=True)
avg_cost = loss.compute(pred, label)
avg_cost.persistable = True
# operate Optimization
optimizer.ops(avg_cost)
executor = fluid.Executor(place)
executor.run(fluid.default_startup_program())
if args.init_checkpoint is not None:
utils.init_checkpoint(executor, args.init_checkpoint,
fluid.default_startup_program())
# Get and run executor
parallel_executor = fluid.ParallelExecutor(
use_cuda=args.use_cuda,
loss_name=avg_cost.name,
main_program=fluid.default_main_program())
# Get device number
device_count = parallel_executor.device_count
logging.info("device count: %d" % device_count)
def valid_and_test(program, feeder, reader, process, mode="test"):
test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, right= create_model(args, pyreader_name = 'test_reader',is_inference=True)
left_feat, pred = net.predict(left, right)
test_prog = test_prog.clone(for_test=True)
if args.init_checkpoint is not "":
utils.init_checkpoint(exe, args.init_checkpoint,
startup_prog)
def valid_and_test(test_program, test_pyreader, get_valid_examples, process, mode, exe, fetch_list):
"""
return auc and acc
"""
# Get Batch Data
batch_data = paddle.batch(reader, args.batch_size, drop_last=False)
batch_data = paddle.batch(get_valid_examples, args.batch_size, drop_last=False)
test_pyreader.decorate_paddle_reader(batch_data)
test_pyreader.start()
pred_list = []
for data in batch_data():
_pred = executor.run(program=program,
feed=feeder.feed(data),
fetch_list=[pred.name])
pred_list += list(_pred)
while True:
try:
_pred = exe.run(program=test_program,fetch_list=[pred.name])
pred_list += list(_pred)
except fluid.core.EOFException:
test_pyreader.reset()
break
pred_list = np.vstack(pred_list)
if mode == "test":
label_list = process.get_test_label()
......@@ -232,66 +217,85 @@ def train(conf_dict, args):
# set global step
global_step = 0
ce_info = []
train_exe = exe
for epoch_id in range(args.epoch):
losses = []
# Get batch data iterator
train_batch_data = paddle.batch(
paddle.reader.shuffle(
train_reader, buf_size=10000),
get_train_examples, buf_size=10000),
args.batch_size,
drop_last=False)
train_pyreader.decorate_paddle_reader(train_batch_data)
train_pyreader.start()
exe.run(startup_prog)
losses = []
start_time = time.time()
for iter, data in enumerate(train_batch_data()):
if len(data) < device_count:
logging.info(
"the size of batch data is less than device_count(%d)" %
device_count)
continue
global_step += 1
avg_loss = parallel_executor.run([avg_cost.name],
feed=train_feeder.feed(data))
if args.do_valid and global_step % args.validation_steps == 0:
valid_result = valid_and_test(
program=infer_program,
feeder=valid_feeder,
reader=valid_reader,
process=simnet_process,
mode="valid")
if args.compute_accuracy:
valid_auc, valid_acc = valid_result
logging.info(
"global_steps: %d, valid_auc: %f, valid_acc: %f" %
(global_step, valid_auc, valid_acc))
else:
valid_auc = valid_result
logging.info("global_steps: %d, valid_auc: %f" %
(global_step, valid_auc))
if global_step % args.save_steps == 0:
model_save_dir = os.path.join(args.output_dir,
conf_dict["model_path"])
model_path = os.path.join(model_save_dir, str(global_step))
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
if args.task_mode == "pairwise":
feed_var_names = [left.name, pos_right.name]
target_vars = [left_feat, pos_score]
else:
feed_var_names = [
left.name,
right.name,
]
target_vars = [left_feat, pred]
fluid.io.save_inference_model(model_path, feed_var_names,
target_vars, executor,
infer_program)
logging.info("saving infer model in %s" % model_path)
losses.append(np.mean(avg_loss[0]))
while True:
try:
global_step += 1
fetch_list = [avg_cost.name]
avg_loss = train_exe.run(program=train_program, fetch_list = fetch_list)
if args.do_valid and global_step % args.validation_steps == 0:
get_valid_examples = simnet_process.get_reader("valid")
valid_result = valid_and_test(test_prog,test_pyreader,get_valid_examples,simnet_process,"valid",exe,[pred.name])
if args.compute_accuracy:
valid_auc, valid_acc = valid_result
logging.info(
"global_steps: %d, valid_auc: %f, valid_acc: %f" %
(global_step, valid_auc, valid_acc))
else:
valid_auc = valid_result
logging.info("global_steps: %d, valid_auc: %f" %
(global_step, valid_auc))
if global_step % args.save_steps == 0:
model_save_dir = os.path.join(args.output_dir,
conf_dict["model_path"])
model_path = os.path.join(model_save_dir, str(global_step))
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
if args.task_mode == "pairwise":
feed_var_names = [left.name, pos_right.name]
target_vars = [left_feat, pos_score]
else:
feed_var_names = [
left.name,
right.name,
]
target_vars = [left_feat, pred]
fluid.io.save_inference_model(model_path, feed_var_names,
target_vars, exe,
test_prog)
logging.info("saving infer model in %s" % model_path)
losses.append(np.mean(avg_loss[0]))
except fluid.core.EOFException:
train_pyreader.reset()
break
end_time = time.time()
logging.info("epoch: %d, loss: %f, used time: %d sec" %
(epoch_id, np.mean(losses), end_time - start_time))
ce_info.append([np.mean(losses), end_time - start_time])
#final save
logging.info("the final step is %s" % global_step)
model_save_dir = os.path.join(args.output_dir,
conf_dict["model_path"])
model_path = os.path.join(model_save_dir, str(global_step))
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
if args.task_mode == "pairwise":
feed_var_names = [left.name, pos_right.name]
target_vars = [left_feat, pos_score]
else:
feed_var_names = [
left.name,
right.name,
]
target_vars = [left_feat, pred]
fluid.io.save_inference_model(model_path, feed_var_names,
target_vars, exe,
test_prog)
logging.info("saving infer model in %s" % model_path)
if args.enable_ce:
card_num = get_cards()
ce_loss = 0
......@@ -309,20 +313,11 @@ def train(conf_dict, args):
if args.do_test:
if args.task_mode == "pairwise":
# Get Feeder and Reader
test_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, pos_right.name])
test_reader = simnet_process.get_reader("test")
get_test_examples = simnet_process.get_reader("test")
else:
# Get Feeder and Reader
test_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, right.name])
test_reader = simnet_process.get_reader("test")
test_result = valid_and_test(
program=infer_program,
feeder=test_feeder,
reader=test_reader,
process=simnet_process,
mode="test")
get_test_examples = simnet_process.get_reader("test")
test_result = valid_and_test(test_prog,test_pyreader,get_test_examples,simnet_process,"test",exe,[pred.name])
if args.compute_accuracy:
test_auc, test_acc = test_result
logging.info("AUC of test is %f, Accuracy of test is %f" %
......@@ -334,51 +329,83 @@ def train(conf_dict, args):
def test(conf_dict, args):
"""
run predict
Evaluation Function
"""
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
exe = fluid.Executor(place)
vocab = utils.load_vocab(args.vocab_path)
simnet_process = reader.SimNetProcessor(args, vocab)
# load auc method
startup_prog = fluid.Program()
get_test_examples = simnet_process.get_reader("test")
batch_data = paddle.batch(get_test_examples, args.batch_size, drop_last=False)
test_prog = fluid.Program()
conf_dict['dict_size'] = len(vocab)
net = utils.import_class("../models/matching",
conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
metric = fluid.metrics.Auc(name="auc")
with codecs.open("predictions.txt", "w", "utf-8") as predictions_file:
# Get model path
model_path = args.init_checkpoint
# Get device
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
# Get executor
executor = fluid.Executor(place=place)
# Load model
program, feed_var_names, fetch_targets = fluid.io.load_inference_model(
model_path, executor)
if args.task_mode == "pairwise":
# Get Feeder and Reader
feeder = fluid.DataFeeder(
place=place, feed_list=feed_var_names, program=program)
test_reader = simnet_process.get_reader("test")
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, pos_right = create_model(
args,
pyreader_name = 'test_reader',
is_inference=True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
else:
# Get Feeder and Reader
feeder = fluid.DataFeeder(
place=place, feed_list=feed_var_names, program=program)
test_reader = simnet_process.get_reader("test")
# Get batch data iterator
batch_data = paddle.batch(test_reader, args.batch_size, drop_last=False)
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, right = create_model(
args,
pyreader_name = 'test_reader',
is_inference=True)
left_feat, pred = net.predict(left, right)
test_prog = test_prog.clone(for_test=True)
exe.run(startup_prog)
utils.init_checkpoint(
exe,
args.init_checkpoint,
main_program=test_prog)
test_exe = exe
test_pyreader.decorate_paddle_reader(batch_data)
logging.info("start test process ...")
test_pyreader.start()
pred_list = []
for iter, data in enumerate(batch_data()):
output = executor.run(program,
feed=feeder.feed(data),
fetch_list=fetch_targets)
if args.task_mode == "pairwise":
pred_list += list(map(lambda item: float(item[0]), output[1]))
predictions_file.write("\n".join(
map(lambda item: str((item[0] + 1) / 2), output[1])) + "\n")
else:
pred_list += map(lambda item: item, output[1])
predictions_file.write("\n".join(
map(lambda item: str(np.argmax(item)), output[1])) + "\n")
fetch_list = [pred.name]
output = []
while True:
try:
output = test_exe.run(program=test_prog,fetch_list=fetch_list)
if args.task_mode == "pairwise":
pred_list += list(map(lambda item: float(item[0]), output[0]))
predictions_file.write("\n".join(
map(lambda item: str((item[0] + 1) / 2), output[0])) + "\n")
else:
pred_list += map(lambda item: item, output[0])
predictions_file.write("\n".join(
map(lambda item: str(np.argmax(item)), output[0])) + "\n")
except fluid.core.EOFException:
test_pyreader.reset()
break
if args.task_mode == "pairwise":
pred_list = np.array(pred_list).reshape((-1, 1))
pred_list = (pred_list + 1) / 2
......@@ -403,47 +430,72 @@ def test(conf_dict, args):
os.path.join(os.getcwd(), args.test_result_path))
def infer(args):
def infer(conf_dict, args):
"""
run predict
"""
vocab = utils.load_vocab(args.vocab_path)
simnet_process = reader.SimNetProcessor(args, vocab)
# Get model path
model_path = args.init_checkpoint
# Get device
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
# Get executor
executor = fluid.Executor(place=place)
# Load model
program, feed_var_names, fetch_targets = fluid.io.load_inference_model(
model_path, executor)
exe = fluid.Executor(place)
vocab = utils.load_vocab(args.vocab_path)
simnet_process = reader.SimNetProcessor(args, vocab)
startup_prog = fluid.Program()
get_infer_examples = simnet_process.get_infer_reader
batch_data = paddle.batch(get_infer_examples, args.batch_size, drop_last=False)
test_prog = fluid.Program()
conf_dict['dict_size'] = len(vocab)
net = utils.import_class("../models/matching",
conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
if args.task_mode == "pairwise":
# Get Feeder and Reader
infer_feeder = fluid.DataFeeder(
place=place, feed_list=feed_var_names, program=program)
infer_reader = simnet_process.get_infer_reader
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
infer_pyreader, left, pos_right = create_model(args, pyreader_name = 'infer_reader', is_inference = True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
else:
# Get Feeder and Reader
infer_feeder = fluid.DataFeeder(
place=place, feed_list=feed_var_names, program=program)
infer_reader = simnet_process.get_infer_reader
# Get batch data iterator
batch_data = paddle.batch(infer_reader, args.batch_size, drop_last=False)
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
infer_pyreader, left, right = create_model(args, pyreader_name = 'infer_reader', is_inference = True)
left_feat, pred = net.predict(left, right)
test_prog = test_prog.clone(for_test=True)
exe.run(startup_prog)
utils.init_checkpoint(
exe,
args.init_checkpoint,
main_program=test_prog)
test_exe = exe
infer_pyreader.decorate_sample_list_generator(batch_data)
logging.info("start test process ...")
preds_list = []
for iter, data in enumerate(batch_data()):
output = executor.run(program,
feed=infer_feeder.feed(data),
fetch_list=fetch_targets)
if args.task_mode == "pairwise":
preds_list += list(
map(lambda item: str((item[0] + 1) / 2), output[1]))
else:
preds_list += map(lambda item: str(np.argmax(item)), output[1])
fetch_list = [pred.name]
output = []
infer_pyreader.start()
while True:
try:
output = test_exe.run(program=test_prog,fetch_list=fetch_list)
if args.task_mode == "pairwise":
preds_list += list(
map(lambda item: str((item[0] + 1) / 2), output[0]))
else:
preds_list += map(lambda item: str(np.argmax(item)), output[0])
except fluid.core.EOFException:
infer_pyreader.reset()
break
with codecs.open(args.infer_result_path, "w", "utf-8") as infer_file:
for _data, _pred in zip(simnet_process.get_infer_data(), preds_list):
infer_file.write(_data + "\t" + _pred + "\n")
......@@ -458,23 +510,11 @@ def get_cards():
num = len(cards.split(","))
return num
if __name__ == "__main__":
def main(conf_dict, args):
"""
main
"""
if args.do_train:
train(conf_dict, args)
elif args.do_test:
test(conf_dict, args)
elif args.do_infer:
infer(args)
else:
raise ValueError(
"one of do_train and do_test and do_infer must be True")
args = ArgConfig()
args = args.build_conf()
if __name__ == "__main__":
utils.print_arguments(args)
try:
if fluid.is_compiled_with_cuda() != True and args.use_cuda == True:
......@@ -487,4 +527,12 @@ if __name__ == "__main__":
pass
utils.init_log("./log/TextSimilarityNet")
conf_dict = config.SimNetConfig(args)
main(conf_dict, args)
if args.do_train:
train(conf_dict, args)
elif args.do_test:
test(conf_dict, args)
elif args.do_infer:
infer(conf_dict, args)
else:
raise ValueError(
"one of do_train and do_test and do_infer must be True")
\ No newline at end of file
......@@ -15,7 +15,7 @@
"""
SimNet utilities.
"""
import argparse
import time
import sys
import re
......@@ -26,20 +26,17 @@ import numpy as np
import logging
import logging.handlers
import paddle.fluid as fluid
import io
"""
******functions for file processing******
"""
def load_vocab(file_path):
"""
load the given vocabulary
"""
vocab = {}
if six.PY3:
f = open(file_path, "r", encoding="utf-8")
else:
f = open(file_path, "r")
f = io.open(file_path, "r", encoding="utf-8")
for line in f:
items = line.strip("\n").split("\t")
if items[0] not in vocab:
......@@ -61,8 +58,7 @@ def get_result_file(args):
"""
with codecs.open(args.test_data_dir, "r", "utf-8") as test_file:
with codecs.open("predictions.txt", "r", "utf-8") as predictions_file:
with codecs.open(args.test_result_path, "w",
"utf-8") as test_result_file:
with codecs.open(args.test_result_path, "w", "utf-8") as test_result_file:
test_datas = [line.strip("\n") for line in test_file]
predictions = [line.strip("\n") for line in predictions_file]
for test_data, prediction in zip(test_datas, predictions):
......@@ -170,6 +166,58 @@ class ArgumentGroup(object):
help=help + ' Default: %(default)s.',
**kwargs)
class ArgConfig(object):
def __init__(self):
parser = argparse.ArgumentParser()
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("config_path", str, None, "Path to the json file for EmoTect model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("output_dir", str, None, "Directory path to save checkpoints")
model_g.add_arg("task_mode", str, None, "task mode: pairwise or pointwise")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 10, "Number of epoches for training.")
train_g.add_arg("save_steps", int, 200, "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 100, "The steps interval to evaluate model performance.")
log_g = ArgumentGroup(parser, "logging", "logging related")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose_result", bool, True, "Whether to output verbose result.")
log_g.add_arg("test_result_path", str, "test_result", "Directory path to test result.")
log_g.add_arg("infer_result_path", str, "infer_result", "Directory path to infer result.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("train_data_dir", str, None, "Directory path to training data.")
data_g.add_arg("valid_data_dir", str, None, "Directory path to valid data.")
data_g.add_arg("test_data_dir", str, None, "Directory path to testing data.")
data_g.add_arg("infer_data_dir", str, None, "Directory path to infer data.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
run_type_g.add_arg("task_name", str, None, "The name of task to perform sentiment classification.")
run_type_g.add_arg("do_train", bool, False, "Whether to perform training.")
run_type_g.add_arg("do_valid", bool, False, "Whether to perform dev.")
run_type_g.add_arg("do_test", bool, False, "Whether to perform testing.")
run_type_g.add_arg("do_infer", bool, False, "Whether to perform inference.")
run_type_g.add_arg("compute_accuracy", bool, False, "Whether to compute accuracy.")
run_type_g.add_arg("lamda", float, 0.91, "When task_mode is pairwise, lamda is the threshold for calculating the accuracy.")
custom_g = ArgumentGroup(parser, "customize", "customized options.")
self.custom_g = custom_g
parser.add_argument('--enable_ce',action='store_true',help='If set, run the task with continuous evaluation logs.')
self.parser = parser
def add_arg(self, name, dtype, default, descrip):
self.custom_g.add_arg(name, dtype, default, descrip)
def build_conf(self):
return self.parser.parse_args()
def print_arguments(args):
"""
......@@ -302,7 +350,7 @@ def init_checkpoint(exe, init_checkpoint_path, main_program):
"""
assert os.path.exists(
init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
def existed_persitables(var):
if not fluid.io.is_persistable(var):
return False
......@@ -314,3 +362,4 @@ def init_checkpoint(exe, init_checkpoint_path, main_program):
main_program=main_program,
predicate=existed_persitables)
print("Load model from {}".format(init_checkpoint_path))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册