提交 27730332 编写于 作者: 一米半's avatar 一米半 提交者: Yibing Liu

fix double softmax ,fix test function and change default config of pairwise (#2303)

上级 1229fb14
...@@ -3,6 +3,7 @@ softmax loss ...@@ -3,6 +3,7 @@ softmax loss
""" """
import sys import sys
import paddle.fluid as fluid
sys.path.append("../../../") sys.path.append("../../../")
import models.matching.paddle_layers as layers import models.matching.paddle_layers as layers
...@@ -23,8 +24,7 @@ class SoftmaxCrossEntropyLoss(object): ...@@ -23,8 +24,7 @@ class SoftmaxCrossEntropyLoss(object):
""" """
compute loss compute loss
""" """
softmax_with_cross_entropy = layers.SoftmaxWithCrossEntropyLayer()
reduce_mean = layers.ReduceMeanLayer() reduce_mean = layers.ReduceMeanLayer()
cost = softmax_with_cross_entropy.ops(input, label) cost = fluid.layers.cross_entropy(input=input, label=label)
avg_cost = reduce_mean.ops(cost) avg_cost = reduce_mean.ops(cost)
return avg_cost return avg_cost
...@@ -49,10 +49,10 @@ class MMDNN(object): ...@@ -49,10 +49,10 @@ class MMDNN(object):
input=input, input=input,
size=[self.vocab_size, self.emb_size], size=[self.vocab_size, self.emb_size],
padding_idx=(0 if zero_pad else None), padding_idx=(0 if zero_pad else None),
param_attr=fluid.ParamAttr(name="word_embedding", param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Xavier())) name="word_embedding", initializer=fluid.initializer.Xavier()))
if scale: if scale:
emb = emb * (self.emb_size ** 0.5) emb = emb * (self.emb_size**0.5)
return emb return emb
def bi_dynamic_lstm(self, input, hidden_size): def bi_dynamic_lstm(self, input, hidden_size):
...@@ -64,7 +64,9 @@ class MMDNN(object): ...@@ -64,7 +64,9 @@ class MMDNN(object):
param_attr=fluid.ParamAttr(name="fw_fc.w"), param_attr=fluid.ParamAttr(name="fw_fc.w"),
bias_attr=False) bias_attr=False)
forward, _ = fluid.layers.dynamic_lstm( forward, _ = fluid.layers.dynamic_lstm(
input=fw_in_proj, size=4 * hidden_size, is_reverse=False, input=fw_in_proj,
size=4 * hidden_size,
is_reverse=False,
param_attr=fluid.ParamAttr(name="forward_lstm.w"), param_attr=fluid.ParamAttr(name="forward_lstm.w"),
bias_attr=fluid.ParamAttr(name="forward_lstm.b")) bias_attr=fluid.ParamAttr(name="forward_lstm.b"))
...@@ -73,7 +75,9 @@ class MMDNN(object): ...@@ -73,7 +75,9 @@ class MMDNN(object):
param_attr=fluid.ParamAttr(name="rv_fc.w"), param_attr=fluid.ParamAttr(name="rv_fc.w"),
bias_attr=False) bias_attr=False)
reverse, _ = fluid.layers.dynamic_lstm( reverse, _ = fluid.layers.dynamic_lstm(
input=rv_in_proj, size=4 * hidden_size, is_reverse=True, input=rv_in_proj,
size=4 * hidden_size,
is_reverse=True,
param_attr=fluid.ParamAttr(name="reverse_lstm.w"), param_attr=fluid.ParamAttr(name="reverse_lstm.w"),
bias_attr=fluid.ParamAttr(name="reverse_lstm.b")) bias_attr=fluid.ParamAttr(name="reverse_lstm.b"))
return [forward, reverse] return [forward, reverse]
...@@ -96,7 +100,7 @@ class MMDNN(object): ...@@ -96,7 +100,7 @@ class MMDNN(object):
if mask is not None: if mask is not None:
cross_mask = fluid.layers.stack(x=[mask] * self.kernel_size, axis=1) cross_mask = fluid.layers.stack(x=[mask] * self.kernel_size, axis=1)
conv = cross_mask * conv + (1 - cross_mask) * (-2 ** 32 + 1) conv = cross_mask * conv + (1 - cross_mask) * (-2**32 + 1)
# valid padding # valid padding
pool = fluid.layers.pool2d( pool = fluid.layers.pool2d(
input=conv, input=conv,
...@@ -157,6 +161,8 @@ class MMDNN(object): ...@@ -157,6 +161,8 @@ class MMDNN(object):
act="tanh", act="tanh",
size=self.hidden_size) size=self.hidden_size)
pred = fluid.layers.fc(input=relu_hid1, size=self.out_size) pred = fluid.layers.fc(input=relu_hid1,
size=self.out_size,
act="softmax")
return left_seq_encoder, pred return left_seq_encoder, pred
...@@ -3,13 +3,13 @@ ...@@ -3,13 +3,13 @@
### 任务说明 ### 任务说明
短文本语义匹配(SimilarityNet, SimNet)是一个计算短文本相似度的框架,可以根据用户输入的两个文本,计算出相似度得分。SimNet框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MMDNN等核心网络结构形式,提供语义相似度计算训练和预测框架,适用于信息检索、新闻推荐、智能客服等多个应用场景,帮助企业解决语义匹配问题。可通过[AI开放平台-短文本相似度](https://ai.baidu.com/tech/nlp_basic/simnet)线上体验。 短文本语义匹配(SimilarityNet, SimNet)是一个计算短文本相似度的框架,可以根据用户输入的两个文本,计算出相似度得分。SimNet框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MMDNN等核心网络结构形式,提供语义相似度计算训练和预测框架,适用于信息检索、新闻推荐、智能客服等多个应用场景,帮助企业解决语义匹配问题。可通过[AI开放平台-短文本相似度](https://ai.baidu.com/tech/nlp_basic/simnet)线上体验。
### 效果说明 ### 效果说明
基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和语义匹配数据集(LCQMC)进行评测,效果如下表所示。LCQMC数据集以Accuracy为评测指标,而pairwise模型的输出为相似度,因此我们采用0.91作为分类阈值,相比于基线模型中网络结构同等复杂的CBOW模型(准确率为0.737),我们模型的准确率为0.7517 基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和语义匹配数据集(LCQMC)进行评测,效果如下表所示。LCQMC数据集以Accuracy为评测指标,而pairwise模型的输出为相似度,因此我们采用0.958作为分类阈值,相比于基线模型中网络结构同等复杂的CBOW模型(准确率为0.737),我们模型的准确率为0.7532
| 模型 | 百度知道 | ECOM |QQSIM | UNICOM | LCQMC | | 模型 | 百度知道 | ECOM |QQSIM | UNICOM | LCQMC |
|:-----------:|:-------------:|:-------------:|:-------------:|:-------------:|:-------------:| |:-----------:|:-------------:|:-------------:|:-------------:|:-------------:|:-------------:|
| | AUC | AUC | AUC|正逆序比|Accuracy| | | AUC | AUC | AUC|正逆序比|Accuracy|
|BOW_Pairwise|0.6766|0.7308|0.7643|1.5630|0.7517| |BOW_Pairwise|0.6767|0.7329|0.7650|1.5630|0.7532|
## 快速开始 ## 快速开始
#### 版本依赖 #### 版本依赖
本项目依赖于 Paddlepaddle Fluid 1.3.1,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。 本项目依赖于 Paddlepaddle Fluid 1.3.1,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
...@@ -46,7 +46,7 @@ tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C ./model_files ...@@ -46,7 +46,7 @@ tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C ./model_files
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。 我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。
```shell ```shell
sh evaluate_ecom.sh sh evaluate_ecom.sh
sh evaluate_qqsim.sh sh evaluate_qqsim.sh
sh evaluate_zhidao.sh sh evaluate_zhidao.sh
sh evaluate_unicom.sh sh evaluate_unicom.sh
``` ```
...@@ -141,7 +141,7 @@ python tokenizer.py --test_data_dir ./test.txt.utf8 --batch_size 1 > test.txt.ut ...@@ -141,7 +141,7 @@ python tokenizer.py --test_data_dir ./test.txt.utf8 --batch_size 1 > test.txt.ut
### 如何训练 ### 如何训练
```shell ```shell
python run_classifier.py \ python run_classifier.py \
--task_name ${TASK_NAME} \ --task_name ${TASK_NAME} \
--use_cuda false \ #是否使用GPU --use_cuda false \ #是否使用GPU
--do_train True \ #是否训练 --do_train True \ #是否训练
--do_valid True \ #是否在训练中测试开发集 --do_valid True \ #是否在训练中测试开发集
......
...@@ -10,8 +10,11 @@ ...@@ -10,8 +10,11 @@
"class_name": "SoftmaxCrossEntropyLoss" "class_name": "SoftmaxCrossEntropyLoss"
}, },
"optimizer": { "optimizer": {
"class_name": "SGDOptimizer", "class_name": "AdamOptimizer",
"learning_rate" : 0.001 "learning_rate": 0.001,
"beta1": 0.9,
"beta2": 0.999,
"epsilon": 1e-08
}, },
"task_mode": "pointwise", "task_mode": "pointwise",
"model_path": "bow_pointwise" "model_path": "bow_pointwise"
......
...@@ -12,8 +12,11 @@ ...@@ -12,8 +12,11 @@
"class_name": "SoftmaxCrossEntropyLoss" "class_name": "SoftmaxCrossEntropyLoss"
}, },
"optimizer": { "optimizer": {
"class_name": "SGDOptimizer", "class_name": "AdamOptimizer",
"learning_rate" : 0.001 "learning_rate": 0.001,
"beta1": 0.9,
"beta2": 0.999,
"epsilon": 1e-08
}, },
"task_mode": "pointwise", "task_mode": "pointwise",
"model_path": "cnn_pointwise" "model_path": "cnn_pointwise"
......
...@@ -11,8 +11,11 @@ ...@@ -11,8 +11,11 @@
"class_name": "SoftmaxCrossEntropyLoss" "class_name": "SoftmaxCrossEntropyLoss"
}, },
"optimizer": { "optimizer": {
"class_name": "SGDOptimizer", "class_name": "AdamOptimizer",
"learning_rate" : 0.001 "learning_rate" : 0.001,
"beta1": 0.9,
"beta2": 0.999,
"epsilon": 1e-08
}, },
"task_mode": "pointwise", "task_mode": "pointwise",
"model_path": "gru_pointwise" "model_path": "gru_pointwise"
......
...@@ -11,8 +11,11 @@ ...@@ -11,8 +11,11 @@
"class_name": "SoftmaxCrossEntropyLoss" "class_name": "SoftmaxCrossEntropyLoss"
}, },
"optimizer": { "optimizer": {
"class_name": "SGDOptimizer", "class_name": "AdamOptimizer",
"learning_rate" : 0.001 "learning_rate": 0.001,
"beta1": 0.9,
"beta2": 0.999,
"epsilon": 1e-08
}, },
"task_mode": "pointwise", "task_mode": "pointwise",
"model_path": "lstm_pointwise" "model_path": "lstm_pointwise"
......
...@@ -38,7 +38,7 @@ train() { ...@@ -38,7 +38,7 @@ train() {
--save_steps 1000 \ --save_steps 1000 \
--validation_steps 100 \ --validation_steps 100 \
--compute_accuracy False \ --compute_accuracy False \
--lamda 0.91 \ --lamda 0.958 \
--task_mode ${TASK_MODE} --task_mode ${TASK_MODE}
} }
#run_evaluate #run_evaluate
...@@ -55,7 +55,7 @@ evaluate() { ...@@ -55,7 +55,7 @@ evaluate() {
--vocab_path ${VOCAB_PATH} \ --vocab_path ${VOCAB_PATH} \
--task_mode ${TASK_MODE} \ --task_mode ${TASK_MODE} \
--compute_accuracy False \ --compute_accuracy False \
--lamda 0.91 \ --lamda 0.958 \
--init_checkpoint ${INIT_CHECKPOINT} --init_checkpoint ${INIT_CHECKPOINT}
} }
# run_infer # run_infer
......
...@@ -26,40 +26,52 @@ import logging ...@@ -26,40 +26,52 @@ import logging
parser = argparse.ArgumentParser(__doc__) parser = argparse.ArgumentParser(__doc__)
model_g = utils.ArgumentGroup(parser, "model", "model configuration and paths.") model_g = utils.ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("config_path", str, None, "Path to the json file for EmoTect model config.") model_g.add_arg("config_path", str, None,
model_g.add_arg("init_checkpoint", str, "examples/cnn_pointwise.json", "Init checkpoint to resume training from.") "Path to the json file for EmoTect model config.")
model_g.add_arg("init_checkpoint", str, "examples/cnn_pointwise.json",
"Init checkpoint to resume training from.")
model_g.add_arg("output_dir", str, None, "Directory path to save checkpoints") model_g.add_arg("output_dir", str, None, "Directory path to save checkpoints")
model_g.add_arg("task_mode", str, None, "task mode: pairwise or pointwise") model_g.add_arg("task_mode", str, None, "task mode: pairwise or pointwise")
train_g = utils.ArgumentGroup(parser, "training", "training options.") train_g = utils.ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 10, "Number of epoches for training.") train_g.add_arg("epoch", int, 10, "Number of epoches for training.")
train_g.add_arg("save_steps", int, 200, "The steps interval to save checkpoints.") train_g.add_arg("save_steps", int, 200,
train_g.add_arg("validation_steps", int, 100, "The steps interval to evaluate model performance.") "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 100,
"The steps interval to evaluate model performance.")
log_g = utils.ArgumentGroup(parser, "logging", "logging related") log_g = utils.ArgumentGroup(parser, "logging", "logging related")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.") log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose_result", bool, True, "Whether to output verbose result.") log_g.add_arg("verbose_result", bool, True, "Whether to output verbose result.")
log_g.add_arg("test_result_path", str, "test_result", "Directory path to test result.") log_g.add_arg("test_result_path", str, "test_result",
log_g.add_arg("infer_result_path", str, "infer_result", "Directory path to infer result.") "Directory path to test result.")
log_g.add_arg("infer_result_path", str, "infer_result",
"Directory path to infer result.")
data_g = utils.ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options") data_g = utils.ArgumentGroup(
parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("train_data_dir", str, None, "Directory path to training data.") data_g.add_arg("train_data_dir", str, None, "Directory path to training data.")
data_g.add_arg("valid_data_dir", str, None, "Directory path to valid data.") data_g.add_arg("valid_data_dir", str, None, "Directory path to valid data.")
data_g.add_arg("test_data_dir", str, None, "Directory path to testing data.") data_g.add_arg("test_data_dir", str, None, "Directory path to testing data.")
data_g.add_arg("infer_data_dir", str, None, "Directory path to infer data.") data_g.add_arg("infer_data_dir", str, None, "Directory path to infer data.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.") data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training.") data_g.add_arg("batch_size", int, 32,
"Total examples' number in batch for training.")
run_type_g = utils.ArgumentGroup(parser, "run_type", "running type options.") run_type_g = utils.ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.") run_type_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
run_type_g.add_arg("task_name", str, None, "The name of task to perform sentiment classification.") run_type_g.add_arg("task_name", str, None,
"The name of task to perform sentiment classification.")
run_type_g.add_arg("do_train", bool, False, "Whether to perform training.") run_type_g.add_arg("do_train", bool, False, "Whether to perform training.")
run_type_g.add_arg("do_valid", bool, False, "Whether to perform dev.") run_type_g.add_arg("do_valid", bool, False, "Whether to perform dev.")
run_type_g.add_arg("do_test", bool, False, "Whether to perform testing.") run_type_g.add_arg("do_test", bool, False, "Whether to perform testing.")
run_type_g.add_arg("do_infer", bool, False, "Whether to perform inference.") run_type_g.add_arg("do_infer", bool, False, "Whether to perform inference.")
run_type_g.add_arg("compute_accuracy", bool, False, "Whether to compute accuracy.") run_type_g.add_arg("compute_accuracy", bool, False,
run_type_g.add_arg("lamda", float, 0.91, "Whether to compute accuracy.")
"When task_mode is pairwise, lamda is the threshold for calculating the accuracy.") run_type_g.add_arg(
"lamda", float, 0.91,
"When task_mode is pairwise, lamda is the threshold for calculating the accuracy."
)
args = parser.parse_args() args = parser.parse_args()
...@@ -75,14 +87,17 @@ def train(conf_dict, args): ...@@ -75,14 +87,17 @@ def train(conf_dict, args):
# Get data layer # Get data layer
data = layers.DataLayer() data = layers.DataLayer()
# Load network structure dynamically # Load network structure dynamically
net = utils.import_class( net = utils.import_class("../models/matching",
"../models/matching", conf_dict["net"]["module_name"], conf_dict["net"]["class_name"])(conf_dict) conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
# Load loss function dynamically # Load loss function dynamically
loss = utils.import_class( loss = utils.import_class("../models/matching/losses",
"../models/matching/losses", conf_dict["loss"]["module_name"], conf_dict["loss"]["class_name"])(conf_dict) conf_dict["loss"]["module_name"],
conf_dict["loss"]["class_name"])(conf_dict)
# Load Optimization method # Load Optimization method
optimizer = utils.import_class( optimizer = utils.import_class(
"../models/matching/optimizers", "paddle_optimizers", conf_dict["optimizer"]["class_name"])(conf_dict) "../models/matching/optimizers", "paddle_optimizers",
conf_dict["optimizer"]["class_name"])(conf_dict)
# load auc method # load auc method
metric = fluid.metrics.Auc(name="auc") metric = fluid.metrics.Auc(name="auc")
# Get device # Get device
...@@ -95,15 +110,23 @@ def train(conf_dict, args): ...@@ -95,15 +110,23 @@ def train(conf_dict, args):
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
# Build network # Build network
left = data.ops(name="left", shape=[1], dtype="int64", lod_level=1) left = data.ops(name="left", shape=[1], dtype="int64", lod_level=1)
pos_right = data.ops(name="right", shape=[1], dtype="int64", lod_level=1) pos_right = data.ops(name="right",
neg_right = data.ops(name="neg_right", shape=[1], dtype="int64", lod_level=1) shape=[1],
dtype="int64",
lod_level=1)
neg_right = data.ops(name="neg_right",
shape=[1],
dtype="int64",
lod_level=1)
left_feat, pos_score = net.predict(left, pos_right) left_feat, pos_score = net.predict(left, pos_right)
# Get Feeder and Reader # Get Feeder and Reader
train_feeder = fluid.DataFeeder(place=place, feed_list=[left.name, pos_right.name, neg_right.name]) train_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, pos_right.name, neg_right.name])
train_reader = simnet_process.get_reader("train") train_reader = simnet_process.get_reader("train")
if args.do_valid: if args.do_valid:
valid_feeder = fluid.DataFeeder(place=place, feed_list=[left.name, pos_right.name]) valid_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, pos_right.name])
valid_reader = simnet_process.get_reader("valid") valid_reader = simnet_process.get_reader("valid")
pred = pos_score pred = pos_score
# Save Infer model # Save Infer model
...@@ -119,10 +142,12 @@ def train(conf_dict, args): ...@@ -119,10 +142,12 @@ def train(conf_dict, args):
left_feat, pred = net.predict(left, right) left_feat, pred = net.predict(left, right)
# Get Feeder and Reader # Get Feeder and Reader
train_feeder = fluid.DataFeeder(place=place, feed_list=[left.name, right.name, label.name]) train_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, right.name, label.name])
train_reader = simnet_process.get_reader("train") train_reader = simnet_process.get_reader("train")
if args.do_valid: if args.do_valid:
valid_feeder = fluid.DataFeeder(place=place, feed_list=[left.name, right.name]) valid_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, right.name])
valid_reader = simnet_process.get_reader("valid") valid_reader = simnet_process.get_reader("valid")
# Save Infer model # Save Infer model
infer_program = fluid.default_main_program().clone(for_test=True) infer_program = fluid.default_main_program().clone(for_test=True)
...@@ -134,8 +159,10 @@ def train(conf_dict, args): ...@@ -134,8 +159,10 @@ def train(conf_dict, args):
executor = fluid.Executor(place) executor = fluid.Executor(place)
executor.run(fluid.default_startup_program()) executor.run(fluid.default_startup_program())
# Get and run executor # Get and run executor
parallel_executor = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=avg_cost.name, parallel_executor = fluid.ParallelExecutor(
main_program=fluid.default_main_program()) use_cuda=args.use_cuda,
loss_name=avg_cost.name,
main_program=fluid.default_main_program())
# Get device number # Get device number
device_count = parallel_executor.device_count device_count = parallel_executor.device_count
logging.info("device count: %d" % device_count) logging.info("device count: %d" % device_count)
...@@ -148,22 +175,25 @@ def train(conf_dict, args): ...@@ -148,22 +175,25 @@ def train(conf_dict, args):
batch_data = paddle.batch(reader, args.batch_size, drop_last=False) batch_data = paddle.batch(reader, args.batch_size, drop_last=False)
pred_list = [] pred_list = []
for data in batch_data(): for data in batch_data():
_pred = executor.run(program=program, feed=feeder.feed(data), fetch_list=[pred.name]) _pred = executor.run(program=program,
feed=feeder.feed(data),
fetch_list=[pred.name])
pred_list += list(_pred) pred_list += list(_pred)
pred_list = np.vstack(pred_list) pred_list = np.vstack(pred_list)
if mode == "test": if mode == "test":
label_list = process.get_test_label() label_list = process.get_test_label()
elif mode == "valid": elif mode == "valid":
label_list = process.get_valid_label() label_list = process.get_valid_label()
if conf_dict['net']['class_name'] == 'MMDNN':
pred_list = utils.deal_preds_of_mmdnn(conf_dict, pred_list)
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
pred_list = np.hstack((np.ones_like(pred_list) - pred_list, pred_list)) pred_list = (pred_list + 1) / 2
pred_list = np.hstack(
(np.ones_like(pred_list) - pred_list, pred_list))
metric.reset() metric.reset()
metric.update(pred_list, label_list) metric.update(pred_list, label_list)
auc = metric.eval() auc = metric.eval()
if args.compute_accuracy: if args.compute_accuracy:
acc = utils.get_accuracy(pred_list, label_list, args.task_mode, args.lamda) acc = utils.get_accuracy(pred_list, label_list, args.task_mode,
args.lamda)
return auc, acc return auc, acc
else: else:
return auc return auc
...@@ -175,27 +205,41 @@ def train(conf_dict, args): ...@@ -175,27 +205,41 @@ def train(conf_dict, args):
for epoch_id in range(args.epoch): for epoch_id in range(args.epoch):
losses = [] losses = []
# Get batch data iterator # Get batch data iterator
train_batch_data = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=10000), train_batch_data = paddle.batch(
args.batch_size, drop_last=False) paddle.reader.shuffle(
train_reader, buf_size=10000),
args.batch_size,
drop_last=False)
start_time = time.time() start_time = time.time()
for iter, data in enumerate(train_batch_data()): for iter, data in enumerate(train_batch_data()):
if len(data) < device_count: if len(data) < device_count:
logging.info("the size of batch data is less than device_count(%d)" % device_count) logging.info(
"the size of batch data is less than device_count(%d)" %
device_count)
continue continue
global_step += 1 global_step += 1
avg_loss = parallel_executor.run([avg_cost.name], feed=train_feeder.feed(data)) avg_loss = parallel_executor.run([avg_cost.name],
feed=train_feeder.feed(data))
if args.do_valid and global_step % args.validation_steps == 0: if args.do_valid and global_step % args.validation_steps == 0:
valid_result = valid_and_test(program=infer_program, feeder=valid_feeder, reader=valid_reader, valid_result = valid_and_test(
process=simnet_process, mode="valid") program=infer_program,
feeder=valid_feeder,
reader=valid_reader,
process=simnet_process,
mode="valid")
if args.compute_accuracy: if args.compute_accuracy:
valid_auc, valid_acc = valid_result valid_auc, valid_acc = valid_result
logging.info("global_steps: %d, valid_auc: %f, valid_acc: %f" % (global_step, valid_auc, valid_acc)) logging.info(
"global_steps: %d, valid_auc: %f, valid_acc: %f" %
(global_step, valid_auc, valid_acc))
else: else:
valid_auc = valid_result valid_auc = valid_result
logging.info("global_steps: %d, valid_auc: %f" % (global_step, valid_auc)) logging.info("global_steps: %d, valid_auc: %f" %
(global_step, valid_auc))
if global_step % args.save_steps == 0: if global_step % args.save_steps == 0:
model_save_dir = os.path.join(args.output_dir, conf_dict["model_path"]) model_save_dir = os.path.join(args.output_dir,
conf_dict["model_path"])
model_path = os.path.join(model_save_dir, str(global_step)) model_path = os.path.join(model_save_dir, str(global_step))
if not os.path.exists(model_save_dir): if not os.path.exists(model_save_dir):
...@@ -204,28 +248,40 @@ def train(conf_dict, args): ...@@ -204,28 +248,40 @@ def train(conf_dict, args):
feed_var_names = [left.name, pos_right.name] feed_var_names = [left.name, pos_right.name]
target_vars = [left_feat, pos_score] target_vars = [left_feat, pos_score]
else: else:
feed_var_names = [left.name, right.name, ] feed_var_names = [
left.name,
right.name,
]
target_vars = [left_feat, pred] target_vars = [left_feat, pred]
fluid.io.save_inference_model( fluid.io.save_inference_model(model_path, feed_var_names,
model_path, feed_var_names, target_vars, executor, infer_program) target_vars, executor,
infer_program)
logging.info("saving infer model in %s" % model_path) logging.info("saving infer model in %s" % model_path)
losses.append(np.mean(avg_loss[0])) losses.append(np.mean(avg_loss[0]))
end_time = time.time() end_time = time.time()
logging.info("epoch: %d, loss: %f, used time: %d sec" % (epoch_id, np.mean(losses), end_time - start_time)) logging.info("epoch: %d, loss: %f, used time: %d sec" %
(epoch_id, np.mean(losses), end_time - start_time))
if args.do_test: if args.do_test:
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
# Get Feeder and Reader # Get Feeder and Reader
test_feeder = fluid.DataFeeder(place=place, feed_list=[left.name, pos_right.name]) test_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, pos_right.name])
test_reader = simnet_process.get_reader("test") test_reader = simnet_process.get_reader("test")
else: else:
# Get Feeder and Reader # Get Feeder and Reader
test_feeder = fluid.DataFeeder(place=place, feed_list=[left.name, right.name]) test_feeder = fluid.DataFeeder(
place=place, feed_list=[left.name, right.name])
test_reader = simnet_process.get_reader("test") test_reader = simnet_process.get_reader("test")
test_result = valid_and_test(program=infer_program, feeder=test_feeder, reader=test_reader, test_result = valid_and_test(
process=simnet_process, mode="test") program=infer_program,
feeder=test_feeder,
reader=test_reader,
process=simnet_process,
mode="test")
if args.compute_accuracy: if args.compute_accuracy:
test_auc, test_acc = test_result test_auc, test_acc = test_result
logging.info("AUC of test is %f, Accuracy of test is %f" % (test_auc, test_acc)) logging.info("AUC of test is %f, Accuracy of test is %f" %
(test_auc, test_acc))
else: else:
test_auc = test_result test_auc = test_result
logging.info("AUC of test is %f" % test_auc) logging.info("AUC of test is %f" % test_auc)
...@@ -250,46 +306,56 @@ def test(conf_dict, args): ...@@ -250,46 +306,56 @@ def test(conf_dict, args):
# Get executor # Get executor
executor = fluid.Executor(place=place) executor = fluid.Executor(place=place)
# Load model # Load model
program, feed_var_names, fetch_targets = fluid.io.load_inference_model(model_path, executor) program, feed_var_names, fetch_targets = fluid.io.load_inference_model(
model_path, executor)
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
# Get Feeder and Reader # Get Feeder and Reader
feeder = fluid.DataFeeder(place=place, feed_list=feed_var_names, program=program) feeder = fluid.DataFeeder(
place=place, feed_list=feed_var_names, program=program)
test_reader = simnet_process.get_reader("test") test_reader = simnet_process.get_reader("test")
else: else:
# Get Feeder and Reader # Get Feeder and Reader
feeder = fluid.DataFeeder(place=place, feed_list=feed_var_names, program=program) feeder = fluid.DataFeeder(
place=place, feed_list=feed_var_names, program=program)
test_reader = simnet_process.get_reader("test") test_reader = simnet_process.get_reader("test")
# Get batch data iterator # Get batch data iterator
batch_data = paddle.batch(test_reader, args.batch_size, drop_last=False) batch_data = paddle.batch(test_reader, args.batch_size, drop_last=False)
logging.info("start test process ...") logging.info("start test process ...")
pred_list = [] pred_list = []
for iter, data in enumerate(batch_data()): for iter, data in enumerate(batch_data()):
output = executor.run(program, feed=feeder.feed(data), fetch_list=fetch_targets) output = executor.run(program,
feed=feeder.feed(data),
fetch_list=fetch_targets)
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
pred_list += list(map(lambda item: float(item[0]), output[1])) pred_list += list(map(lambda item: float(item[0]), output[1]))
predictions_file.write("\n".join(map(lambda item: str(item[0]), output[1])) + "\n") predictions_file.write("\n".join(
map(lambda item: str((item[0] + 1) / 2), output[1])) + "\n")
else: else:
pred_list += map(lambda item: item, output[1]) pred_list += map(lambda item: item, output[1])
predictions_file.write("\n".join(map(lambda item: str(np.argmax(item)), output[1])) + "\n") predictions_file.write("\n".join(
if conf_dict['net']['class_name'] == 'MMDNN': map(lambda item: str(np.argmax(item)), output[1])) + "\n")
pred_list = utils.deal_preds_of_mmdnn(conf_dict, pred_list)
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
pred_list = np.array(pred_list).reshape((-1, 1)) pred_list = np.array(pred_list).reshape((-1, 1))
pred_list = np.hstack((np.ones_like(pred_list) - pred_list, pred_list)) pred_list = (pred_list + 1) / 2
pred_list = np.hstack(
(np.ones_like(pred_list) - pred_list, pred_list))
else: else:
pred_list = np.array(pred_list) pred_list = np.array(pred_list)
labels = simnet_process.get_test_label() labels = simnet_process.get_test_label()
metric.update(pred_list, labels) metric.update(pred_list, labels)
if args.compute_accuracy: if args.compute_accuracy:
acc = utils.get_accuracy(pred_list, labels, args.task_mode, args.lamda) acc = utils.get_accuracy(pred_list, labels, args.task_mode,
logging.info("AUC of test is %f, Accuracy of test is %f" % (metric.eval(), acc)) args.lamda)
logging.info("AUC of test is %f, Accuracy of test is %f" %
(metric.eval(), acc))
else: else:
logging.info("AUC of test is %f" % metric.eval()) logging.info("AUC of test is %f" % metric.eval())
if args.verbose_result: if args.verbose_result:
utils.get_result_file(args) utils.get_result_file(args)
logging.info("test result saved in %s" % os.path.join(os.getcwd(), args.test_result_path)) logging.info("test result saved in %s" %
os.path.join(os.getcwd(), args.test_result_path))
def infer(args): def infer(args):
...@@ -308,29 +374,36 @@ def infer(args): ...@@ -308,29 +374,36 @@ def infer(args):
# Get executor # Get executor
executor = fluid.Executor(place=place) executor = fluid.Executor(place=place)
# Load model # Load model
program, feed_var_names, fetch_targets = fluid.io.load_inference_model(model_path, executor) program, feed_var_names, fetch_targets = fluid.io.load_inference_model(
model_path, executor)
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
# Get Feeder and Reader # Get Feeder and Reader
infer_feeder = fluid.DataFeeder(place=place, feed_list=feed_var_names, program=program) infer_feeder = fluid.DataFeeder(
place=place, feed_list=feed_var_names, program=program)
infer_reader = simnet_process.get_infer_reader infer_reader = simnet_process.get_infer_reader
else: else:
# Get Feeder and Reader # Get Feeder and Reader
infer_feeder = fluid.DataFeeder(place=place, feed_list=feed_var_names, program=program) infer_feeder = fluid.DataFeeder(
place=place, feed_list=feed_var_names, program=program)
infer_reader = simnet_process.get_infer_reader infer_reader = simnet_process.get_infer_reader
# Get batch data iterator # Get batch data iterator
batch_data = paddle.batch(infer_reader, args.batch_size, drop_last=False) batch_data = paddle.batch(infer_reader, args.batch_size, drop_last=False)
logging.info("start test process ...") logging.info("start test process ...")
preds_list = [] preds_list = []
for iter, data in enumerate(batch_data()): for iter, data in enumerate(batch_data()):
output = executor.run(program, feed=infer_feeder.feed(data), fetch_list=fetch_targets) output = executor.run(program,
feed=infer_feeder.feed(data),
fetch_list=fetch_targets)
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
preds_list += list(map(lambda item: str(item[0]), output[1])) preds_list += list(
map(lambda item: str((item[0] + 1) / 2), output[1]))
else: else:
preds_list += map(lambda item: str(np.argmax(item)), output[1]) preds_list += map(lambda item: str(np.argmax(item)), output[1])
with open(args.infer_result_path, "w") as infer_file: with open(args.infer_result_path, "w") as infer_file:
for _data, _pred in zip(simnet_process.get_infer_data(), preds_list): for _data, _pred in zip(simnet_process.get_infer_data(), preds_list):
infer_file.write(_data + "\t" + _pred + "\n") infer_file.write(_data + "\t" + _pred + "\n")
logging.info("infer result saved in %s" % os.path.join(os.getcwd(), args.infer_result_path)) logging.info("infer result saved in %s" %
os.path.join(os.getcwd(), args.infer_result_path))
def main(conf_dict, args): def main(conf_dict, args):
...@@ -344,7 +417,8 @@ def main(conf_dict, args): ...@@ -344,7 +417,8 @@ def main(conf_dict, args):
elif args.do_infer: elif args.do_infer:
infer(args) infer(args)
else: else:
raise ValueError("one of do_train and do_test and do_infer must be True") raise ValueError(
"one of do_train and do_test and do_infer must be True")
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -11,7 +11,6 @@ import six ...@@ -11,7 +11,6 @@ import six
import numpy as np import numpy as np
import logging import logging
import logging.handlers import logging.handlers
""" """
******functions for file processing****** ******functions for file processing******
""" """
...@@ -165,9 +164,13 @@ def print_arguments(args): ...@@ -165,9 +164,13 @@ def print_arguments(args):
print('------------------------------------------------') print('------------------------------------------------')
def init_log(log_path, level=logging.INFO, when="D", backup=7, def init_log(
format="%(levelname)s: %(asctime)s - %(filename)s:%(lineno)d * %(thread)d %(message)s", log_path,
datefmt=None): level=logging.INFO,
when="D",
backup=7,
format="%(levelname)s: %(asctime)s - %(filename)s:%(lineno)d * %(thread)d %(message)s",
datefmt=None):
""" """
init_log - initialize log module init_log - initialize log module
...@@ -209,16 +212,14 @@ def init_log(log_path, level=logging.INFO, when="D", backup=7, ...@@ -209,16 +212,14 @@ def init_log(log_path, level=logging.INFO, when="D", backup=7,
if not os.path.isdir(dir): if not os.path.isdir(dir):
os.makedirs(dir) os.makedirs(dir)
handler = logging.handlers.TimedRotatingFileHandler(log_path + ".log", handler = logging.handlers.TimedRotatingFileHandler(
when=when, log_path + ".log", when=when, backupCount=backup)
backupCount=backup)
handler.setLevel(level) handler.setLevel(level)
handler.setFormatter(formatter) handler.setFormatter(formatter)
logger.addHandler(handler) logger.addHandler(handler)
handler = logging.handlers.TimedRotatingFileHandler(log_path + ".log.wf", handler = logging.handlers.TimedRotatingFileHandler(
when=when, log_path + ".log.wf", when=when, backupCount=backup)
backupCount=backup)
handler.setLevel(logging.WARNING) handler.setLevel(logging.WARNING)
handler.setFormatter(formatter) handler.setFormatter(formatter)
logger.addHandler(handler) logger.addHandler(handler)
...@@ -241,7 +242,7 @@ def get_level(): ...@@ -241,7 +242,7 @@ def get_level():
return logger.level return logger.level
def get_accuracy(preds, labels, mode, lamda=0.91): def get_accuracy(preds, labels, mode, lamda=0.958):
""" """
compute accuracy compute accuracy
""" """
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册