提交 e5b9110c 编写于 作者: Z zhoushiyu 提交者: Thunderbrook

[PaddleRec]reset auc more elegant and fix bug and README in ctr/dnn (#3906)

* reset auc more elegant and fix bug and README in ctr/dnn

* change ctr/dnn README
上级 4a5b4c10
...@@ -67,11 +67,8 @@ def infer(): ...@@ -67,11 +67,8 @@ def infer():
dirname=cur_model_path, dirname=cur_model_path,
main_program=fluid.default_main_program()) main_program=fluid.default_main_program())
auc_states_names = ['_generated_var_2', '_generated_var_3'] for var in dcn_model.auc_states: # reset auc states
for name in auc_states_names: set_zero(var.name, scope=inference_scope, place=place)
param = inference_scope.var(name).get_tensor()
param_array = np.zeros(param._get_dims()).astype("int64")
param.set(param_array, place)
loss_all = 0 loss_all = 0
num_ins = 0 num_ins = 0
...@@ -93,6 +90,23 @@ def infer(): ...@@ -93,6 +90,23 @@ def infer():
) )
def set_zero(var_name,
scope=fluid.global_scope(),
place=fluid.CPUPlace(),
param_type="int64"):
"""
Set tensor of a Variable to zero.
Args:
var_name(str): name of Variable
scope(Scope): Scope object, default is fluid.global_scope()
place(Place): Place object, default is fluid.CPUPlace()
param_type(str): param data type, default is int64
"""
param = scope.var(var_name).get_tensor()
param_array = np.zeros(param._get_dims()).astype(param_type)
param.set(param_array, place)
if __name__ == '__main__': if __name__ == '__main__':
utils.check_version() utils.check_version()
infer() infer()
...@@ -62,9 +62,8 @@ class DCN(object): ...@@ -62,9 +62,8 @@ class DCN(object):
# auc # auc
prob_2d = fluid.layers.concat([1 - self.prob, self.prob], 1) prob_2d = fluid.layers.concat([1 - self.prob, self.prob], 1)
label_int = fluid.layers.cast(self.target_input, 'int64') label_int = fluid.layers.cast(self.target_input, 'int64')
auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=prob_2d, auc_var, batch_auc_var, self.auc_states = fluid.layers.auc(
label=label_int, input=prob_2d, label=label_int, slide_steps=0)
slide_steps=0)
self.auc_var = auc_var self.auc_var = auc_var
# logloss # logloss
......
...@@ -115,9 +115,9 @@ def train(): ...@@ -115,9 +115,9 @@ def train():
if args.trainer_id == 0 and not os.path.isdir(args.model_output_dir): if args.trainer_id == 0 and not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir) os.mkdir(args.model_output_dir)
loss, auc, data_list = ctr_deepfm_model(args.embedding_size, args.num_field, loss, auc, data_list, auc_states = ctr_deepfm_model(
args.num_feat, args.layer_sizes, args.embedding_size, args.num_field, args.num_feat, args.layer_sizes,
args.act, args.reg, args.is_sparse) args.act, args.reg, args.is_sparse)
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.SGD(
learning_rate=args.lr, learning_rate=args.lr,
regularization=fluid.regularizer.L2DecayRegularizer(args.reg)) regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
...@@ -152,8 +152,8 @@ def train(): ...@@ -152,8 +152,8 @@ def train():
exe.train_from_dataset( exe.train_from_dataset(
program=main_program, program=main_program,
dataset=dataset, dataset=dataset,
fetch_list=[loss], fetch_list=[loss, auc],
fetch_info=['epoch %d batch loss' % (epoch_id + 1)], fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"],
print_period=5, print_period=5,
debug=False) debug=False)
model_dir = os.path.join(args.model_output_dir, model_dir = os.path.join(args.model_output_dir,
......
...@@ -40,7 +40,7 @@ def infer(): ...@@ -40,7 +40,7 @@ def infer():
with fluid.scope_guard(inference_scope): with fluid.scope_guard(inference_scope):
with fluid.framework.program_guard(test_program, startup_program): with fluid.framework.program_guard(test_program, startup_program):
loss, auc, data_list = ctr_deepfm_model( loss, auc, data_list, auc_states = ctr_deepfm_model(
args.embedding_size, args.num_field, args.num_feat, args.embedding_size, args.num_field, args.num_feat,
args.layer_sizes, args.act, args.reg) args.layer_sizes, args.act, args.reg)
...@@ -51,11 +51,8 @@ def infer(): ...@@ -51,11 +51,8 @@ def infer():
dirname=cur_model_path, dirname=cur_model_path,
main_program=fluid.default_main_program()) main_program=fluid.default_main_program())
auc_states_names = ['_generated_var_2', '_generated_var_3'] for var in auc_states: # reset auc states
for name in auc_states_names: set_zero(var.name, scope=inference_scope, place=place)
param = inference_scope.var(name).get_tensor()
param_array = np.zeros(param._get_dims()).astype("int64")
param.set(param_array, place)
loss_all = 0 loss_all = 0
num_ins = 0 num_ins = 0
...@@ -73,6 +70,23 @@ def infer(): ...@@ -73,6 +70,23 @@ def infer():
) )
def set_zero(var_name,
scope=fluid.global_scope(),
place=fluid.CPUPlace(),
param_type="int64"):
"""
Set tensor of a Variable to zero.
Args:
var_name(str): name of Variable
scope(Scope): Scope object, default is fluid.global_scope()
place(Place): Place object, default is fluid.CPUPlace()
param_type(str): param data type, default is int64
"""
param = scope.var(var_name).get_tensor()
param_array = np.zeros(param._get_dims()).astype(param_type)
param.set(param_array, place)
if __name__ == '__main__': if __name__ == '__main__':
utils.check_version() utils.check_version()
infer() infer()
...@@ -18,9 +18,9 @@ def train(): ...@@ -18,9 +18,9 @@ def train():
if not os.path.isdir(args.model_output_dir): if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir) os.mkdir(args.model_output_dir)
loss, auc, data_list = ctr_deepfm_model(args.embedding_size, args.num_field, loss, auc, data_list, auc_states = ctr_deepfm_model(
args.num_feat, args.layer_sizes, args.embedding_size, args.num_field, args.num_feat, args.layer_sizes,
args.act, args.reg) args.act, args.reg)
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.SGD(
learning_rate=args.lr, learning_rate=args.lr,
regularization=fluid.regularizer.L2DecayRegularizer(args.reg)) regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
...@@ -47,8 +47,8 @@ def train(): ...@@ -47,8 +47,8 @@ def train():
exe.train_from_dataset( exe.train_from_dataset(
program=fluid.default_main_program(), program=fluid.default_main_program(),
dataset=dataset, dataset=dataset,
fetch_list=[loss], fetch_list=[loss, auc],
fetch_info=['epoch %d batch loss' % (epoch_id + 1)], fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"],
print_period=1000, print_period=1000,
debug=False) debug=False)
model_dir = os.path.join(args.model_output_dir, model_dir = os.path.join(args.model_output_dir,
......
...@@ -111,4 +111,5 @@ def ctr_deepfm_model(embedding_size, ...@@ -111,4 +111,5 @@ def ctr_deepfm_model(embedding_size,
label=label_int, label=label_int,
slide_steps=0) slide_steps=0)
return batch_cost, auc_var, [raw_feat_idx, raw_feat_value, label] return batch_cost, auc_var, [raw_feat_idx, raw_feat_value,
label], auc_states
...@@ -49,8 +49,6 @@ python train.py \ ...@@ -49,8 +49,6 @@ python train.py \
2>&1 | tee train.log 2>&1 | tee train.log
``` ```
训练到第1轮的第40000个batch后,测试的AUC为0.801178,误差(cost)为0.445196。
### 分布式训练 ### 分布式训练
本地启动一个2 trainer 2 pserver的分布式训练任务,分布式场景下训练数据会按照trainer的id进行切分,保证trainer之间的训练数据不会重叠,提高训练效率 本地启动一个2 trainer 2 pserver的分布式训练任务,分布式场景下训练数据会按照trainer的id进行切分,保证trainer之间的训练数据不会重叠,提高训练效率
...@@ -66,10 +64,13 @@ sh cluster_train.sh ...@@ -66,10 +64,13 @@ sh cluster_train.sh
对测试集进行预测: 对测试集进行预测:
```bash ```bash
python infer.py \ python infer.py \
--model_path models/pass-0/ \ --model_path models/pass-2/ \
--data_path data/raw/valid.txt --data_path data/raw/train.txt
``` ```
注意:infer.py跑完最后输出的AUC才是整个预测文件的整体AUC。
加载pass-2的模型, 预期测试AUC为`0.794`
注意:infer.py跑完最后输出的AUC才是整个预测文件的整体AUC。train.txt文件在reader.py中被分为训练和测试两部分,所以这里数据不会和训练重叠。
## 在百度云上运行集群训练 ## 在百度云上运行集群训练
1. 参考文档 [在百度云上启动Fluid分布式训练](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/user_guides/howto/training/train_on_baidu_cloud_cn.rst) 在百度云上部署一个CPU集群。 1. 参考文档 [在百度云上启动Fluid分布式训练](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/user_guides/howto/training/train_on_baidu_cloud_cn.rst) 在百度云上部署一个CPU集群。
......
...@@ -64,9 +64,6 @@ python train.py \ ...@@ -64,9 +64,6 @@ python train.py \
2>&1 | tee train.log 2>&1 | tee train.log
``` ```
After training pass 1 batch 40000, the testing AUC is `0.801178` and the testing
cost is `0.445196`.
### Distributed Train ### Distributed Train
Run a 2 pserver 2 trainer distribute training on a single machine. Run a 2 pserver 2 trainer distribute training on a single machine.
In distributed training setting, training data is splited by trainer_id, so that training data In distributed training setting, training data is splited by trainer_id, so that training data
...@@ -83,9 +80,12 @@ The command line options for infering can be listed by `python infer.py -h`. ...@@ -83,9 +80,12 @@ The command line options for infering can be listed by `python infer.py -h`.
To make inference for the test dataset: To make inference for the test dataset:
```bash ```bash
python infer.py \ python infer.py \
--model_path models/ \ --model_path models/pass-2 \
--data_path data/raw/train.txt --data_path data/raw/train.txt
``` ```
Load models in `models/pass-2`, the expected testing Auc is `0.794`.
Note: The AUC value in the last log info is the total AUC for all test dataset. Here, train.txt is splited inside the reader.py so that validation data does not have overlap with training data. Note: The AUC value in the last log info is the total AUC for all test dataset. Here, train.txt is splited inside the reader.py so that validation data does not have overlap with training data.
## Train on Baidu Cloud ## Train on Baidu Cloud
......
...@@ -60,35 +60,35 @@ def infer(): ...@@ -60,35 +60,35 @@ def infer():
startup_program = fluid.framework.Program() startup_program = fluid.framework.Program()
test_program = fluid.framework.Program() test_program = fluid.framework.Program()
with fluid.framework.program_guard(test_program, startup_program): with fluid.scope_guard(inference_scope):
loss, auc_var, batch_auc_var, _, data_list = ctr_dnn_model( with fluid.framework.program_guard(test_program, startup_program):
args.embedding_size, args.sparse_feature_dim, False) loss, auc_var, batch_auc_var, _, data_list, auc_states = ctr_dnn_model(
args.embedding_size, args.sparse_feature_dim, False)
exe = fluid.Executor(place)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=data_list, place=place)
feeder = fluid.DataFeeder(feed_list=data_list, place=place)
fluid.io.load_persistables(
executor=exe, fluid.io.load_persistables(
dirname=args.model_path, executor=exe,
main_program=fluid.default_main_program()) dirname=args.model_path,
main_program=fluid.default_main_program())
def set_zero(var_name):
param = inference_scope.var(var_name).get_tensor() def set_zero(var_name):
param_array = np.zeros(param._get_dims()).astype("int64") param = inference_scope.var(var_name).get_tensor()
param.set(param_array, place) param_array = np.zeros(param._get_dims()).astype("int64")
param.set(param_array, place)
auc_states_names = ['_generated_var_2', '_generated_var_3']
for name in auc_states_names: for var in auc_states:
set_zero(name) set_zero(var.name)
for batch_id, data in enumerate(test_reader()): for batch_id, data in enumerate(test_reader()):
loss_val, auc_val = exe.run(test_program, loss_val, auc_val = exe.run(test_program,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[loss, auc_var]) fetch_list=[loss, auc_var])
if batch_id % 100 == 0: if batch_id % 100 == 0:
logger.info("TEST --> batch: {} loss: {} auc: {}".format( logger.info("TEST --> batch: {} loss: {} auc: {}".format(
batch_id, loss_val / args.batch_size, auc_val)) batch_id, loss_val / args.batch_size, auc_val))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -36,7 +36,8 @@ def ctr_deepfm_model(factor_size, sparse_feature_dim, dense_feature_dim, ...@@ -36,7 +36,8 @@ def ctr_deepfm_model(factor_size, sparse_feature_dim, dense_feature_dim,
dtype='float32', dtype='float32',
size=[emb_dict_size, 1], size=[emb_dict_size, 1],
is_sparse=True) is_sparse=True)
first_embeddings = fluid.layers.squeeze(input=first_embeddings, axes=[1]) first_embeddings = fluid.layers.squeeze(
input=first_embeddings, axes=[1])
first_order = fluid.layers.sequence_pool( first_order = fluid.layers.sequence_pool(
input=first_embeddings, pool_type='sum') input=first_embeddings, pool_type='sum')
...@@ -46,7 +47,8 @@ def ctr_deepfm_model(factor_size, sparse_feature_dim, dense_feature_dim, ...@@ -46,7 +47,8 @@ def ctr_deepfm_model(factor_size, sparse_feature_dim, dense_feature_dim,
size=[emb_dict_size, factor_size], size=[emb_dict_size, factor_size],
param_attr=fm_param_attr, param_attr=fm_param_attr,
is_sparse=True) is_sparse=True)
nonzero_embeddings = fluid.layers.squeeze(input=nonzero_embeddings, axes=[1]) nonzero_embeddings = fluid.layers.squeeze(
input=nonzero_embeddings, axes=[1])
summed_features_emb = fluid.layers.sequence_pool( summed_features_emb = fluid.layers.sequence_pool(
input=nonzero_embeddings, pool_type='sum') input=nonzero_embeddings, pool_type='sum')
summed_features_emb_square = fluid.layers.square(summed_features_emb) summed_features_emb_square = fluid.layers.square(summed_features_emb)
...@@ -211,4 +213,4 @@ def ctr_dnn_model(embedding_size, sparse_feature_dim, use_py_reader=True): ...@@ -211,4 +213,4 @@ def ctr_dnn_model(embedding_size, sparse_feature_dim, use_py_reader=True):
auc_var, batch_auc_var, auc_states = \ auc_var, batch_auc_var, auc_states = \
fluid.layers.auc(input=predict, label=words[-1], num_thresholds=2 ** 12, slide_steps=20) fluid.layers.auc(input=predict, label=words[-1], num_thresholds=2 ** 12, slide_steps=20)
return avg_cost, auc_var, batch_auc_var, py_reader, words return avg_cost, auc_var, batch_auc_var, py_reader, words, auc_states
...@@ -215,7 +215,7 @@ def train(): ...@@ -215,7 +215,7 @@ def train():
if not os.path.isdir(args.model_output_dir): if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir) os.mkdir(args.model_output_dir)
loss, auc_var, batch_auc_var, py_reader, _ = ctr_dnn_model( loss, auc_var, batch_auc_var, py_reader, _, auc_states = ctr_dnn_model(
args.embedding_size, args.sparse_feature_dim) args.embedding_size, args.sparse_feature_dim)
optimizer = fluid.optimizer.Adam(learning_rate=1e-4) optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
optimizer.minimize(loss) optimizer.minimize(loss)
......
...@@ -122,7 +122,7 @@ def train(): ...@@ -122,7 +122,7 @@ def train():
if not os.path.isdir(args.model_output_dir): if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir) os.mkdir(args.model_output_dir)
loss, auc, data_list = eval('network_conf.' + args.model_name)( loss, auc, data_list, auc_states = eval('network_conf.' + args.model_name)(
args.embedding_size, args.num_field, args.num_feat, args.embedding_size, args.num_field, args.num_feat,
args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin, args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin,
args.is_sparse) args.is_sparse)
......
...@@ -40,9 +40,11 @@ def infer(): ...@@ -40,9 +40,11 @@ def infer():
with fluid.scope_guard(inference_scope): with fluid.scope_guard(inference_scope):
with fluid.framework.program_guard(test_program, startup_program): with fluid.framework.program_guard(test_program, startup_program):
loss, auc, data_list = eval('network_conf.' + args.model_name)( loss, auc, data_list, auc_states = eval(
args.embedding_size, args.num_field, args.num_feat, 'network_conf.' + args.model_name)(
args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin) args.embedding_size, args.num_field, args.num_feat,
args.layer_sizes_dnn, args.act, args.reg,
args.layer_sizes_cin)
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=data_list, place=place) feeder = fluid.DataFeeder(feed_list=data_list, place=place)
...@@ -51,11 +53,8 @@ def infer(): ...@@ -51,11 +53,8 @@ def infer():
dirname=cur_model_path, dirname=cur_model_path,
main_program=fluid.default_main_program()) main_program=fluid.default_main_program())
auc_states_names = ['_generated_var_2', '_generated_var_3'] for var in auc_states: # reset auc states
for name in auc_states_names: set_zero(var.name, scope=inference_scope, place=place)
param = inference_scope.var(name).get_tensor()
param_array = np.zeros(param._get_dims()).astype("int64")
param.set(param_array, place)
loss_all = 0 loss_all = 0
num_ins = 0 num_ins = 0
...@@ -74,6 +73,23 @@ def infer(): ...@@ -74,6 +73,23 @@ def infer():
) )
def set_zero(var_name,
scope=fluid.global_scope(),
place=fluid.CPUPlace(),
param_type="int64"):
"""
Set tensor of a Variable to zero.
Args:
var_name(str): name of Variable
scope(Scope): Scope object, default is fluid.global_scope()
place(Place): Place object, default is fluid.CPUPlace()
param_type(str): param data type, default is int64
"""
param = scope.var(var_name).get_tensor()
param_array = np.zeros(param._get_dims()).astype(param_type)
param.set(param_array, place)
if __name__ == '__main__': if __name__ == '__main__':
utils.check_version() utils.check_version()
infer() infer()
...@@ -13,7 +13,7 @@ def train(): ...@@ -13,7 +13,7 @@ def train():
if not os.path.isdir(args.model_output_dir): if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir) os.mkdir(args.model_output_dir)
loss, auc, data_list = eval('network_conf.' + args.model_name)( loss, auc, data_list, auc_states = eval('network_conf.' + args.model_name)(
args.embedding_size, args.num_field, args.num_feat, args.embedding_size, args.num_field, args.num_feat,
args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin) args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin)
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.SGD(
......
...@@ -134,4 +134,5 @@ def ctr_xdeepfm_model(embedding_size, ...@@ -134,4 +134,5 @@ def ctr_xdeepfm_model(embedding_size,
label=label_int, label=label_int,
slide_steps=0) slide_steps=0)
return batch_cost, auc_var, [raw_feat_idx, raw_feat_value, label] return batch_cost, auc_var, [raw_feat_idx, raw_feat_value,
label], auc_states
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册