From 8e400d2257d3ff26add5efa8b09494b34cdea6e0 Mon Sep 17 00:00:00 2001 From: zhoushiyu <31816202+wilhelmzh@users.noreply.github.com> Date: Fri, 27 Sep 2019 13:12:56 +0800 Subject: [PATCH] Fix bug in deepfm (#3429) * fix deepfm failed in 2*1 local distributed training * fix deepfm preprocess bug * fix deepfm preprocess bug and README --- PaddleRec/ctr/dcn/README.md | 10 +++++++--- PaddleRec/ctr/deepfm/README.md | 9 ++++++--- PaddleRec/ctr/deepfm/cluster_train.py | 4 ++-- PaddleRec/ctr/deepfm/data/preprocess.py | 18 ++++-------------- .../ctr/deepfm/dist_data/preprocess_dist.py | 19 ++++--------------- PaddleRec/ctr/xdeepfm/README.md | 7 +++++-- 6 files changed, 28 insertions(+), 39 deletions(-) diff --git a/PaddleRec/ctr/dcn/README.md b/PaddleRec/ctr/dcn/README.md index 6fbf0ad9..560acee8 100644 --- a/PaddleRec/ctr/dcn/README.md +++ b/PaddleRec/ctr/dcn/README.md @@ -23,7 +23,7 @@ DCN模型介绍可以参阅论文[Deep & Cross Network for Ad Click Predictions](https://arxiv.org/abs/1708.05123) ## 环境 -- PaddlePaddle 1.5.2 +- PaddlePaddle 1.6 ## 数据下载 @@ -72,6 +72,9 @@ loss: [0.44703564] auc_val: [0.80654419] cd dist_data && sh dist_download.sh && cd .. ``` 运行命令本地模拟多机场景,默认使用2 X 2,即2个pserver,2个trainer的方式组网训练。 + +**注意:在多机训练中,建议使用Paddle 1.6版本以上或[最新版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev)。** + ```bash sh cluster_train.sh ``` @@ -98,6 +101,7 @@ python infer.py --model_output_dir cluster_model --test_epoch 10 --test_valid_da - 0号trainer保存模型参数 - 每次训练完成后需要手动停止pserver进程,使用以下命令查看pserver进程: - >ps -ef | grep python - + +>ps -ef | grep python + - 数据读取使用dataset模式,目前仅支持运行在Linux环境下 diff --git a/PaddleRec/ctr/deepfm/README.md b/PaddleRec/ctr/deepfm/README.md index 78061d32..a9847c80 100644 --- a/PaddleRec/ctr/deepfm/README.md +++ b/PaddleRec/ctr/deepfm/README.md @@ -15,7 +15,7 @@ This model implementation reproduces the result of the paper "DeepFM: A Factoriz ``` ## Environment -- PaddlePaddle 1.5 +- PaddlePaddle 1.6 ## Download and preprocess data @@ -53,6 +53,8 @@ When the training set is iterated to the 22nd round, the testing Logloss is `0.4 ## Distributed Train We emulate distributed training on a local machine. In default, we use 2 X 2,i.e. 2 pservers X 2 trainers。 +**Note: we suggest to use Paddle >= 1.6 or [the latest Paddle](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev) in distributed train.** + ### Download and preprocess distributed demo dataset This small demo dataset(a few lines from Criteo dataset) only test if distributed training can train. ```bash @@ -78,7 +80,7 @@ other params explained in cluster_train.py Infer ```bash -python infer.py --model_output_dir cluster_model --test_epoch 50 --test_data_dir=dist_data/dist_test_data --feat_dict='dist_data/aid_data/feat_dict_10.pkl2' +python infer.py --model_output_dir cluster_model --test_epoch 10 --test_data_dir=dist_data/dist_test_data --feat_dict='dist_data/aid_data/feat_dict_10.pkl2' ``` Notes: @@ -87,7 +89,8 @@ Notes: - The first trainer(with trainer_id 0) saves model params. - After each training, pserver processes should be stop manually. You can use command below: - >ps -ef | grep python + +>ps -ef | grep python - We use Dataset API to load data,it's only supported on Linux now. diff --git a/PaddleRec/ctr/deepfm/cluster_train.py b/PaddleRec/ctr/deepfm/cluster_train.py index 95d7b5e3..23985ebe 100644 --- a/PaddleRec/ctr/deepfm/cluster_train.py +++ b/PaddleRec/ctr/deepfm/cluster_train.py @@ -38,7 +38,7 @@ def parse_args(): parser.add_argument( '--num_epoch', type=int, - default=50, + default=10, help="The number of epochs to train (default: 50)") parser.add_argument( '--model_output_dir', @@ -73,7 +73,7 @@ def parse_args(): parser.add_argument( '--reg', type=float, default=1e-4, help=' (default: 1e-4)') parser.add_argument('--num_field', type=int, default=39) - parser.add_argument('--num_feat', type=int, default=135483) + parser.add_argument('--num_feat', type=int, default=141443) parser.add_argument('--use_gpu', type=int, default=1) # dist params diff --git a/PaddleRec/ctr/deepfm/data/preprocess.py b/PaddleRec/ctr/deepfm/data/preprocess.py index 81fb751d..1fa4a5fe 100644 --- a/PaddleRec/ctr/deepfm/data/preprocess.py +++ b/PaddleRec/ctr/deepfm/data/preprocess.py @@ -59,7 +59,7 @@ def get_feat_dict(): for line_idx, line in enumerate(fin): if line_idx % 100000 == 0: print('generating feature dict', line_idx / 45000000) - features = line.lstrip('\n').split('\t') + features = line.rstrip('\n').split('\t') for idx in categorical_range_: if features[idx] == '': continue feat_cnt.update([features[idx]]) @@ -77,19 +77,9 @@ def get_feat_dict(): for idx in continuous_range_: feat_dict[idx] = tc tc += 1 - # Discrete features - cnt_feat_set = set() - with open('train.txt', 'r') as fin: - for line_idx, line in enumerate(fin): - features = line.rstrip('\n').split('\t') - for idx in categorical_range_: - if features[idx] == '' or features[idx] not in dis_feat_set: - continue - if features[idx] not in cnt_feat_set: - cnt_feat_set.add(features[idx]) - feat_dict[features[idx]] = tc - tc += 1 - + for feat in dis_feat_set: + feat_dict[feat] = tc + tc += 1 # Save dictionary with open(dir_feat_dict_, 'wb') as fout: pickle.dump(feat_dict, fout, protocol=2) diff --git a/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py b/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py index 3a02db61..fd9739e0 100644 --- a/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py +++ b/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py @@ -33,7 +33,7 @@ def get_feat_dict(): feat_cnt = Counter() with open(INPUT_FILE, 'r') as fin: for line_idx, line in enumerate(fin): - features = line.lstrip('\n').split('\t') + features = line.rstrip('\n').split('\t') for idx in categorical_range_: if features[idx] == '': continue feat_cnt.update([features[idx]]) @@ -53,20 +53,9 @@ def get_feat_dict(): for idx in continuous_range_: feat_dict[idx] = tc tc += 1 - # Discrete features - cnt_feat_set = set() - with open(INPUT_FILE, 'r') as fin: - for line_idx, line in enumerate(fin): - features = line.rstrip('\n').split('\t') - for idx in categorical_range_: - if features[idx] == '' or features[idx] not in feat_set: - continue - if features[idx] not in cnt_feat_set: - cnt_feat_set.add(features[idx]) - feat_dict[features[idx]] = tc - tc += 1 - - # Save dictionary + for feat in feat_set: + feat_dict[feat] = tc + tc += 1 with open(dir_feat_dict_, 'wb') as fout: pickle.dump(feat_dict, fout, protocol=2) print('args.num_feat ', len(feat_dict) + 1) diff --git a/PaddleRec/ctr/xdeepfm/README.md b/PaddleRec/ctr/xdeepfm/README.md index fed81a06..9b2475cd 100644 --- a/PaddleRec/ctr/xdeepfm/README.md +++ b/PaddleRec/ctr/xdeepfm/README.md @@ -12,7 +12,7 @@ sh download.sh ``` ## 环境 -- PaddlePaddle 1.5 +- PaddlePaddle 1.6 ## 单机训练 ```bash @@ -35,6 +35,8 @@ test_epoch设置加载第10轮训练的模型。 ## 多机训练 运行命令本地模拟多机场景,默认使用2 X 2模式,即2个pserver,2个trainer的方式组网训练。 +**注意:在多机训练中,建议使用Paddle 1.6版本以上或[最新版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev)。** + 数据下载同上面命令。 ```bash sh cluster_train.sh @@ -62,6 +64,7 @@ python infer.py --model_output_dir cluster_model --test_epoch 10 --use_gpu=0 - 0号trainer保存模型参数 - 每次训练完成后需要手动停止pserver进程,使用以下命令查看pserver进程: - >ps -ef | grep python + +>ps -ef | grep python - 数据读取使用dataset模式,目前仅支持运行在Linux环境下 -- GitLab