From 8e400d2257d3ff26add5efa8b09494b34cdea6e0 Mon Sep 17 00:00:00 2001
From: zhoushiyu <31816202+wilhelmzh@users.noreply.github.com>
Date: Fri, 27 Sep 2019 13:12:56 +0800
Subject: [PATCH] Fix bug in deepfm (#3429)

* fix deepfm failed in 2*1 local distributed training

* fix deepfm preprocess bug

* fix deepfm preprocess bug and README
---
 PaddleRec/ctr/dcn/README.md                   | 10 +++++++---
 PaddleRec/ctr/deepfm/README.md                |  9 ++++++---
 PaddleRec/ctr/deepfm/cluster_train.py         |  4 ++--
 PaddleRec/ctr/deepfm/data/preprocess.py       | 18 ++++--------------
 .../ctr/deepfm/dist_data/preprocess_dist.py   | 19 ++++---------------
 PaddleRec/ctr/xdeepfm/README.md               |  7 +++++--
 6 files changed, 28 insertions(+), 39 deletions(-)

diff --git a/PaddleRec/ctr/dcn/README.md b/PaddleRec/ctr/dcn/README.md
index 6fbf0ad9..560acee8 100644
--- a/PaddleRec/ctr/dcn/README.md
+++ b/PaddleRec/ctr/dcn/README.md
@@ -23,7 +23,7 @@
 DCN模型介绍可以参阅论文[Deep & Cross Network for Ad Click Predictions](https://arxiv.org/abs/1708.05123)
 
 ## 环境
-- PaddlePaddle 1.5.2
+- PaddlePaddle 1.6
 
 ## 数据下载
 
@@ -72,6 +72,9 @@ loss: [0.44703564]      auc_val: [0.80654419]
 cd dist_data && sh dist_download.sh && cd ..
 ```
 运行命令本地模拟多机场景，默认使用2 X 2，即2个pserver，2个trainer的方式组网训练。
+
+**注意：在多机训练中，建议使用Paddle 1.6版本以上或[最新版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev)。**
+
 ```bash
 sh cluster_train.sh
 ```
@@ -98,6 +101,7 @@ python infer.py --model_output_dir cluster_model --test_epoch 10 --test_valid_da
 - 0号trainer保存模型参数
 
 - 每次训练完成后需要手动停止pserver进程，使用以下命令查看pserver进程：
-  >ps -ef | grep python
-
+  
+>ps -ef | grep python
+  
 - 数据读取使用dataset模式，目前仅支持运行在Linux环境下
diff --git a/PaddleRec/ctr/deepfm/README.md b/PaddleRec/ctr/deepfm/README.md
index 78061d32..a9847c80 100644
--- a/PaddleRec/ctr/deepfm/README.md
+++ b/PaddleRec/ctr/deepfm/README.md
@@ -15,7 +15,7 @@ This model implementation reproduces the result of the paper "DeepFM: A Factoriz
 ```
 
 ## Environment
-- PaddlePaddle 1.5
+- PaddlePaddle 1.6
 
 ## Download and preprocess data
 
@@ -53,6 +53,8 @@ When the training set is iterated to the 22nd round, the testing Logloss is `0.4
 ## Distributed Train
 We emulate distributed training on a local machine. In default, we use 2 X 2，i.e. 2 pservers X 2 trainers。
 
+**Note: we suggest to use Paddle >= 1.6 or [the latest Paddle](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev) in distributed train.**
+
 ### Download and preprocess distributed demo dataset
 This small demo dataset(a few lines from Criteo dataset) only test if distributed training can train.
 ```bash
@@ -78,7 +80,7 @@ other params explained in cluster_train.py
 
 Infer
 ```bash
-python infer.py --model_output_dir cluster_model --test_epoch 50 --test_data_dir=dist_data/dist_test_data --feat_dict='dist_data/aid_data/feat_dict_10.pkl2'
+python infer.py --model_output_dir cluster_model --test_epoch 10 --test_data_dir=dist_data/dist_test_data --feat_dict='dist_data/aid_data/feat_dict_10.pkl2'
 ```
 
 Notes:
@@ -87,7 +89,8 @@ Notes:
 - The first trainer(with trainer_id 0) saves model params.
 
 - After each training, pserver processes should be stop manually. You can use command below:
-  >ps -ef | grep python
+  
+>ps -ef | grep python
 
 - We use Dataset API to load data，it's only supported on Linux now.
 
diff --git a/PaddleRec/ctr/deepfm/cluster_train.py b/PaddleRec/ctr/deepfm/cluster_train.py
index 95d7b5e3..23985ebe 100644
--- a/PaddleRec/ctr/deepfm/cluster_train.py
+++ b/PaddleRec/ctr/deepfm/cluster_train.py
@@ -38,7 +38,7 @@ def parse_args():
     parser.add_argument(
         '--num_epoch',
         type=int,
-        default=50,
+        default=10,
         help="The number of epochs to train (default: 50)")
     parser.add_argument(
         '--model_output_dir',
@@ -73,7 +73,7 @@ def parse_args():
     parser.add_argument(
         '--reg', type=float, default=1e-4, help=' (default: 1e-4)')
     parser.add_argument('--num_field', type=int, default=39)
-    parser.add_argument('--num_feat', type=int, default=135483)
+    parser.add_argument('--num_feat', type=int, default=141443)
     parser.add_argument('--use_gpu', type=int, default=1)
 
     # dist params
diff --git a/PaddleRec/ctr/deepfm/data/preprocess.py b/PaddleRec/ctr/deepfm/data/preprocess.py
index 81fb751d..1fa4a5fe 100644
--- a/PaddleRec/ctr/deepfm/data/preprocess.py
+++ b/PaddleRec/ctr/deepfm/data/preprocess.py
@@ -59,7 +59,7 @@ def get_feat_dict():
             for line_idx, line in enumerate(fin):
                 if line_idx % 100000 == 0:
                     print('generating feature dict', line_idx / 45000000)
-                features = line.lstrip('\n').split('\t')
+                features = line.rstrip('\n').split('\t')
                 for idx in categorical_range_:
                     if features[idx] == '': continue
                     feat_cnt.update([features[idx]])
@@ -77,19 +77,9 @@ def get_feat_dict():
         for idx in continuous_range_:
             feat_dict[idx] = tc
             tc += 1
-        # Discrete features
-        cnt_feat_set = set()
-        with open('train.txt', 'r') as fin:
-            for line_idx, line in enumerate(fin):
-                features = line.rstrip('\n').split('\t')
-                for idx in categorical_range_:
-                    if features[idx] == '' or features[idx] not in dis_feat_set:
-                        continue
-                    if features[idx] not in cnt_feat_set:
-                        cnt_feat_set.add(features[idx])
-                        feat_dict[features[idx]] = tc
-                        tc += 1
-
+        for feat in dis_feat_set:
+            feat_dict[feat] = tc
+            tc += 1
         # Save dictionary
         with open(dir_feat_dict_, 'wb') as fout:
             pickle.dump(feat_dict, fout, protocol=2)
diff --git a/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py b/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py
index 3a02db61..fd9739e0 100644
--- a/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py
+++ b/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py
@@ -33,7 +33,7 @@ def get_feat_dict():
         feat_cnt = Counter()
         with open(INPUT_FILE, 'r') as fin:
             for line_idx, line in enumerate(fin):
-                features = line.lstrip('\n').split('\t')
+                features = line.rstrip('\n').split('\t')
                 for idx in categorical_range_:
                     if features[idx] == '': continue
                     feat_cnt.update([features[idx]])
@@ -53,20 +53,9 @@ def get_feat_dict():
         for idx in continuous_range_:
             feat_dict[idx] = tc
             tc += 1
-        # Discrete features
-        cnt_feat_set = set()
-        with open(INPUT_FILE, 'r') as fin:
-            for line_idx, line in enumerate(fin):
-                features = line.rstrip('\n').split('\t')
-                for idx in categorical_range_:
-                    if features[idx] == '' or features[idx] not in feat_set:
-                        continue
-                    if features[idx] not in cnt_feat_set:
-                        cnt_feat_set.add(features[idx])
-                        feat_dict[features[idx]] = tc
-                        tc += 1
-
-        # Save dictionary
+        for feat in feat_set:
+            feat_dict[feat] = tc
+            tc += 1
         with open(dir_feat_dict_, 'wb') as fout:
             pickle.dump(feat_dict, fout, protocol=2)
         print('args.num_feat ', len(feat_dict) + 1)
diff --git a/PaddleRec/ctr/xdeepfm/README.md b/PaddleRec/ctr/xdeepfm/README.md
index fed81a06..9b2475cd 100644
--- a/PaddleRec/ctr/xdeepfm/README.md
+++ b/PaddleRec/ctr/xdeepfm/README.md
@@ -12,7 +12,7 @@ sh download.sh
 ```
 
 ## 环境
-- PaddlePaddle 1.5
+- PaddlePaddle 1.6
 
 ## 单机训练
 ```bash
@@ -35,6 +35,8 @@ test_epoch设置加载第10轮训练的模型。
 ## 多机训练
 运行命令本地模拟多机场景，默认使用2 X 2模式，即2个pserver，2个trainer的方式组网训练。
 
+**注意：在多机训练中，建议使用Paddle 1.6版本以上或[最新版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev)。**
+
 数据下载同上面命令。
 ```bash
 sh cluster_train.sh
@@ -62,6 +64,7 @@ python infer.py --model_output_dir cluster_model --test_epoch 10 --use_gpu=0
 - 0号trainer保存模型参数
 
 - 每次训练完成后需要手动停止pserver进程，使用以下命令查看pserver进程：
-  >ps -ef | grep python
+  
+>ps -ef | grep python
 
 - 数据读取使用dataset模式，目前仅支持运行在Linux环境下
-- 
GitLab