diff --git a/PaddleRec/ctr/dcn/README.md b/PaddleRec/ctr/dcn/README.md index 6fbf0ad9cf16cb2dbbbe154a2818e7704f68dc29..560acee8ab535d05831275e036d1fcfe541bd555 100644 --- a/PaddleRec/ctr/dcn/README.md +++ b/PaddleRec/ctr/dcn/README.md @@ -23,7 +23,7 @@ DCN模型介绍可以参阅论文[Deep & Cross Network for Ad Click Predictions](https://arxiv.org/abs/1708.05123) ## 环境 -- PaddlePaddle 1.5.2 +- PaddlePaddle 1.6 ## 数据下载 @@ -72,6 +72,9 @@ loss: [0.44703564] auc_val: [0.80654419] cd dist_data && sh dist_download.sh && cd .. ``` 运行命令本地模拟多机场景,默认使用2 X 2,即2个pserver,2个trainer的方式组网训练。 + +**注意:在多机训练中,建议使用Paddle 1.6版本以上或[最新版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev)。** + ```bash sh cluster_train.sh ``` @@ -98,6 +101,7 @@ python infer.py --model_output_dir cluster_model --test_epoch 10 --test_valid_da - 0号trainer保存模型参数 - 每次训练完成后需要手动停止pserver进程,使用以下命令查看pserver进程: - >ps -ef | grep python - + +>ps -ef | grep python + - 数据读取使用dataset模式,目前仅支持运行在Linux环境下 diff --git a/PaddleRec/ctr/deepfm/README.md b/PaddleRec/ctr/deepfm/README.md index 78061d324aea64289d1a4780d720d2cc102e6942..a9847c8073913cd102d3b18a19ea52b53a1922af 100644 --- a/PaddleRec/ctr/deepfm/README.md +++ b/PaddleRec/ctr/deepfm/README.md @@ -15,7 +15,7 @@ This model implementation reproduces the result of the paper "DeepFM: A Factoriz ``` ## Environment -- PaddlePaddle 1.5 +- PaddlePaddle 1.6 ## Download and preprocess data @@ -53,6 +53,8 @@ When the training set is iterated to the 22nd round, the testing Logloss is `0.4 ## Distributed Train We emulate distributed training on a local machine. In default, we use 2 X 2,i.e. 2 pservers X 2 trainers。 +**Note: we suggest to use Paddle >= 1.6 or [the latest Paddle](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev) in distributed train.** + ### Download and preprocess distributed demo dataset This small demo dataset(a few lines from Criteo dataset) only test if distributed training can train. ```bash @@ -78,7 +80,7 @@ other params explained in cluster_train.py Infer ```bash -python infer.py --model_output_dir cluster_model --test_epoch 50 --test_data_dir=dist_data/dist_test_data --feat_dict='dist_data/aid_data/feat_dict_10.pkl2' +python infer.py --model_output_dir cluster_model --test_epoch 10 --test_data_dir=dist_data/dist_test_data --feat_dict='dist_data/aid_data/feat_dict_10.pkl2' ``` Notes: @@ -87,7 +89,8 @@ Notes: - The first trainer(with trainer_id 0) saves model params. - After each training, pserver processes should be stop manually. You can use command below: - >ps -ef | grep python + +>ps -ef | grep python - We use Dataset API to load data,it's only supported on Linux now. diff --git a/PaddleRec/ctr/deepfm/cluster_train.py b/PaddleRec/ctr/deepfm/cluster_train.py index 95d7b5e3e7653433d47fb88a398a226329a36b2e..23985ebefab5d2ae0f1e92a32bb86dc0fd63c14e 100644 --- a/PaddleRec/ctr/deepfm/cluster_train.py +++ b/PaddleRec/ctr/deepfm/cluster_train.py @@ -38,7 +38,7 @@ def parse_args(): parser.add_argument( '--num_epoch', type=int, - default=50, + default=10, help="The number of epochs to train (default: 50)") parser.add_argument( '--model_output_dir', @@ -73,7 +73,7 @@ def parse_args(): parser.add_argument( '--reg', type=float, default=1e-4, help=' (default: 1e-4)') parser.add_argument('--num_field', type=int, default=39) - parser.add_argument('--num_feat', type=int, default=135483) + parser.add_argument('--num_feat', type=int, default=141443) parser.add_argument('--use_gpu', type=int, default=1) # dist params diff --git a/PaddleRec/ctr/deepfm/data/preprocess.py b/PaddleRec/ctr/deepfm/data/preprocess.py index 81fb751dbb1cee11be639ff6e9766e053cea398c..1fa4a5feae17bde64463d2f05beb3d053284dcda 100644 --- a/PaddleRec/ctr/deepfm/data/preprocess.py +++ b/PaddleRec/ctr/deepfm/data/preprocess.py @@ -59,7 +59,7 @@ def get_feat_dict(): for line_idx, line in enumerate(fin): if line_idx % 100000 == 0: print('generating feature dict', line_idx / 45000000) - features = line.lstrip('\n').split('\t') + features = line.rstrip('\n').split('\t') for idx in categorical_range_: if features[idx] == '': continue feat_cnt.update([features[idx]]) @@ -77,19 +77,9 @@ def get_feat_dict(): for idx in continuous_range_: feat_dict[idx] = tc tc += 1 - # Discrete features - cnt_feat_set = set() - with open('train.txt', 'r') as fin: - for line_idx, line in enumerate(fin): - features = line.rstrip('\n').split('\t') - for idx in categorical_range_: - if features[idx] == '' or features[idx] not in dis_feat_set: - continue - if features[idx] not in cnt_feat_set: - cnt_feat_set.add(features[idx]) - feat_dict[features[idx]] = tc - tc += 1 - + for feat in dis_feat_set: + feat_dict[feat] = tc + tc += 1 # Save dictionary with open(dir_feat_dict_, 'wb') as fout: pickle.dump(feat_dict, fout, protocol=2) diff --git a/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py b/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py index 3a02db6199ba5cc9f13d4085a1c0b756e9bbb9fe..fd9739e0cb7071f9689e04731d6b70b7413840f6 100644 --- a/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py +++ b/PaddleRec/ctr/deepfm/dist_data/preprocess_dist.py @@ -33,7 +33,7 @@ def get_feat_dict(): feat_cnt = Counter() with open(INPUT_FILE, 'r') as fin: for line_idx, line in enumerate(fin): - features = line.lstrip('\n').split('\t') + features = line.rstrip('\n').split('\t') for idx in categorical_range_: if features[idx] == '': continue feat_cnt.update([features[idx]]) @@ -53,20 +53,9 @@ def get_feat_dict(): for idx in continuous_range_: feat_dict[idx] = tc tc += 1 - # Discrete features - cnt_feat_set = set() - with open(INPUT_FILE, 'r') as fin: - for line_idx, line in enumerate(fin): - features = line.rstrip('\n').split('\t') - for idx in categorical_range_: - if features[idx] == '' or features[idx] not in feat_set: - continue - if features[idx] not in cnt_feat_set: - cnt_feat_set.add(features[idx]) - feat_dict[features[idx]] = tc - tc += 1 - - # Save dictionary + for feat in feat_set: + feat_dict[feat] = tc + tc += 1 with open(dir_feat_dict_, 'wb') as fout: pickle.dump(feat_dict, fout, protocol=2) print('args.num_feat ', len(feat_dict) + 1) diff --git a/PaddleRec/ctr/xdeepfm/README.md b/PaddleRec/ctr/xdeepfm/README.md index fed81a06ec1425a93fc6e5270c024e1dc4571a79..9b2475cd789db6e298db31194713e27064849d8b 100644 --- a/PaddleRec/ctr/xdeepfm/README.md +++ b/PaddleRec/ctr/xdeepfm/README.md @@ -12,7 +12,7 @@ sh download.sh ``` ## 环境 -- PaddlePaddle 1.5 +- PaddlePaddle 1.6 ## 单机训练 ```bash @@ -35,6 +35,8 @@ test_epoch设置加载第10轮训练的模型。 ## 多机训练 运行命令本地模拟多机场景,默认使用2 X 2模式,即2个pserver,2个trainer的方式组网训练。 +**注意:在多机训练中,建议使用Paddle 1.6版本以上或[最新版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev)。** + 数据下载同上面命令。 ```bash sh cluster_train.sh @@ -62,6 +64,7 @@ python infer.py --model_output_dir cluster_model --test_epoch 10 --use_gpu=0 - 0号trainer保存模型参数 - 每次训练完成后需要手动停止pserver进程,使用以下命令查看pserver进程: - >ps -ef | grep python + +>ps -ef | grep python - 数据读取使用dataset模式,目前仅支持运行在Linux环境下