From 246a353560176da84983bcc11100bee7d70a0677 Mon Sep 17 00:00:00 2001 From: yinhaofeng <1841837261@qq.com> Date: Sun, 27 Sep 2020 17:47:15 +0000 Subject: [PATCH] wide_deep --- models/rank/wide_deep/README.md | 193 ++++++++++++------ models/rank/wide_deep/config.yaml | 20 +- models/rank/wide_deep/data/args.py | 59 ++++++ models/rank/wide_deep/data/create_data.sh | 6 +- .../rank/wide_deep/data/data_preparation.py | 21 +- models/rank/wide_deep/data/get_slot_data.py | 2 +- models/rank/wide_deep/data/run.sh | 1 + 7 files changed, 218 insertions(+), 84 deletions(-) create mode 100644 models/rank/wide_deep/data/args.py diff --git a/models/rank/wide_deep/README.md b/models/rank/wide_deep/README.md index c32047c0..55c7cdd5 100644 --- a/models/rank/wide_deep/README.md +++ b/models/rank/wide_deep/README.md @@ -3,16 +3,19 @@ 以下是本例的简要目录结构及说明: ``` -├── data # 文档 - ├── train #训练数据 - ├── train_data.txt - ├── create_data.sh - ├── data_preparation.py - ├── get_slot_data.py - ├── run.sh +├── data # 数据 + ├── sample_data #示例数据 + ├── train #训练数据 + ├── train_data.txt + ├── create_data.sh #数据下载脚本 + ├── data_preparation.py #数据处理程序 + ├── get_slot_data.py #数据处理程序 + ├── run.sh #一键数据下载脚本 + ├── args.py ## 脚本参数 ├── __init__.py ├── config.yaml #配置文件 ├── model.py #模型文件 +├── README.md #文档 ``` 注:在阅读该示例前,建议您先了解以下内容: @@ -21,13 +24,13 @@ ## 内容 -- [模型简介](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#模型简介) -- [数据准备](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#数据准备) -- [运行环境](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#运行环境) -- [快速开始](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#快速开始) -- [论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#论文复现) -- [进阶使用](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#进阶使用) -- [FAQ](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#FAQ) +- [模型简介](#模型简介) +- [数据准备](#数据准备) +- [运行环境](#运行环境) +- [快速开始](#快速开始) +- [论文复现](#论文复现) +- [进阶使用](#进阶使用) +- [FAQ](#FAQ) ## 模型简介 @@ -36,12 +39,6 @@ 1. 效果上,在Google Play 进行线上A/B实验,wide&deep模型相比高度优化的Wide浅层模型,app下载率+3.9%。相比deep模型也有一定提升。 2. 性能上,通过切分一次请求需要处理的app 的Batch size为更小的size,并利用多线程并行请求达到提高处理效率的目的。单次响应耗时从31ms下降到14ms。 -本例在paddlepaddle上实现wide&deep并在开源数据集Census-income Data上验证模型效果,在测试集上的平均acc和auc分别为: - -> mean_acc: 0.76195 -> -> mean_auc: 0.90577 - 若进行精度验证,请参考[论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#论文复现)部分。 本项目支持功能 @@ -52,12 +49,44 @@ ## 数据准备 +本例在paddlerec上实现wide&deep并在开源数据集Census-income Data上验证模型效果 数据地址: [adult.data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data) [adult.test](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test) +您可以在进入models/rank/wide_deep/data目录,直接运行一键数据生成脚本run.sh获取数据。 +``` +sh run.sh +``` +在本例中需要调用pandas库,如环境中没有提前安装,可以使用命令 pip install pandas 安装。 + +运行的结果示例如下: +``` +--2020-09-27 16:57:38-- https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data +Resolving archive.ics.uci.edu... 128.195.10.252 +Connecting to archive.ics.uci.edu|128.195.10.252|:443... connected. +HTTP request sent, awaiting response... 200 OK +Length: 3974305 (3.8M) [application/x-httpd-php] +Saving to: data/adult.data + +100%[===================================================================================================================>] 3,974,305 12.6K/s in 6m 17s + +2020-09-27 17:03:57 (10.3 KB/s) - data/adult.data saved [3974305/3974305] + +--2020-09-27 17:03:57-- https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test +Resolving archive.ics.uci.edu... 128.195.10.252 +Connecting to archive.ics.uci.edu|128.195.10.252|:443... connected. +HTTP request sent, awaiting response... 200 OK +Length: 2003153 (1.9M) [application/x-httpd-php] +Saving to: data/adult.test + +100%[==================================================================================================================>] 2,003,153 12.7K/s in 51s + +2020-09-27 17:08:04 (13.5 KB/s) - data/adult.test saved [2003153/2003153] +``` + ## 运行环境 PaddlePaddle>=1.7.2 @@ -68,60 +97,98 @@ PaddleRec >=0.1 os : windows/linux/macos + ## 快速开始 +本文提供了样例数据可以供您快速体验,在paddlerec目录下执行下面的命令即可快速启动训练: -### 单机训练 - -CPU环境 - -在config.yaml文件中设置好设备,epochs等。 - -```sh -dataset: - - name: sample_1 - type: QueueDataset - batch_size: 5 - data_path: "{workspace}/data/sample_data/train" - sparse_slots: "label" - dense_slots: "wide_input:8 deep_input:58" - - name: infer_sample - type: QueueDataset - batch_size: 5 - data_path: "{workspace}/data/sample_data/train" - sparse_slots: "label" - dense_slots: "wide_input:8 deep_input:58" ``` - -### 单机预测 - -CPU环境 - -在config.yaml文件中设置好epochs、device等参数。 - +python -m paddlerec.run -m models/rank/wide_deep/config.yaml ``` - - name: infer_runner - class: infer - device: cpu - init_model_path: "increment/0" +使用样例数据快速跑通的结果实例: +``` +PaddleRec: Runner train_runner Begin +Executor Mode: train +processor_register begin +Running SingleInstance. +Running SingleNetwork. +Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:[] +Running SingleStartup. +Running SingleRunner. +I0927 17:16:18.305258 3437 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel. +I0927 17:16:18.310783 3437 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1 +I0927 17:16:18.314724 3437 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True +I0927 17:16:18.317752 3437 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0 +2020-09-27 17:16:18,475-INFO: [Train] batch: 20, time_each_interval: 0.18s, ACC: [0.6], BATCH_AUC: [0.41666667], AUC: [0.61538462] +2020-09-27 17:16:18,583-INFO: [Train] batch: 40, time_each_interval: 0.11s, ACC: [0.8], BATCH_AUC: [0.875], AUC: [0.59693471] +2020-09-27 17:16:18,625-INFO: [Train] batch: 60, time_each_interval: 0.04s, ACC: [0.4], BATCH_AUC: [1.], AUC: [0.59405999] +2020-09-27 17:16:18,666-INFO: [Train] batch: 80, time_each_interval: 0.04s, ACC: [0.8], BATCH_AUC: [0.5], AUC: [0.56687606] +epoch 0 done, use time: 0.503633022308, global metrics: ACC=[1.], BATCH_AUC=[0.], AUC=[0.56696623] +PaddleRec Finish ``` - ## 论文复现 - -用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=40, thread_num=8, epoch_num=40 - -本例在paddlepaddle上实现wide&deep并在开源数据集Census-income Data上验证模型效果,在测试集上的平均acc和auc分别为: - -mean_acc: 0.76195 , mean_auc: 0.90577 - - -修改后运行方案:修改config.yaml中的'workspace'为config.yaml的目录位置,执行 +为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。 +在全量数据下模型的指标如下: + +| 模型 | auc | acc | batch_size | thread_num| epoch_num| Time of each epoch | +| :------| :------ | :------ | :------| :------ | :------| :------ | +| wide_deep | 0.8987 | 0.775 | 40 | 1 | 80 | 约10s | +1. 确认您当前所在目录为PaddleRec/models/rank/wide_deep +2. 在data目录下运行数据一键处理脚本,命令如下: +``` +cd data +sh run.sh +cd .. +``` +3. 退回deepfm目录中,打开文件config.yaml,更改其中的参数 +将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径) +将train_sample中的batch_size从5改为40 +将train_sample中的data_path改为{workspace}/data/slot_train_data +将infer_sample中的batch_size从5改为40 +将infer_sample中的data_path改为{workspace}/data/slot_test_data +将train_runner中的epochs改为80 +将infer_runner中的init_model_path改为increment/79 +4. 运行命令,模型会进行80个epoch的训练,然后预测最后一个epoch,并获得相应auc和acc指标 +``` +python -m paddlerec.run -m ./config.yaml +``` +5. 经过全量数据训练后,执行预测的结果示例如下: ``` -python -m paddlerec.run -m /home/your/dir/config.yaml #调试模式 直接指定本地config的绝对路径 +PaddleRec: Runner infer_runner Begin +Executor Mode: infer +processor_register begin +Running SingleInstance. +Running SingleNetwork. +Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:[] +Running SingleInferStartup. +Running SingleInferRunner. +load persistables from increment/79 +2020-09-27 17:37:17,679-INFO: [Infer] batch: 20, time_each_interval: 0.77s, ACC: [0.8], AUC: [0.89880283] +2020-09-27 17:37:18,452-INFO: [Infer] batch: 40, time_each_interval: 0.77s, ACC: [0.825], AUC: [0.89879974] +2020-09-27 17:37:19,023-INFO: [Infer] batch: 60, time_each_interval: 0.57s, ACC: [0.7], AUC: [0.89880376] +2020-09-27 17:37:19,591-INFO: [Infer] batch: 80, time_each_interval: 0.57s, ACC: [0.925], AUC: [0.89879592] +2020-09-27 17:37:20,195-INFO: [Infer] batch: 100, time_each_interval: 0.60s, ACC: [0.725], AUC: [0.89879213] +2020-09-27 17:37:20,822-INFO: [Infer] batch: 120, time_each_interval: 0.63s, ACC: [0.775], AUC: [0.89879757] +2020-09-27 17:37:21,303-INFO: [Infer] batch: 140, time_each_interval: 0.48s, ACC: [0.775], AUC: [0.89879296] +2020-09-27 17:37:21,798-INFO: [Infer] batch: 160, time_each_interval: 0.49s, ACC: [0.875], AUC: [0.89879267] +2020-09-27 17:37:22,265-INFO: [Infer] batch: 180, time_each_interval: 0.47s, ACC: [0.85], AUC: [0.89879272] +2020-09-27 17:37:22,835-INFO: [Infer] batch: 200, time_each_interval: 0.57s, ACC: [0.725], AUC: [0.89878928] +2020-09-27 17:37:23,364-INFO: [Infer] batch: 220, time_each_interval: 0.53s, ACC: [0.825], AUC: [0.89878807] +2020-09-27 17:37:23,859-INFO: [Infer] batch: 240, time_each_interval: 0.49s, ACC: [0.7], AUC: [0.8987825] +2020-09-27 17:37:24,337-INFO: [Infer] batch: 260, time_each_interval: 0.48s, ACC: [0.775], AUC: [0.89878314] +2020-09-27 17:37:24,877-INFO: [Infer] batch: 280, time_each_interval: 0.54s, ACC: [0.875], AUC: [0.89877827] +2020-09-27 17:37:25,410-INFO: [Infer] batch: 300, time_each_interval: 0.53s, ACC: [0.75], AUC: [0.89877518] +2020-09-27 17:37:25,985-INFO: [Infer] batch: 320, time_each_interval: 0.57s, ACC: [0.75], AUC: [0.89876936] +2020-09-27 17:37:26,447-INFO: [Infer] batch: 340, time_each_interval: 0.46s, ACC: [0.775], AUC: [0.89876268] +2020-09-27 17:37:26,725-INFO: [Infer] batch: 360, time_each_interval: 0.28s, ACC: [0.75], AUC: [0.8987574] +2020-09-27 17:37:26,889-INFO: [Infer] batch: 380, time_each_interval: 0.16s, ACC: [0.8], AUC: [0.89874688] +2020-09-27 17:37:27,065-INFO: [Infer] batch: 400, time_each_interval: 0.18s, ACC: [0.8], AUC: [0.89875484] +Infer infer_phase of epoch increment/79 done, use time: 10.2139520645, global metrics: ACC=[0.775], AUC=[0.89875217] +PaddleRec Finish ``` ## 进阶使用 -## FAQ +## FAQ diff --git a/models/rank/wide_deep/config.yaml b/models/rank/wide_deep/config.yaml index d1da03e7..2b3bf5ec 100755 --- a/models/rank/wide_deep/config.yaml +++ b/models/rank/wide_deep/config.yaml @@ -19,13 +19,13 @@ workspace: "models/rank/wide_deep" dataset: - name: sample_1 - type: QueueDataset + type: DataLoader # or QueueDataset batch_size: 5 data_path: "{workspace}/data/sample_data/train" sparse_slots: "label" dense_slots: "wide_input:8 deep_input:58" - name: infer_sample - type: QueueDataset + type: DataLoader # or QueueDataset batch_size: 5 data_path: "{workspace}/data/sample_data/train" sparse_slots: "label" @@ -33,14 +33,14 @@ dataset: hyper_parameters: optimizer: - class: SGD - learning_rate: 0.0001 + class: adam + learning_rate: 0.001 hidden1_units: 75 hidden2_units: 50 hidden3_units: 25 -mode: train_runner +mode: train_runner # if infer, change mode to "infer_runner" and change phase to "infer_phase" runner: @@ -53,17 +53,19 @@ runner: save_inference_interval: 1 save_checkpoint_path: "increment" save_inference_path: "inference" + phases: phase1 - name: infer_runner class: infer device: cpu init_model_path: "increment/0" + phases: infer_phase phase: - name: phase1 model: "{workspace}/model.py" dataset_name: sample_1 thread_num: 1 -#- name: infer_phase -# model: "{workspace}/model.py" -# dataset_name: infer_sample -# thread_num: 1 +- name: infer_phase + model: "{workspace}/model.py" + dataset_name: infer_sample + thread_num: 1 diff --git a/models/rank/wide_deep/data/args.py b/models/rank/wide_deep/data/args.py new file mode 100644 index 00000000..297e9050 --- /dev/null +++ b/models/rank/wide_deep/data/args.py @@ -0,0 +1,59 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import distutils.util +import sys + + +def parse_args(): + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--epochs", type=int, default=40, help="epochs") + parser.add_argument( + "--batch_size", type=int, default=40, help="batch_size") + parser.add_argument( + '--use_gpu', type=int, default=0, help='whether using gpu') + parser.add_argument( + '--test_epoch', type=str, default='39', help='test_epoch') + parser.add_argument( + '--train_path', type=str, default='data/adult.data', help='train_path') + parser.add_argument( + '--test_path', type=str, default='data/adult.test', help='test_path') + parser.add_argument( + '--train_data_path', + type=str, + default='train_data/train_data.csv', + help='train_data_path') + parser.add_argument( + '--test_data_path', + type=str, + default='test_data/test_data.csv', + help='test_data_path') + parser.add_argument( + '--model_dir', type=str, default='model_dir', help='test_data_path') + parser.add_argument( + '--hidden1_units', type=int, default=75, help='hidden1_units') + parser.add_argument( + '--hidden2_units', type=int, default=50, help='hidden2_units') + parser.add_argument( + '--hidden3_units', type=int, default=25, help='hidden3_units') + + args = parser.parse_args() + + return args diff --git a/models/rank/wide_deep/data/create_data.sh b/models/rank/wide_deep/data/create_data.sh index daf60cea..dd161d5e 100755 --- a/models/rank/wide_deep/data/create_data.sh +++ b/models/rank/wide_deep/data/create_data.sh @@ -1,11 +1,11 @@ mkdir train_data mkdir test_data -train_path="adult.data" -test_path="adult.test" +train_path="data/adult.data" +test_path="data/adult.test" train_data_path="./train_data/train_data.csv" test_data_path="./test_data/test_data.csv" -pip install -r requirements.txt +#pip install -r requirements.txt wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test diff --git a/models/rank/wide_deep/data/data_preparation.py b/models/rank/wide_deep/data/data_preparation.py index 88507009..51661398 100644 --- a/models/rank/wide_deep/data/data_preparation.py +++ b/models/rank/wide_deep/data/data_preparation.py @@ -109,6 +109,7 @@ def build_model_columns(train_data_path, test_data_path): train_df[categorical_columns], columns=categorical_columns) test_df_temp = pd.get_dummies( test_df[categorical_columns], columns=categorical_columns) + train_df = train_df.join(train_df_temp) test_df = test_df.join(test_df_temp) @@ -121,22 +122,26 @@ def build_model_columns(train_data_path, test_data_path): lambda x: 1 if x == '>50K' else 0) test_df['label'] = test_df['income_bracket'].apply( lambda x: 1 if x == '>50K' else 0) - + ''' with io.open('train_data/columns.txt', 'w') as f: write_str = str(len(wide_columns)) + '\n' + str(len( deep_columns)) + '\n' - f.write(write_str) + f.write(u"{}".format(write_str)) f.close() with io.open('test_data/columns.txt', 'w') as f: write_str = str(len(wide_columns)) + '\n' + str(len( deep_columns)) + '\n' - f.write(write_str) - f.close() + f.write(u"{}".format(write_str)) - train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv( - train_data_path, index=False) - test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv( - test_data_path, index=False) + f.close() + ''' + + train_df = train_df[wide_columns + deep_columns + ['label']] + train_df = train_df.fillna(0).astype(int) + train_df.to_csv(train_data_path, index=False, header=0) + test_df = test_df[wide_columns + deep_columns + ['label']] + test_df = test_df.fillna(0).astype(int) + test_df.to_csv(test_data_path, index=False, header=0) def clean_file(train_path, test_path, train_data_path, test_data_path): diff --git a/models/rank/wide_deep/data/get_slot_data.py b/models/rank/wide_deep/data/get_slot_data.py index 5f873a30..962cd880 100755 --- a/models/rank/wide_deep/data/get_slot_data.py +++ b/models/rank/wide_deep/data/get_slot_data.py @@ -29,7 +29,7 @@ class Reader(dg.MultiSlotDataGenerator): def _process_line(self, line): line = line.strip().split(',') - features = list(map(float, line)) + features = list(map(int, map(float, line))) wide_feat = features[0:8] deep_feat = features[8:58 + 8] label = features[-1] diff --git a/models/rank/wide_deep/data/run.sh b/models/rank/wide_deep/data/run.sh index 7b4fb849..e6a3363c 100644 --- a/models/rank/wide_deep/data/run.sh +++ b/models/rank/wide_deep/data/run.sh @@ -7,6 +7,7 @@ do done mkdir slot_test_data + for i in `ls ./test_data` do cat test_data/$i | python get_slot_data.py > slot_test_data/$i -- GitLab