From 99a02c0364c143249f32bcd89aace119d84da3f2 Mon Sep 17 00:00:00 2001 From: overlordmax <37664905+overlordmax@users.noreply.github.com> Date: Wed, 22 Apr 2020 13:38:26 +0800 Subject: [PATCH] Wide deep 04221156 (#4556) * fix bugs * fix bugs * add wide_deep * fix code style * fix code style --- PaddleRec/wide_deep/README.md | 189 ++++++++++++++++++++++++ PaddleRec/wide_deep/args.py | 42 ++++++ PaddleRec/wide_deep/create_data.sh | 17 +++ PaddleRec/wide_deep/data_preparation.py | 109 ++++++++++++++ PaddleRec/wide_deep/infer.py | 75 ++++++++++ PaddleRec/wide_deep/infer_cpu.sh | 9 ++ PaddleRec/wide_deep/infer_gpu.sh | 9 ++ PaddleRec/wide_deep/net.py | 66 +++++++++ PaddleRec/wide_deep/requirements.txt | 132 +++++++++++++++++ PaddleRec/wide_deep/train.py | 47 ++++++ PaddleRec/wide_deep/train_cpu.sh | 9 ++ PaddleRec/wide_deep/train_gpu.sh | 8 + PaddleRec/wide_deep/utils.py | 31 ++++ 13 files changed, 743 insertions(+) create mode 100644 PaddleRec/wide_deep/README.md create mode 100644 PaddleRec/wide_deep/args.py create mode 100644 PaddleRec/wide_deep/create_data.sh create mode 100644 PaddleRec/wide_deep/data_preparation.py create mode 100644 PaddleRec/wide_deep/infer.py create mode 100644 PaddleRec/wide_deep/infer_cpu.sh create mode 100644 PaddleRec/wide_deep/infer_gpu.sh create mode 100644 PaddleRec/wide_deep/net.py create mode 100644 PaddleRec/wide_deep/requirements.txt create mode 100644 PaddleRec/wide_deep/train.py create mode 100644 PaddleRec/wide_deep/train_cpu.sh create mode 100644 PaddleRec/wide_deep/train_gpu.sh create mode 100644 PaddleRec/wide_deep/utils.py diff --git a/PaddleRec/wide_deep/README.md b/PaddleRec/wide_deep/README.md new file mode 100644 index 00000000..885aceb9 --- /dev/null +++ b/PaddleRec/wide_deep/README.md @@ -0,0 +1,189 @@ +# wide&deep + + 以下是本例的简要目录结构及说明: + + + +``` +├── README.md # 文档 +├── requirements.txt # 需要的安装包 +├── net.py # wide&deep网络文件 +├── utils.py # 通用函数 +├── args.py # 参数脚本 +├── create_data.sh # 生成训练数据脚本 +├── data_preparation.py # 数据预处理脚本 +├── train.py # 训练文件 +├── infer.py # 预测文件 +├── train_gpu.sh # gpu训练shell脚本 +├── train_cpu.sh # cpu训练shell脚本 +├── infer_gpu.sh # gpu预测shell脚本 +├── infer_cpu.sh # cpu预测shell脚本 +``` + +## 简介 + +[《Wide & Deep Learning for Recommender Systems》]( https://arxiv.org/pdf/1606.07792.pdf)是Google 2016年发布的推荐框架,wide&deep设计了一种融合浅层(wide)模型和深层(deep)模型进行联合训练的框架,综合利用浅层模型的记忆能力和深层模型的泛化能力,实现单模型对推荐系统准确性和扩展性的兼顾。从推荐效果和服务性能两方面进行评价: + +1. 效果上,在Google Play 进行线上A/B实验,wide&deep模型相比高度优化的Wide浅层模型,app下载率+3.9%。相比deep模型也有一定提升。 +2. 性能上,通过切分一次请求需要处理的app 的Batch size为更小的size,并利用多线程并行请求达到提高处理效率的目的。单次响应耗时从31ms下降到14ms。 + +本例在paddlepaddle上实现wide&deep并在开源数据集Census-income Data上验证模型效果,在测试集上的平均acc和auc分别为: + +> mean_acc: 0.76195 +> +> mean_auc: 0.90577 + +## 数据下载及预处理 + +数据地址: + +[adult.data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data) + +[adult.test](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test) + +在create_data.sh脚本文件中添加文件的路径,并运行脚本。 + +```sh +mkdir train_data +mkdir test_data +mkdir data +train_path="data/adult.data" #原始训练数据 +test_path="data/adult.test" #原始测试数据 +train_data_path="train_data/train_data.csv" #处理后的训练数据 +test_data_path="test_data/train_data.csv" #处理后的测试数据 + +pip install -r requirements.txt #安装必需包 + +wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data +wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test + +python data_preparation.py --train_path ${train_path} \ + --test_path ${test_path} \ + --train_data_path ${train_data_path}\ + --test_data_path ${test_data_path} + +``` + +## 环境 + + PaddlePaddle 1.7.0 + + python3.7 + +## 单机训练 + +GPU环境 + +在train_gpu.sh脚本文件中设置好数据路径、参数。 + +```sh +CUDA_VISIBLE_DEVICES=0 python train.py --epochs 40 \ #训练轮次 + --batch_size 40 \ #batch大小 + --use_gpu 1 \ #使用gpu训练 + --train_data_path 'train_data/train_data.csv' \ #训练数据 + --model_dir 'model_dir' #模型保存路径 + --hidden1_units 75 \ #deep网络隐层大小 + --hidden2_units 50 \ + --hidden3_units 25 + +``` + +修改脚本的可执行权限并运行 + +```sh +./train_gpu.sh +``` + +CPU环境 + +在train_cpu.sh脚本文件中设置好数据路径、参数。 + +```sh +python train.py --epochs 40 \ #训练轮次 + --batch_size 40 \ #batch大小 + --use_gpu 0 \ #使用cpu训练 + --train_data_path 'train_data/train_data.csv' \ #训练数据 + --model_dir 'model_dir' #模型保存路径 + --hidden1_units 75 \ #deep网络隐层大小 + --hidden2_units 50 \ + --hidden3_units 25 + +``` + +修改脚本的可执行权限并运行 + +``` +./train_cpu.sh +``` + +## 单机预测 + +GPU环境 + +在infer_gpu.sh脚本文件中设置好数据路径、参数。 + +```sh +python infer.py --batch_size 40 \ #batch大小 + --use_gpu 0 \ #使用cpu训练 + --test_epoch 39 \ #选择那一轮的模型用来预测 + --test_data_path 'test_data/test_data.csv' \ #测试数据 + --model_dir 'model_dir' \ #模型路径 + --hidden1_units 75 \ #隐层单元个数 + --hidden2_units 50 \ + --hidden3_units 25 + +``` + +修改脚本的可执行权限并运行 + +```sh +./infer_gpu.sh +``` + +CPU环境 + +在infer_cpu.sh脚本文件中设置好数据路径、参数。 + +```sh +python infer.py --batch_size 40 \ #batch大小 + --use_gpu 0 \ #使用cpu训练 + --test_epoch 39 \ #选择那一轮的模型用来预测 + --test_data_path 'test_data/test_data.csv' \ #测试数据 + --model_dir 'model_dir' \ #模型路径 + --hidden1_units 75 \ #隐层单元个数 + --hidden2_units 50 \ + --hidden3_units 25 + +``` + +修改脚本的可执行权限并运行 + +``` +./infer_cpu.sh +``` + +## 模型效果 + +在测试集的效果如下: + +``` +W0422 11:44:50.891095 1573 device_context.cc:237] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 9.2, Runtime API Version: 9.0 +W0422 11:44:50.895593 1573 device_context.cc:245] device: 0, cuDNN Version: 7.3. +2020-04-22 11:44:52,236-INFO: batch_id: 0, batch_time: 0.00613s, acc: 0.72500, auc: 0.92790 +2020-04-22 11:44:52,242-INFO: batch_id: 1, batch_time: 0.00467s, acc: 0.80000, auc: 0.93356 +2020-04-22 11:44:52,247-INFO: batch_id: 2, batch_time: 0.00462s, acc: 0.82500, auc: 0.93372 +2020-04-22 11:44:52,252-INFO: batch_id: 3, batch_time: 0.00445s, acc: 0.75000, auc: 0.94198 +2020-04-22 11:44:52,257-INFO: batch_id: 4, batch_time: 0.00449s, acc: 0.67500, auc: 0.93222 +2020-04-22 11:44:52,262-INFO: batch_id: 5, batch_time: 0.00444s, acc: 0.80000, auc: 0.92254 +...... +2020-04-22 11:44:54,439-INFO: batch_id: 400, batch_time: 0.00507s, acc: 0.80000, auc: 0.90650 +2020-04-22 11:44:54,445-INFO: batch_id: 401, batch_time: 0.00512s, acc: 0.67500, auc: 0.90658 +2020-04-22 11:44:54,452-INFO: batch_id: 402, batch_time: 0.00591s, acc: 0.72500, auc: 0.90638 +2020-04-22 11:44:54,458-INFO: batch_id: 403, batch_time: 0.00518s, acc: 0.80000, auc: 0.90634 +2020-04-22 11:44:54,464-INFO: batch_id: 404, batch_time: 0.00513s, acc: 0.72500, auc: 0.90619 +2020-04-22 11:44:54,470-INFO: batch_id: 405, batch_time: 0.00497s, acc: 0.77500, auc: 0.90597 +2020-04-22 11:44:54,476-INFO: batch_id: 406, batch_time: 0.00554s, acc: 0.77500, auc: 0.90606 +2020-04-22 11:44:54,481-INFO: batch_id: 407, batch_time: 0.00471s, acc: 0.00000, auc: 0.90608 +2020-04-22 11:44:54,481-INFO: mean_acc:0.76195, mean_auc:0.90577 +``` + diff --git a/PaddleRec/wide_deep/args.py b/PaddleRec/wide_deep/args.py new file mode 100644 index 00000000..128e9f01 --- /dev/null +++ b/PaddleRec/wide_deep/args.py @@ -0,0 +1,42 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import distutils.util +import sys + + +def parse_args(): + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--epochs", type=int, default=40, help="epochs") + parser.add_argument("--batch_size", type=int, default=40, help="batch_size") + parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') + parser.add_argument('--test_epoch', type=str, default='1',help='test_epoch') + parser.add_argument('--train_path', type=str, default='data/adult.data', help='train_path') + parser.add_argument('--test_path', type=str, default='data/adult.test', help='test_path') + parser.add_argument('--train_data_path', type=str, default='train_data/train_data.csv', help='train_data_path') + parser.add_argument('--test_data_path', type=str, default='test_data/test_data.csv', help='test_data_path') + parser.add_argument('--model_dir', type=str, default='model_dir', help='test_data_path') + parser.add_argument('--hidden1_units', type=int, default=75, help='hidden1_units') + parser.add_argument('--hidden2_units', type=int, default=50, help='hidden2_units') + parser.add_argument('--hidden3_units', type=int, default=25, help='hidden3_units') + + args = parser.parse_args() + + return args diff --git a/PaddleRec/wide_deep/create_data.sh b/PaddleRec/wide_deep/create_data.sh new file mode 100644 index 00000000..28f894bb --- /dev/null +++ b/PaddleRec/wide_deep/create_data.sh @@ -0,0 +1,17 @@ +mkdir train_data +mkdir test_data +mkdir data +train_path="data/adult.data" +test_path="data/adult.test" +train_data_path="train_data/train_data.csv" +test_data_path="test_data/test_data.csv" + +pip install -r requirements.txt + +wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data +wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test + +python data_preparation.py --train_path ${train_path} \ + --test_path ${test_path} \ + --train_data_path ${train_data_path}\ + --test_data_path ${test_data_path} diff --git a/PaddleRec/wide_deep/data_preparation.py b/PaddleRec/wide_deep/data_preparation.py new file mode 100644 index 00000000..928424b3 --- /dev/null +++ b/PaddleRec/wide_deep/data_preparation.py @@ -0,0 +1,109 @@ +import os +import io +import args +import pandas as pd +from sklearn import preprocessing + +def _clean_file(source_path,target_path): + """makes changes to match the CSV format.""" + with io.open(source_path, 'r') as temp_eval_file: + with io.open(target_path, 'w') as eval_file: + for line in temp_eval_file: + line = line.strip() + line = line.replace(', ', ',') + if not line or ',' not in line: + continue + if line[-1] == '.': + line = line[:-1] + line += '\n' + eval_file.write(line) + +def build_model_columns(train_data_path, test_data_path): + # The column names are from + # https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html + column_names = [ + 'age', 'workclass', 'fnlwgt', 'education', 'education_num', + 'marital_status', 'occupation', 'relationship', 'race', 'gender', + 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', + 'income_bracket' + ] + + # Load the dataset in Pandas + train_df = pd.read_csv( + train_data_path, + delimiter=',', + header=None, + index_col=None, + names=column_names) + test_df = pd.read_csv( + test_data_path, + delimiter=',', + header=None, + index_col=None, + names=column_names) + + # First group of tasks according to the paper + #label_columns = ['income_50k', 'marital_stat'] + categorical_columns = ['education','marital_status','relationship','workclass','occupation'] + for col in categorical_columns: + label_train = preprocessing.LabelEncoder() + train_df[col]= label_train.fit_transform(train_df[col]) + label_test = preprocessing.LabelEncoder() + test_df[col]= label_test.fit_transform(test_df[col]) + + bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65] + train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), bins,labels=False) + test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), bins,labels=False) + + base_columns = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets'] + + train_df['education_occupation'] = train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str) + test_df['education_occupation'] = test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str) + train_df['age_buckets_education_occupation'] = train_df['age_buckets'].astype(str) + '_' + train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str) + test_df['age_buckets_education_occupation'] = test_df['age_buckets'].astype(str) + '_' + test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str) + crossed_columns = ['education_occupation','age_buckets_education_occupation'] + + for col in crossed_columns: + label_train = preprocessing.LabelEncoder() + train_df[col]= label_train.fit_transform(train_df[col]) + label_test = preprocessing.LabelEncoder() + test_df[col]= label_test.fit_transform(test_df[col]) + + wide_columns = base_columns + crossed_columns + + train_df_temp = pd.get_dummies(train_df[categorical_columns],columns=categorical_columns) + test_df_temp = pd.get_dummies(test_df[categorical_columns], columns=categorical_columns) + train_df = train_df.join(train_df_temp) + test_df = test_df.join(test_df_temp) + + deep_columns = list(train_df_temp.columns)+ ['age','education_num','capital_gain','capital_loss','hours_per_week'] + + train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) + test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) + + with io.open('train_data/columns.txt','w') as f: + write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' + f.write(write_str) + f.close() + with io.open('test_data/columns.txt','w') as f: + write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' + f.write(write_str) + f.close() + + train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(train_data_path,index=False) + test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(test_data_path,index=False) + + +def clean_file(train_path, test_path, train_data_path, test_data_path): + _clean_file(train_path, train_data_path) + _clean_file(test_path, test_data_path) + +if __name__ == '__main__': + args = args.parse_args() + clean_file(args.train_path, args.test_path, args.train_data_path, args.test_data_path) + build_model_columns(args.train_data_path, args.test_data_path) + + + + + \ No newline at end of file diff --git a/PaddleRec/wide_deep/infer.py b/PaddleRec/wide_deep/infer.py new file mode 100644 index 00000000..4ad524a8 --- /dev/null +++ b/PaddleRec/wide_deep/infer.py @@ -0,0 +1,75 @@ +import numpy as np +import os +import paddle.fluid as fluid +from net import wide_deep +import logging +import paddle +import args +import utils +import time +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +def set_zero(var_name,scope=fluid.global_scope(), place=fluid.CPUPlace(),param_type="int64"): + """ + Set tensor of a Variable to zero. + Args: + var_name(str): name of Variable + scope(Scope): Scope object, default is fluid.global_scope() + place(Place): Place object, default is fluid.CPUPlace() + param_type(str): param data type, default is int64 + """ + param = scope.var(var_name).get_tensor() + param_array = np.zeros(param._get_dims()).astype(param_type) + param.set(param_array, place) + + +def run_infer(args,test_data_path): + wide_deep_model = wide_deep() + + test_data_generator = utils.CriteoDataset() + test_reader = paddle.batch(test_data_generator.test(test_data_path), batch_size=args.batch_size) + + inference_scope = fluid.Scope() + startup_program = fluid.framework.Program() + test_program = fluid.framework.Program() + + cur_model_path = os.path.join(args.model_dir, 'epoch_' + str(args.test_epoch), "checkpoint") + + with fluid.scope_guard(inference_scope): + with fluid.framework.program_guard(test_program, startup_program): + inputs = wide_deep_model.input_data() + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + loss, acc, auc, batch_auc, auc_states = wide_deep_model.model(inputs, args.hidden1_units, args.hidden2_units, args.hidden3_units) + exe = fluid.Executor(place) + exe.run(startup_program) + + fluid.load(fluid.default_main_program(), cur_model_path,exe) + feeder = fluid.DataFeeder(feed_list=inputs, place=place) + + for var in auc_states: # reset auc states + set_zero(var.name, scope=inference_scope, place=place) + + mean_acc = [] + mean_auc = [] + for batch_id, data in enumerate(test_reader()): + begin = time.time() + acc_val,auc_val = exe.run(program=test_program, + feed=feeder.feed(data), + fetch_list=[acc.name, auc.name], + return_numpy=True + ) + mean_acc.append(np.array(acc_val)[0]) + mean_auc.append(np.array(auc_val)[0]) + end = time.time() + logger.info("batch_id: {}, batch_time: {:.5f}s, acc: {:.5f}, auc: {:.5f}".format( + batch_id, end-begin, np.array(acc_val)[0], np.array(auc_val)[0])) + + logger.info("mean_acc:{:.5f}, mean_auc:{:.5f}".format(np.mean(mean_acc), np.mean(mean_auc))) + +if __name__ == "__main__": + + args = args.parse_args() + run_infer(args, args.test_data_path) + \ No newline at end of file diff --git a/PaddleRec/wide_deep/infer_cpu.sh b/PaddleRec/wide_deep/infer_cpu.sh new file mode 100644 index 00000000..3f60125f --- /dev/null +++ b/PaddleRec/wide_deep/infer_cpu.sh @@ -0,0 +1,9 @@ +python infer.py --batch_size 40 \ + --use_gpu 0 \ + --test_epoch 39 \ + --test_data_path 'test_data/test_data.csv' \ + --model_dir 'model_dir' \ + --hidden1_units 75 \ + --hidden2_units 50 \ + --hidden3_units 25 + \ No newline at end of file diff --git a/PaddleRec/wide_deep/infer_gpu.sh b/PaddleRec/wide_deep/infer_gpu.sh new file mode 100644 index 00000000..4b93e384 --- /dev/null +++ b/PaddleRec/wide_deep/infer_gpu.sh @@ -0,0 +1,9 @@ +CUDA_VISIBLE_DEVICES=0 python infer.py --batch_size 40 \ + --use_gpu 1 \ + --test_epoch 39 \ + --test_data_path 'test_data/test_data.csv' \ + --model_dir 'model_dir' \ + --hidden1_units 75 \ + --hidden2_units 50 \ + --hidden3_units 25 + \ No newline at end of file diff --git a/PaddleRec/wide_deep/net.py b/PaddleRec/wide_deep/net.py new file mode 100644 index 00000000..f598bdc9 --- /dev/null +++ b/PaddleRec/wide_deep/net.py @@ -0,0 +1,66 @@ +import paddle +import io +import math +import paddle.fluid as fluid + +class wide_deep(object): + def wide_part(self, data): + out = fluid.layers.fc(input=data, + size=1, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1])), + regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)), + act=None, + name='wide') + return out + + def fc(self, data, hidden_units, active, tag): + output = fluid.layers.fc(input=data, + size=hidden_units, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))), + act=active, + name=tag) + + return output + + def deep_part(self, data, hidden1_units, hidden2_units, hidden3_units): + l1 = self.fc(data, hidden1_units, 'relu', 'l1') + l2 = self.fc(l1, hidden2_units, 'relu', 'l2') + l3 = self.fc(l2, hidden3_units, 'relu', 'l3') + + return l3 + + def input_data(self): + wide_input = fluid.data(name='wide_input', shape=[None, 8], dtype='float32') + deep_input = fluid.data(name='deep_input', shape=[None, 58], dtype='float32') + label = fluid.data(name='label', shape=[None, 1], dtype='float32') + inputs = [wide_input] + [deep_input] + [label] + + return inputs + + def model(self, inputs, hidden1_units, hidden2_units, hidden3_units): + wide_output = self.wide_part(inputs[0]) + deep_output = self.deep_part(inputs[1], hidden1_units, hidden2_units, hidden3_units) + + wide_model = fluid.layers.fc(input=wide_output, + size=1, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), + act=None, + name='w_wide') + + deep_model = fluid.layers.fc(input=deep_output, + size=1, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), + act=None, + name='w_deep') + + prediction = fluid.layers.elementwise_add(wide_model, deep_model) + pred = fluid.layers.sigmoid(fluid.layers.clip(prediction, min=-15.0, max=15.0), name="prediction") + + num_seqs = fluid.layers.create_tensor(dtype='int64') + acc = fluid.layers.accuracy(input=pred, label=fluid.layers.cast(x=inputs[2], dtype='int64'), total=num_seqs) + auc_val, batch_auc, auc_states = fluid.layers.auc(input=pred, label=fluid.layers.cast(x=inputs[2], dtype='int64')) + + cost = fluid.layers.sigmoid_cross_entropy_with_logits(x=prediction, label=inputs[2]) + avg_cost = fluid.layers.mean(cost) + + return avg_cost, acc,auc_val, batch_auc, auc_states diff --git a/PaddleRec/wide_deep/requirements.txt b/PaddleRec/wide_deep/requirements.txt new file mode 100644 index 00000000..1f4c4696 --- /dev/null +++ b/PaddleRec/wide_deep/requirements.txt @@ -0,0 +1,132 @@ +absl-py==0.8.1 +aspy.yaml==1.3.0 +attrs==19.2.0 +audioread==2.1.8 +backcall==0.1.0 +bleach==3.1.0 +cachetools==4.0.0 +certifi==2019.9.11 +cffi==1.14.0 +cfgv==2.0.1 +chardet==3.0.4 +Click==7.0 +cloudpickle==1.2.1 +cma==2.7.0 +colorlog==4.1.0 +cycler==0.10.0 +Cython==0.29 +decorator==4.4.0 +entrypoints==0.3 +flake8==3.7.9 +Flask==1.1.1 +funcsigs==1.0.2 +future==0.18.0 +google-auth==1.10.0 +google-auth-oauthlib==0.4.1 +graphviz==0.13 +grpcio==1.26.0 +gunicorn==20.0.4 +gym==0.12.1 +h5py==2.9.0 +identify==1.4.10 +idna==2.8 +imageio==2.6.1 +imageio-ffmpeg==0.3.0 +importlib-metadata==0.23 +ipykernel==5.1.0 +ipython==7.0.1 +ipython-genutils==0.2.0 +itsdangerous==1.1.0 +jedi==0.15.1 +jieba==0.42.1 +Jinja2==2.10.1 +joblib==0.14.1 +jsonschema==3.1.1 +jupyter-client==5.3.3 +jupyter-core==4.5.0 +kiwisolver==1.1.0 +librosa==0.7.2 +llvmlite==0.31.0 +Markdown==3.1.1 +MarkupSafe==1.1.1 +matplotlib==2.2.3 +mccabe==0.6.1 +mistune==0.8.4 +more-itertools==7.2.0 +moviepy==1.0.1 +nbconvert==5.3.1 +nbformat==4.4.0 +networkx==2.4 +nltk==3.4.5 +nodeenv==1.3.4 +notebook==5.7.0 +numba==0.48.0 +numpy==1.16.4 +oauthlib==3.1.0 +objgraph==3.4.1 +opencv-python==4.1.1.26 +paddlehub==1.5.0 +paddlepaddle-gpu==1.7.1.post97 +pandas==0.23.4 +pandocfilters==1.4.2 +parl==1.1.2 +parso==0.5.1 +pexpect==4.7.0 +pickleshare==0.7.5 +Pillow==6.2.0 +pre-commit==1.21.0 +prettytable==0.7.2 +proglog==0.1.9 +prometheus-client==0.5.0 +prompt-toolkit==2.0.10 +protobuf==3.10.0 +ptyprocess==0.6.0 +pyarrow==0.13.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.7 +pycodestyle==2.5.0 +pycparser==2.19 +pyflakes==2.1.1 +pyglet==1.4.5 +Pygments==2.4.2 +pyparsing==2.4.2 +pyrsistent==0.15.4 +python-dateutil==2.8.0 +pytz==2019.3 +PyYAML==5.1.2 +pyzmq==18.0.1 +rarfile==3.1 +recordio==0.1.7 +requests==2.22.0 +requests-oauthlib==1.3.0 +resampy==0.2.2 +rsa==4.0 +scikit-learn==0.20.0 +scipy==1.3.0 +seaborn==0.10.0 +Send2Trash==1.5.0 +sentencepiece==0.1.85 +simplegeneric==0.8.1 +six==1.12.0 +sklearn==0.0 +SoundFile==0.10.3.post1 +tb-nightly==1.15.0a20190801 +tb-paddle==0.3.6 +tensorboard==2.1.0 +tensorboardX==1.8 +termcolor==1.1.0 +terminado==0.8.2 +testpath==0.4.2 +toml==0.10.0 +tornado==5.1.1 +tqdm==4.36.1 +traitlets==4.3.3 +urllib3==1.25.6 +virtualenv==16.7.9 +visualdl==1.3.0 +wcwidth==0.1.7 +webencodings==0.5.1 +Werkzeug==0.16.0 +xgboost==1.0.1 +yapf==0.26.0 +zipp==0.6.0 diff --git a/PaddleRec/wide_deep/train.py b/PaddleRec/wide_deep/train.py new file mode 100644 index 00000000..1d07c7cc --- /dev/null +++ b/PaddleRec/wide_deep/train.py @@ -0,0 +1,47 @@ +import numpy as np +import os +import paddle.fluid as fluid +from net import wide_deep +import logging +import paddle +import args +import utils +import time + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +def train(args, train_data_path): + wide_deep_model = wide_deep() + inputs = wide_deep_model.input_data() + train_data_generator = utils.CriteoDataset() + train_reader = paddle.batch(train_data_generator.train(train_data_path), batch_size=args.batch_size) + + loss, acc, auc, batch_auc, auc_states = wide_deep_model.model(inputs, args.hidden1_units, args.hidden2_units, args.hidden3_units) + optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.01) + optimizer.minimize(loss) + + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + feeder = fluid.DataFeeder(feed_list=inputs, place=place) + + for epoch in range(args.epochs): + for batch_id, data in enumerate(train_reader()): + begin = time.time() + loss_val, acc_val, auc_val = exe.run(program=fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[loss.name, acc.name, auc.name], + return_numpy=True) + end = time.time() + logger.info("epoch:{}, batch_time:{:.5f}s, loss:{:.5f}, acc:{:.5f}, auc:{:.5f}".format(epoch, end-begin, np.array(loss_val)[0], + np.array(acc_val)[0], np.array(auc_val)[0])) + + model_dir = os.path.join(args.model_dir, 'epoch_' + str(epoch + 1), "checkpoint") + main_program = fluid.default_main_program() + fluid.io.save(main_program,model_dir) + +if __name__ == "__main__": + args = args.parse_args() + train(args, args.train_data_path) diff --git a/PaddleRec/wide_deep/train_cpu.sh b/PaddleRec/wide_deep/train_cpu.sh new file mode 100644 index 00000000..eccadedb --- /dev/null +++ b/PaddleRec/wide_deep/train_cpu.sh @@ -0,0 +1,9 @@ +python train.py --epochs 40 \ + --batch_size 40 \ + --use_gpu 0 \ + --train_data_path 'train_data/train_data.csv' \ + --model_dir 'model_dir' \ + --hidden1_units 75 \ + --hidden2_units 50 \ + --hidden3_units 25 + \ No newline at end of file diff --git a/PaddleRec/wide_deep/train_gpu.sh b/PaddleRec/wide_deep/train_gpu.sh new file mode 100644 index 00000000..c55a8305 --- /dev/null +++ b/PaddleRec/wide_deep/train_gpu.sh @@ -0,0 +1,8 @@ +CUDA_VISIBLE_DEVICES=0 python train.py --epochs 40 \ + --batch_size 40 \ + --use_gpu 1 \ + --train_data_path 'train_data/train_data.csv' \ + --model_dir 'model_dir' \ + --hidden1_units 75 \ + --hidden2_units 50 \ + --hidden3_units 25 diff --git a/PaddleRec/wide_deep/utils.py b/PaddleRec/wide_deep/utils.py new file mode 100644 index 00000000..fc239624 --- /dev/null +++ b/PaddleRec/wide_deep/utils.py @@ -0,0 +1,31 @@ +import numpy as np +import os +import paddle.fluid as fluid + +class CriteoDataset(object): + + def _reader_creator(self, file): + def reader(): + with open(file, 'r') as f: + for i,line in enumerate(f): + if i == 0: + continue + line = line.strip().split(',') + features = list(map(float, line)) + wide_feat = features[0:8] + deep_feat = features[8:58+8] + label = features[-1] + output = [] + output.append(wide_feat) + output.append(deep_feat) + output.append([label]) + + yield output + + return reader + + def train(self, file): + return self._reader_creator(file) + + def test(self, file): + return self._reader_creator(file) \ No newline at end of file -- GitLab