提交 b5e95a80 编写于 作者: F frankwhzhang

Merge branch 'master' of https://github.com/PaddlePaddle/PaddleRec into log_print

...@@ -69,7 +69,7 @@ class Metric(object): ...@@ -69,7 +69,7 @@ class Metric(object):
global_metrics = dict() global_metrics = dict()
for key in self._global_metric_state_vars: for key in self._global_metric_state_vars:
varname, dtype = self._global_metric_state_vars[key] varname, dtype = self._global_metric_state_vars[key]
global_metrics[key] = self.get_global_metric_state(fleet, scope, global_metrics[key] = self._get_global_metric_state(fleet, scope,
varname) varname)
return self._calculate(global_metrics) return self._calculate(global_metrics)
......
...@@ -99,7 +99,8 @@ class SingleNetwork(NetworkBase): ...@@ -99,7 +99,8 @@ class SingleNetwork(NetworkBase):
context["dataset"] = {} context["dataset"] = {}
for dataset in context["env"]["dataset"]: for dataset in context["env"]["dataset"]:
type = envs.get_global_env("dataset." + dataset["name"] + ".type") type = envs.get_global_env("dataset." + dataset["name"] + ".type")
if type != "DataLoader":
if type == "QueueDataset":
dataset_class = QueueDataset(context) dataset_class = QueueDataset(context)
context["dataset"][dataset[ context["dataset"][dataset[
"name"]] = dataset_class.create_dataset(dataset["name"], "name"]] = dataset_class.create_dataset(dataset["name"],
...@@ -133,9 +134,7 @@ class PSNetwork(NetworkBase): ...@@ -133,9 +134,7 @@ class PSNetwork(NetworkBase):
if envs.get_global_env("dataset." + dataset_name + if envs.get_global_env("dataset." + dataset_name +
".type") == "DataLoader": ".type") == "DataLoader":
model._init_dataloader(is_infer=False) model._init_dataloader(is_infer=False)
data_loader = DataLoader(context)
data_loader.get_dataloader(context, dataset_name,
model._data_loader)
model.net(model._data_var, False) model.net(model._data_var, False)
optimizer = model.optimizer() optimizer = model.optimizer()
strategy = self._build_strategy(context) strategy = self._build_strategy(context)
...@@ -160,7 +159,11 @@ class PSNetwork(NetworkBase): ...@@ -160,7 +159,11 @@ class PSNetwork(NetworkBase):
for dataset in context["env"]["dataset"]: for dataset in context["env"]["dataset"]:
type = envs.get_global_env("dataset." + dataset["name"] + type = envs.get_global_env("dataset." + dataset["name"] +
".type") ".type")
if type != "DataLoader": if type == "DataLoader":
data_loader = DataLoader(context)
data_loader.get_dataloader(context, dataset_name,
model._data_loader)
elif type == "QueueDataset":
dataset_class = QueueDataset(context) dataset_class = QueueDataset(context)
context["dataset"][dataset[ context["dataset"][dataset[
"name"]] = dataset_class.create_dataset( "name"]] = dataset_class.create_dataset(
...@@ -229,9 +232,6 @@ class PslibNetwork(NetworkBase): ...@@ -229,9 +232,6 @@ class PslibNetwork(NetworkBase):
if envs.get_global_env("dataset." + dataset_name + if envs.get_global_env("dataset." + dataset_name +
".type") == "DataLoader": ".type") == "DataLoader":
model._init_dataloader(is_infer=False) model._init_dataloader(is_infer=False)
data_loader = DataLoader(context)
data_loader.get_dataloader(context, dataset_name,
model._data_loader)
model.net(model._data_var, False) model.net(model._data_var, False)
optimizer = model.optimizer() optimizer = model.optimizer()
...@@ -257,7 +257,11 @@ class PslibNetwork(NetworkBase): ...@@ -257,7 +257,11 @@ class PslibNetwork(NetworkBase):
for dataset in context["env"]["dataset"]: for dataset in context["env"]["dataset"]:
type = envs.get_global_env("dataset." + dataset["name"] + type = envs.get_global_env("dataset." + dataset["name"] +
".type") ".type")
if type != "DataLoader": if type == "DataLoader":
data_loader = DataLoader(context)
data_loader.get_dataloader(context, dataset_name, context[
"model"][model_dict["name"]]["model"]._data_loader)
elif type == "QueueDataset":
dataset_class = QueueDataset(context) dataset_class = QueueDataset(context)
context["dataset"][dataset[ context["dataset"][dataset[
"name"]] = dataset_class.create_dataset( "name"]] = dataset_class.create_dataset(
...@@ -323,7 +327,10 @@ class CollectiveNetwork(NetworkBase): ...@@ -323,7 +327,10 @@ class CollectiveNetwork(NetworkBase):
context["dataset"] = {} context["dataset"] = {}
for dataset in context["env"]["dataset"]: for dataset in context["env"]["dataset"]:
type = envs.get_global_env("dataset." + dataset["name"] + ".type") type = envs.get_global_env("dataset." + dataset["name"] + ".type")
if type != "DataLoader": if type == "QueueDataset":
raise ValueError(
"Collective don't support QueueDataset training, please use DataLoader."
)
dataset_class = QueueDataset(context) dataset_class = QueueDataset(context)
context["dataset"][dataset[ context["dataset"][dataset[
"name"]] = dataset_class.create_dataset(dataset["name"], "name"]] = dataset_class.create_dataset(dataset["name"],
......
...@@ -537,7 +537,6 @@ class SingleInferRunner(RunnerBase): ...@@ -537,7 +537,6 @@ class SingleInferRunner(RunnerBase):
def run(self, context): def run(self, context):
self._dir_check(context) self._dir_check(context)
self.epoch_model_name_list.sort()
for index, epoch_name in enumerate(self.epoch_model_name_list): for index, epoch_name in enumerate(self.epoch_model_name_list):
for model_dict in context["phases"]: for model_dict in context["phases"]:
model_class = context["model"][model_dict["name"]]["model"] model_class = context["model"][model_dict["name"]]["model"]
......
# ESMM
以下是本例的简要目录结构及说明:
```
├── data # 文档
├── train #训练数据
├──small.txt
├── test #测试数据
├── small.txt
├── run.sh
├── __init__.py
├── config.yaml #配置文件
├── esmm_reader.py #数据读取文件
├── model.py #模型文件
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/esmm#模型简介)
- [数据准备](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/esmm#数据准备)
- [运行环境](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/esmm#运行环境)
- [快速开始](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/esmm#快速开始)
- [论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/esmm#论文复现)
- [进阶使用](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/esmm#进阶使用)
- [FAQ](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/esmm#FAQ)
## 模型简介
不同于CTR预估问题,CVR预估面临两个关键问题:
1. **Sample Selection Bias (SSB)** 转化是在点击之后才“有可能”发生的动作,传统CVR模型通常以点击数据为训练集,其中点击未转化为负例,点击并转化为正例。但是训练好的模型实际使用时,则是对整个空间的样本进行预估,而非只对点击样本进行预估。即是说,训练数据与实际要预测的数据来自不同分布,这个偏差对模型的泛化能力构成了很大挑战。
2. **Data Sparsity (DS)** 作为CVR训练数据的点击样本远小于CTR预估训练使用的曝光样本。
ESMM是发表在 SIGIR’2018 的论文[《Entire Space Multi-Task Model: An Effective Approach for Estimating Post-Click Conversion Rate》]( https://arxiv.org/abs/1804.07931 )文章基于 Multi-Task Learning 的思路,提出一种新的CVR预估模型——ESMM,有效解决了真实场景中CVR预估面临的数据稀疏以及样本选择偏差这两个关键问题
本项目在paddlepaddle上实现ESMM的网络结构,并在开源数据集[Ali-CCP:Alibaba Click and Conversion Prediction]( https://tianchi.aliyun.com/datalab/dataSet.html?dataId=408 )上验证模型效果, 本模型配置默认使用demo数据集,若进行精度验证,请参考[论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/esmm#论文复现)部分。
本项目支持功能
训练:单机CPU、单机单卡GPU、单机多卡GPU、本地模拟参数服务器训练、增量训练,配置请参考 [启动训练](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/train.md)
预测:单机CPU、单机单卡GPU ;配置请参考[PaddleRec 离线预测](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/predict.md)
## 数据准备
数据地址:[Ali-CCP:Alibaba Click and Conversion Prediction]( https://tianchi.aliyun.com/datalab/dataSet.html?dataId=408 )
```
cd data
sh run.sh
```
数据格式参见demo数据:data/train
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
### 单机训练
CPU环境
在config.yaml文件中设置好设备,epochs等。
```
dataset:
- name: dataset_train
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/esmm_reader.py"
- name: dataset_infer
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/test"
data_converter: "{workspace}/esmm_reader.py"
```
### 单机预测
CPU环境
在config.yaml文件中设置好epochs、device等参数。
```
- name: infer_runner
class: infer
init_model_path: "increment/1"
device: cpu
print_interval: 1
phases: [infer]
```
## 论文复现
用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=1000, thread_num=8, epoch_num=4
修改后运行方案:修改config.yaml中的'workspace'为config.yaml的目录位置,执行
```
python -m paddlerec.run -m /home/your/dir/config.yaml #调试模式 直接指定本地config的绝对路径
```
## 进阶使用
## FAQ
mkdir train_data
mkdir test_data
mkdir vocab
mkdir data
train_source_path="./data/sample_train.tar.gz"
train_target_path="train_data"
test_source_path="./data/sample_test.tar.gz"
test_target_path="test_data"
cd data
echo "downloading sample_train.tar.gz......"
curl -# 'http://jupter-oss.oss-cn-hangzhou.aliyuncs.com/file/opensearch/documents/408/sample_train.tar.gz?Expires=1586435769&OSSAccessKeyId=LTAIGx40tjZWxj6q&Signature=ahUDqhvKT1cGjC4%2FIER2EWtq7o4%3D&response-content-disposition=attachment%3B%20' -H 'Proxy-Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' -H 'Accept-Language: zh-CN,zh;q=0.9' --compressed --insecure -o sample_train.tar.gz
cd ..
echo "unzipping sample_train.tar.gz......"
tar -xzvf ${train_source_path} -C ${train_target_path} && rm -rf ${train_source_path}
cd data
echo "downloading sample_test.tar.gz......"
curl -# 'http://jupter-oss.oss-cn-hangzhou.aliyuncs.com/file/opensearch/documents/408/sample_test.tar.gz?Expires=1586435821&OSSAccessKeyId=LTAIGx40tjZWxj6q&Signature=OwLMPjt1agByQtRVi8pazsAliNk%3D&response-content-disposition=attachment%3B%20' -H 'Proxy-Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' -H 'Accept-Language: zh-CN,zh;q=0.9' --compressed --insecure -o sample_test.tar.gz
cd ..
echo "unzipping sample_test.tar.gz......"
tar -xzvf ${test_source_path} -C ${test_target_path} && rm -rf ${test_source_path}
echo "preprocessing data......"
python reader.py --train_data_path ${train_target_path} \
--test_data_path ${test_target_path} \
--vocab_path vocab/vocab_size.txt \
--train_sample_size 6400 \
--test_sample_size 6400 \
# MMOE
以下是本例的简要目录结构及说明:
```
├── data # 文档
├── train #训练数据
├── train_data.txt
├── test #测试数据
├── test_data.txt
├── run.sh
├── data_preparation.py
├── __init__.py
├── config.yaml #配置文件
├── census_reader.py #数据读取文件
├── model.py #模型文件
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#模型简介)
- [数据准备](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#数据准备)
- [运行环境](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#运行环境)
- [快速开始](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#快速开始)
- [论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#论文复现)
- [进阶使用](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#进阶使用)
- [FAQ](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#FAQ)
## 模型简介
多任务模型通过学习不同任务的联系和差异,可提高每个任务的学习效率和质量。多任务学习的的框架广泛采用shared-bottom的结构,不同任务间共用底部的隐层。这种结构本质上可以减少过拟合的风险,但是效果上可能受到任务差异和数据分布带来的影响。 论文[《Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts》]( https://www.kdd.org/kdd2018/accepted-papers/view/modeling-task-relationships-in-multi-task-learning-with-multi-gate-mixture- )中提出了一个Multi-gate Mixture-of-Experts(MMOE)的多任务学习结构。MMOE模型刻画了任务相关性,基于共享表示来学习特定任务的函数,避免了明显增加参数的缺点。
我们在Paddlepaddle定义MMOE的网络结构,在开源数据集Census-income Data上验证模型效果,两个任务的auc分别为:
1.income
> max_mmoe_test_auc_income:0.94937
>
> mean_mmoe_test_auc_income:0.94465
2.marital
> max_mmoe_test_auc_marital:0.99419
>
> mean_mmoe_test_auc_marital:0.99324
若进行精度验证,请参考[论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#论文复现)部分。
本项目支持功能
训练:单机CPU、单机单卡GPU、单机多卡GPU、本地模拟参数服务器训练、增量训练,配置请参考 [启动训练](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/train.md)
预测:单机CPU、单机单卡GPU ;配置请参考[PaddleRec 离线预测](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/predict.md)
## 数据准备
数据地址: [Census-income Data](https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census.tar.gz )
数据解压后, 在run.sh脚本文件中添加文件的路径,并运行脚本。
```sh
mkdir train_data
mkdir test_data
mkdir data
train_path="data/census-income.data"
test_path="data/census-income.test"
train_data_path="train_data/"
test_data_path="test_data/"
pip install -r requirements.txt
wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census.tar.gz
tar -zxvf data/census.tar.gz -C data/
python data_preparation.py --train_path ${train_path} \
--test_path ${test_path} \
--train_data_path ${train_data_path}\
--test_data_path ${test_data_path}
```
生成的格式以逗号为分割点
```
0,0,73,0,0,0,0,1700.09,0,0
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
### 单机训练
CPU环境
在config.yaml文件中设置好设备,epochs等。
```
dataset:
- name: dataset_train
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
- name: dataset_infer
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
```
### 单机预测
CPU环境
在config.yaml文件中设置好epochs、device等参数。
```
- name: infer_runner
class: infer
init_model_path: "increment/0"
device: cpu
```
## 论文复现
用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=1000, thread_num=8, epoch_num=4
使用gpu p100 单卡训练 6.5h 测试auc: best:0.9940, mean:0.9932
修改后运行方案:修改config.yaml中的'workspace'为config.yaml的目录位置,执行
```
python -m paddlerec.run -m /home/your/dir/config.yaml #调试模式 直接指定本地config的绝对路径
```
## 进阶使用
## FAQ
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pandas as pd
import numpy as np
import paddle.fluid as fluid
from args import *
def fun1(x):
if x == ' 50000+.':
return 1
else:
return 0
def fun2(x):
if x == ' Never married':
return 1
else:
return 0
def data_preparation(train_path, test_path, train_data_path, test_data_path):
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names = [
'age', 'class_worker', 'det_ind_code', 'det_occ_code', 'education',
'wage_per_hour', 'hs_college', 'marital_stat', 'major_ind_code',
'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member',
'unemp_reason', 'full_or_part_emp', 'capital_gains', 'capital_losses',
'stock_dividends', 'tax_filer_stat', 'region_prev_res',
'state_prev_res', 'det_hh_fam_stat', 'det_hh_summ', 'instance_weight',
'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same',
'mig_prev_sunbelt', 'num_emp', 'fam_under_18', 'country_father',
'country_mother', 'country_self', 'citizenship', 'own_or_self',
'vet_question', 'vet_benefits', 'weeks_worked', 'year', 'income_50k'
]
# Load the dataset in Pandas
train_df = pd.read_csv(
train_path,
delimiter=',',
header=None,
index_col=None,
names=column_names)
other_df = pd.read_csv(
test_path,
delimiter=',',
header=None,
index_col=None,
names=column_names)
# First group of tasks according to the paper
label_columns = ['income_50k', 'marital_stat']
# One-hot encoding categorical columns
categorical_columns = [
'class_worker', 'det_ind_code', 'det_occ_code', 'education',
'hs_college', 'major_ind_code', 'major_occ_code', 'race',
'hisp_origin', 'sex', 'union_member', 'unemp_reason',
'full_or_part_emp', 'tax_filer_stat', 'region_prev_res',
'state_prev_res', 'det_hh_fam_stat', 'det_hh_summ', 'mig_chg_msa',
'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
'fam_under_18', 'country_father', 'country_mother', 'country_self',
'citizenship', 'vet_question'
]
train_raw_labels = train_df[label_columns]
other_raw_labels = other_df[label_columns]
transformed_train = pd.get_dummies(train_df, columns=categorical_columns)
transformed_other = pd.get_dummies(other_df, columns=categorical_columns)
# Filling the missing column in the other set
transformed_other[
'det_hh_fam_stat_ Grandchild <18 ever marr not in subfamily'] = 0
# get label
transformed_train['income_50k'] = transformed_train['income_50k'].apply(
lambda x: fun1(x))
transformed_train['marital_stat'] = transformed_train[
'marital_stat'].apply(lambda x: fun2(x))
transformed_other['income_50k'] = transformed_other['income_50k'].apply(
lambda x: fun1(x))
transformed_other['marital_stat'] = transformed_other[
'marital_stat'].apply(lambda x: fun2(x))
# Split the other dataset into 1:1 validation to test according to the paper
validation_indices = transformed_other.sample(
frac=0.5, replace=False, random_state=1).index
test_indices = list(set(transformed_other.index) - set(validation_indices))
validation_data = transformed_other.iloc[validation_indices]
test_data = transformed_other.iloc[test_indices]
cols = transformed_train.columns.tolist()
cols.insert(0, cols.pop(cols.index('income_50k')))
cols.insert(0, cols.pop(cols.index('marital_stat')))
transformed_train = transformed_train[cols]
test_data = test_data[cols]
validation_data = validation_data[cols]
print(transformed_train.shape, transformed_other.shape,
validation_data.shape, test_data.shape)
transformed_train.to_csv(train_data_path + 'train_data.csv', index=False)
test_data.to_csv(test_data_path + 'test_data.csv', index=False)
args = data_preparation_args()
data_preparation(args.train_path, args.test_path, args.train_data_path,
args.test_data_path)
# Share_bottom
以下是本例的简要目录结构及说明:
```
├── data # 文档
├── train #训练数据
├── train_data.txt
├── test #测试数据
├── test_data.txt
├── run.sh
├── data_preparation.py
├── __init__.py
├── config.yaml #配置文件
├── census_reader.py #数据读取文件
├── model.py #模型文件
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/share-bottom#模型简介)
- [数据准备](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/share-bottom#数据准备)
- [运行环境](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/share-bottom#运行环境)
- [快速开始](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/share-bottom#快速开始)
- [论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/share-bottom#论文复现)
- [进阶使用](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/share-bottom#进阶使用)
- [FAQ](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/share-bottom#FAQ)
## 模型简介
share_bottom是多任务学习的基本框架,其特点是对于不同的任务,底层的参数和网络结构是共享的,这种结构的优点是极大地减少网络的参数数量的情况下也能很好地对多任务进行学习,但缺点也很明显,由于底层的参数和网络结构是完全共享的,因此对于相关性不高的两个任务会导致优化冲突,从而影响模型最终的结果。后续很多Neural-based的多任务模型都是基于share_bottom发展而来的,如MMOE等模型可以改进share_bottom在多任务之间相关性低导致模型效果差的缺点。
我们在Paddlepaddle实现share_bottom网络结构,并在开源数据集Census-income Data上验证模型效果。两个任务的auc分别为:
1.income
>max_sb_test_auc_income:0.94993
>
>mean_sb_test_auc_income: 0.93120
2.marital
> max_sb_test_auc_marital:0.99384
>
> mean_sb_test_auc_marital:0.99256
本项目在paddlepaddle上实现share_bottom的网络结构,并在开源数据集 [Census-income Data](https://archive.ics.uci.edu/ml/datasets/Census-Income+(KDD) )上验证模型效果, 本模型配置默认使用demo数据集,若进行精度验证,请参考[论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/share-bottom#论文复现)部分。
本项目支持功能
训练:单机CPU、单机单卡GPU、单机多卡GPU、本地模拟参数服务器训练、增量训练,配置请参考 [启动训练](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/train.md)
预测:单机CPU、单机单卡GPU ;配置请参考[PaddleRec 离线预测](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/predict.md)
## 数据准备
数据地址: [Census-income Data](https://archive.ics.uci.edu/ml/datasets/Census-Income+(KDD) )
数据解压后, 在create_data.sh脚本文件中添加文件的路径,并运行脚本。
```sh
mkdir train_data
mkdir test_data
mkdir data
train_path="data/census-income.data"
test_path="data/census-income.test"
train_data_path="train_data/"
test_data_path="test_data/"
pip install -r requirements.txt
wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census.tar.gz
tar -zxvf data/census.tar.gz -C data/
python data_preparation.py --train_path ${train_path} \
--test_path ${test_path} \
--train_data_path ${train_data_path}\
--test_data_path ${test_data_path}
```
生成的格式以逗号为分割点
```
0,0,73,0,0,0,0,1700.09,0,0
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
### 单机训练
CPU环境
在config.yaml文件中设置好设备,epochs等。
```sh
dataset:
- name: dataset_train
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
- name: dataset_infer
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
```
### 单机预测
CPU环境
在config.yaml文件中设置好epochs、device等参数。
```sh
- name: infer_runner
class: infer
init_model_path: "increment/0"
device: cpu
```
## 论文复现
用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=32, thread_num=8, epoch_num=100
使用gpu p100 单卡训练 4.5h 100轮, 测试auc:best: 0.9939,mean:0.9931
修改后运行方案:修改config.yaml中的'workspace'为config.yaml的目录位置,执行
```text
python -m paddlerec.run -m /home/your/dir/config.yaml #调试模式 直接指定本地config的绝对路径
```
## 进阶使用
## FAQ
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pandas as pd
import numpy as np
import paddle.fluid as fluid
from args import *
def fun1(x):
if x == ' 50000+.':
return 1
else:
return 0
def fun2(x):
if x == ' Never married':
return 1
else:
return 0
def data_preparation(train_path, test_path, train_data_path, test_data_path):
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names = [
'age', 'class_worker', 'det_ind_code', 'det_occ_code', 'education',
'wage_per_hour', 'hs_college', 'marital_stat', 'major_ind_code',
'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member',
'unemp_reason', 'full_or_part_emp', 'capital_gains', 'capital_losses',
'stock_dividends', 'tax_filer_stat', 'region_prev_res',
'state_prev_res', 'det_hh_fam_stat', 'det_hh_summ', 'instance_weight',
'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same',
'mig_prev_sunbelt', 'num_emp', 'fam_under_18', 'country_father',
'country_mother', 'country_self', 'citizenship', 'own_or_self',
'vet_question', 'vet_benefits', 'weeks_worked', 'year', 'income_50k'
]
# Load the dataset in Pandas
train_df = pd.read_csv(
train_path,
delimiter=',',
header=None,
index_col=None,
names=column_names)
other_df = pd.read_csv(
test_path,
delimiter=',',
header=None,
index_col=None,
names=column_names)
# First group of tasks according to the paper
label_columns = ['income_50k', 'marital_stat']
# One-hot encoding categorical columns
categorical_columns = [
'class_worker', 'det_ind_code', 'det_occ_code', 'education',
'hs_college', 'major_ind_code', 'major_occ_code', 'race',
'hisp_origin', 'sex', 'union_member', 'unemp_reason',
'full_or_part_emp', 'tax_filer_stat', 'region_prev_res',
'state_prev_res', 'det_hh_fam_stat', 'det_hh_summ', 'mig_chg_msa',
'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
'fam_under_18', 'country_father', 'country_mother', 'country_self',
'citizenship', 'vet_question'
]
train_raw_labels = train_df[label_columns]
other_raw_labels = other_df[label_columns]
transformed_train = pd.get_dummies(train_df, columns=categorical_columns)
transformed_other = pd.get_dummies(other_df, columns=categorical_columns)
# Filling the missing column in the other set
transformed_other[
'det_hh_fam_stat_ Grandchild <18 ever marr not in subfamily'] = 0
# get label
transformed_train['income_50k'] = transformed_train['income_50k'].apply(
lambda x: fun1(x))
transformed_train['marital_stat'] = transformed_train[
'marital_stat'].apply(lambda x: fun2(x))
transformed_other['income_50k'] = transformed_other['income_50k'].apply(
lambda x: fun1(x))
transformed_other['marital_stat'] = transformed_other[
'marital_stat'].apply(lambda x: fun2(x))
# Split the other dataset into 1:1 validation to test according to the paper
validation_indices = transformed_other.sample(
frac=0.5, replace=False, random_state=1).index
test_indices = list(set(transformed_other.index) - set(validation_indices))
validation_data = transformed_other.iloc[validation_indices]
test_data = transformed_other.iloc[test_indices]
cols = transformed_train.columns.tolist()
cols.insert(0, cols.pop(cols.index('income_50k')))
cols.insert(0, cols.pop(cols.index('marital_stat')))
transformed_train = transformed_train[cols]
test_data = test_data[cols]
validation_data = validation_data[cols]
print(transformed_train.shape, transformed_other.shape,
validation_data.shape, test_data.shape)
transformed_train.to_csv(train_data_path + 'train_data.csv', index=False)
test_data.to_csv(test_data_path + 'test_data.csv', index=False)
args = data_preparation_args()
data_preparation(args.train_path, args.test_path, args.train_data_path,
args.test_data_path)
mkdir train_data
mkdir test_data
mkdir data
train_path="data/census-income.data"
test_path="data/census-income.test"
train_data_path="train_data/"
test_data_path="test_data/"
pip install -r requirements.txt
wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census.tar.gz
tar -zxvf data/census.tar.gz -C data/
python data_preparation.py --train_path ${train_path} \
--test_path ${test_path} \
--train_data_path ${train_data_path}\
--test_data_path ${test_data_path}
...@@ -15,23 +15,53 @@ ...@@ -15,23 +15,53 @@
├── config.yaml #配置文件 ├── config.yaml #配置文件
``` ```
## 简介 注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
---
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [论文复现](#论文复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
[《FLEN: Leveraging Field for Scalable CTR Prediction》](https://arxiv.org/pdf/1911.04690.pdf)文章提出了field-wise bi-interaction pooling技术,解决了在大规模应用特征field信息时存在的时间复杂度和空间复杂度高的困境,同时提出了一种缓解梯度耦合问题的方法dicefactor。该模型已应用于美图的大规模推荐系统中,持续稳定地取得业务效果的全面提升。 [《FLEN: Leveraging Field for Scalable CTR Prediction》](https://arxiv.org/pdf/1911.04690.pdf)文章提出了field-wise bi-interaction pooling技术,解决了在大规模应用特征field信息时存在的时间复杂度和空间复杂度高的困境,同时提出了一种缓解梯度耦合问题的方法dicefactor。该模型已应用于美图的大规模推荐系统中,持续稳定地取得业务效果的全面提升。
本项目在avazu数据集上验证模型效果 本项目在avazu数据集上验证模型效果, 本模型配置默认使用demo数据集,若进行精度验证,请参考[论文复现](#论文复现)部分。
本项目支持功能
训练:单机CPU、单机单卡GPU、单机多卡GPU、本地模拟参数服务器训练、增量训练,配置请参考 [启动训练](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/train.md)
预测:单机CPU、单机单卡GPU ;配置请参考[PaddleRec 离线预测](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/predict.md)
## 数据准备
## 数据下载及预处理
## 环境
PaddlePaddle 1.7.2 ## 运行环境
python3.7 PaddlePaddle>=1.7.2
PaddleRec python 2.7/3.5/3.6/3.7
## 单机训练 PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
### 单机训练
CPU环境 CPU环境
...@@ -60,7 +90,7 @@ runner: ...@@ -60,7 +90,7 @@ runner:
phases: [phase1] phases: [phase1]
``` ```
## 单机预测 ### 单机预测
CPU环境 CPU环境
...@@ -77,54 +107,21 @@ CPU环境 ...@@ -77,54 +107,21 @@ CPU环境
phases: [phase2] phases: [phase2]
``` ```
## 运行 ### 运行
``` ```
python -m paddlerec.run -m paddlerec.models.rank.flen python -m paddlerec.run -m paddlerec.models.rank.flen
``` ```
## 模型效果 ## 论文复现
在样例数据上测试模型 用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=512, thread_num=8, epoch_num=1
训练: 全量数据的效果未来补充。
```
0702 13:38:20.903220 7368 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 2 cards are used, so 2 programs are executed in parallel.
I0702 13:38:20.925912 7368 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
I0702 13:38:20.933356 7368 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
batch: 2, AUC: [0.09090909 0. ], BATCH_AUC: [0.09090909 0. ]
batch: 4, AUC: [0.31578947 0.29411765], BATCH_AUC: [0.31578947 0.29411765]
batch: 6, AUC: [0.41333333 0.33333333], BATCH_AUC: [0.41333333 0.33333333]
batch: 8, AUC: [0.4453125 0.44166667], BATCH_AUC: [0.4453125 0.44166667]
batch: 10, AUC: [0.39473684 0.38888889], BATCH_AUC: [0.44117647 0.41176471]
batch: 12, AUC: [0.41860465 0.45535714], BATCH_AUC: [0.5078125 0.54545455]
batch: 14, AUC: [0.43413729 0.42746615], BATCH_AUC: [0.56666667 0.56 ]
batch: 16, AUC: [0.46433566 0.47460087], BATCH_AUC: [0.53 0.59247649]
batch: 18, AUC: [0.44009217 0.44642857], BATCH_AUC: [0.46 0.47]
batch: 20, AUC: [0.42705314 0.43781095], BATCH_AUC: [0.45878136 0.4874552 ]
batch: 22, AUC: [0.45176471 0.46011281], BATCH_AUC: [0.48046875 0.45878136]
batch: 24, AUC: [0.48375 0.48910256], BATCH_AUC: [0.56630824 0.59856631]
epoch 0 done, use time: 0.21532440185546875
PaddleRec Finish
```
预测
```
PaddleRec: Runner single_cpu_infer Begin
Executor Mode: infer
processor_register begin
Running SingleInstance.
Running SingleNetwork.
QueueDataset can not support PY3, change to DataLoader
QueueDataset can not support PY3, change to DataLoader
Running SingleInferStartup.
Running SingleInferRunner.
load persistables from increment_model/0
batch: 20, AUC: [0.49121353], BATCH_AUC: [0.66176471]
batch: 40, AUC: [0.51156463], BATCH_AUC: [0.55197133]
Infer phase2 of 0 done, use time: 0.3941819667816162
PaddleRec Finish
```
## 进阶使用
## FAQ
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
data = pd.read_csv('./avazu_sample.txt')
data['day'] = data['hour'].apply(lambda x: str(x)[4:6])
data['hour'] = data['hour'].apply(lambda x: str(x)[6:])
sparse_features = [
'hour',
'C1',
'banner_pos',
'site_id',
'site_domain',
'site_category',
'app_id',
'app_domain',
'app_category',
'device_id',
'device_model',
'device_type',
'device_conn_type', # 'device_ip',
'C14',
'C15',
'C16',
'C17',
'C18',
'C19',
'C20',
'C21',
]
data[sparse_features] = data[sparse_features].fillna('-1', )
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
lbe = LabelEncoder()
data[feat] = lbe.fit_transform(data[feat])
cols = [
'click', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C1',
'device_model', 'device_type', 'device_id', 'app_id', 'app_domain',
'app_category', 'banner_pos', 'site_id', 'site_domain', 'site_category',
'device_conn_type', 'hour'
]
# 计算每一个特征的最大值,作为vacob_size
data = data[cols]
line = ''
vacob_file = open('vacob_file.txt', 'w')
for col in cols[1:]:
max_val = data[col].max()
line += str(max_val) + ','
vacob_file.write(line)
vacob_file.close()
data.to_csv('./train_data/train_data.txt', index=False, header=None)
# wide&deep
以下是本例的简要目录结构及说明:
```
├── data # 文档
├── train #训练数据
├── train_data.txt
├── create_data.sh
├── data_preparation.py
├── get_slot_data.py
├── run.sh
├── __init__.py
├── config.yaml #配置文件
├── model.py #模型文件
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#模型简介)
- [数据准备](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#数据准备)
- [运行环境](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#运行环境)
- [快速开始](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#快速开始)
- [论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#论文复现)
- [进阶使用](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#进阶使用)
- [FAQ](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#FAQ)
## 模型简介
[《Wide & Deep Learning for Recommender Systems》]( https://arxiv.org/pdf/1606.07792.pdf)是Google 2016年发布的推荐框架,wide&deep设计了一种融合浅层(wide)模型和深层(deep)模型进行联合训练的框架,综合利用浅层模型的记忆能力和深层模型的泛化能力,实现单模型对推荐系统准确性和扩展性的兼顾。从推荐效果和服务性能两方面进行评价:
1. 效果上,在Google Play 进行线上A/B实验,wide&deep模型相比高度优化的Wide浅层模型,app下载率+3.9%。相比deep模型也有一定提升。
2. 性能上,通过切分一次请求需要处理的app 的Batch size为更小的size,并利用多线程并行请求达到提高处理效率的目的。单次响应耗时从31ms下降到14ms。
本例在paddlepaddle上实现wide&deep并在开源数据集Census-income Data上验证模型效果,在测试集上的平均acc和auc分别为:
> mean_acc: 0.76195
>
> mean_auc: 0.90577
若进行精度验证,请参考[论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/wide_deep#论文复现)部分。
本项目支持功能
训练:单机CPU、单机单卡GPU、单机多卡GPU、本地模拟参数服务器训练、增量训练,配置请参考 [启动训练](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/train.md)
预测:单机CPU、单机单卡GPU ;配置请参考[PaddleRec 离线预测](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/predict.md)
## 数据准备
数据地址:
[adult.data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data)
[adult.test](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test)
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
### 单机训练
CPU环境
在config.yaml文件中设置好设备,epochs等。
```sh
dataset:
- name: sample_1
type: QueueDataset
batch_size: 5
data_path: "{workspace}/data/sample_data/train"
sparse_slots: "label"
dense_slots: "wide_input:8 deep_input:58"
- name: infer_sample
type: QueueDataset
batch_size: 5
data_path: "{workspace}/data/sample_data/train"
sparse_slots: "label"
dense_slots: "wide_input:8 deep_input:58"
```
### 单机预测
CPU环境
在config.yaml文件中设置好epochs、device等参数。
```
- name: infer_runner
class: infer
device: cpu
init_model_path: "increment/0"
```
## 论文复现
用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=40, thread_num=8, epoch_num=40
本例在paddlepaddle上实现wide&deep并在开源数据集Census-income Data上验证模型效果,在测试集上的平均acc和auc分别为:
mean_acc: 0.76195 , mean_auc: 0.90577
修改后运行方案:修改config.yaml中的'workspace'为config.yaml的目录位置,执行
```
python -m paddlerec.run -m /home/your/dir/config.yaml #调试模式 直接指定本地config的绝对路径
```
## 进阶使用
## FAQ
...@@ -42,30 +42,32 @@ hyper_parameters: ...@@ -42,30 +42,32 @@ hyper_parameters:
gnn_propogation_steps: 1 gnn_propogation_steps: 1
# select runner by name # select runner by name
mode: train_runner mode: [single_cpu_train, single_cpu_infer]
# config of each runner. # config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process. # runner is a kind of paddle training class, which wraps the train/infer process.
runner: runner:
- name: train_runner - name: single_cpu_train
class: train class: train
# num of epochs # num of epochs
epochs: 2 epochs: 5
# device to run training or infer # device to run training or infer
device: cpu device: cpu
save_checkpoint_interval: 1 # save model interval of epochs save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path save_checkpoint_path: "increment_gnn" # save checkpoint path
save_inference_path: "inference" # save inference path save_inference_path: "inference_gnn" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path init_model_path: "" # load model path
print_interval: 1 print_interval: 1
- name: infer_runner phases: [phase1]
- name: single_cpu_infer
class: infer class: infer
# device to run training or infer # device to run training or infer
device: cpu device: cpu
print_interval: 1 print_interval: 1
init_model_path: "increment/0" # load model path init_model_path: "increment_gnn" # load model path
phases: [phase2]
# runner will run all the phase in each epoch # runner will run all the phase in each epoch
phase: phase:
...@@ -73,7 +75,7 @@ phase: ...@@ -73,7 +75,7 @@ phase:
model: "{workspace}/model.py" # user-defined model model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name dataset_name: dataset_train # select dataset by name
thread_num: 1 thread_num: 1
# - name: phase2 - name: phase2
# model: "{workspace}/model.py" # user-defined model model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name dataset_name: dataset_infer # select dataset by name
# thread_num: 1 thread_num: 1
...@@ -57,5 +57,10 @@ def _download_file(url, savepath, print_progress): ...@@ -57,5 +57,10 @@ def _download_file(url, savepath, print_progress):
progress("[%-50s] %.2f%%" % ('=' * 50, 100), end=True) progress("[%-50s] %.2f%%" % ('=' * 50, 100), end=True)
_download_file("https://sr-gnn.bj.bcebos.com/train-item-views.csv", if sys.argv[1] == "diginetica":
_download_file("https://sr-gnn.bj.bcebos.com/train-item-views.csv",
"./train-item-views.csv", True) "./train-item-views.csv", True)
elif sys.argv[1] == "yoochoose":
_download_file(
"https://paddlerec.bj.bcebos.com/gnn%2Fyoochoose-clicks.dat",
"./yoochoose-clicks.dat", True)
...@@ -41,39 +41,29 @@ with open(dataset, "r") as f: ...@@ -41,39 +41,29 @@ with open(dataset, "r") as f:
curdate = None curdate = None
for data in reader: for data in reader:
sessid = data['session_id'] sessid = data['session_id']
if curdate and not curid == sessid:
date = '' date = ''
if opt.dataset == 'yoochoose':
date = time.mktime(
time.strptime(curdate[:19], '%Y-%m-%dT%H:%M:%S'))
else:
date = time.mktime(time.strptime(curdate, '%Y-%m-%d'))
sess_date[curid] = date
curid = sessid
if opt.dataset == 'yoochoose': if opt.dataset == 'yoochoose':
item = data['item_id'] item = data['item_id']
date = time.mktime(
time.strptime(data['timestamp'][:19], '%Y-%m-%dT%H:%M:%S'))
else: else:
item = data['item_id'], int(data['timeframe']) item = data['item_id'], int(data['timeframe'])
curdate = '' date = time.mktime(time.strptime(data['eventdate'], '%Y-%m-%d'))
if opt.dataset == 'yoochoose':
curdate = data['timestamp'] if sessid not in sess_date:
else: sess_date[sessid] = date
curdate = data['eventdate'] elif date > sess_date[sessid]:
sess_date[sessid] = date
if sessid in sess_clicks: if sessid in sess_clicks:
sess_clicks[sessid] += [item] sess_clicks[sessid] += [item]
else: else:
sess_clicks[sessid] = [item] sess_clicks[sessid] = [item]
ctr += 1 ctr += 1
date = '' if opt.dataset != 'yoochoose':
if opt.dataset == 'yoochoose':
date = time.mktime(time.strptime(curdate[:19], '%Y-%m-%dT%H:%M:%S'))
else:
date = time.mktime(time.strptime(curdate, '%Y-%m-%d'))
for i in list(sess_clicks): for i in list(sess_clicks):
sorted_clicks = sorted(sess_clicks[i], key=operator.itemgetter(1)) sorted_clicks = sorted(sess_clicks[i], key=operator.itemgetter(1))
sess_clicks[i] = [c[0] for c in sorted_clicks] sess_clicks[i] = [c[0] for c in sorted_clicks]
sess_date[curid] = date
print("-- Reading data @ %ss" % datetime.datetime.now()) print("-- Reading data @ %ss" % datetime.datetime.now())
# Filter out length 1 sessions # Filter out length 1 sessions
...@@ -160,7 +150,7 @@ def obtian_tra(): ...@@ -160,7 +150,7 @@ def obtian_tra():
train_dates += [date] train_dates += [date]
train_seqs += [outseq] train_seqs += [outseq]
print(item_ctr) # 43098, 37484 print(item_ctr) # 43098, 37484
with open("./diginetica/config.txt", "w") as fout: with open("./config.txt", "w") as fout:
fout.write(str(item_ctr) + "\n") fout.write(str(item_ctr) + "\n")
return train_ids, train_dates, train_seqs return train_ids, train_dates, train_seqs
......
...@@ -15,21 +15,31 @@ ...@@ -15,21 +15,31 @@
# limitations under the License. # limitations under the License.
set -e set -e
echo "begin to download data"
cd data && python download.py dataset=$1
mkdir diginetica src=$1
python preprocess.py --dataset diginetica
if [[ $src == "yoochoose1_4" || $src == "yoochoose1_64" ]];then
src="yoochoose"
elif [[ $src == "diginetica" ]];then
src="diginetica"
else
echo "Usage: sh data_prepare.sh [diginetica|yoochoose1_4|yoochoose1_64]"
exit 1
fi
echo "begin to download data"
cd data && python download.py $src
mkdir $dataset
python preprocess.py --dataset $src
echo "begin to convert data (binary -> txt)" echo "begin to convert data (binary -> txt)"
python convert_data.py --data_dir diginetica python convert_data.py --data_dir $dataset
cat diginetica/train.txt | wc -l >> diginetica/config.txt cat ${dataset}/train.txt | wc -l >> config.txt
rm -rf train && mkdir train rm -rf train && mkdir train
mv diginetica/train.txt train mv ${dataset}/train.txt train
rm -rf test && mkdir test rm -rf test && mkdir test
mv diginetica/test.txt test mv ${dataset}/test.txt test
mv diginetica/config.txt ./config.txt
...@@ -20,6 +20,7 @@ import paddle.fluid.layers as layers ...@@ -20,6 +20,7 @@ import paddle.fluid.layers as layers
from paddlerec.core.utils import envs from paddlerec.core.utils import envs
from paddlerec.core.model import ModelBase from paddlerec.core.model import ModelBase
from paddlerec.core.metrics import RecallK
class Model(ModelBase): class Model(ModelBase):
...@@ -235,16 +236,16 @@ class Model(ModelBase): ...@@ -235,16 +236,16 @@ class Model(ModelBase):
softmax = layers.softmax_with_cross_entropy( softmax = layers.softmax_with_cross_entropy(
logits=logits, label=inputs[6]) # [batch_size, 1] logits=logits, label=inputs[6]) # [batch_size, 1]
self.loss = layers.reduce_mean(softmax) # [1] self.loss = layers.reduce_mean(softmax) # [1]
self.acc = layers.accuracy(input=logits, label=inputs[6], k=20) acc = RecallK(input=logits, label=inputs[6], k=20)
self._cost = self.loss self._cost = self.loss
if is_infer: if is_infer:
self._infer_results['acc'] = self.acc self._infer_results['P@20'] = acc
self._infer_results['loss'] = self.loss self._infer_results['LOSS'] = self.loss
return return
self._metrics["LOSS"] = self.loss self._metrics["LOSS"] = self.loss
self._metrics["train_acc"] = self.acc self._metrics["Train_P@20"] = acc
def optimizer(self): def optimizer(self):
step_per_epoch = self.corpus_size // self.train_batch_size step_per_epoch = self.corpus_size // self.train_batch_size
......
# GNN
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── train
├── train.txt
├── test
├── test.txt
├── download.py
├── convert_data.py
├── preprocess.py
├── __init__.py
├── README.md # 文档
├── model.py #模型文件
├── config.yaml #配置文件
├── data_prepare.sh #一键数据处理脚本
├── reader.py #训练数据reader
├── evaluate_reader.py # 预测数据reader
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
---
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [论文复现](#论文复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
SR-GNN模型的介绍可以参阅论文[Session-based Recommendation with Graph Neural Networks](https://arxiv.org/abs/1811.00855)
本文解决的是Session-based Recommendation这一问题,过程大致分为以下四步:
1. 首先对所有的session序列通过有向图进行建模。
2. 然后通过GNN,学习每个node(item)的隐向量表示
3. 通过一个attention架构模型得到每个session的embedding
4. 最后通过一个softmax层进行全表预测
本示例中,我们复现了论文效果,在DIGINETICA数据集上P@20可以达到50.7。
同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/124382)
本模型配置默认使用demo数据集,若进行精度验证,请参考[论文复现](#论文复现)部分。
本项目支持功能
训练:单机CPU、单机单卡GPU、单机多卡GPU、本地模拟参数服务器训练、增量训练,配置请参考 [启动训练](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/train.md)
预测:单机CPU、单机单卡GPU ;配置请参考[PaddleRec 离线预测](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/predict.md)
## 数据处理
本示例中数据处理共包含三步:
- Step1: 原始数据数据集下载,本示例提供了两个开源数据集:DIGINETICA和Yoochoose,可选其中任意一个训练本模型。数据下载命令及原始数据格式如下所示。若采用diginetica数据集,执行完该命令之后,会在data目录下得到原始数据文件train-item-views.csv。若采用yoochoose数据集,执行完该命令之后,会在data目录下得到原始数据文件yoochoose-clicks.dat。
```
cd data && python download.py diginetica # or yoochoose
```
> [Yoochooses](https://2015.recsyschallenge.com/challenge.html)数据集来源于RecSys Challenge 2015,原始数据包含如下字段:
1. Session ID – the id of the session. In one session there are one or many clicks.
2. Timestamp – the time when the click occurred.
3. Item ID – the unique identifier of the item.
4. Category – the category of the item.
> [DIGINETICA](https://competitions.codalab.org/competitions/11161#learn_the_details-data2)数据集来源于CIKM Cup 2016 _Personalized E-Commerce Search Challenge_项目。原始数据包含如下字段:
1. sessionId - the id of the session. In one session there are one or many clicks.
2. userId - the id of the user, with anonymized user ids.
3. itemId - the unique identifier of the item.
4. timeframe - time since the first query in a session, in milliseconds.
5. eventdate - calendar date.
- Step2: 数据预处理。
1. 以session_id为key合并原始数据集,得到每个session的日期,及顺序点击列表。
2. 过滤掉长度为1的session;过滤掉点击次数小于5的items。
3. 训练集、测试集划分。原始数据集里最新日期七天内的作为训练集,更早之前的数据作为测试集。
```
cd data && python preprocess.py --dataset diginetica # or yoochoose
```
- Step3: 数据整理。 将训练文件统一放在data/train目录下,测试文件统一放在data/test目录下。
```
cat data/diginetica/train.txt | wc -l >> data/config.txt # or yoochoose1_4 or yoochoose1_64
rm -rf data/train/*
rm -rf data/test/*
mv data/diginetica/train.txt data/train
mv data/diginetica/test.txt data/test
```
数据处理完成后,data/train目录存放训练数据,data/test目录下存放测试数据,数据格式如下:
```
#session\tlabel
10,11,12,12,13,14\t15
```
data/config.txt中存放数据统计信息,第一行代表训练集中item总数,用以配置模型词表大小,第二行代表训练集大小。
方便起见, 我们提供了一键式数据处理脚本:
```
sh data_prepare.sh diginetica # or yoochoose1_4 or yoochoose1_64
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
### 单机训练
CPU环境
在config.yaml文件中设置好设备,epochs等。
```
# select runner by name
mode: [single_cpu_train, single_cpu_infer]
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: single_cpu_train
class: train
# num of epochs
epochs: 2
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment_gnn" # save checkpoint path
save_inference_path: "inference_gnn" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
print_interval: 1
phases: [phase1]
```
### 单机预测
CPU环境
在config.yaml文件中设置好epochs、device等参数。
```
- name: single_cpu_infer
class: infer
# device to run training or infer
device: cpu
print_interval: 1
init_model_path: "increment_gnn" # load model path
phases: [phase2]
```
### 运行
```
python -m paddlerec.run -m paddlerec.models.recall.gnn
```
### 结果展示
样例数据训练结果展示:
```
Running SingleStartup.
Running SingleRunner.
batch: 1, LOSS: [10.67443], InsCnt: [200.], RecallCnt: [0.], Acc(Recall@20): [0.]
batch: 2, LOSS: [10.672471], InsCnt: [300.], RecallCnt: [0.], Acc(Recall@20): [0.]
batch: 3, LOSS: [10.672463], InsCnt: [400.], RecallCnt: [1.], Acc(Recall@20): [0.0025]
batch: 4, LOSS: [10.670724], InsCnt: [500.], RecallCnt: [2.], Acc(Recall@20): [0.004]
batch: 5, LOSS: [10.66949], InsCnt: [600.], RecallCnt: [2.], Acc(Recall@20): [0.00333333]
batch: 6, LOSS: [10.670102], InsCnt: [700.], RecallCnt: [2.], Acc(Recall@20): [0.00285714]
batch: 7, LOSS: [10.671348], InsCnt: [800.], RecallCnt: [2.], Acc(Recall@20): [0.0025]
...
epoch 0 done, use time: 2926.6897077560425, global metrics: LOSS=[6.0788856], InsCnt=719400.0 RecallCnt=224033.0 Acc(Recall@20)=0.3114164581595774
...
epoch 4 done, use time: 3083.101449728012, global metrics: LOSS=[4.249889], InsCnt=3597000.0 RecallCnt=2070666.0 Acc(Recall@20)=0.5756647206005004
```
样例数据预测结果展示:
```
Running SingleInferStartup.
Running SingleInferRunner.
load persistables from increment_gnn/2
batch: 1, InsCnt: [200.], RecallCnt: [96.], Acc(Recall@20): [0.48], LOSS: [5.7198644]
batch: 2, InsCnt: [300.], RecallCnt: [153.], Acc(Recall@20): [0.51], LOSS: [5.4096317]
batch: 3, InsCnt: [400.], RecallCnt: [210.], Acc(Recall@20): [0.525], LOSS: [5.300991]
batch: 4, InsCnt: [500.], RecallCnt: [258.], Acc(Recall@20): [0.516], LOSS: [5.6269655]
batch: 5, InsCnt: [600.], RecallCnt: [311.], Acc(Recall@20): [0.5183333], LOSS: [5.39276]
batch: 6, InsCnt: [700.], RecallCnt: [352.], Acc(Recall@20): [0.50285715], LOSS: [5.633842]
batch: 7, InsCnt: [800.], RecallCnt: [406.], Acc(Recall@20): [0.5075], LOSS: [5.342844]
batch: 8, InsCnt: [900.], RecallCnt: [465.], Acc(Recall@20): [0.51666665], LOSS: [4.918761]
...
Infer phase2 of epoch 0 done, use time: 549.1640813350677, global metrics: InsCnt=60800.0 RecallCnt=31083.0 Acc(Recall@20)=0.511233552631579, LOSS=[5.8957024]
```
## 论文复现
用原论文的完整数据复现论文效果需要在config.yaml修改超参:
- batch_size: 修改config.yaml中dataset_train数据集的batch_size为100。
- epochs: 修改config.yaml中runner的epochs为5。
- sparse_feature_number: 不同训练数据集(diginetica or yoochoose)配置不一致,diginetica数据集配置为43098,yoochoose数据集配置为37484。具体见数据处理后得到的data/config.txt文件中第一行。
- corpus_size: 不同训练数据集配置不一致,diginetica数据集配置为719470,yoochoose数据集配置为5917745。具体见数据处理后得到的data/config.txt文件中第二行。
使用cpu训练 5轮 测试Recall@20:0.51367
修改后运行方案:修改config.yaml中的'workspace'为config.yaml的目录位置,执行
```
python -m paddlerec.run -m /home/your/dir/config.yaml #调试模式 直接指定本地config的绝对路径
```
## 进阶使用
## FAQ
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册