From a2ce4de7a93c4ed5fbb2e4cd76bb4e1f498e4901 Mon Sep 17 00:00:00 2001 From: yinhaofeng <1841837261@qq.com> Date: Tue, 22 Sep 2020 14:58:55 +0000 Subject: [PATCH] deepfm add readme --- .../rank/deepfm/data/download_preprocess.py | 2 +- models/rank/deepfm/data/get_slot_data.py | 3 +- models/rank/deepfm/data/run.sh | 2 +- models/rank/deepfm/readme.md | 317 ++++++++++++++++++ 4 files changed, 320 insertions(+), 4 deletions(-) create mode 100644 models/rank/deepfm/readme.md diff --git a/models/rank/deepfm/data/download_preprocess.py b/models/rank/deepfm/data/download_preprocess.py index e163cba2..9b46ff8a 100755 --- a/models/rank/deepfm/data/download_preprocess.py +++ b/models/rank/deepfm/data/download_preprocess.py @@ -28,7 +28,7 @@ if __name__ == '__main__': print("download and extract starting...") download_file_and_uncompress(url) - download_file(url2, "./sample_data/feat_dict_10.pkl2", True) + download_file(url2, "./deepfm%2Ffeat_dict_10.pkl2", True) print("download and extract finished") print("preprocessing...") diff --git a/models/rank/deepfm/data/get_slot_data.py b/models/rank/deepfm/data/get_slot_data.py index 5ee0fceb..1d4c5c47 100755 --- a/models/rank/deepfm/data/get_slot_data.py +++ b/models/rank/deepfm/data/get_slot_data.py @@ -79,8 +79,7 @@ class Reader(dg.MultiSlotDataGenerator): v = i[1] for j in v: s += " " + k + ":" + str(j) - print(s.strip()) - yield None + print(s.strip()) # add print for data preprocessing return data_iter diff --git a/models/rank/deepfm/data/run.sh b/models/rank/deepfm/data/run.sh index c2bc4ae8..a3255cb5 100644 --- a/models/rank/deepfm/data/run.sh +++ b/models/rank/deepfm/data/run.sh @@ -1,5 +1,5 @@ python download_preprocess.py - +mv ./deepfm%2Ffeat_dict_10.pkl2 sample_data/feat_dict_10.pkl2 mkdir slot_train_data for i in `ls ./train_data` do diff --git a/models/rank/deepfm/readme.md b/models/rank/deepfm/readme.md new file mode 100644 index 00000000..f832811c --- /dev/null +++ b/models/rank/deepfm/readme.md @@ -0,0 +1,317 @@ +# 基于deepFM模型的点击率预估模型 + +## 介绍 +`CTR(Click Through Rate)`,即点击率,是“推荐系统/计算广告”等领域的重要指标,对其进行预估是商品推送/广告投放等决策的基础。简单来说,CTR预估对每次广告的点击情况做出预测,预测用户是点击还是不点击。CTR预估模型综合考虑各种因素、特征,在大量历史数据上训练,最终对商业决策提供帮助。本模型实现了下述论文中的deepFM模型: + +```text +@inproceedings{guo2017deepfm, + title={DeepFM: A Factorization-Machine based Neural Network for CTR Prediction}, + author={Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He}, + booktitle={the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI)}, + pages={1725--1731}, + year={2017} +} +``` +## 数据准备 +### 数据来源 +训练及测试数据集选用[Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge/)所用的Criteo数据集。该数据集包括两部分:训练集和测试集。训练集包含一段时间内Criteo的部分流量,测试集则对应训练数据后一天的广告点击流量。 +每一行数据格式如下所示: +```bash +