diff --git a/models/rank/deepfm/data/download_preprocess.py b/models/rank/deepfm/data/download_preprocess.py index e163cba23c3a9bfdf3a1e2ef08369cd704c8f260..9b46ff8ab50c6cc7a7331088a7062ba8b1a54931 100755 --- a/models/rank/deepfm/data/download_preprocess.py +++ b/models/rank/deepfm/data/download_preprocess.py @@ -28,7 +28,7 @@ if __name__ == '__main__': print("download and extract starting...") download_file_and_uncompress(url) - download_file(url2, "./sample_data/feat_dict_10.pkl2", True) + download_file(url2, "./deepfm%2Ffeat_dict_10.pkl2", True) print("download and extract finished") print("preprocessing...") diff --git a/models/rank/deepfm/data/get_slot_data.py b/models/rank/deepfm/data/get_slot_data.py index 5ee0fcebc1f14891c81ce7c228e699f6c533d2a1..1d4c5c47d229cccd56b2e0ef5c4a1a2bb7b20ff7 100755 --- a/models/rank/deepfm/data/get_slot_data.py +++ b/models/rank/deepfm/data/get_slot_data.py @@ -79,8 +79,7 @@ class Reader(dg.MultiSlotDataGenerator): v = i[1] for j in v: s += " " + k + ":" + str(j) - print(s.strip()) - yield None + print(s.strip()) # add print for data preprocessing return data_iter diff --git a/models/rank/deepfm/data/run.sh b/models/rank/deepfm/data/run.sh index c2bc4ae8ce1d7ad7c89ebd48f993ab920fba0ba2..a3255cb5976f9d4380048166b3e1588110361bf1 100644 --- a/models/rank/deepfm/data/run.sh +++ b/models/rank/deepfm/data/run.sh @@ -1,5 +1,5 @@ python download_preprocess.py - +mv ./deepfm%2Ffeat_dict_10.pkl2 sample_data/feat_dict_10.pkl2 mkdir slot_train_data for i in `ls ./train_data` do diff --git a/models/rank/deepfm/readme.md b/models/rank/deepfm/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..f832811ce8159d1dccb3f0d689d8125600114911 --- /dev/null +++ b/models/rank/deepfm/readme.md @@ -0,0 +1,317 @@ +# 基于deepFM模型的点击率预估模型 + +## 介绍 +`CTR(Click Through Rate)`,即点击率,是“推荐系统/计算广告”等领域的重要指标,对其进行预估是商品推送/广告投放等决策的基础。简单来说,CTR预估对每次广告的点击情况做出预测,预测用户是点击还是不点击。CTR预估模型综合考虑各种因素、特征,在大量历史数据上训练,最终对商业决策提供帮助。本模型实现了下述论文中的deepFM模型: + +```text +@inproceedings{guo2017deepfm, + title={DeepFM: A Factorization-Machine based Neural Network for CTR Prediction}, + author={Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He}, + booktitle={the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI)}, + pages={1725--1731}, + year={2017} +} +``` +## 数据准备 +### 数据来源 +训练及测试数据集选用[Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge/)所用的Criteo数据集。该数据集包括两部分:训练集和测试集。训练集包含一段时间内Criteo的部分流量,测试集则对应训练数据后一天的广告点击流量。 +每一行数据格式如下所示: +```bash +