From a3478a169e7a021ea3aa12a82cfa8fe5205466d7 Mon Sep 17 00:00:00 2001 From: yinhaofeng <1841837261@qq.com> Date: Fri, 18 Sep 2020 15:20:38 +0000 Subject: [PATCH] logistic_regression --- .../data/download_preprocess.py | 2 +- models/rank/logistic_regression/data/run.sh | 2 +- models/rank/logistic_regression/readme.md | 274 ++++++++++++++++++ 3 files changed, 276 insertions(+), 2 deletions(-) create mode 100644 models/rank/logistic_regression/readme.md diff --git a/models/rank/logistic_regression/data/download_preprocess.py b/models/rank/logistic_regression/data/download_preprocess.py index 7a504b4f..9b46ff8a 100644 --- a/models/rank/logistic_regression/data/download_preprocess.py +++ b/models/rank/logistic_regression/data/download_preprocess.py @@ -28,7 +28,7 @@ if __name__ == '__main__': print("download and extract starting...") download_file_and_uncompress(url) - download_file(url2, "./aid_data/feat_dict_10.pkl2", True) + download_file(url2, "./deepfm%2Ffeat_dict_10.pkl2", True) print("download and extract finished") print("preprocessing...") diff --git a/models/rank/logistic_regression/data/run.sh b/models/rank/logistic_regression/data/run.sh index c2bc4ae8..a3255cb5 100644 --- a/models/rank/logistic_regression/data/run.sh +++ b/models/rank/logistic_regression/data/run.sh @@ -1,5 +1,5 @@ python download_preprocess.py - +mv ./deepfm%2Ffeat_dict_10.pkl2 sample_data/feat_dict_10.pkl2 mkdir slot_train_data for i in `ls ./train_data` do diff --git a/models/rank/logistic_regression/readme.md b/models/rank/logistic_regression/readme.md new file mode 100644 index 00000000..cd6ee716 --- /dev/null +++ b/models/rank/logistic_regression/readme.md @@ -0,0 +1,274 @@ +# 基于logistic_regression模型的点击率预估模型 + +## 介绍 +`CTR(Click Through Rate)`,即点击率,是“推荐系统/计算广告”等领域的重要指标,对其进行预估是商品推送/广告投放等决策的基础。简单来说,CTR预估对每次广告的点击情况做出预测,预测用户是点击还是不点击。CTR预估模型综合考虑各种因素、特征,在大量历史数据上训练,最终对商业决策提供帮助。本模型实现了下述论文中的logistic_regression模型: + +```text +@inproceedings{guo2017deepfm, + title={DeepFM: A Factorization-Machine based Neural Network for CTR Prediction}, + author={Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He}, + booktitle={the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI)}, + pages={1725--1731}, + year={2017} +} +``` + +## 数据准备 +### 数据来源 +训练及测试数据集选用[Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge/)所用的Criteo数据集。该数据集包括两部分:训练集和测试集。训练集包含一段时间内Criteo的部分流量,测试集则对应训练数据后一天的广告点击流量。 +每一行数据格式如下所示: +```bash +