diff --git a/youtube_recall/README.cn.md b/youtube_recall/README.cn.md new file mode 100644 index 0000000000000000000000000000000000000000..2f20416bb9bf0d202bd38dd7a15b5ea447b0c472 --- /dev/null +++ b/youtube_recall/README.cn.md @@ -0,0 +1,396 @@ +# Youtube DNN推荐模型 + +以下是本例目录包含的文件以及对应说明: + +``` +├── README.md # 文档 +├── README.cn.md # 中文文档 +├── data # 示例数据 +│   ├── data.tar # 示例数据 +├── infer.py # 预测脚本 +├── network_conf.py # 模型网络配置 +├── reader.py # data reader +├── train.py # 训练脚本 +└── utils.py # 工具 +└── data_processer.py # 数据预处理脚本 +└── user_vector.py # 获取用户向量脚本 +└── item_vector.py # 获取视频向量脚本 +├── infer_user.py # 获取用户个性化脚本 +``` + +## 背景介绍\[[1](#参考文献)\] +Youtube是世界最大的视频网站之一,其推荐系统帮助10亿以上的用户,从海量视频中,发现个性化的内容。该推荐系统主要面临以下三个挑战: +- 规模: 许多现有的推荐算法证明在小数据量下运行良好,但不能满足YouTube这样庞大的用户群和内容库的场景,因此需要高度专业化的分布式学习算法和高效的线上服务。 +- 新鲜度: YouTube内容库更新频率极高,每秒上传大量视频。系统应及时追踪新上传的视频和用户的实时行为,并且模型在推荐新/旧视频上有良好平衡能力。 +- 噪音: 噪音来自于两方面,其一,用户历史行为稀疏,且有各种不可观测的外部因素,以及用户满意度不明确。其二,内容本身的数据是非结构化的。因此算法应更具有鲁棒性。 + +下图展示了整个推荐系统框图: +

+
+Figure 1. 推荐系统框图(出自论文[1]) +

+ +整个推荐系统有两部分组成: 召回(candidate generation/recall)和排序(ranking)。 +- 召回模型: 输入用户的历史行为,从大规模的内容库中获得一个小集合(百级别)。召回出的视频与用户高度相关。一个用户是用其历史点击过的视频,搜索过的关键词,和人口统计相关的特征来表征。 +- 排序模型: 采用更精细的特征计算得到排序分,对召回得到的候选集合中的视频进行排序。 + +本文主要详细介绍了召回模型的原理与使用。 + +## 召回模型简介 +该推荐问题可以被建模成一个"超大规模多分类"问题。即在时刻![](https://www.zhihu.com/equation?tex=t),为用户![](https://www.zhihu.com/equation?tex=U)(已知上下文信息![](https://www.zhihu.com/equation?tex=C))在视频库![](https://www.zhihu.com/equation?tex=V)中预测出观看视频![](https://www.zhihu.com/equation?tex=i)的类别, + +![](https://www.zhihu.com/equation?tex=%24P(%5Comega_t%3Di%7CU%2CC)%3D%5Cfrac%7Be%5E%7B%5Cmathbf%7Bv_i%7D%5Cmathbf%7Bu%7D%7D%7D%7B%5Csum_%7Bj%5Cin%20V%7D%5E%7B%20%7De%5E%7B%5Cmathbf%7Bv_j%7D%5Cmathbf%7Bu%7D%7D%7D) + +其中![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Cin%20%5Cmathbb%7BR%7D%5EN),是<用户,上下文信息>的高维向量表示。![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv_j%7D%5Cin%20%5Cmathbb%7BR%7D%5EN)是视频![](https://www.zhihu.com/equation?tex=j)的高维向量表示。DNN模型的目标是以用户信息和上下文信息为输入条件下,学习用户的高维向量表示,以此输入softmax分类器,来预测视频库中各个视频(类别)的观看概率。 + +下图展示了召回模型的网络结构: +

+
+Figure 2. 召回模型网络结构(出自论文[1]) +

+ +- 输入层:用户的浏览序列、搜索序列、人口统计学特征、和其他上下文信息等 +- embedding层:将用户浏览视频序列接embedding层,再做时间序列上的平均。对于搜索序列同样处理。 +- 隐层:包含三个隐层,用RELU激活函数,最后一层隐层的输出即为高维向量表示![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D)。 +- 输出层: softmax层,输出视频库中各个视频(类别)的观看概率。在线上预测时,提取模型训练得到的softmax层内部的参数,作为视频![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D)的高维向量表示。可利用类似局部敏感哈希(Locality Sensitive Hashing)用![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D)查询最相关的N个视频。 + +## 数据预处理 +本例模拟了用户的视频点击日志,作为样本数据。格式如下: +``` +用户Id \t 所在省份 \t 所在城市 \t 历史点击的视频序列信息 \t 手机型号 +历史点击的视频序列信息的格式为 视频信息1;视频信息2;...;视频信息K +视频信息的格式为 视频id:视频类目:视频标签1_视频标签2_视频标签3_...视频标签M +例如: +USER_ID_15 上海市 上海市 VIDEO_42:CATEGORY_9:TAG115;VIDEO_43:CATEGORY_9:TAG116_TAG115;VIDEO_44:CATEGORY_2:TAG117_TAG71 GO T5 +``` +在youtube_recall目录下运行以下命令(下同),可以解压样本数据。 +``` +cd data +tar -zxvf data.tar +``` + +然后,脚本`data_preprocess.py`将对训练数据做预处理。具体使用方法参考如下说明: +``` +usage: data_processor.py [-h] --train_set_path TRAIN_SET_PATH --output_dir + OUTPUT_DIR [--feat_appear_limit FEAT_APPEAR_LIMIT] + +PaddlePaddle Youtube Recall Model Example + +optional arguments: + -h, --help show this help message and exit + --train_set_path TRAIN_SET_PATH + path of the train set + --output_dir OUTPUT_DIR + directory to output + --feat_appear_limit FEAT_APPEAR_LIMIT + the minimum number of feature values appears (default: + 20) +``` +该脚本的作用如下: +- 借鉴\[[2](#参考文献)\]中对特征的处理,过滤低频特征(样本中出现次数低于`feat_appear_limit`)。 +- 对特征进行编码,生成字典`feature_dict.pkl`。 +- 统计每个视频出现的概率,保存至`item_freq.pkl`,提供给nce层使用。 + +例如可执行下列命令,完成数据预处理: +```shell +mkdir output +python data_processor.py --train_set_path=./data/train.txt \ + --output_dir=./output \ + --feat_appear_limit=20 +``` + +## 模型实现 +下面是网络中各个部分的具体实现,相关代码均包含在 `./network_conf.py` 中。 + +### 输入层 +```python +def _build_input_layer(self): + """ + build input layer + """ + self._history_clicked_items = paddle.layer.data( + name="history_clicked_items", type=paddle.data_type.integer_value_sequence( + len(self._feature_dict['history_clicked_items']))) + self._history_clicked_categories = paddle.layer.data( + name="history_clicked_categories", type=paddle.data_type.integer_value_sequence( + len(self._feature_dict['history_clicked_categories']))) + self._history_clicked_tags = paddle.layer.data( + name="history_clicked_tags", type=paddle.data_type.integer_value_sequence( + len(self._feature_dict['history_clicked_tags']))) + self._user_id = paddle.layer.data( + name="user_id", type=paddle.data_type.integer_value( + len(self._feature_dict['user_id']))) + self._province = paddle.layer.data( + name="province", type=paddle.data_type.integer_value( + len(self._feature_dict['province']))) + self._city = paddle.layer.data( + name="city", type=paddle.data_type.integer_value(len(self._feature_dict['city']))) + self._phone = paddle.layer.data( + name="phone", type=paddle.data_type.integer_value(len(self._feature_dict['phone']))) + self._target_item = paddle.layer.data( + name="target_item", type=paddle.data_type.integer_value( + len(self._feature_dict['history_clicked_items']))) +``` + +### Embedding层 +每个输入特征通过embedding到固定维度的向量中。 +```python +def _create_emb_attr(self, name): + """ + create embedding parameter + """ + return paddle.attr.Param( + name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=False) + +def _build_embedding_layer(self): + """ + build embedding layer + """ + self._user_id_emb = paddle.layer.embedding(input=self._user_id, + size=64, + param_attr=self._create_emb_attr( + '_proj_user_id')) + self._province_emb = paddle.layer.embedding(input=self._province, + size=8, + param_attr=self._create_emb_attr( + '_proj_province')) + self._city_emb = paddle.layer.embedding(input=self._city, + size=16, + param_attr=self._create_emb_attr('_proj_city')) + self._phone_emb = paddle.layer.embedding(input=self._phone, + size=16, + param_attr=self._create_emb_attr('_proj_phone')) + self._history_clicked_items_emb = paddle.layer.embedding( + input=self._history_clicked_items, + size=64, + param_attr=self._create_emb_attr('_proj_history_clicked_items')) + self._history_clicked_categories_emb = paddle.layer.embedding( + input=self._history_clicked_categories, + size=8, + param_attr=self._create_emb_attr('_proj_history_clicked_categories')) + self._history_clicked_tags_emb = paddle.layer.embedding( + input=self._history_clicked_tags, + size=64, + param_attr=self._create_emb_attr('_proj_history_clicked_tags')) +``` + +### 隐层 +本文对\[[原论文](#参考文献)\](Covington, Paul, Jay Adams, and Emre Sargin. "Deep neural networks for youtube recommendations." Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016.)中的模型做了如下改进: +- 历史用户点击的视频序列,经过embedding之后,不再使用加权求平均,而是使用lstm序列模型。本文将用户点击的先后次序纳入模型中,然后在时间序列上做最大池化,得到定长向量表示,从而使模型学习到与点击时序相关的隐藏信息。 +- 考虑到数据规模与训练性能,本文只用了两个Relu层,也有很不错的效果。 + +```python +self._rnn_cell = paddle.networks.simple_lstm( + input=self._history_clicked_items_emb, size=64) + self._lstm_last = paddle.layer.pooling( + input=self._rnn_cell, pooling_type=paddle.pooling.Max()) + self._avg_emb_cats = paddle.layer.pooling( + input=self._history_clicked_categories_emb, + pooling_type=paddle.pooling.Avg()) + self._avg_emb_tags = paddle.layer.pooling( + input=self._history_clicked_tags_emb, + pooling_type=paddle.pooling.Avg()) + self._fc_0 = paddle.layer.fc( + name="Relu1", + input=[ + self._lstm_last, self._user_id_emb, self._province_emb, + self._city_emb, self._avg_emb_cats, self._avg_emb_tags, + self._phone_emb + ], + size=self._dnn_layer_dims[0], + act=paddle.activation.Relu()) + + self._fc_1 = paddle.layer.fc( + name="Relu2", + input=self._fc_0, + size=self._dnn_layer_dims[1], + act=paddle.activation.Relu()) +``` + +### 输出层 +为了提高模型训练速度,使用噪声对比估计(Noise-contrastive estimation, NCE)\[[3](#参考文献)\]。将[数据预处理](#数据预处理)中产出的item_freq.pkl,也就是负样例的分布,作为nce层的参数。 +```python +return paddle.layer.nce( + input=self._fc_1, + label=self._target_item, + num_classes=len(self._feature_dict['history_clicked_items']), + param_attr=paddle.attr.Param(name="nce_w"), + bias_attr=paddle.attr.Param(name="nce_b"), + act=paddle.activation.Sigmoid(), + num_neg_samples=5, + neg_distribution=self._item_freq) +``` + +## 训练 +首先,准备`reader.py`,负责将输入原始数据中的特征,转为编码后的特征id。对一条训练数据,根据`window_size`产出多条训练样本给trainer,例如: +``` +window_size=2 +原始数据: +用户Id \t 所在省份 \t 所在城市 \t 视频信息1;视频信息2;...;视频信息K \t 手机型号 +多条训练样本: +用户Id,所在省份,所在城市,[,历史点击视频1],[,历史点击视频类目1],[,历史点击视频标签1],手机型号,历史点击视频2 +用户Id,所在省份,所在城市,[历史点击视频1,历史点击视频2],[历史点击视频类目1,历史点击视频类目2],[历史点击视频标签1,历史点击视频标签2],手机型号,历史点击视频3 +用户Id,所在省份,所在城市,[历史点击视频2,历史点击视频3],[历史点击视频类目2,历史点击视频类目3],[历史点击视频标签2,历史点击视频标签3],手机型号,历史点击视频4 +...... +``` +相关代码如下: +```python +for i in range(1, len(history_clicked_items_all)): + start = max(0, i - self._window_size) + history_clicked_items = history_clicked_items_all[start:i] + history_clicked_categories = history_clicked_categories_all[start:i] + history_clicked_tags_str = history_clicked_tags_all[start:i] + history_clicked_tags = [] + for tags_a in history_clicked_tags_str: + for tag in tags_a.split("_"): + history_clicked_tags.append(int(tag)) + target_item = history_clicked_items_all[i] + yield user_id, province, city, \ + history_clicked_items, history_clicked_categories, \ + history_clicked_tags, phone, target_item +``` +```python +reader = Reader(feature_dict, args.window_size) + trainer.train( + paddle.batch( + paddle.reader.shuffle( + lambda: reader.train(args.train_set_path), + buf_size=7000), args.batch_size), + num_passes=args.num_passes, + feeding=feeding, + event_handler=event_handler) +``` +接下去就可以开始训练了,可执行以下命令: +```shell +mkdir output/model +python train.py --train_set_path='./data/train.txt' \ + --test_set_path='./data/test.txt' \ + --model_output_dir='./output/model/' \ + --feature_dict='./output/feature_dict.pkl' \ + --item_freq='./output/item_freq.pkl' +``` + +## 离线预测 +输入用户相关的特征,输出topN个最可能观看的视频,可执行以下命令: +```shell +python infer.py --infer_set_path='./data/infer.txt' \ + --model_path='./output/model/model_pass_00000.tar.gz' \ + --feature_dict='./output/feature_dict.pkl' \ + --batch_size=50 +``` + +## 在线预测 +在线预测的时候,采用近似最近邻(approximate nearest neighbor-ANN)算法直接用用户向量查询最相关的topN个视频向量,将对应的视频内容推荐给用户。下面介绍如何获得用户向量和视频向量。 + +### 用户向量 +用最后一个RELU层的输出,前拼一个常数项1,作为用户向量。这边最后一个RELU层的大小是31维,拼接后的用户向量就是32维,即 + +![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%3D%5B1%2Cu_1%2Cu_2%2C...%2Cu_%7B31%7D%5D) + +### 视频向量 +视频向量从模型训练得到的softmax层的参数中提取。假设共有M个不同的视频,那么softmax层输出的是这M个视频各自用户点击的概率,即 + +![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bo%7D%3D%5Bs_1%2Cs_2%2C...%2Cs_%7BM%7D%5D) + +从最后一个RELU层输出的用户向量![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D),到softmax层输出的M个视频的概率![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bo%7D),中间则是通过乘以了softmax层的参数w,b构成的一个![](https://www.zhihu.com/equation?tex=32%5Ctimes%20M)矩阵,其中的每一列为一个32维的视频向量,按照字典顺序一一对应。 + +![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Ccdot%20%5Cbegin%7Bbmatrix%7D%0A%20b_1%20%20%26%20b_2%20%26%20%20%5Ccdots%20%26%20b_M%20%5C%5C%20%0A%20w_%7B11%7D%20%26%20w_%7B21%7D%20%26%20%20%5Ccdots%20%20%26%20w_%7BM1%7D%20%5C%5C%20%0A%20w_%7B12%7D%20%26%20w_%7B22%7D%20%26%20%20%20%5Ccdots%20%26%20w_%7BM2%7D%20%20%5C%5C%20%0A%5Cvdots%20%26%20%5Cvdots%20%26%20%20%5Cvdots%20%26%20%5Cvdots%20%5C%5C%20%0Aw_%7B131%7D%20%26%20%20w_%7B231%7D%20%26%20%20%5Ccdots%20%20%26%20w_%7BM31%7D%20%20%0A%5Cend%7Bbmatrix%7D_%7B32%5Ctimes%20M%7D%20%3D%20%5Cmathbf%7Bu%7D%20%5Ccdot%20%20%5Cbegin%7Bbmatrix%7D%20%0A%5Cmathbf%7Bv_1%7D%2C%20%5Cmathbf%7Bv_2%7D%2C%20%5Ccdots%2C%20%5Cmathbf%7Bv_M%7D%20%0A%5Cend%7Bbmatrix%7D_%7B1%5Ctimes%20M%7D%3D%5Cmathbf%7Bo%7D) + +### SIMPLE-LSH变换 +很多ann算法只支持cosine距离,而模型是根据内积排序的,两者效果差异较大。为此,这边的解决方案是,对前面得到的用户和视频向量,作SIMPLE-LSH变换\[[4](#参考文献)\],使内积排序与cosin排序等价。 + +具体如下: +- 对于视频向量![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D%5Cin%20%5Cmathbb%7BR%7D%5EN),有![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Cmathbf%7Bv%7D%20%5Cright%20%5C%7C%5Cleqslant%20m),变换后的![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D),![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%3D%20%5B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%3B%20%5Csqrt%7B1%20-%5Cleft%20%5C%7C%20%5Cmathbf%7B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%7B%7D%7D%20%5Cright%20%5C%7C%5E2%7D%5D)。 + +- 对于用户向量![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Cin%20%5Cmathbb%7BR%7D%5EN),变换后的![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D),![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%3D%20%5B%5Cmathbf%7Bu%7D_%7Bnorm%7D%3B%200%5D),其中![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D_%7Bnorm%7D)是模长归一化后的![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D)。 + +线上对于一个![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D)用内积召回![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D),作上述变换![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%2C%20%5Cmathbf%7Bv%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)后,不改变内积排序的顺序。又因为![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%5Cright%20%5C%7C) 和![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%5Cright%20%5C%7C)都为1,因此![](https://www.zhihu.com/equation?tex=cos(%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%2C%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)%20%3D%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Ccdot%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D),就可以兼容ANN用cosin的方式召回了,结果等价。 + +线上使用时,为保留精度,可以不除以![](https://www.zhihu.com/equation?tex=m),也就变成![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D),排序依然等价。 + +### 实现 +可使用`user_vector.py`获取用户向量, 输入用户特征经过网络预测,probs[1]中存储的是最后一个RELU层的输出,先前拼接一个1,再做SIMPLE-LSH变换(后接一个0,归一化): +```python +probs = inferer.infer( + input=test_batch, + feeding=feeding, + field=["value"], + flatten_result=False) +for i, res in enumerate(zip(probs[1])): + # do simple lsh conversion + user_vector = [1.000] + for i in res[0]: + user_vector.append(i) + user_vector.append(0.000) + norm = np.linalg.norm(user_vector) + user_vector_norm = [str(_ / norm) for _ in user_vector] + print ",".join(user_vector_norm) +``` + +可使用`item_vector.py`分别获视频向量。加载模型,提取参数nce_w和nce_b,拼接M个视频向量,第i个视频向量的第一维是对应的nce_b[0][i],后面是nce_w[i][1:31]。再做SIMPLE-LSH变换,找到所有向量最大的模,按照![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D)处理。 +```python +# load the trained model. + with gzip.open(args.model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + + nce_w = parameters.get("nce_w") + nce_b = parameters.get("nce_b") + item_vector = convt_simple_lsh(get_item_vec_from_softmax(nce_w, nce_b)) + +def get_item_vec_from_softmax(nce_w, nce_b): + """ + get item vectors from softmax parameter + """ + if nce_w is None or nce_b is None: + return None + vector = [] + total_items_num = nce_w.shape[0] + if total_items_num != nce_b.shape[1]: + return None + dim_vector = nce_w.shape[1] + 1 + for i in range(0, total_items_num): + vector.append([]) + vector[i].append(nce_b[0][i]) + for j in range(1, dim_vector): + vector[i].append(nce_w[i][j - 1]) + return vector + + +def convt_simple_lsh(vector): + """ + do simple lsh conversion + """ + max_norm = 0 + num_of_vec = len(vector) + for i in range(0, num_of_vec): + norm = np.linalg.norm(vector[i]) + if norm > max_norm: + max_norm = norm + for i in range(0, num_of_vec): + vector[i].append( + math.sqrt( + math.pow(max_norm, 2) - math.pow(np.linalg.norm(vector[i]), 2))) + return vector +``` + +可执行下列命令运行脚本: +```shell +python user_vector.py --infer_set_path='./data/infer.txt' \ + --model_path='./output/model/model_pass_00000.tar.gz' \ + --feature_dict='./output/feature_dict.pkl' \ + --batch_size=50 +python item_vector.py --model_path='./output/model/model_pass_00000.tar.gz' \ + --feature_dict='./output/feature_dict.pkl' +``` +## 离线挖掘 +因为实时召回需要大量机器资源,这边也可以离线挖掘产出数据,线上召回使用挖掘好的数据。可以产出最热,用户个性化,视频相关等数据。下面的示例产出了用户个性化数据。 +``` +python infer_user.py --model_path='./output/model/model_pass_00000.tar.gz' \ + --feature_dict='./output/feature_dict.pkl' +``` + +## 参考文献 +1. Covington, Paul, Jay Adams, and Emre Sargin. "Deep neural networks for youtube recommendations." Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016. +2. https://code.google.com/archive/p/word2vec/ +3. http://paddlepaddle.org/docs/develop/models/nce_cost/README.html +4. Neyshabur, Behnam, and Nathan Srebro. "On symmetric and asymmetric LSHs for inner product search." arXiv preprint arXiv:1410.5518 (2014). diff --git a/youtube_recall/README.md b/youtube_recall/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b67bd33660b72a11444db1880f426102ac5c76d3 --- /dev/null +++ b/youtube_recall/README.md @@ -0,0 +1,382 @@ +# Deep Neural Networks for YouTube Recommendations + +## Introduction\[[1](#References)\] +YouTube is the world's largest platform for creating, sharing and discovering video content. Youtube recommendations are responsible for helping more than a billion users discover personalized content from an ever-growing corpus of videos. +- Scale: Many existing recommendation algorithm proven to work well on small problems fail to operate on massive scale. Highly specialized distributed learning algorithms and efficient serving systems are essential. +- Freshness: YouTube has a very dynamic corpus with many hours of video are uploaded per second. The recommendation system should model newly uploaded content as well as the latest actions taken by user. +- Noise: Historical user behavior on YouTube is inherently difficult to predict due to sparsity and a variety of unobservable external factors. Furthermore, the noisy implicit feedback signals instead of the ground truth of user satisfaction is observed, and metadata associated with content is poorly structured, which forces the algorithms to be robust. + +The overall structure of the recommendation system is illustrated in Figure 1. +

+
+Figure 1. Recommendation system architecture[1] +

+ +The system is comprised of two neural networks: one for candidate generation and one for ranking. +- The candidate generation network: It takes events from the user's YouTube activity history as input and retrieves a small subset(hundreds) of videos, highly relevant to the user, from a large corpus. The similarity between users is expressed in terms of coarse features such as IDs of video watches, search query tokens and demographics. +- The ranking network: It accomplishes this task by assigning a score to each video according to a desired objective function using a rich set of features describing the video and user. + +This markdown describes the principle and use of the candidate generation network in detail. + +## Candidate Generation +Here, candidate generation is modeled as extreme multiclass classification where the prediction problem becomes accurately classifying a specific video watch ![](https://www.zhihu.com/equation?tex=%5Comega_t) at time ![](https://www.zhihu.com/equation?tex=t) among millions of video ![](https://www.zhihu.com/equation?tex=i) (classes) from a corpus ![](https://www.zhihu.com/equation?tex=V) based on user ![](https://www.zhihu.com/equation?tex=U) and context ![](https://www.zhihu.com/equation?tex=C), + +![](https://www.zhihu.com/equation?tex=%24P(%5Comega_t%3Di%7CU%2CC)%3D%5Cfrac%7Be%5E%7B%5Cmathbf%7Bv_i%7D%5Cmathbf%7Bu%7D%7D%7D%7B%5Csum_%7Bj%5Cin%20V%7D%5E%7B%20%7De%5E%7B%5Cmathbf%7Bv_j%7D%5Cmathbf%7Bu%7D%7D%7D) + +where ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Cin%20%5Cmathbb%7BR%7D%5EN) represents a high-dimensional "embedding" of the user, context pair and the ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv_j%7D%5Cin%20%5Cmathbb%7BR%7D%5EN) represent embeddings of each candidate video. The task of the deep neural network is to learn user embeddings ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D) as a function of the user's history and context that are useful for discriminating among videos with a softmax classifier. + +Figure 2 shows the general network architecture of candidate generation model: +

+
+Figure 2. Candidate generation model architecture[1] +

+ +- Input layer: A user's watch history is represented by a variable-length sequence of sparse video IDs, and search history is similarly represented by a variable-length sequence of search tokens. +- Embedding layer: The input features each is mapped to a fixed-sized dense vector representation via the embeddings, and then simply averaging the embeddings. The embeddings are learned jointly with all other model parameters through normal gradient descent back-propagation updates. +- Hidden layer: Features are concatenated into a wide first layer, followed by several layers of fully connected Rectified Linear Units (ReLU). The output of the last ReLU layer is the previous mentioned high-dimensional "embedding" of the user ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D), so called user vector. +- Output layer: A softmax classifier is connected to do discriminating millions of classes (videos). To speed up training process, a technique is applied that samples negative classes from background distribution with importance weighting. The previous mentioned high-dimensional "embedding" of the candidate video ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D) is obtained by weight and bias of the softmax layer. At serving time, the most likely N classes (videos) is computed for presenting to the user. To Score millions of items under a strict serving laterncy, the scoring problem reduces to a nearest neighbor search in the dot product space, and Locality Sensitive Hashing is relied on. + +## Data Pre-processing +In this example, here moke the click log of users as sample data, and its format is as follows: +``` +user-id \t province \t city \t history-clicked-video-info-sequence \t phone + +history-clicked-video-info-sequence is formated as +video-info1;video-info2;...;video-infoK + +video-info is formated as +video-id:category:tag1_tag2_tag3_...tagM + +For example: +USER_ID_15 Shanghai Shanghai VIDEO_42:CATEGORY_9:TAG115;VIDEO_43:CATEGORY_9:TAG116_TAG115;VIDEO_44:CATEGORY_2:TAG117_TAG71 GO T5 +``` +Run this code in `youtube_recall` directory (the same below) to prepare the sample data. +``` +cd data +tar -zxvf data.tar +``` + +Then, run `data_preprocess.py` for data pre-processiong. Refer to the following instructions: +``` +usage: data_processor.py [-h] --train_set_path TRAIN_SET_PATH --output_dir + OUTPUT_DIR [--feat_appear_limit FEAT_APPEAR_LIMIT] + +PaddlePaddle Deep Candidate Generation Example + +optional arguments: + -h, --help show this help message and exit + --train_set_path TRAIN_SET_PATH + path of the train set + --output_dir OUTPUT_DIR + directory to output + --feat_appear_limit FEAT_APPEAR_LIMIT + the minimum number of feature values appears (default: + 20) +``` +The fucntion of this script is as follows: +- Filter low-frequency features\[[2](#References)\], which appears less than `feat_appear_limit` times. +- Encode features, and generate dictionary `feature_dict.pkl`. +- Count the probability of each video appears and write into `item_freq.pkl`, and provide it to NCE layer. + +For example, run the following command to accomplish data pre-processing: +``` +mkdir output +python data_processor.py --train_set_path=./data/train.txt \ + --output_dir=./output \ + --feat_appear_limit=20 +``` + +## Model Implementaion +The details of model implementation is illustrated as follows. The code is in `./network_conf.py`. + +### Input layer +```python +def _build_input_layer(self): + """ + build input layer + """ + self._history_clicked_items = paddle.layer.data( + name="history_clicked_items", type=paddle.data_type.integer_value_sequence( + len(self._feature_dict['history_clicked_items']))) + self._history_clicked_categories = paddle.layer.data( + name="history_clicked_categories", type=paddle.data_type.integer_value_sequence( + len(self._feature_dict['history_clicked_categories']))) + self._history_clicked_tags = paddle.layer.data( + name="history_clicked_tags", type=paddle.data_type.integer_value_sequence( + len(self._feature_dict['history_clicked_tags']))) + self._user_id = paddle.layer.data( + name="user_id", type=paddle.data_type.integer_value( + len(self._feature_dict['user_id']))) + self._province = paddle.layer.data( + name="province", type=paddle.data_type.integer_value( + len(self._feature_dict['province']))) + self._city = paddle.layer.data( + name="city", type=paddle.data_type.integer_value(len(self._feature_dict['city']))) + self._phone = paddle.layer.data( + name="phone", type=paddle.data_type.integer_value(len(self._feature_dict['phone']))) + self._target_item = paddle.layer.data( + name="target_item", type=paddle.data_type.integer_value( + len(self._feature_dict['history_clicked_items']))) +``` + +### Embedding layer +The each of input features is mapped to a fixed-sized dense vector representation +```python +def _create_emb_attr(self, name): + """ + create embedding parameter + """ + return paddle.attr.Param( + name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=False) + +def _build_embedding_layer(self): + """ + build embedding layer + """ + self._user_id_emb = paddle.layer.embedding(input=self._user_id, + size=64, + param_attr=self._create_emb_attr( + '_proj_user_id')) + self._province_emb = paddle.layer.embedding(input=self._province, + size=8, + param_attr=self._create_emb_attr( + '_proj_province')) + self._city_emb = paddle.layer.embedding(input=self._city, + size=16, + param_attr=self._create_emb_attr('_proj_city')) + self._phone_emb = paddle.layer.embedding(input=self._phone, + size=16, + param_attr=self._create_emb_attr('_proj_phone')) + self._history_clicked_items_emb = paddle.layer.embedding( + input=self._history_clicked_items, + size=64, + param_attr=self._create_emb_attr('_proj_history_clicked_items')) + self._history_clicked_categories_emb = paddle.layer.embedding( + input=self._history_clicked_categories, + size=8, + param_attr=self._create_emb_attr('_proj_history_clicked_categories')) + self._history_clicked_tags_emb = paddle.layer.embedding( + input=self._history_clicked_tags, + size=64, + param_attr=self._create_emb_attr('_proj_history_clicked_tags')) +``` + +### Hiddern layer +Here improves the original networks in \[[Original Paper](#References)\](Covington, Paul, Jay Adams, and Emre Sargin. "Deep neural networks for youtube recommendations." Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016.) +- By modifying that the embeddings of video watches are not simply averaged but are connected to a LSTM layer with max temporal pooling instead, so that the deep sequential information related to user interests can be learned well. +- Considering data scale and efficiency of training, only two ReLU layers are applied, which also leads to good performance. + +```python +self._rnn_cell = paddle.networks.simple_lstm(input=self._history_clicked_items_emb, size=64) +self._lstm_last = paddle.layer.pooling( + input=self._rnn_cell, pooling_type=paddle.pooling.Max()) +self._avg_emb_cats = paddle.layer.pooling(input=self._history_clicked_categories_emb, + pooling_type=paddle.pooling.Avg()) +self._avg_emb_tags = paddle.layer.pooling(input=self._history_clicked_tags_emb, + pooling_type=paddle.pooling.Avg()) +self._fc_0 = paddle.layer.fc( + name="Relu1", + input=[self._lstm_last, self._user_id_emb, + self._city_emb, self._phone_emb], + size=self._dnn_layer_dims[0], + act=paddle.activation.Relu()) + +self._fc_1 = paddle.layer.fc( + name="Relu2", + input=self._fc_0, + size=self._dnn_layer_dims[1], + act=paddle.activation.Relu()) +``` + +### Output layer +To speed up training process, Noise-contrastive estimation, NCE\[[3](#references)\] is applied to sample negative classes from background distribution with importance weighting. The previous mentioned `item_freq.pkl`[data pre-processing](#data pre-processing) is used as neg_distribution. +```python +return paddle.layer.nce( + input=self._fc_1, + label=self._target_item, + num_classes=len(self._feature_dict['history_clicked_items']), + param_attr=paddle.attr.Param(name="nce_w"), + bias_attr=paddle.attr.Param(name="nce_b"), + num_neg_samples=5, + neg_distribution=self._item_freq) +``` + +## Train +First of all, prepare `reader.py`, the function of which is to convert raw features into encoding id. One piece of train data generates several data instances according to `window_size`, and then is fed into trainer. +``` +window_size=2 +train data: +user-id \t province \t city \t video-info1;video-info2;...;video-infoK \t phone + +several data instances: +user-id,province,city,[,video-id1],[,category1],[,tags1],phone,video-id2 +user-id,province,city,[video-id1,video-id2],[category1,category2],[tags1,tags2],phone,video-id3 +user-id,province,city,[video-id2,video-id3],[category2,category3],[tags2,tags3],phone,video-id4 +...... +``` +The relevant code is as follows: +```python +for i in range(1, len(history_clicked_items_all)): + start = max(0, i - self._window_size) + history_clicked_items = history_clicked_items_all[start:i] + history_clicked_categories = history_clicked_categories_all[start:i] + history_clicked_tags_str = history_clicked_tags_all[start:i] + history_clicked_tags = [] + for tags_a in history_clicked_tags_str: + for tag in tags_a.split("_"): + history_clicked_tags.append(int(tag)) + target_item = history_clicked_items_all[i] + yield user_id, province, city, \ + history_clicked_items, history_clicked_categories, \ + history_clicked_tags, phone, target_item +``` +```python +reader = Reader(feature_dict, args.window_size) + trainer.train( + paddle.batch( + paddle.reader.shuffle( + lambda: reader.train(args.train_set_path), + buf_size=7000), args.batch_size), + num_passes=args.num_passes, + feeding=feeding, + event_handler=event_handler) +``` +Then start training. +```shell +mkdir output/model +python train.py --train_set_path='./data/train.txt' \ + --test_set_path='./data/test.txt' \ + --model_output_dir='./output/model/' \ + --feature_dict='./output/feature_dict.pkl' \ + --item_freq='./output/item_freq.pkl' +``` + +## Offline prediction +Input user related features, and then get the most likely N videos for user. +```shell +python infer.py --infer_set_path='./data/infer.txt' \ + --model_path='./output/model/model_pass_00000.tar.gz' \ + --feature_dict='./output/feature_dict.pkl' \ + --batch_size=50 +``` + +## Online prediction +For online prediction, Approximate Nearest Neighbor(ANN) is adopted to directly recall top N most likely watch video. Here shows how to get user vector and video vector. + +### User Vector +User vector is the output of the last RELU layer with cascading a constant term 1 in the front. Here the dimension of the last RELU layer is 31, and thus the dimension of user vector is 32. + +![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%3D%5B1%2Cu_1%2Cu_2%2C...%2Cu_%7B31%7D%5D) + +### Video Vector +Video vector is extracted from the parameters of softmax layer. If there are M different videos, the output of softmax layer will be the probability of click of these M videos. + +![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bo%7D%3D%5Bs_1%2Cs_2%2C...%2Cs_%7BM%7D%5D) + +To get ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bo%7D) from user vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D), a ![](https://www.zhihu.com/equation?tex=32%5Ctimes%20M) matrix which consists of the parameters w, b of softmax layer is multiplied. Each column of this matrix is a 32-dim video vector, according to the dictionary order one by one. + +![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Ccdot%20%5Cbegin%7Bbmatrix%7D%0A%20b_1%20%20%26%20b_2%20%26%20%20%5Ccdots%20%26%20b_M%20%5C%5C%20%0A%20w_%7B11%7D%20%26%20w_%7B21%7D%20%26%20%20%5Ccdots%20%20%26%20w_%7BM1%7D%20%5C%5C%20%0A%20w_%7B12%7D%20%26%20w_%7B22%7D%20%26%20%20%20%5Ccdots%20%26%20w_%7BM2%7D%20%20%5C%5C%20%0A%5Cvdots%20%26%20%5Cvdots%20%26%20%20%5Cvdots%20%26%20%5Cvdots%20%5C%5C%20%0Aw_%7B131%7D%20%26%20%20w_%7B231%7D%20%26%20%20%5Ccdots%20%20%26%20w_%7BM31%7D%20%20%0A%5Cend%7Bbmatrix%7D_%7B32%5Ctimes%20M%7D%20%3D%20%5Cmathbf%7Bu%7D%20%5Ccdot%20%20%5Cbegin%7Bbmatrix%7D%20%0A%5Cmathbf%7Bv_1%7D%2C%20%5Cmathbf%7Bv_2%7D%2C%20%5Ccdots%2C%20%5Cmathbf%7Bv_M%7D%20%0A%5Cend%7Bbmatrix%7D_%7B1%5Ctimes%20M%7D%3D%5Cmathbf%7Bo%7D) + +### SIMPLE-LSH conversion + +However, most of ANN systems currently only support cosin sorting, not by inner product sorting, which leads to big effect difference. + +To solve it, user and video vectors are sliently modified by a SIMPLE-LSH conversion\[[4](#References)\], so that inner sorting is equivalent to cosin sorting after conversion. + +Details are as follows: +- For video vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D%5Cin%20%5Cmathbb%7BR%7D%5EN), ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Cmathbf%7Bv%7D%20%5Cright%20%5C%7C%5Cleqslant%20m). The modified video vector ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D), and let ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%3D%20%5B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%3B%20%5Csqrt%7B1%20-%5Cleft%20%5C%7C%20%5Cmathbf%7B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%7B%7D%7D%20%5Cright%20%5C%7C%5E2%7D%5D). + +- For user vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Cin%20%5Cmathbb%7BR%7D%5EN), and the modified user vector ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D), and let ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%3D%20%5B%5Cmathbf%7Bu%7D_%7Bnorm%7D%3B%200%5D), where ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D_%7Bnorm%7D) is normalized ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D). + +When online predicting, for a coming ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D), it should recall ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D) by inner product sorting. After ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%2C%20%5Cmathbf%7Bv%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D) conversion, the order of inner prodct sorting is unchanged. Since ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%5Cright%20%5C%7C) and ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%5Cright%20%5C%7C) are both equal to 1, ![](https://www.zhihu.com/equation?tex=cos(%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%2C%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)%20%3D%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Ccdot%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D), which makes cosin-supported-only ANN system works. + +And in order to retain precision, use ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D) is also equivalent. + +### Implemention +Run `user_vector.py` to generate user vector. First input the features into network and then infer. The output of the last RELU layer is saved in variable probs[1]. By cascading a contant term 1 in the front and making SIMPLE-LSH conversion, user vector is generated. +```python +probs = inferer.infer( + input=test_batch, + feeding=feeding, + field=["value"], + flatten_result=False) +for i, res in enumerate(zip(probs[1])): + # do simple lsh conversion + user_vector = [1.000] + for i in res[0]: + user_vector.append(i) + user_vector.append(0.000) + norm = np.linalg.norm(user_vector) + user_vector_norm = [str(_ / norm) for _ in user_vector] + print ",".join(user_vector_norm) +``` + +Run `item_vector.py` to generate video vector. First load the model and extract the parameters nce_w and nce_b. And then generate ith video vector by putting nce_b[0][i] in the first dimension and nce_b[0][i] in the next. Finally make SIMPLE-LSH conversion, finding the maximum norm and processing according to ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D). + +```python +# load the trained model. + with gzip.open(args.model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + + nce_w = parameters.get("nce_w") + nce_b = parameters.get("nce_b") + item_vector = convt_simple_lsh(get_item_vec_from_softmax(nce_w, nce_b)) + +def get_item_vec_from_softmax(nce_w, nce_b): + """ + get item vectors from softmax parameter + """ + if nce_w is None or nce_b is None: + return None + vector = [] + total_items_num = nce_w.shape[0] + if total_items_num != nce_b.shape[1]: + return None + dim_vector = nce_w.shape[1] + 1 + for i in range(0, total_items_num): + vector.append([]) + vector[i].append(nce_b[0][i]) + for j in range(1, dim_vector): + vector[i].append(nce_w[i][j - 1]) + return vector + + +def convt_simple_lsh(vector): + """ + do simple lsh conversion + """ + max_norm = 0 + num_of_vec = len(vector) + for i in range(0, num_of_vec): + norm = np.linalg.norm(vector[i]) + if norm > max_norm: + max_norm = norm + for i in range(0, num_of_vec): + vector[i].append( + math.sqrt( + math.pow(max_norm, 2) - math.pow(np.linalg.norm(vector[i]), 2))) + return vector +``` + +Use `user_vector.py` and `item_vector.py` to calculate user and item vectors. For example, run the following commands: +```shell +python user_vector.py --infer_set_path='./data/infer.txt' \ + --model_path='./output/model/model_pass_00000.tar.gz' \ + --feature_dict='./output/feature_dict.pkl' \ + --batch_size=50 +python item_vector.py --model_path='./output/model/model_pass_00000.tar.gz' \ + --feature_dict='./output/feature_dict.pkl' +``` + +## Offline data mining +Since it is inevitable to consume large amount of machine resources for online predicting, an alternative is offline data mining, e.g. hottest videos, user personalized recommendation, item-based recommendation, and online systems directly access it. Here shows an example to get user personalized recommendation. +``` +python infer_user.py --model_path='./output/model/model_pass_00000.tar.gz' \ + --feature_dict='./output/feature_dict.pkl' +``` + +## References +1. Covington, Paul, Jay Adams, and Emre Sargin. "Deep neural networks for youtube recommendations." Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016. +2. https://code.google.com/archive/p/word2vec/ +3. http://paddlepaddle.org/docs/develop/models/nce_cost/README.html +4. Neyshabur, Behnam, and Nathan Srebro. "On symmetric and asymmetric LSHs for inner product search." arXiv preprint arXiv:1410.5518 (2014). diff --git a/youtube_recall/data/data.tar b/youtube_recall/data/data.tar new file mode 100644 index 0000000000000000000000000000000000000000..3191924203d3a859df990392effc53f1bc74887c Binary files /dev/null and b/youtube_recall/data/data.tar differ diff --git a/youtube_recall/data_processor.py b/youtube_recall/data_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..f52675e54b1fb2a997b5bf099b5a137887fdbf27 --- /dev/null +++ b/youtube_recall/data_processor.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import sys +import argparse +import os +import cPickle + +from utils import logger +""" +This script will output 2 files: +1. feature_dict.pkl +2. item_freq.pkl +""" + + +class FeatureGenerator(object): + """ + Encode feature values with low-frequency filtering. + """ + + def __init__(self, feat_appear_limit=20): + """ + @feat_appear_limit: int + """ + self._dic = None # feature value --> id + self._count = None # numbers of appearances of feature values + self._feat_appear_limit = feat_appear_limit + + def add_feat_val(self, feat_val): + """ + Add feature values and count numbers of its appearance. + """ + if self._count is None: + self._count = {'': 0} + if feat_val == "NULL": + feat_val = '' + if feat_val not in self._count: + self._count[feat_val] = 1 + else: + self._count[feat_val] += 1 + self._count[''] += 1 + + def _filter_feat(self): + """ + Filter low-frequency feature values. + """ + self._items = filter(lambda x: x[1] > self._feat_appear_limit, + self._count.items()) + self._items.sort(key=lambda x: x[1], reverse=True) + + def _build_dict(self): + """ + Build feature values --> ids dict. + """ + self._dic = {} + self._filter_feat() + for i in xrange(len(self._items)): + self._dic[self._items[i][0]] = i + self.dim = len(self._dic) + + def get_feat_id(self, feat_val): + """ + Get id of feature value after encoding. + """ + # build dict + if self._dic is None: + self._build_dict() + + # find id + if feat_val in self._dic: + return self._dic[feat_val] + else: + return self._dic[''] + + def get_dim(self): + """ + Get dim. + """ + # build dict + if self._dic is None: + self._build_dict() + return len(self._dic) + + def get_dict(self): + """ + Get dict. + """ + # build dict + if self._dic is None: + self._build_dict() + return self._dic + + def get_total_count(self): + """ + Compute total num of count. + """ + total_count = 0 + for i in xrange(len(self._items)): + feat_val = self._items[i][0] + c = self._items[i][1] + total_count += c + return total_count + + def count_iterator(self): + """ + Iterate feature values and its num of appearance. + """ + for i in xrange(len(self._items)): + yield self._items[i][0], self._items[i][1] + + def __repr__(self): + """ + """ + return '' % self._dim + + +def scan_build_dict(data_path, features_dict): + """ + Scan the raw data and add all feature values. + """ + logger.info('scan data set') + + with open(data_path, 'r') as f: + for (line_id, line) in enumerate(f): + fields = line.strip('\n').split('\t') + user_id = fields[0] + province = fields[1] + features_dict['province'].add_feat_val(province) + city = fields[2] + features_dict['city'].add_feat_val(city) + item_infos = fields[3] + phone = fields[4] + features_dict['phone'].add_feat_val(phone) + for item_info in item_infos.split(";"): + item_info_array = item_info.split(":") + item = item_info_array[0] + features_dict['history_clicked_items'].add_feat_val(item) + features_dict['user_id'].add_feat_val(user_id) + category = item_info_array[1] + features_dict['history_clicked_categories'].add_feat_val( + category) + tags = item_info_array[2] + for tag in tags.split("_"): + features_dict['history_clicked_tags'].add_feat_val(tag) + + +def parse_args(): + """ + parse arguments + """ + parser = argparse.ArgumentParser( + description="PaddlePaddle Youtube Recall Model Example") + parser.add_argument( + '--train_set_path', + type=str, + required=True, + help="path of the train set") + parser.add_argument( + '--output_dir', type=str, required=True, help="directory to output") + parser.add_argument( + '--feat_appear_limit', + type=int, + default=20, + help="the minimum number of feature values appears (default: 20)") + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_args() + + # check argument + assert os.path.exists( + args.train_set_path), 'The train set path does not exist.' + + # features used + features = [ + 'user_id', 'province', 'city', 'phone', 'history_clicked_items', + 'history_clicked_tags', 'history_clicked_categories' + ] + + # init feature generators + features_dict = {} + for feature in features: + features_dict[feature] = FeatureGenerator( + feat_appear_limit=args.feat_appear_limit) + + # scan data for building dict + scan_build_dict(args.train_set_path, features_dict) + + # generate feature_dict.pkl + feature_encoding_dict = {} + for feature in features: + d = features_dict[feature].get_dict() + feature_encoding_dict[feature] = d + logger.info('Feature:%s, dimension is %d' % (feature, len(d))) + output_dict_path = os.path.join(args.output_dir, 'feature_dict.pkl') + with open(output_dict_path, "w") as f: + cPickle.dump(feature_encoding_dict, f, -1) + + # generate item_freq.pkl + item_freq_list = [] + g = features_dict['history_clicked_items'] + total_count = g.get_total_count() + for feat_val, feat_count in g.count_iterator(): + item_freq_list.append(float(feat_count) / total_count) + logger.info('item_freq, dimension is %d' % (len(item_freq_list))) + output_item_freq_path = os.path.join(args.output_dir, 'item_freq.pkl') + with open(output_item_freq_path, "w") as f: + cPickle.dump(item_freq_list, f, -1) + + logger.info('Complete!') diff --git a/youtube_recall/images/model_network.png b/youtube_recall/images/model_network.png new file mode 100644 index 0000000000000000000000000000000000000000..ab2ce43d030cb8406b232ef5ff7a3fa1361e22f4 Binary files /dev/null and b/youtube_recall/images/model_network.png differ diff --git a/youtube_recall/images/recommendation_system.png b/youtube_recall/images/recommendation_system.png new file mode 100644 index 0000000000000000000000000000000000000000..dcb9634e130eebb3e0b06aedf5520d9492df591f Binary files /dev/null and b/youtube_recall/images/recommendation_system.png differ diff --git a/youtube_recall/infer.py b/youtube_recall/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..1bfde71d79afb28f75817220bba90cfde8bb6571 --- /dev/null +++ b/youtube_recall/infer.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import gzip +import paddle.v2 as paddle +import argparse +import cPickle + +from reader import Reader +from network_conf import DNNmodel +from utils import logger + + +def parse_args(): + """ + parse arguments + :return: + """ + parser = argparse.ArgumentParser( + description="PaddlePaddle Youtube Recall Model Example") + parser.add_argument( + '--infer_set_path', + type=str, + required=True, + help="path of the infer set") + parser.add_argument( + '--model_path', type=str, required=True, help="path of the model") + parser.add_argument( + '--feature_dict', + type=str, + required=True, + help="path of feature_dict.pkl") + parser.add_argument( + '--batch_size', + type=int, + default=50, + help="size of mini-batch (default:50)") + return parser.parse_args() + + +def infer(): + """ + infer + """ + args = parse_args() + + # check argument + assert os.path.exists( + args.infer_set_path), 'The infer_set_path path does not exist.' + assert os.path.exists( + args.model_path), 'The model_path path does not exist.' + assert os.path.exists( + args.feature_dict), 'The feature_dict path does not exist.' + + paddle.init(use_gpu=False, trainer_count=1) + + with open(args.feature_dict) as f: + feature_dict = cPickle.load(f) + + nid_dict = feature_dict['history_clicked_items'] + nid_to_word = dict((v, k) for k, v in nid_dict.items()) + + # load the trained model. + with gzip.open(args.model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + + # build model + prediction_layer, fc = DNNmodel( + dnn_layer_dims=[256, 31], feature_dict=feature_dict, + is_infer=True).model_cost + inferer = paddle.inference.Inference( + output_layer=[prediction_layer, fc], parameters=parameters) + + reader = Reader(feature_dict) + test_batch = [] + for idx, item in enumerate(reader.infer(args.infer_set_path)): + test_batch.append(item) + if len(test_batch) == args.batch_size: + infer_a_batch(inferer, test_batch, nid_to_word) + test_batch = [] + if len(test_batch): + infer_a_batch(inferer, test_batch, nid_to_word) + + +def infer_a_batch(inferer, test_batch, nid_to_word): + """ + input a batch of data and infer + """ + feeding = { + 'user_id': 0, + 'province': 1, + 'city': 2, + 'history_clicked_items': 3, + 'history_clicked_categories': 4, + 'history_clicked_tags': 5, + 'phone': 6 + } + probs = inferer.infer( + input=test_batch, + feeding=feeding, + field=["value"], + flatten_result=False) + for i, res in enumerate(zip(test_batch, probs[0], probs[1])): + softmax_output = res[1] + sort_nid = res[1].argsort() + # print top 30 recommended item + ret = "" + for j in range(1, 30): + item_id = sort_nid[-1 * j] + item_id_to_word = nid_to_word[item_id] + ret += "%s:%.6f," \ + % (item_id_to_word, softmax_output[item_id]) + + print ret.rstrip(",") + + +if __name__ == "__main__": + infer() diff --git a/youtube_recall/infer_user.py b/youtube_recall/infer_user.py new file mode 100644 index 0000000000000000000000000000000000000000..fa789964f21d0a7cca840dc3f89ae55c017bf9af --- /dev/null +++ b/youtube_recall/infer_user.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import gzip +import paddle.v2 as paddle +import argparse +import cPickle + +from reader import Reader +from network_conf import DNNmodel +from utils import logger +import numpy as np + + +def parse_args(): + """ + parse arguments + :return: + """ + parser = argparse.ArgumentParser( + description="PaddlePaddle Youtube Recall Model Example") + parser.add_argument( + '--model_path', type=str, required=True, help="path of the model") + parser.add_argument( + '--feature_dict', + type=str, + required=True, + help="path of feature_dict.pkl") + return parser.parse_args() + + +def infer_user(): + """ + infer_user + """ + args = parse_args() + + # check argument + assert os.path.exists( + args.model_path), 'The model_path path does not exist.' + assert os.path.exists( + args.feature_dict), 'The feature_dict path does not exist.' + + paddle.init(use_gpu=False, trainer_count=1) + + with open(args.feature_dict) as f: + feature_dict = cPickle.load(f) + + nid_dict = feature_dict['history_clicked_items'] + nid_to_word = dict((v, k) for k, v in nid_dict.items()) + + # load the trained model. + with gzip.open(args.model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + parameters.set('_proj_province', \ + np.zeros(shape=parameters.get('_proj_province').shape)) + parameters.set('_proj_city', \ + np.zeros(shape=parameters.get('_proj_city').shape)) + parameters.set('_proj_phone', \ + np.zeros(shape=parameters.get('_proj_phone').shape)) + parameters.set('_proj_history_clicked_items', \ + np.zeros(shape= parameters.get('_proj_history_clicked_items').shape)) + parameters.set('_proj_history_clicked_categories', \ + np.zeros(shape= parameters.get('_proj_history_clicked_categories').shape)) + parameters.set('_proj_history_clicked_tags', \ + np.zeros(shape= parameters.get('_proj_history_clicked_tags').shape)) + + # build model + prediction_layer, fc = DNNmodel( + dnn_layer_dims=[256, 31], feature_dict=feature_dict, + is_infer=True).model_cost + inferer = paddle.inference.Inference( + output_layer=[prediction_layer, fc], parameters=parameters) + + reader = Reader(feature_dict) + test_batch = [] + for idx, item in enumerate( + reader.infer_user(['USER_ID_0', 'USER_ID_981', 'USER_ID_310806'])): + test_batch.append(item) + infer_a_batch(inferer, test_batch, nid_to_word) + + +def infer_a_batch(inferer, test_batch, nid_to_word): + """ + input a batch of data and infer + """ + feeding = { + 'user_id': 0, + 'province': 1, + 'city': 2, + 'history_clicked_items': 3, + 'history_clicked_categories': 4, + 'history_clicked_tags': 5, + 'phone': 6 + } + probs = inferer.infer( + input=test_batch, + feeding=feeding, + field=["value"], + flatten_result=False) + for i, res in enumerate(zip(test_batch, probs[0], probs[1])): + softmax_output = res[1] + sort_nid = res[1].argsort() + + # print top 30 recommended item + ret = "" + for j in range(1, 30): + item_id = sort_nid[-1 * j] + item_id_to_word = nid_to_word[item_id] + ret += "%s:%.6f," \ + % (item_id_to_word, softmax_output[item_id]) + print ret.rstrip(",") + + +if __name__ == "__main__": + infer_user() diff --git a/youtube_recall/item_vector.py b/youtube_recall/item_vector.py new file mode 100644 index 0000000000000000000000000000000000000000..7804b64f924818286e4c8ba72b0e8ba7e795aa32 --- /dev/null +++ b/youtube_recall/item_vector.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import gzip +import paddle.v2 as paddle +import argparse +import cPickle + +from reader import Reader +from network_conf import DNNmodel +from utils import logger +import numpy as np +import math + + +def parse_args(): + """ + parse arguments + :return: + """ + parser = argparse.ArgumentParser( + description="PaddlePaddle Youtube Recall Model Example") + parser.add_argument( + '--model_path', type=str, required=True, help="path of the model") + parser.add_argument( + '--feature_dict', + type=str, + required=True, + help="path of feature_dict.pkl") + return parser.parse_args() + + +def get_item_vec_from_softmax(nce_w, nce_b): + """ + get item vectors from softmax parameter + """ + if nce_w is None or nce_b is None: + return None + vector = [] + total_items_num = nce_w.shape[0] + if total_items_num != nce_b.shape[1]: + return None + dim_vector = nce_w.shape[1] + 1 + for i in range(0, total_items_num): + vector.append([]) + vector[i].append(nce_b[0][i]) + for j in range(1, dim_vector): + vector[i].append(nce_w[i][j - 1]) + return vector + + +def convt_simple_lsh(vector): + """ + do simple lsh conversion + """ + max_norm = 0 + num_of_vec = len(vector) + for i in range(0, num_of_vec): + norm = np.linalg.norm(vector[i]) + if norm > max_norm: + max_norm = norm + for i in range(0, num_of_vec): + vector[i].append( + math.sqrt( + math.pow(max_norm, 2) - math.pow(np.linalg.norm(vector[i]), 2))) + return vector + + +def item_vector(): + """ + get item vectors + """ + args = parse_args() + + # check argument + assert os.path.exists( + args.model_path), 'The model_path path does not exist.' + assert os.path.exists( + args.feature_dict), 'The feature_dict path does not exist.' + + paddle.init(use_gpu=False, trainer_count=1) + + with open(args.feature_dict) as f: + feature_dict = cPickle.load(f) + + # load the trained model. + with gzip.open(args.model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + + nid_dict = feature_dict['history_clicked_items'] + nid_to_word = dict((v, k) for k, v in nid_dict.items()) + + nce_w = parameters.get("nce_w") + nce_b = parameters.get("nce_b") + item_vector = convt_simple_lsh(get_item_vec_from_softmax(nce_w, nce_b)) + for i in range(0, len(item_vector)): + itemid = nid_to_word[i] + print itemid + "\t" + ",".join(map(str, item_vector[i])) + + +if __name__ == "__main__": + item_vector() diff --git a/youtube_recall/network_conf.py b/youtube_recall/network_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..2577467131ec8aa9bd234ad70333f6e8870900d7 --- /dev/null +++ b/youtube_recall/network_conf.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import paddle.v2 as paddle +import cPickle + + +class DNNmodel(object): + """ + Deep Neural Networks for YouTube candidate generation + """ + + def __init__(self, + dnn_layer_dims=None, + feature_dict=None, + item_freq=None, + is_infer=False): + """ + initialize model + @dnn_layer_dims: dimension of each hidden layer + @feature_dict: dictionary of encoded feature + @item_freq: dictionary of feature values and its frequency + @is_infer: if infer mode + """ + self._dnn_layer_dims = dnn_layer_dims + self._feature_dict = feature_dict + self._item_freq = item_freq + + self._is_infer = is_infer + + # build model + self._build_input_layer() + self._build_embedding_layer() + self.model_cost = self._build_dnn_model() + + def _build_input_layer(self): + """ + build input layer + """ + self._history_clicked_items = paddle.layer.data( + name="history_clicked_items", + type=paddle.data_type.integer_value_sequence( + len(self._feature_dict['history_clicked_items']))) + self._history_clicked_categories = paddle.layer.data( + name="history_clicked_categories", + type=paddle.data_type.integer_value_sequence( + len(self._feature_dict['history_clicked_categories']))) + self._history_clicked_tags = paddle.layer.data( + name="history_clicked_tags", + type=paddle.data_type.integer_value_sequence( + len(self._feature_dict['history_clicked_tags']))) + self._user_id = paddle.layer.data( + name="user_id", + type=paddle.data_type.integer_value( + len(self._feature_dict['user_id']))) + self._province = paddle.layer.data( + name="province", + type=paddle.data_type.integer_value( + len(self._feature_dict['province']))) + self._city = paddle.layer.data( + name="city", + type=paddle.data_type.integer_value( + len(self._feature_dict['city']))) + self._phone = paddle.layer.data( + name="phone", + type=paddle.data_type.integer_value( + len(self._feature_dict['phone']))) + self._target_item = paddle.layer.data( + name="target_item", + type=paddle.data_type.integer_value( + len(self._feature_dict['history_clicked_items']))) + + def _create_emb_attr(self, name): + """ + create embedding parameter + """ + return paddle.attr.Param( + name=name, + initial_std=0.001, + learning_rate=1, + l2_rate=0, + sparse_update=False) + + def _build_embedding_layer(self): + """ + build embedding layer + """ + self._user_id_emb = paddle.layer.embedding( + input=self._user_id, + size=64, + param_attr=self._create_emb_attr('_proj_user_id')) + self._province_emb = paddle.layer.embedding( + input=self._province, + size=8, + param_attr=self._create_emb_attr('_proj_province')) + self._city_emb = paddle.layer.embedding( + input=self._city, + size=16, + param_attr=self._create_emb_attr('_proj_city')) + self._phone_emb = paddle.layer.embedding( + input=self._phone, + size=16, + param_attr=self._create_emb_attr('_proj_phone')) + self._history_clicked_items_emb = paddle.layer.embedding( + input=self._history_clicked_items, + size=64, + param_attr=self._create_emb_attr('_proj_history_clicked_items')) + self._history_clicked_categories_emb = paddle.layer.embedding( + input=self._history_clicked_categories, + size=8, + param_attr=self._create_emb_attr( + '_proj_history_clicked_categories')) + self._history_clicked_tags_emb = paddle.layer.embedding( + input=self._history_clicked_tags, + size=64, + param_attr=self._create_emb_attr('_proj_history_clicked_tags')) + + def _build_dnn_model(self): + """ + build dnn model + """ + self._rnn_cell = paddle.networks.simple_lstm( + input=self._history_clicked_items_emb, size=64) + self._lstm_last = paddle.layer.pooling( + input=self._rnn_cell, pooling_type=paddle.pooling.Max()) + self._avg_emb_cats = paddle.layer.pooling( + input=self._history_clicked_categories_emb, + pooling_type=paddle.pooling.Avg()) + self._avg_emb_tags = paddle.layer.pooling( + input=self._history_clicked_tags_emb, + pooling_type=paddle.pooling.Avg()) + self._fc_0 = paddle.layer.fc( + name="Relu1", + input=[ + self._lstm_last, self._user_id_emb, self._province_emb, + self._city_emb, self._avg_emb_cats, self._avg_emb_tags, + self._phone_emb + ], + size=self._dnn_layer_dims[0], + act=paddle.activation.Relu()) + + self._fc_1 = paddle.layer.fc(name="Relu2", + input=self._fc_0, + size=self._dnn_layer_dims[1], + act=paddle.activation.Relu()) + + if not self._is_infer: + return paddle.layer.nce( + input=self._fc_1, + label=self._target_item, + num_classes=len(self._feature_dict['history_clicked_items']), + param_attr=paddle.attr.Param(name="nce_w"), + bias_attr=paddle.attr.Param(name="nce_b"), + num_neg_samples=5, + neg_distribution=self._item_freq) + else: + self.prediction_layer = paddle.layer.mixed( + size=len(self._feature_dict['history_clicked_items']), + input=paddle.layer.trans_full_matrix_projection( + self._fc_1, param_attr=paddle.attr.Param(name="nce_w")), + act=paddle.activation.Softmax(), + bias_attr=paddle.attr.Param(name="nce_b")) + return self.prediction_layer, self._fc_1 + + +if __name__ == "__main__": + # this is to test and debug the network topology defination. + # please set the hyper-parameters as needed. + item_freq_path = "./output/item_freq.pkl" + with open(item_freq_path) as f: + item_freq = cPickle.load(f) + + feature_dict_path = "./output/feature_dict.pkl" + with open(feature_dict_path) as f: + feature_dict = cPickle.load(f) + + a = DNNmodel( + dnn_layer_dims=[256, 31], + feature_dict=feature_dict, + item_freq=item_freq, + is_infer=False) diff --git a/youtube_recall/reader.py b/youtube_recall/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..e64e3fe7573f00c1aeb44c81368ee8afd224e56d --- /dev/null +++ b/youtube_recall/reader.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import sys + +from utils import logger +from utils import TaskMode + + +class Reader(object): + """ + Reader + """ + + def __init__(self, feature_dict=None, window_size=20): + """ + init + @window_size: window_size + """ + self._feature_dict = feature_dict + self._window_size = window_size + + def train(self, path): + """ + load train set + @path: train set path + """ + logger.info("start train reader from %s" % path) + mode = TaskMode.create_train() + return self._reader(path, mode) + + def test(self, path): + """ + load test set + @path: test set path + """ + logger.info("start test reader from %s" % path) + mode = TaskMode.create_test() + return self._reader(path, mode) + + def infer(self, path): + """ + load infer set + @path: infer set path + """ + logger.info("start infer reader from %s" % path) + mode = TaskMode.create_infer() + return self._reader(path, mode) + + def infer_user(self, user_list): + """ + load user set to infer + @user_list: user list + """ + return self._reader_user(user_list) + + def _reader(self, path, mode): + """ + parse data set + """ + USER_ID_UNK = self._feature_dict['user_id'].get('') + PROVINCE_UNK = self._feature_dict['province'].get('') + CITY_UNK = self._feature_dict['city'].get('') + ITEM_UNK = self._feature_dict['history_clicked_items'].get('') + CATEGORY_UNK = self._feature_dict['history_clicked_categories'].get( + '') + TAG_UNK = self._feature_dict['history_clicked_tags'].get('') + PHONE_UNK = self._feature_dict['phone'].get('') + with open(path) as f: + for line in f: + fields = line.strip('\n').split('\t') + user_id = self._feature_dict['user_id'].get(fields[0], + USER_ID_UNK) + province = self._feature_dict['province'].get(fields[1], + PROVINCE_UNK) + city = self._feature_dict['city'].get(fields[2], CITY_UNK) + item_infos = fields[3] + phone = self._feature_dict['phone'].get(fields[4], PHONE_UNK) + history_clicked_items_all = [] + history_clicked_tags_all = [] + history_clicked_categories_all = [] + for item_info in item_infos.split(';'): + item_info_array = item_info.split(':') + item = item_info_array[0] + item_encoded_id = self._feature_dict['history_clicked_items'].get(\ + item, ITEM_UNK) + if item_encoded_id != ITEM_UNK: + history_clicked_items_all.append(item_encoded_id) + category = item_info_array[1] + history_clicked_categories_all.append( + self._feature_dict['history_clicked_categories'].get(\ + category, CATEGORY_UNK)) + tags = item_info_array[2] + tag_split = map(str, [self._feature_dict['history_clicked_tags'].get(\ + tag, TAG_UNK) \ + for tag in tags.strip().split("_")]) + history_clicked_tags_all.append("_".join(tag_split)) + + if not mode.is_infer(): + history_clicked_items_all.insert(0, 0) + history_clicked_tags_all.insert(0, "0") + history_clicked_categories_all.insert(0, 0) + + for i in range(1, len(history_clicked_items_all)): + start = max(0, i - self._window_size) + history_clicked_items = history_clicked_items_all[start: + i] + history_clicked_categories = history_clicked_categories_all[ + start:i] + history_clicked_tags_str = history_clicked_tags_all[ + start:i] + history_clicked_tags = [] + for tags_a in history_clicked_tags_str: + for tag in tags_a.split("_"): + history_clicked_tags.append(int(tag)) + target_item = history_clicked_items_all[i] + yield user_id, province, city, \ + history_clicked_items, history_clicked_categories, \ + history_clicked_tags, phone, target_item + else: + history_clicked_items = history_clicked_items_all + history_clicked_categories = history_clicked_categories_all + history_clicked_tags_str = history_clicked_tags_all + history_clicked_tags = [] + for tags_a in history_clicked_tags_str: + for tag in tags_a.split("_"): + history_clicked_tags.append(int(tag)) + yield user_id, province, city, \ + history_clicked_items, history_clicked_categories, \ + history_clicked_tags, phone + + def _reader_user(self, user_list): + """ + parse user list + """ + USER_ID_UNK = self._feature_dict['user_id'].get('') + for user in user_list: + user_id = self._feature_dict['user_id'].get(user, USER_ID_UNK) + yield user_id, 0, 0, [0], [0], [0], 0 + + +if __name__ == "__main__": + # this is to test and debug reader function + train_data = sys.argv[1] + feature_dict = sys.argv[2] + window_size = int(sys.argv[3]) + + import cPickle + with open(feature_dict) as f: + feature_dict = cPickle.load(f) + + r = Reader(feature_dict, window_size) + + for dat in r.train(train_data): + print dat diff --git a/youtube_recall/train.py b/youtube_recall/train.py new file mode 100644 index 0000000000000000000000000000000000000000..b725bd467773abc24f6fa960a83b6c23c3ea6bf5 --- /dev/null +++ b/youtube_recall/train.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import gzip +import paddle.v2 as paddle +import argparse +import cPickle + +from reader import Reader +from network_conf import DNNmodel +from utils import logger + + +def parse_args(): + """ + parse arguments + """ + parser = argparse.ArgumentParser( + description="PaddlePaddle Youtube Recall Model Example") + parser.add_argument( + '--train_set_path', + type=str, + required=True, + help="path of the train set") + parser.add_argument( + '--test_set_path', type=str, required=True, help="path of the test set") + parser.add_argument( + '--model_output_dir', + type=str, + required=True, + help="directory to output") + parser.add_argument( + '--feature_dict', + type=str, + required=True, + help="path of feature_dict.pkl") + parser.add_argument( + '--item_freq', type=str, required=True, help="path of item_freq.pkl ") + parser.add_argument( + '--window_size', type=int, default=20, help="window size(default: 20)") + parser.add_argument( + '--num_passes', type=int, default=1, help="number of passes to train") + parser.add_argument( + '--batch_size', + type=int, + default=50, + help="size of mini-batch (default:50)") + return parser.parse_args() + + +def train(): + """ + train + """ + args = parse_args() + + # check argument + assert os.path.exists( + args.train_set_path), 'The train_set_path path does not exist.' + assert os.path.exists( + args.test_set_path), 'The test_set_path path does not exist.' + assert os.path.exists( + args.feature_dict), 'The feature_dict path does not exist.' + assert os.path.exists(args.item_freq), 'The item_freq path does not exist.' + assert os.path.exists( + args.model_output_dir), 'The model_output_dir path does not exist.' + + paddle.init(use_gpu=False, trainer_count=1) + + with open(args.feature_dict) as f: + feature_dict = cPickle.load(f) + + with open(args.item_freq) as f: + item_freq = cPickle.load(f) + + feeding = { + 'user_id': 0, + 'province': 1, + 'city': 2, + 'history_clicked_items': 3, + 'history_clicked_categories': 4, + 'history_clicked_tags': 5, + 'phone': 6, + 'target_item': 7 + } + optimizer = paddle.optimizer.AdaGrad( + learning_rate=1e-1, + regularization=paddle.optimizer.L2Regularization(rate=1e-3)) + + cost = DNNmodel( + dnn_layer_dims=[256, 31], + feature_dict=feature_dict, + item_freq=item_freq, + is_infer=False).model_cost + parameters = paddle.parameters.create(cost) + + trainer = paddle.trainer.SGD(cost, parameters, optimizer) + + def event_handler(event): + """ + event handler + """ + if isinstance(event, paddle.event.EndIteration): + if event.batch_id and not event.batch_id % 10: + logger.info("Pass %d, Batch %d, Cost %f" % + (event.pass_id, event.batch_id, event.cost)) + elif isinstance(event, paddle.event.EndPass): + save_path = os.path.join(args.model_output_dir, + "model_pass_%05d.tar.gz" % event.pass_id) + logger.info("Save model into %s ..." % save_path) + with gzip.open(save_path, "w") as f: + trainer.save_parameter_to_tar(f) + + reader = Reader(feature_dict, args.window_size) + trainer.train( + paddle.batch( + paddle.reader.shuffle( + lambda: reader.train(args.train_set_path), buf_size=7000), + args.batch_size), + num_passes=args.num_passes, + feeding=feeding, + event_handler=event_handler) + + +if __name__ == "__main__": + train() diff --git a/youtube_recall/user_vector.py b/youtube_recall/user_vector.py new file mode 100644 index 0000000000000000000000000000000000000000..270fcd70c31a58baf7b1ab1640740117223f788d --- /dev/null +++ b/youtube_recall/user_vector.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import gzip +import paddle.v2 as paddle +import argparse +import cPickle + +from reader import Reader +from network_conf import DNNmodel +from utils import logger +import numpy as np + + +def parse_args(): + """ + parse arguments + """ + parser = argparse.ArgumentParser( + description="PaddlePaddle Youtube Recall Model Example") + parser.add_argument( + '--infer_set_path', + type=str, + required=True, + help="path of the infer set") + parser.add_argument( + '--model_path', type=str, required=True, help="path of the model") + parser.add_argument( + '--feature_dict', + type=str, + required=True, + help="path of feature_dict.pkl") + parser.add_argument( + '--batch_size', + type=int, + default=50, + help="size of mini-batch (default:50)") + return parser.parse_args() + + +def user_vector(): + """ + get user vectors + """ + args = parse_args() + + # check argument + assert os.path.exists( + args.infer_set_path), 'The infer_set_path path does not exist.' + assert os.path.exists( + args.model_path), 'The model_path path does not exist.' + assert os.path.exists( + args.feature_dict), 'The feature_dict path does not exist.' + + paddle.init(use_gpu=False, trainer_count=1) + + with open(args.feature_dict) as f: + feature_dict = cPickle.load(f) + + # load the trained model. + with gzip.open(args.model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + + # build model + prediction_layer, fc = DNNmodel( + dnn_layer_dims=[256, 31], feature_dict=feature_dict, + is_infer=True).model_cost + inferer = paddle.inference.Inference( + output_layer=[prediction_layer, fc], parameters=parameters) + + reader = Reader(feature_dict) + test_batch = [] + for idx, item in enumerate(reader.infer(args.infer_set_path)): + test_batch.append(item) + if len(test_batch) == args.batch_size: + get_a_batch_user_vector(inferer, test_batch) + test_batch = [] + if len(test_batch): + get_a_batch_user_vector(inferer, test_batch) + + +def get_a_batch_user_vector(inferer, test_batch): + """ + input a batch of data and get user vectors + """ + feeding = { + 'user_id': 0, + 'province': 1, + 'city': 2, + 'history_clicked_items': 3, + 'history_clicked_categories': 4, + 'history_clicked_tags': 5, + 'phone': 6 + } + probs = inferer.infer( + input=test_batch, + feeding=feeding, + field=["value"], + flatten_result=False) + for i, res in enumerate(zip(probs[1])): + # do simple lsh conversion + user_vector = [1.000] + for i in res[0]: + user_vector.append(i) + user_vector.append(0.000) + norm = np.linalg.norm(user_vector) + user_vector_norm = [str(_ / norm) for _ in user_vector] + print ",".join(user_vector_norm) + + +if __name__ == "__main__": + user_vector() diff --git a/youtube_recall/utils.py b/youtube_recall/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bdfcd70183229c8a4702684c4f24cb4783223e33 --- /dev/null +++ b/youtube_recall/utils.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import logging + +logging.basicConfig() +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + + +class TaskMode(object): + """ + TaskMode + """ + TRAIN_MODE = 0 + TEST_MODE = 1 + INFER_MODE = 2 + + def __init__(self, mode): + """ + + :param mode: + """ + self.mode = mode + + def is_train(self): + """ + + :return: + """ + return self.mode == self.TRAIN_MODE + + def is_test(self): + """ + + :return: + """ + return self.mode == self.TEST_MODE + + def is_infer(self): + """ + + :return: + """ + return self.mode == self.INFER_MODE + + @staticmethod + def create_train(): + """ + + :return: + """ + return TaskMode(TaskMode.TRAIN_MODE) + + @staticmethod + def create_test(): + """ + + :return: + """ + return TaskMode(TaskMode.TEST_MODE) + + @staticmethod + def create_infer(): + """ + + :return: + """ + return TaskMode(TaskMode.INFER_MODE)