diff --git a/youtube_recall/README.cn.md b/youtube_recall/README.cn.md index 9aae47bcc9275fa9bd2f973c83b7f459f93e755b..0a2eb35f87e38a7970f4c674da4ca2eadb2e56dd 100644 --- a/youtube_recall/README.cn.md +++ b/youtube_recall/README.cn.md @@ -137,7 +137,7 @@ def _create_emb_attr(self, name): create embedding parameter """ return paddle.attr.Param( - name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=True) + name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=False) def _build_embedding_layer(self): """ diff --git a/youtube_recall/README.md b/youtube_recall/README.md index 1c669e757353d8c6a08e2ac54cca2e38956db5a4..0167c3ec49aa18239d44558e946e8fa0417943f8 100644 --- a/youtube_recall/README.md +++ b/youtube_recall/README.md @@ -35,7 +35,7 @@ Figure 2. Candidate generation model architecture - Output layer: A softmax classifier is connected to do discriminating millions of classes (videos). To speed up training process, a technique is applied that samples negative classes from background distribution with importance weighting. The previous mentioned high-dimensional "embedding" of the candidate video ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D) is obtained by weight and bias of the softmax layer. At serving time, the most likely N classes (videos) is computed for presenting to the user. To Score millions of items under a strict serving laterncy, the scoring problem reduces to a nearest neighbor search in the dot product space, and Locality Sensitive Hashing is relied on. ## Data Pre-processing -In this example, we moked click log of users as sample data, and its format is as follows: +In this example, here moke the click log of users as sample data, and its format is as follows: ``` user-id \t province \t city \t history-clicked-video-info-sequence \t phone @@ -125,7 +125,7 @@ def _create_emb_attr(self, name): create embedding parameter """ return paddle.attr.Param( - name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=True) + name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=False) def _build_embedding_layer(self): """ @@ -160,7 +160,7 @@ def _build_embedding_layer(self): ``` ### Hiddern layer -We improves the original networks in \[[1](#References)\] by modifying that the embeddings of video watches are not simply averaged but are connected to a LSTM layer with max temporal pooling instead, so that the deep sequential information related to user interests can be learned well. Considering data scale and efficiency of training, we only apply two ReLU layers, which also leads to good performance. +We improves the original networks in \[[1](#References)\] by modifying that the embeddings of video watches are not simply averaged but are connected to a LSTM layer with max temporal pooling instead, so that the deep sequential information related to user interests can be learned well. Considering data scale and efficiency of training, only two ReLU layers are applied, which also leads to good performance. ```python self._rnn_cell = paddle.networks.simple_lstm(input=self._history_clicked_items_emb, size=64) @@ -257,18 +257,18 @@ python infer.py --infer_set_path='./data/infer.txt' \ ``` ## Online prediction -For online prediction,we adopt Approximate Nearest Neighbor(ANN) to directly recall top N most likely watch video. However, our ANN system currently only supports cosin sorting, not by inner product sorting, which leads to big effect difference. +For online prediction,Approximate Nearest Neighbor(ANN) is adopted to directly recall top N most likely watch video. However, our ANN system currently only supports cosin sorting, not by inner product sorting, which leads to big effect difference. -As a result, we sliently modify user and video vectors by a SIMPLE-LSH conversion\[[4](#References)\], so that inner sorting is equivalent to cosin sorting after conversion. +To solve it, user and video vectors are sliently modified by a SIMPLE-LSH conversion\[[4](#References)\], so that inner sorting is equivalent to cosin sorting after conversion. Details are as follows: -- For video vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D%5Cin%20%5Cmathbb%7BR%7D%5EN), we have ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Cmathbf%7Bv%7D%20%5Cright%20%5C%7C%5Cleqslant%20m). The modified video vector ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D), and let ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%3D%20%5B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%3B%20%5Csqrt%7B1%20-%5Cleft%20%5C%7C%20%5Cmathbf%7B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%7B%7D%7D%20%5Cright%20%5C%7C%5E2%7D%5D). +- For video vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D%5Cin%20%5Cmathbb%7BR%7D%5EN), ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Cmathbf%7Bv%7D%20%5Cright%20%5C%7C%5Cleqslant%20m). The modified video vector ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D), and let ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%3D%20%5B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%3B%20%5Csqrt%7B1%20-%5Cleft%20%5C%7C%20%5Cmathbf%7B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%7B%7D%7D%20%5Cright%20%5C%7C%5E2%7D%5D). - For user vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Cin%20%5Cmathbb%7BR%7D%5EN), and the modified user vector ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D), and let ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%3D%20%5B%5Cmathbf%7Bu%7D_%7Bnorm%7D%3B%200%5D), where ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D_%7Bnorm%7D) is normalized ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D). -When online predicting, for a coming ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D), we need recall ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D) by inner product sorting. After ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%2C%20%5Cmathbf%7Bv%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D) conversion, the order of inner prodct sorting is unchanged. Since ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%5Cright%20%5C%7C) and ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%5Cright%20%5C%7C) are both equal to 1, ![](https://www.zhihu.com/equation?tex=cos(%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%2C%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)%20%3D%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Ccdot%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D), which makes cosin-supported-only ANN system works. +When online predicting, for a coming ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D), it should recall ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D) by inner product sorting. After ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%2C%20%5Cmathbf%7Bv%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D) conversion, the order of inner prodct sorting is unchanged. Since ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%5Cright%20%5C%7C) and ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%5Cright%20%5C%7C) are both equal to 1, ![](https://www.zhihu.com/equation?tex=cos(%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%2C%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)%20%3D%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Ccdot%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D), which makes cosin-supported-only ANN system works. -And in order to retain precision, we find that ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D) is also equivalent. +And in order to retain precision, use ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D) is also equivalent. Use `user_vector.py` and `vector.py` to calculate user and item vectors. For example, run the following commands: ```shell diff --git a/youtube_recall/infer.py b/youtube_recall/infer.py index 6c3e096bcf2a76ac464acfb116e5c0e6507ad956..4e0aa730f4ef26e8a0f530fd658e2af83398d868 100644 --- a/youtube_recall/infer.py +++ b/youtube_recall/infer.py @@ -103,13 +103,14 @@ def infer_a_batch(inferer, test_batch, nid_to_word): for i, res in enumerate(zip(test_batch, probs[0], probs[1])): softmax_output = res[1] sort_nid = res[1].argsort() - # print top 30 recommended item + ret = "" for j in range(1, 30): item_id = sort_nid[-1 * j] item_id_to_word = nid_to_word[item_id] - print "%s\t%.6f" \ + ret += "%s:%.6f," \ % (item_id_to_word, softmax_output[item_id]) + print ret.rstrip(",") if __name__ == "__main__": diff --git a/youtube_recall/infer_user.py b/youtube_recall/infer_user.py new file mode 100644 index 0000000000000000000000000000000000000000..40a49350f6579fa41e4dd3e1a0ff5915ba767a7c --- /dev/null +++ b/youtube_recall/infer_user.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import gzip +import paddle.v2 as paddle +import argparse +import cPickle + +from reader import Reader +from network_conf import DNNmodel +from utils import logger +import numpy as np + + +def parse_args(): + """ + parse arguments + :return: + """ + parser = argparse.ArgumentParser( + description="PaddlePaddle Youtube Recall Model Example") + parser.add_argument( + '--model_path', type=str, required=True, help="path of the model") + parser.add_argument( + '--feature_dict', + type=str, + required=True, + help="path of feature_dict.pkl") + return parser.parse_args() + + +def infer_user(): + """ + infer_user + """ + args = parse_args() + + # check argument + assert os.path.exists( + args.model_path), 'The model_path path does not exist.' + assert os.path.exists( + args.feature_dict), 'The feature_dict path does not exist.' + + paddle.init(use_gpu=False, trainer_count=1) + + with open(args.feature_dict) as f: + feature_dict = cPickle.load(f) + + nid_dict = feature_dict['history_clicked_items'] + nid_to_word = dict((v, k) for k, v in nid_dict.items()) + + # load the trained model. + with gzip.open(args.model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + parameters.set( + '_proj_province', + np.zeros(shape=parameters.get('_proj_province').shape)) + parameters.set( + '_proj_city', np.zeros(shape=parameters.get('_proj_city').shape)) + parameters.set( + '_proj_phone', np.zeros(shape=parameters.get('_proj_phone').shape)) + parameters.set('_proj_history_clicked_items', \ + np.zeros(shape= parameters.get('_proj_history_clicked_items').shape)) + parameters.set('_proj_history_clicked_categories', \ + np.zeros(shape= parameters.get('_proj_history_clicked_categories').shape)) + parameters.set('_proj_history_clicked_tags', \ + np.zeros(shape= parameters.get('_proj_history_clicked_tags').shape)) + + # build model + prediction_layer, fc = DNNmodel( + dnn_layer_dims=[256, 31], feature_dict=feature_dict, + is_infer=True).model_cost + inferer = paddle.inference.Inference( + output_layer=[prediction_layer, fc], parameters=parameters) + + reader = Reader(feature_dict) + test_batch = [] + for idx, item in enumerate( + reader.infer_user(['USER_ID_0', 'USER_ID_981', 'USER_ID_310806'])): + test_batch.append(item) + infer_a_batch(inferer, test_batch, nid_to_word) + + +def infer_a_batch(inferer, test_batch, nid_to_word): + """ + input a batch of data and infer + """ + feeding = { + 'user_id': 0, + 'province': 1, + 'city': 2, + 'history_clicked_items': 3, + 'history_clicked_categories': 4, + 'history_clicked_tags': 5, + 'phone': 6 + } + probs = inferer.infer( + input=test_batch, + feeding=feeding, + field=["value"], + flatten_result=False) + for i, res in enumerate(zip(test_batch, probs[0], probs[1])): + softmax_output = res[1] + sort_nid = res[1].argsort() + + # print top 30 recommended item + ret = "" + for j in range(1, 30): + item_id = sort_nid[-1 * j] + item_id_to_word = nid_to_word[item_id] + ret += "%s:%.6f," \ + % (item_id_to_word, softmax_output[item_id]) + print ret.rstrip(",") + + +if __name__ == "__main__": + infer_user() diff --git a/youtube_recall/network_conf.py b/youtube_recall/network_conf.py index 8d398992a275f2bd5911f781b55ab22a19ff36a5..8a2f9541533bca576128a14c2ccf9ebdfa5a7375 100644 --- a/youtube_recall/network_conf.py +++ b/youtube_recall/network_conf.py @@ -74,7 +74,7 @@ class DNNmodel(object): initial_std=0.001, learning_rate=1, l2_rate=0, - sparse_update=True) + sparse_update=False) def _build_embedding_layer(self): """ diff --git a/youtube_recall/reader.py b/youtube_recall/reader.py index 8c9727ca17525291e7c316678f1734052362181a..e64e3fe7573f00c1aeb44c81368ee8afd224e56d 100644 --- a/youtube_recall/reader.py +++ b/youtube_recall/reader.py @@ -46,6 +46,13 @@ class Reader(object): mode = TaskMode.create_infer() return self._reader(path, mode) + def infer_user(self, user_list): + """ + load user set to infer + @user_list: user list + """ + return self._reader_user(user_list) + def _reader(self, path, mode): """ parse data set @@ -121,6 +128,15 @@ class Reader(object): history_clicked_items, history_clicked_categories, \ history_clicked_tags, phone + def _reader_user(self, user_list): + """ + parse user list + """ + USER_ID_UNK = self._feature_dict['user_id'].get('') + for user in user_list: + user_id = self._feature_dict['user_id'].get(user, USER_ID_UNK) + yield user_id, 0, 0, [0], [0], [0], 0 + if __name__ == "__main__": # this is to test and debug reader function diff --git a/youtube_recall/user_vector.py b/youtube_recall/user_vector.py index b2d04adead70985f533c21dfb534794a0b065718..270fcd70c31a58baf7b1ab1640740117223f788d 100644 --- a/youtube_recall/user_vector.py +++ b/youtube_recall/user_vector.py @@ -105,7 +105,7 @@ def get_a_batch_user_vector(inferer, test_batch): user_vector.append(0.000) norm = np.linalg.norm(user_vector) user_vector_norm = [str(_ / norm) for _ in user_vector] - print ", ".join(user_vector_norm) + print ",".join(user_vector_norm) if __name__ == "__main__":