From 03c5139aa2f4bd3a5a377b0e58c6ba40cec47a64 Mon Sep 17 00:00:00 2001 From: Bella-Zhao Date: Thu, 25 Jan 2018 14:52:03 +0800 Subject: [PATCH] infer user --- youtube_recall/README.cn.md | 2 +- youtube_recall/README.md | 16 ++--- youtube_recall/infer.py | 5 +- youtube_recall/infer_user.py | 117 +++++++++++++++++++++++++++++++++ youtube_recall/network_conf.py | 2 +- youtube_recall/reader.py | 16 +++++ youtube_recall/user_vector.py | 2 +- 7 files changed, 147 insertions(+), 13 deletions(-) create mode 100644 youtube_recall/infer_user.py diff --git a/youtube_recall/README.cn.md b/youtube_recall/README.cn.md index 9aae47bc..0a2eb35f 100644 --- a/youtube_recall/README.cn.md +++ b/youtube_recall/README.cn.md @@ -137,7 +137,7 @@ def _create_emb_attr(self, name): create embedding parameter """ return paddle.attr.Param( - name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=True) + name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=False) def _build_embedding_layer(self): """ diff --git a/youtube_recall/README.md b/youtube_recall/README.md index 1c669e75..0167c3ec 100644 --- a/youtube_recall/README.md +++ b/youtube_recall/README.md @@ -35,7 +35,7 @@ Figure 2. Candidate generation model architecture - Output layer: A softmax classifier is connected to do discriminating millions of classes (videos). To speed up training process, a technique is applied that samples negative classes from background distribution with importance weighting. The previous mentioned high-dimensional "embedding" of the candidate video ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D) is obtained by weight and bias of the softmax layer. At serving time, the most likely N classes (videos) is computed for presenting to the user. To Score millions of items under a strict serving laterncy, the scoring problem reduces to a nearest neighbor search in the dot product space, and Locality Sensitive Hashing is relied on. ## Data Pre-processing -In this example, we moked click log of users as sample data, and its format is as follows: +In this example, here moke the click log of users as sample data, and its format is as follows: ``` user-id \t province \t city \t history-clicked-video-info-sequence \t phone @@ -125,7 +125,7 @@ def _create_emb_attr(self, name): create embedding parameter """ return paddle.attr.Param( - name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=True) + name=name, initial_std=0.001, learning_rate=1, l2_rate=0, sparse_update=False) def _build_embedding_layer(self): """ @@ -160,7 +160,7 @@ def _build_embedding_layer(self): ``` ### Hiddern layer -We improves the original networks in \[[1](#References)\] by modifying that the embeddings of video watches are not simply averaged but are connected to a LSTM layer with max temporal pooling instead, so that the deep sequential information related to user interests can be learned well. Considering data scale and efficiency of training, we only apply two ReLU layers, which also leads to good performance. +We improves the original networks in \[[1](#References)\] by modifying that the embeddings of video watches are not simply averaged but are connected to a LSTM layer with max temporal pooling instead, so that the deep sequential information related to user interests can be learned well. Considering data scale and efficiency of training, only two ReLU layers are applied, which also leads to good performance. ```python self._rnn_cell = paddle.networks.simple_lstm(input=self._history_clicked_items_emb, size=64) @@ -257,18 +257,18 @@ python infer.py --infer_set_path='./data/infer.txt' \ ``` ## Online prediction -For online prediction,we adopt Approximate Nearest Neighbor(ANN) to directly recall top N most likely watch video. However, our ANN system currently only supports cosin sorting, not by inner product sorting, which leads to big effect difference. +For online prediction,Approximate Nearest Neighbor(ANN) is adopted to directly recall top N most likely watch video. However, our ANN system currently only supports cosin sorting, not by inner product sorting, which leads to big effect difference. -As a result, we sliently modify user and video vectors by a SIMPLE-LSH conversion\[[4](#References)\], so that inner sorting is equivalent to cosin sorting after conversion. +To solve it, user and video vectors are sliently modified by a SIMPLE-LSH conversion\[[4](#References)\], so that inner sorting is equivalent to cosin sorting after conversion. Details are as follows: -- For video vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D%5Cin%20%5Cmathbb%7BR%7D%5EN), we have ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Cmathbf%7Bv%7D%20%5Cright%20%5C%7C%5Cleqslant%20m). The modified video vector ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D), and let ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%3D%20%5B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%3B%20%5Csqrt%7B1%20-%5Cleft%20%5C%7C%20%5Cmathbf%7B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%7B%7D%7D%20%5Cright%20%5C%7C%5E2%7D%5D). +- For video vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D%5Cin%20%5Cmathbb%7BR%7D%5EN), ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Cmathbf%7Bv%7D%20%5Cright%20%5C%7C%5Cleqslant%20m). The modified video vector ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D), and let ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%3D%20%5B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%3B%20%5Csqrt%7B1%20-%5Cleft%20%5C%7C%20%5Cmathbf%7B%5Cfrac%7B%5Cmathbf%7Bv%7D%7D%7Bm%7D%7B%7D%7D%20%5Cright%20%5C%7C%5E2%7D%5D). - For user vector ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Cin%20%5Cmathbb%7BR%7D%5EN), and the modified user vector ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Cin%20%5Cmathbb%7BR%7D%5E%7BN%2B1%7D), and let ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%3D%20%5B%5Cmathbf%7Bu%7D_%7Bnorm%7D%3B%200%5D), where ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D_%7Bnorm%7D) is normalized ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D). -When online predicting, for a coming ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D), we need recall ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D) by inner product sorting. After ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%2C%20%5Cmathbf%7Bv%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D) conversion, the order of inner prodct sorting is unchanged. Since ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%5Cright%20%5C%7C) and ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%5Cright%20%5C%7C) are both equal to 1, ![](https://www.zhihu.com/equation?tex=cos(%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%2C%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)%20%3D%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Ccdot%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D), which makes cosin-supported-only ANN system works. +When online predicting, for a coming ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D), it should recall ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bv%7D) by inner product sorting. After ![](https://www.zhihu.com/equation?tex=%5Cmathbf%7Bu%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%2C%20%5Cmathbf%7Bv%7D%5Crightarrow%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D) conversion, the order of inner prodct sorting is unchanged. Since ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%5Cright%20%5C%7C) and ![](https://www.zhihu.com/equation?tex=%5Cleft%20%5C%7C%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%20%5Cright%20%5C%7C) are both equal to 1, ![](https://www.zhihu.com/equation?tex=cos(%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%20%2C%5Ctilde%7B%5Cmathbf%7Bv%7D%7D)%20%3D%20%5Ctilde%7B%5Cmathbf%7Bu%7D%7D%5Ccdot%20%5Ctilde%7B%5Cmathbf%7Bv%7D%7D), which makes cosin-supported-only ANN system works. -And in order to retain precision, we find that ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D) is also equivalent. +And in order to retain precision, use ![](https://www.zhihu.com/equation?tex=%5Ctilde%7B%5Cmathbf%7Bv%7D%7D%3D%5B%5Cmathbf%7Bv%7D%3B%5Csqrt%7Bm%5E2-%5Cleft%5C%7C%20%5Cmathbf%7B%5Cmathbf%7Bv%7D%7D%5Cright%5C%7C%5E2%7D%5D) is also equivalent. Use `user_vector.py` and `vector.py` to calculate user and item vectors. For example, run the following commands: ```shell diff --git a/youtube_recall/infer.py b/youtube_recall/infer.py index 6c3e096b..4e0aa730 100644 --- a/youtube_recall/infer.py +++ b/youtube_recall/infer.py @@ -103,13 +103,14 @@ def infer_a_batch(inferer, test_batch, nid_to_word): for i, res in enumerate(zip(test_batch, probs[0], probs[1])): softmax_output = res[1] sort_nid = res[1].argsort() - # print top 30 recommended item + ret = "" for j in range(1, 30): item_id = sort_nid[-1 * j] item_id_to_word = nid_to_word[item_id] - print "%s\t%.6f" \ + ret += "%s:%.6f," \ % (item_id_to_word, softmax_output[item_id]) + print ret.rstrip(",") if __name__ == "__main__": diff --git a/youtube_recall/infer_user.py b/youtube_recall/infer_user.py new file mode 100644 index 00000000..40a49350 --- /dev/null +++ b/youtube_recall/infer_user.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import gzip +import paddle.v2 as paddle +import argparse +import cPickle + +from reader import Reader +from network_conf import DNNmodel +from utils import logger +import numpy as np + + +def parse_args(): + """ + parse arguments + :return: + """ + parser = argparse.ArgumentParser( + description="PaddlePaddle Youtube Recall Model Example") + parser.add_argument( + '--model_path', type=str, required=True, help="path of the model") + parser.add_argument( + '--feature_dict', + type=str, + required=True, + help="path of feature_dict.pkl") + return parser.parse_args() + + +def infer_user(): + """ + infer_user + """ + args = parse_args() + + # check argument + assert os.path.exists( + args.model_path), 'The model_path path does not exist.' + assert os.path.exists( + args.feature_dict), 'The feature_dict path does not exist.' + + paddle.init(use_gpu=False, trainer_count=1) + + with open(args.feature_dict) as f: + feature_dict = cPickle.load(f) + + nid_dict = feature_dict['history_clicked_items'] + nid_to_word = dict((v, k) for k, v in nid_dict.items()) + + # load the trained model. + with gzip.open(args.model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + parameters.set( + '_proj_province', + np.zeros(shape=parameters.get('_proj_province').shape)) + parameters.set( + '_proj_city', np.zeros(shape=parameters.get('_proj_city').shape)) + parameters.set( + '_proj_phone', np.zeros(shape=parameters.get('_proj_phone').shape)) + parameters.set('_proj_history_clicked_items', \ + np.zeros(shape= parameters.get('_proj_history_clicked_items').shape)) + parameters.set('_proj_history_clicked_categories', \ + np.zeros(shape= parameters.get('_proj_history_clicked_categories').shape)) + parameters.set('_proj_history_clicked_tags', \ + np.zeros(shape= parameters.get('_proj_history_clicked_tags').shape)) + + # build model + prediction_layer, fc = DNNmodel( + dnn_layer_dims=[256, 31], feature_dict=feature_dict, + is_infer=True).model_cost + inferer = paddle.inference.Inference( + output_layer=[prediction_layer, fc], parameters=parameters) + + reader = Reader(feature_dict) + test_batch = [] + for idx, item in enumerate( + reader.infer_user(['USER_ID_0', 'USER_ID_981', 'USER_ID_310806'])): + test_batch.append(item) + infer_a_batch(inferer, test_batch, nid_to_word) + + +def infer_a_batch(inferer, test_batch, nid_to_word): + """ + input a batch of data and infer + """ + feeding = { + 'user_id': 0, + 'province': 1, + 'city': 2, + 'history_clicked_items': 3, + 'history_clicked_categories': 4, + 'history_clicked_tags': 5, + 'phone': 6 + } + probs = inferer.infer( + input=test_batch, + feeding=feeding, + field=["value"], + flatten_result=False) + for i, res in enumerate(zip(test_batch, probs[0], probs[1])): + softmax_output = res[1] + sort_nid = res[1].argsort() + + # print top 30 recommended item + ret = "" + for j in range(1, 30): + item_id = sort_nid[-1 * j] + item_id_to_word = nid_to_word[item_id] + ret += "%s:%.6f," \ + % (item_id_to_word, softmax_output[item_id]) + print ret.rstrip(",") + + +if __name__ == "__main__": + infer_user() diff --git a/youtube_recall/network_conf.py b/youtube_recall/network_conf.py index 8d398992..8a2f9541 100644 --- a/youtube_recall/network_conf.py +++ b/youtube_recall/network_conf.py @@ -74,7 +74,7 @@ class DNNmodel(object): initial_std=0.001, learning_rate=1, l2_rate=0, - sparse_update=True) + sparse_update=False) def _build_embedding_layer(self): """ diff --git a/youtube_recall/reader.py b/youtube_recall/reader.py index 8c9727ca..e64e3fe7 100644 --- a/youtube_recall/reader.py +++ b/youtube_recall/reader.py @@ -46,6 +46,13 @@ class Reader(object): mode = TaskMode.create_infer() return self._reader(path, mode) + def infer_user(self, user_list): + """ + load user set to infer + @user_list: user list + """ + return self._reader_user(user_list) + def _reader(self, path, mode): """ parse data set @@ -121,6 +128,15 @@ class Reader(object): history_clicked_items, history_clicked_categories, \ history_clicked_tags, phone + def _reader_user(self, user_list): + """ + parse user list + """ + USER_ID_UNK = self._feature_dict['user_id'].get('') + for user in user_list: + user_id = self._feature_dict['user_id'].get(user, USER_ID_UNK) + yield user_id, 0, 0, [0], [0], [0], 0 + if __name__ == "__main__": # this is to test and debug reader function diff --git a/youtube_recall/user_vector.py b/youtube_recall/user_vector.py index b2d04ade..270fcd70 100644 --- a/youtube_recall/user_vector.py +++ b/youtube_recall/user_vector.py @@ -105,7 +105,7 @@ def get_a_batch_user_vector(inferer, test_batch): user_vector.append(0.000) norm = np.linalg.norm(user_vector) user_vector_norm = [str(_ / norm) for _ in user_vector] - print ", ".join(user_vector_norm) + print ",".join(user_vector_norm) if __name__ == "__main__": -- GitLab