get vector

2880e92f · zhaoyijin666 · 927d956d · 2880e92f · 2880e92f · 2880e92f
5 changed file
--- a/youtube_recall/infer.py
+++ b/youtube_recall/infer.py
@@ -46,7 +46,7 @@ def infer():
    # check argument
    assert os.path.exists(
-        args.infer_set_path), 'The train_set_path path does not exist.'
+        args.infer_set_path), 'The infer_set_path path does not exist.'
    assert os.path.exists(
        args.model_path), 'The model_path path does not exist.'
    assert os.path.exists(
@@ -101,11 +101,10 @@ def infer_a_batch(inferer, test_batch, nid_to_word):
        field=["value"],
        flatten_result=False)
    for i, res in enumerate(zip(test_batch, probs[0], probs[1])):
-        print "Sample %s:" % str(i)
        softmax_output = res[1]
        sort_nid = res[1].argsort()
-        # 输出top 30的推荐视频id，title，分数
+        # print top 30 recommended item 
        for j in range(1, 30):
            item_id = sort_nid[-1 * j]
            item_id_to_word = nid_to_word[item_id]

--- a/youtube_recall/item_vector.py
+++ b/youtube_recall/item_vector.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import gzip
+import paddle.v2 as paddle
+import argparse
+import cPickle
+from reader import Reader
+from network_conf import DNNmodel
+from utils import logger
+import numpy as np
+import math
+def parse_args():
+    """
+    parse arguments
+    :return:
+    """
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Youtube Recall Model Example")
+    parser.add_argument(
+        '--model_path', type=str, required=True, help="path of the model")
+    parser.add_argument(
+        '--feature_dict',
+        type=str,
+        required=True,
+        help="path of feature_dict.pkl")
+    return parser.parse_args()
+def get_item_vec_from_softmax(nce_w, nce_b):
+    """
+    get item vectors from softmax parameter 
+    """
+    if nce_w is None or nce_b is None:
+        return None
+    vector = []
+    total_items_num = nce_w.shape[0]
+    if total_items_num != nce_b.shape[1]:
+        return None
+    dim_vector = nce_w.shape[1] + 1
+    for i in range(0, total_items_num):
+        vector.append([])
+        vector[i].append(nce_b[0][i])
+        for j in range(1, dim_vector):
+            vector[i].append(nce_w[i][j - 1])
+    return vector
+def convt_simple_lsh(vector):
+    """
+    do simple lsh conversion
+    """
+    max_norm = 0
+    num_of_vec = len(vector)
+    for i in range(0, num_of_vec):
+        norm = np.linalg.norm(vector[i])
+        if norm > max_norm:
+            max_norm = norm
+    for i in range(0, num_of_vec):
+        vector[i].append(
+            math.sqrt(
+                math.pow(max_norm, 2) - math.pow(np.linalg.norm(vector[i]), 2)))
+    return vector
+def item_vector():
+    """
+    get item vectors
+    """
+    args = parse_args()
+    # check argument
+    assert os.path.exists(
+        args.model_path), 'The model_path path does not exist.'
+    assert os.path.exists(
+        args.feature_dict), 'The feature_dict path does not exist.'
+    paddle.init(use_gpu=False, trainer_count=1)
+    with open(args.feature_dict) as f:
+        feature_dict = cPickle.load(f)
+    # load the trained model.
+    with gzip.open(args.model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    nid_dict = feature_dict['history_clicked_items']
+    nid_to_word = dict((v, k) for k, v in nid_dict.items())
+    nce_w = parameters.get("nce_w")
+    nce_b = parameters.get("nce_b")
+    item_vector = convt_simple_lsh(get_item_vec_from_softmax(nce_w, nce_b))
+    for i in range(0, len(item_vector)):
+        itemid = nid_to_word[i]
+        print itemid + "\t" + ",".join(map(str, item_vector[i]))
+if __name__ == "__main__":
+    item_vector()
--- a/youtube_recall/test.py
+++ b/youtube_recall/test.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+########################################################################
+# 
+# Copyright (c) 2018 Baidu.com, Inc. All Rights Reserved
+# 
+########################################################################
+"""
+File: test.py
+Author: baidu(baidu@baidu.com)
+Date: 2018/01/12 11:41:37
+"""
+import cPickle
+#with open("./output/item_freq.pkl") as f:
+with open("./data/nid_dict.pkl") as f:
+    item_freq = cPickle.load(f)
+print item_freq
--- a/youtube_recall/user_vector.py
+++ b/youtube_recall/user_vector.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import gzip
+import paddle.v2 as paddle
+import argparse
+import cPickle
+from reader import Reader
+from network_conf import DNNmodel
+from utils import logger
+import numpy as np
+def parse_args():
+    """
+    parse arguments
+    """
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Youtube Recall Model Example")
+    parser.add_argument(
+        '--infer_set_path',
+        type=str,
+        required=True,
+        help="path of the infer set")
+    parser.add_argument(
+        '--model_path', type=str, required=True, help="path of the model")
+    parser.add_argument(
+        '--feature_dict',
+        type=str,
+        required=True,
+        help="path of feature_dict.pkl")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=50,
+        help="size of mini-batch (default:50)")
+    return parser.parse_args()
+def user_vector():
+    """
+    get user vectors
+    """
+    args = parse_args()
+    # check argument
+    assert os.path.exists(
+        args.infer_set_path), 'The infer_set_path path does not exist.'
+    assert os.path.exists(
+        args.model_path), 'The model_path path does not exist.'
+    assert os.path.exists(
+        args.feature_dict), 'The feature_dict path does not exist.'
+    paddle.init(use_gpu=False, trainer_count=1)
+    with open(args.feature_dict) as f:
+        feature_dict = cPickle.load(f)
+    # load the trained model.
+    with gzip.open(args.model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    # build model
+    prediction_layer, fc = DNNmodel(
+        dnn_layer_dims=[256, 31], feature_dict=feature_dict,
+        is_infer=True).model_cost
+    inferer = paddle.inference.Inference(
+        output_layer=[prediction_layer, fc], parameters=parameters)
+    reader = Reader(feature_dict)
+    test_batch = []
+    for idx, item in enumerate(reader.infer(args.infer_set_path)):
+        test_batch.append(item)
+        if len(test_batch) == args.batch_size:
+            get_a_batch_user_vector(inferer, test_batch)
+            test_batch = []
+    if len(test_batch):
+        get_a_batch_user_vector(inferer, test_batch)
+def get_a_batch_user_vector(inferer, test_batch):
+    """
+    input a batch of data and get user vectors
+    """
+    feeding = {
+        'user_id': 0,
+        'province': 1,
+        'city': 2,
+        'history_clicked_items': 3,
+        'history_clicked_categories': 4,
+        'history_clicked_tags': 5,
+        'phone': 6
+    }
+    probs = inferer.infer(
+        input=test_batch,
+        feeding=feeding,
+        field=["value"],
+        flatten_result=False)
+    for i, res in enumerate(zip(probs[1])):
+        # do simple lsh conversion
+        user_vector = [1.000]
+        for i in res[0]:
+            user_vector.append(i)
+        user_vector.append(0.000)
+        norm = np.linalg.norm(user_vector)
+        user_vector_norm = [str(_ / norm) for _ in user_vector]
+        print ", ".join(user_vector_norm)
+if __name__ == "__main__":
+    user_vector()
--- a/youtube_recall/vector.py
+++ b/youtube_recall/vector.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import gzip
+import paddle.v2 as paddle
+import argparse
+import cPickle
+from reader import Reader
+from network_conf import DNNmodel
+from utils import logger
+def parse_args():
+    """
+    parse arguments
+    :return:
+    """
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Youtube Recall Model Example")
+    parser.add_argument(
+        '--infer_set_path',
+        type=str,
+        required=True,
+        help="path of the infer set")
+    parser.add_argument(
+        '--model_path', type=str, required=True, help="path of the model")
+    parser.add_argument(
+        '--feature_dict',
+        type=str,
+        required=True,
+        help="path of feature_dict.pkl")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=50,
+        help="size of mini-batch (default:50)")
+    return parser.parse_args()
+def vector():
+    """
+    print user vector and item vector
+    """
+    args = parse_args()
+    # check argument
+    assert os.path.exists(
+        args.infer_set_path), 'The infer_set_path path does not exist.'
+    assert os.path.exists(
+        args.model_path), 'The model_path path does not exist.'
+    assert os.path.exists(
+        args.feature_dict), 'The feature_dict path does not exist.'
+    paddle.init(use_gpu=False, trainer_count=1)
+    with open(args.feature_dict) as f:
+        feature_dict = cPickle.load(f)
+    nid_dict = feature_dict['history_clicked_items']
+    nid_to_word = dict((v, k) for k, v in nid_dict.items())
+    # load the trained model.
+    with gzip.open(args.model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    # build model
+    prediction_layer, fc = DNNmodel(
+        dnn_layer_dims=[256, 31], feature_dict=feature_dict,
+        is_infer=True).model_cost
+    inferer = paddle.inference.Inference(
+        output_layer=[prediction_layer, fc], parameters=parameters)
+    reader = Reader(feature_dict)
+    test_batch = []
+    for idx, item in enumerate(reader.infer(args.infer_set_path)):
+        test_batch.append(item)
+        if len(test_batch) == args.batch_size:
+            infer_a_batch(inferer, test_batch, nid_to_word)
+            test_batch = []
+    if len(test_batch):
+        infer_a_batch(inferer, test_batch, nid_to_word)
+def infer_a_batch(inferer, test_batch, nid_to_word):
+    """
+    input a batch of data and infer 
+    """
+    feeding = {
+        'user_id': 0,
+        'province': 1,
+        'city': 2,
+        'history_clicked_items': 3,
+        'history_clicked_categories': 4,
+        'history_clicked_tags': 5,
+        'phone': 6
+    }
+    probs = inferer.infer(
+        input=test_batch,
+        feeding=feeding,
+        field=["value"],
+        flatten_result=False)
+    for i, res in enumerate(zip(test_batch, probs[0], probs[1])):
+        print "Sample %s:" % str(i)
+        user_vector = [1.000]
+        for i in res[2]:
+            user_vector.append(i)
+        user_vector.append(0.000)
+        norm = np.linalg.norm(user_vector)
+        user_vector_norm = [_ / norm for _ in user_vector]
+        print user_vector_norm
+if __name__ == "__main__":
+    vector()