From a03fc02aca8ad2ea5a22a3d64b42768262ff7c41 Mon Sep 17 00:00:00 2001
From: overlordmax <37664905+overlordmax@users.noreply.github.com>
Date: Wed, 13 May 2020 13:39:26 +0800
Subject: [PATCH] Youtube 05131100 (#4614)

* fix bugs

* fix bugs

* add wide_deep

* fix code style

* fix code style

* fix some bugs

* fix filename

* add ncf

* add download data

* add download data

* add youtube dnn

* edit README.md
---
 PaddleRec/youbube_dnn/README.md     | 147 ++++++++++++++++++++++++++++
 PaddleRec/youbube_dnn/args.py       |  43 ++++++++
 PaddleRec/youbube_dnn/get_topk.py   |  34 +++++++
 PaddleRec/youbube_dnn/infer.py      |  53 ++++++++++
 PaddleRec/youbube_dnn/infer_cpu.sh  |   1 +
 PaddleRec/youbube_dnn/infer_gpu.sh  |   1 +
 PaddleRec/youbube_dnn/rec_topk.sh   |   1 +
 PaddleRec/youbube_dnn/train.py      |  75 ++++++++++++++
 PaddleRec/youbube_dnn/train_cpu.sh  |   1 +
 PaddleRec/youbube_dnn/train_gpu.sh  |   1 +
 PaddleRec/youbube_dnn/youtubednn.py |  52 ++++++++++
 11 files changed, 409 insertions(+)
 create mode 100644 PaddleRec/youbube_dnn/README.md
 create mode 100644 PaddleRec/youbube_dnn/args.py
 create mode 100644 PaddleRec/youbube_dnn/get_topk.py
 create mode 100644 PaddleRec/youbube_dnn/infer.py
 create mode 100644 PaddleRec/youbube_dnn/infer_cpu.sh
 create mode 100644 PaddleRec/youbube_dnn/infer_gpu.sh
 create mode 100644 PaddleRec/youbube_dnn/rec_topk.sh
 create mode 100644 PaddleRec/youbube_dnn/train.py
 create mode 100644 PaddleRec/youbube_dnn/train_cpu.sh
 create mode 100644 PaddleRec/youbube_dnn/train_gpu.sh
 create mode 100644 PaddleRec/youbube_dnn/youtubednn.py

diff --git a/PaddleRec/youbube_dnn/README.md b/PaddleRec/youbube_dnn/README.md
new file mode 100644
index 00000000..60714ef4
--- /dev/null
+++ b/PaddleRec/youbube_dnn/README.md
@@ -0,0 +1,147 @@
+# youtube dnn
+
+ 以下是本例的简要目录结构及说明：
+
+```
+├── README.md # 文档
+├── youtubednn.py # youtubednn.py网络文件
+├── args.py # 参数脚本
+├── train.py # 训练文件
+├── infer.py # 预测文件
+├── train_gpu.sh # gpu训练shell脚本
+├── train_cpu.sh # cpu训练shell脚本
+├── infer_gpu.sh # gpu预测shell脚本
+├── infer_cpu.sh # cpu预测shell脚本
+├── get_topk.py # 获取user最有可能点击的k个video
+├── rec_topk.sh # 推荐shell脚本
+```
+
+## 简介
+
+[《Deep Neural Networks for YouTube Recommendations》](https://link.zhihu.com/?target=https%3A//static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf) 这篇论文是google的YouTube团队在推荐系统上DNN方面的尝试，是经典的向量化召回模型，主要通过模型来学习用户和物品的兴趣向量，并通过内积来计算用户和物品之间的相似性，从而得到最终的候选集。YouTube采取了两层深度网络完成整个推荐过程：
+
+1.第一层是**Candidate Generation Model**完成候选视频的快速筛选，这一步候选视频集合由百万降低到了百的量级。
+
+2.第二层是用**Ranking Model**完成几百个候选视频的精排。
+
+本项目在paddlepaddle上完成YouTube dnn的召回部分Candidate Generation Model，分别获得用户和物品的向量表示，从而后续可以通过其他方法（如用户和物品的余弦相似度）给用户推荐物品。
+
+由于原论文没有开源数据集，本项目随机构造数据验证网络的正确性。
+
+## 环境
+
+ PaddlePaddle 1.7.0 
+
+ python3.7 
+
+## 单机训练
+
+GPU环境
+
+在train_gpu.sh脚本文件中设置好数据路径、参数。
+
+```sh
+CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1\ #使用gpu
+                                    --batch_size 32\
+                                    --epochs 20\
+                                    --watch_vec_size 64\ #特征维度
+                                    --search_vec_size 64\
+                                    --other_feat_size 64\
+                                    --output_size 100\ 
+                                    --model_dir 'model_dir'\ #模型保存路径
+                                    --test_epoch 19\
+                                    --base_lr 0.01\
+                                    --video_vec_path './video_vec.csv' #得到物品向量文件路径
+```
+
+执行脚本
+
+```sh
+sh train_gpu.sh
+```
+
+CPU环境
+
+在train_cpu.sh脚本文件中设置好数据路径、参数。
+
+```sh
+python train.py --use_gpu 0\ #使用cpu
+                --batch_size 32\
+                --epochs 20\
+                --watch_vec_size 64\ #特征维度
+                --search_vec_size 64\
+                --other_feat_size 64\
+                --output_size 100\ 
+                --model_dir 'model_dir'\ #模型保存路径
+                --test_epoch 19\
+                --base_lr 0.01\
+                --video_vec_path './video_vec.csv' #得到物品向量文件路径
+```
+
+执行脚本
+
+```
+sh train_cpu.sh
+```
+
+## 单机预测
+
+GPU环境
+
+在infer_gpu.sh脚本文件中设置好数据路径、参数。
+
+```sh
+CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 \ #使用gpu
+                                    --test_epoch 19 \ #采用哪一轮模型来预测
+                                    --model_dir './model_dir' \ #模型路径
+                                    --user_vec_path './user_vec.csv' #用户向量路径
+```
+
+执行脚本
+
+```sh
+sh infer_gpu.sh
+```
+
+CPU环境
+
+在infer_cpu.sh脚本文件中设置好数据路径、参数。
+
+```sh
+python infer.py --use_gpu 0 \ #使用cpu
+                --test_epoch 19 \ #采用哪一轮模型来预测
+                --model_dir './model_dir' \ #模型路径
+                --user_vec_path './user_vec.csv' #用户向量路径
+```
+
+执行脚本
+
+```sh
+sh infer_cpu.sh
+```
+
+## 模型效果
+
+构造数据集进行训练：
+
+```
+W0512 23:12:36.044643  2124 device_context.cc:237] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.1, Runtime API Version: 9.0
+W0512 23:12:36.050058  2124 device_context.cc:245] device: 0, cuDNN Version: 7.3.
+2020-05-12 23:12:37,681-INFO: epoch_id: 0, batch_time: 0.00719s, loss: 4.68754, acc: 0.00000
+2020-05-12 23:12:37,686-INFO: epoch_id: 0, batch_time: 0.00503s, loss: 4.54141, acc: 0.03125
+2020-05-12 23:12:37,691-INFO: epoch_id: 0, batch_time: 0.00419s, loss: 4.92227, acc: 0.00000
+```
+
+通过计算每个用户和每个物品的余弦相似度，给每个用户推荐topk视频：
+
+```
+user:0, top K videos:[93, 73, 6, 20, 84]
+user:1, top K videos:[58, 0, 46, 86, 71]
+user:2, top K videos:[52, 51, 47, 82, 19]
+......
+user:96, top K videos:[0, 52, 86, 45, 11]
+user:97, top K videos:[0, 52, 45, 58, 28]
+user:98, top K videos:[58, 24, 49, 36, 46]
+user:99, top K videos:[0, 47, 44, 72, 51]
+```
+
diff --git a/PaddleRec/youbube_dnn/args.py b/PaddleRec/youbube_dnn/args.py
new file mode 100644
index 00000000..6fb6ce61
--- /dev/null
+++ b/PaddleRec/youbube_dnn/args.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import distutils.util
+import sys
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--epochs", type=int, default=20, help="epochs")
+    parser.add_argument("--batch_size", type=int, default=32, help="batch_size")
+    parser.add_argument("--test_epoch", type=int, default=19, help="test_epoch")
+    parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu')
+    parser.add_argument('--model_dir', type=str, default='./model_dir', help='model_dir')
+    parser.add_argument('--watch_vec_size', type=int, default=64, help='watch_vec_size')
+    parser.add_argument('--search_vec_size', type=int, default=64, help='search_vec_size')
+    parser.add_argument('--other_feat_size', type=int, default=64, help='other_feat_size')
+    parser.add_argument('--output_size', type=int, default=100, help='output_size')
+    parser.add_argument('--base_lr', type=float, default=0.01, help='base_lr')
+    parser.add_argument('--video_vec_path', type=str, default='./video_vec.csv', help='video_vec_path')
+    parser.add_argument('--user_vec_path', type=str, default='./user_vec.csv', help='user_vec_path')
+    parser.add_argument('--topk', type=int, default=5, help='topk')
+
+    args = parser.parse_args()
+    return args
+    
+
+
diff --git a/PaddleRec/youbube_dnn/get_topk.py b/PaddleRec/youbube_dnn/get_topk.py
new file mode 100644
index 00000000..69025adb
--- /dev/null
+++ b/PaddleRec/youbube_dnn/get_topk.py
@@ -0,0 +1,34 @@
+import numpy as np
+import pandas as pd
+import args
+import copy
+
+def cos_sim(vector_a, vector_b):
+    vector_a = np.mat(vector_a)
+    vector_b = np.mat(vector_b)
+    num = float(vector_a * vector_b.T)
+    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
+    cos = num / denom
+    sim = 0.5 + 0.5 * cos
+    return sim
+
+def get_topK(args, K):
+    video_vec = pd.read_csv(args.video_vec_path, header=None)
+    user_vec = pd.read_csv(args.user_vec_path, header=None)
+
+    user_video_sim_list = []
+    for i in range(user_vec.shape[0]):
+        for j in range(video_vec.shape[1]):    
+            user_video_sim = cos_sim(np.array(user_vec.loc[i]), np.array(video_vec[j]))
+            user_video_sim_list.append(user_video_sim)
+
+        tmp_list=copy.deepcopy(user_video_sim_list)
+        tmp_list.sort()
+        max_sim_index=[user_video_sim_list.index(one) for one in tmp_list[::-1][:K]]
+
+        print("user:{0}, top K videos:{1}".format(i, max_sim_index))
+        user_video_sim_list = []
+
+if __name__ == "__main__":
+    args = args.parse_args()
+    get_topK(args, 5)
\ No newline at end of file
diff --git a/PaddleRec/youbube_dnn/infer.py b/PaddleRec/youbube_dnn/infer.py
new file mode 100644
index 00000000..45921739
--- /dev/null
+++ b/PaddleRec/youbube_dnn/infer.py
@@ -0,0 +1,53 @@
+import paddle.fluid as fluid
+import numpy as np
+import pandas as pd
+import time
+import sys
+import os
+import args
+import logging
+from youtubednn import YoutubeDNN
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("fluid")
+logger.setLevel(logging.INFO)
+
+def infer(args):
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    cur_model_path = os.path.join(args.model_dir, 'epoch_' + str(args.test_epoch), "checkpoint")
+
+    with fluid.scope_guard(fluid.Scope()):
+        infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(cur_model_path, exe)
+        # Build a random data set.
+        sample_size = 100
+        watch_vecs = []
+        search_vecs = []
+        other_feats = []
+
+        for i in range(sample_size):
+            watch_vec = np.random.rand(1, args.watch_vec_size)
+            search_vec = np.random.rand(1, args.search_vec_size)
+            other_feat = np.random.rand(1, args.other_feat_size)
+            watch_vecs.append(watch_vec)
+            search_vecs.append(search_vec)
+            other_feats.append(other_feat)
+
+        for i in range(sample_size):
+            l3 = exe.run(infer_program,
+                        feed={
+                            "watch_vec": watch_vecs[i].astype('float32'),
+                            "search_vec": search_vecs[i].astype('float32'),
+                            "other_feat": other_feats[i].astype('float32'),
+                        },
+                        return_numpy=True,
+                        fetch_list=fetch_vars)
+
+            user_vec = pd.DataFrame(l3[0])
+            user_vec.to_csv(args.user_vec_path, mode="a", index=False, header=0)
+
+if __name__ == "__main__":
+    args = args.parse_args()
+    if(os.path.exists(args.user_vec_path)):
+        os.system("rm " + args.user_vec_path)
+    infer(args)
\ No newline at end of file
diff --git a/PaddleRec/youbube_dnn/infer_cpu.sh b/PaddleRec/youbube_dnn/infer_cpu.sh
new file mode 100644
index 00000000..36a9bd74
--- /dev/null
+++ b/PaddleRec/youbube_dnn/infer_cpu.sh
@@ -0,0 +1 @@
+python infer.py --use_gpu 0 --test_epoch 19 --model_dir './model_dir' --user_vec_path './user_vec.csv'
\ No newline at end of file
diff --git a/PaddleRec/youbube_dnn/infer_gpu.sh b/PaddleRec/youbube_dnn/infer_gpu.sh
new file mode 100644
index 00000000..a91d2238
--- /dev/null
+++ b/PaddleRec/youbube_dnn/infer_gpu.sh
@@ -0,0 +1 @@
+CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 --test_epoch 19 --model_dir './model_dir' --user_vec_path './user_vec.csv'
\ No newline at end of file
diff --git a/PaddleRec/youbube_dnn/rec_topk.sh b/PaddleRec/youbube_dnn/rec_topk.sh
new file mode 100644
index 00000000..7211e89d
--- /dev/null
+++ b/PaddleRec/youbube_dnn/rec_topk.sh
@@ -0,0 +1 @@
+python get_topk.py --video_vec_path './video_vec.csv' --user_vec_path './user_vec.csv' --topk 5
\ No newline at end of file
diff --git a/PaddleRec/youbube_dnn/train.py b/PaddleRec/youbube_dnn/train.py
new file mode 100644
index 00000000..4eb90307
--- /dev/null
+++ b/PaddleRec/youbube_dnn/train.py
@@ -0,0 +1,75 @@
+import numpy as np
+import pandas as pd
+import os
+import random
+import paddle.fluid as fluid
+from youtubednn import YoutubeDNN
+import paddle
+import args
+import logging
+import time
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("fluid")
+logger.setLevel(logging.INFO)
+
+def train(args):
+    youtube_model = YoutubeDNN()
+    inputs = youtube_model.input_data(args.watch_vec_size, args.search_vec_size, args.other_feat_size)
+    loss, acc, l3 = youtube_model.net(inputs, args.output_size, layers=[128, 64, 32])
+
+    sgd = fluid.optimizer.SGD(learning_rate=args.base_lr)
+    sgd.minimize(loss)
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    # Build a random data set.
+    sample_size = 100
+    watch_vecs = []
+    search_vecs = []
+    other_feats = []
+    labels = []
+
+    for i in range(sample_size):
+        watch_vec = np.random.rand(args.batch_size, args.watch_vec_size)
+        search_vec = np.random.rand(args.batch_size, args.search_vec_size)
+        other_feat = np.random.rand(args.batch_size, args.other_feat_size)
+        watch_vecs.append(watch_vec)
+        search_vecs.append(search_vec)
+        other_feats.append(other_feat)
+        label = np.random.randint(args.output_size, size=(args.batch_size, 1))
+        labels.append(label)
+    for epoch in range(args.epochs):
+        for i in range(sample_size):
+            begin = time.time()
+            loss_data, acc_val = exe.run(fluid.default_main_program(),
+                                feed={
+                                    "watch_vec": watch_vecs[i].astype('float32'),
+                                    "search_vec": search_vecs[i].astype('float32'),
+                                    "other_feat": other_feats[i].astype('float32'),
+                                    "label": np.array(labels[i]).reshape(args.batch_size, 1)
+                                },
+                                return_numpy=True,
+                                fetch_list=[loss.name, acc.name])
+            end = time.time()
+            logger.info("epoch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, acc: {:.5f}".format(
+                epoch, end-begin, float(np.array(loss_data)), np.array(acc_val)[0]))
+        #save model
+        model_dir = os.path.join(args.model_dir, 'epoch_' + str(epoch + 1), "checkpoint")
+
+        feed_var_names = ["watch_vec", "search_vec", "other_feat"]
+        fetch_vars = [l3]
+        fluid.io.save_inference_model(model_dir, feed_var_names, fetch_vars, exe)
+
+    #save all video vector
+    video_array = np.array(fluid.global_scope().find_var('l4_weight').get_tensor())
+    video_vec = pd.DataFrame(video_array)
+    video_vec.to_csv(args.video_vec_path, mode="a", index=False, header=0)
+
+if __name__ == "__main__":
+    args = args.parse_args()
+    if(os.path.exists(args.video_vec_path)):
+        os.system("rm " + args.video_vec_path)
+    train(args)
diff --git a/PaddleRec/youbube_dnn/train_cpu.sh b/PaddleRec/youbube_dnn/train_cpu.sh
new file mode 100644
index 00000000..ed89a105
--- /dev/null
+++ b/PaddleRec/youbube_dnn/train_cpu.sh
@@ -0,0 +1 @@
+python train.py --use_gpu 0 --batch_size 32 --epochs 20 --watch_vec_size 64 --search_vec_size 64 --other_feat_size 64 --output_size 100 --model_dir 'model_dir' --test_epoch 19 --base_lr 0.01 --video_vec_path './video_vec.csv'
\ No newline at end of file
diff --git a/PaddleRec/youbube_dnn/train_gpu.sh b/PaddleRec/youbube_dnn/train_gpu.sh
new file mode 100644
index 00000000..e934433f
--- /dev/null
+++ b/PaddleRec/youbube_dnn/train_gpu.sh
@@ -0,0 +1 @@
+CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 --batch_size 32 --epochs 20 --watch_vec_size 64 --search_vec_size 64 --other_feat_size 64 --output_size 100 --model_dir 'model_dir' --test_epoch 19 --base_lr 0.01 --video_vec_path './video_vec.csv'
\ No newline at end of file
diff --git a/PaddleRec/youbube_dnn/youtubednn.py b/PaddleRec/youbube_dnn/youtubednn.py
new file mode 100644
index 00000000..f3642de9
--- /dev/null
+++ b/PaddleRec/youbube_dnn/youtubednn.py
@@ -0,0 +1,52 @@
+import paddle
+import io
+import math
+import numpy as np
+import paddle.fluid as fluid
+
+class YoutubeDNN(object):
+    def input_data(self, watch_vec_size, search_vec_size, other_feat_size):
+        watch_vec = fluid.data(name="watch_vec", shape=[None, watch_vec_size], dtype="float32")
+        search_vec = fluid.data(name="search_vec", shape=[None, search_vec_size], dtype="float32")
+        other_feat = fluid.data(name="other_feat", shape=[None, other_feat_size], dtype="float32")
+        label = fluid.data(name="label", shape=[None, 1], dtype="int64")
+
+        inputs = [watch_vec] + [search_vec] + [other_feat] + [label]
+
+        return inputs
+        
+    def fc(self, tag, data, out_dim, active='relu'):
+        init_stddev = 1.0
+        scales = 1.0  / np.sqrt(data.shape[1])
+        
+        if tag == 'l4':
+            p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag,
+                        initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=init_stddev * scales))
+        else:
+            p_attr = None
+                    
+        b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1))
+
+        out = fluid.layers.fc(input=data,
+                            size=out_dim,
+                            act=active,
+                            param_attr=p_attr, 
+                            bias_attr =b_attr,
+                            name=tag)
+        return out
+
+    def net(self, inputs, output_size, layers=[128, 64, 32]):
+        concat_feats = fluid.layers.concat(input=inputs[:-1], axis=-1)
+
+        l1 = self.fc('l1', concat_feats, layers[0], 'relu')
+        l2 = self.fc('l2', l1, layers[1], 'relu')
+        l3 = self.fc('l3', l2, layers[2], 'relu')
+        l4 = self.fc('l4', l3, output_size, 'softmax')
+
+        num_seqs = fluid.layers.create_tensor(dtype='int64')
+        acc = fluid.layers.accuracy(input=l4, label=inputs[-1], total=num_seqs)
+
+        cost = fluid.layers.cross_entropy(input=l4, label=inputs[-1])
+        avg_cost = fluid.layers.mean(cost)
+
+        return avg_cost, acc, l3
-- 
GitLab