diff --git a/PaddleRec/ncf/Dataset.py b/PaddleRec/ncf/Dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..9cbe0200e56555b93767252213bb97f328219d75 --- /dev/null +++ b/PaddleRec/ncf/Dataset.py @@ -0,0 +1,38 @@ +import scipy.sparse as sp +import numpy as np +from time import time +import args + +class Dataset(object): + + def __init__(self, path): + self.testRatings = self.load_rating_file_as_list(path + ".test.rating") + self.testNegatives = self.load_negative_file(path + ".test.negative") + assert len(self.testRatings) == len(self.testNegatives) + + def load_rating_file_as_list(self, filename): + ratingList = [] + with open(filename, "r") as f: + line = f.readline() + while line != None and line != "": + arr = line.split("\t") + user, item = int(arr[0]), int(arr[1]) + ratingList.append([user, item]) + line = f.readline() + return ratingList + + def load_negative_file(self, filename): + negativeList = [] + with open(filename, "r") as f: + line = f.readline() + while line != None and line != "": + arr = line.split("\t") + negatives = [] + for x in arr[1: ]: + negatives.append(int(x)) + negativeList.append(negatives) + line = f.readline() + return negativeList + + + diff --git a/PaddleRec/ncf/README.md b/PaddleRec/ncf/README.md new file mode 100644 index 0000000000000000000000000000000000000000..53def11f071ff5e3698abe9d55503e25a11c8f7f --- /dev/null +++ b/PaddleRec/ncf/README.md @@ -0,0 +1,135 @@ +# NCF + + 以下是本例的简要目录结构及说明: + +``` +├── README.md # 文档 +├── requirements.txt # 需要的安装包 +├── gmf.py # gmf网络文件 +├── mlp.py # mlp网络文件 +├── neumf.py # neumf网络文件 +├── create_data.sh # 生成训练数据脚本 +├── Dataset.py # 测试数据集处理 +├── get_train_data.py # 生成测试数据集 +├── evaluate.py # 预测并计算指标文件 +├── train.py # 训练文件 +├── infer.py # 预测文件 +├── args.py # 参数文件 +├── utils.py # 通用函数 +├── train_gpu.sh # gpu训练shell脚本 +├── train_cpu.sh # cpu训练shell脚本 +``` + +## 简介 + +很多应用场景,并没有显性反馈的存在。因为大部分用户是沉默的用户,并不会明确给系统反馈“我对这个物品的偏好值是多少”。因此,推荐系统可以根据大量的隐性反馈来推断用户的偏好值。[《Neural Collaborative Filtering 》](https://arxiv.org/pdf/1708.05031.pdf)作者利用深度学习来对user和item特征进行建模,使模型具有非线性表达能力。具体来说使用多层感知机来学习user-item交互函数,提出了一种隐性反馈协同过滤解决方案。 + +## 环境 + + PaddlePaddle 1.7.0 + + python3.7 + +## 数据下载及预处理 + +[Data.zip](https://paddlerec.bj.bcebos.com/ncf/Data.zip) + +在create_data.sh脚本文件中添加文件的路径,并运行脚本。 + +```sh +mkdir Data +pip install -r requirements.txt #安装必需包 +wget -P Data https://paddlerec.bj.bcebos.com/ncf/Data.zip #下载数据集 +unzip Data/Data.zip -d Data/ +python get_train_data.py --num_neg 4 \ #负采样个数 + --train_data_path "Data/train_data.csv" #生成训练数据 +``` + + + +## 单机训练 + +GPU环境 + +在train_gpu.sh脚本文件中设置好数据路径、参数。 + +```sh +CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 \ #使用gpu + --NeuMF 1 \ #nn和gmf网络结合 + --epochs 20 \ #训练轮次 + --batch_size 256 \ #batch大小 + --num_factors 8 \ #gmf网络输入的embedding大小 + --num_neg 4 \ #负采样个数 + --lr 0.001 \ #学习率 + --model_dir 'model_dir' #模型保存目录 +``` + +修改脚本的可执行权限并运行 + +``` +./train_gpu.sh +``` + +CPU环境 + +在train_cpu.sh脚本文件中设置好数据路径、参数。 + +```sh +python train.py --use_gpu 0 \ #使用cpu + --NeuMF 1 \ #nn和gmf网络结合 + --epochs 20 \ #训练轮次 + --batch_size 256 \ #batch大小 + --num_factors 8 \ #gmf网络输入的embedding大小 + --num_neg 4 \ #负采样个数 + --lr 0.001 \ #学习率 + --model_dir 'model_dir' #模型保存目录 +``` + +修改脚本的可执行权限并运行 + +``` +./train_cpu.sh +``` + +## 单机预测 + +预测使用CPU环境,速度较快。 + +``` +python infer.py +``` + +## 模型效果 + +训练: + +``` +use_gpu:1, NeuMF:1, epochs:20, batch_size:256, num_factors:8, num_neg:4, lr:0.001, model_dir:model_dir, layers:[64, 32, 16, 8] +W0428 12:15:20.169631 1161 device_context.cc:237] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.1, Runtime API Version: 9.0 +W0428 12:15:20.173840 1161 device_context.cc:245] device: 0, cuDNN Version: 7.3. +2020-04-28 12:15:21,945-INFO: epoch: 0, batch_id: 0, batch_time: 0.01069s, loss: 0.69115 +2020-04-28 12:15:21,956-INFO: epoch: 0, batch_id: 1, batch_time: 0.00917s, loss: 0.68997 +2020-04-28 12:15:21,976-INFO: epoch: 0, batch_id: 2, batch_time: 0.00901s, loss: 0.68813 +... +2020-04-28 12:15:22,726-INFO: epoch: 0, batch_id: 72, batch_time: 0.00874s, loss: 0.44167 +2020-04-28 12:15:22,736-INFO: epoch: 0, batch_id: 73, batch_time: 0.00862s, loss: 0.44800 +2020-04-28 12:15:22,746-INFO: epoch: 0, batch_id: 74, batch_time: 0.00871s, loss: 0.43535 + +``` + +预测: + +在参数epoch:20,num_factors:8及用指标HR@10、NDCG@10与论文进行对比: + +本例: + +``` +2020-04-28 12:17:56,541-INFO: epoch: 20, epoch_time: 101.68907s, HR: 0.57268, NDCG: 0.32499 +``` + +论文: + +``` +HR: 0.688, NDCG: 0.410 +``` + diff --git a/PaddleRec/ncf/args.py b/PaddleRec/ncf/args.py new file mode 100644 index 0000000000000000000000000000000000000000..12c6afee503440f10c0a454c5d948869d4748e05 --- /dev/null +++ b/PaddleRec/ncf/args.py @@ -0,0 +1,24 @@ +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="Run GMF.") + parser.add_argument('--path', nargs='?', default='Data/', help='Input data path.') + parser.add_argument('--dataset', nargs='?', default='ml-1m', help='Choose a dataset.') + parser.add_argument('--epochs', type=int, default=20, help='Number of epochs.') + parser.add_argument('--batch_size', type=int, default=256, help='Batch size.') + parser.add_argument('--test_batch_size', type=int, default=100, help='Batch size.') + parser.add_argument('--num_factors', type=int, default=8, help='Embedding size.') + parser.add_argument('--num_users', type=int, default=6040, help='num_users') + parser.add_argument('--num_items', type=int, default=3706, help='num_users') + parser.add_argument('--num_neg', type=int, default=4, help='Number of negative instances to pair with a positive instance.') + parser.add_argument('--lr', type=float, default=0.001, help='Learning rate.') + parser.add_argument('--train_data_path', type=str, default="Data/train_data.csv", help='train_data_path') + parser.add_argument('--test_data_path', type=str, default="Data/test.txt", help='train_data_path') + parser.add_argument('--model_dir', type=str, default="model_dir", help='model_dir.') + parser.add_argument('--use_gpu', type=int, default=0, help='use_gpu') + parser.add_argument('--GMF', type=int, default=0, help='GMF') + parser.add_argument('--MLP', type=int, default=0, help='MLP') + parser.add_argument('--NeuMF', type=int, default=0, help='NeuMF') + parser.add_argument('--layers', nargs='?', default=[64,32,16,8], + help="MLP layers. Note that the first layer is the concatenation of user and item embeddings. So layers[0]/2 is the embedding size.") + return parser.parse_args() \ No newline at end of file diff --git a/PaddleRec/ncf/create_data.sh b/PaddleRec/ncf/create_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..caf377c5216c6ee89450943dd93eb76a88cbd66c --- /dev/null +++ b/PaddleRec/ncf/create_data.sh @@ -0,0 +1,6 @@ +mkdir Data +pip install -r requirements.txt +wget -P Data https://paddlerec.bj.bcebos.com/ncf/Data.zip +unzip Data/Data.zip -d Data/ +python get_train_data.py --num_neg 4 \ + --train_data_path "Data/train_data.csv" \ No newline at end of file diff --git a/PaddleRec/ncf/evaluate.py b/PaddleRec/ncf/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..b91a7e26398273766c96e9a2a73014607dba1fd3 --- /dev/null +++ b/PaddleRec/ncf/evaluate.py @@ -0,0 +1,114 @@ +import math +import heapq # for retrieval topK +import multiprocessing +import numpy as np +from time import time +import paddle.fluid as fluid +import os +from gmf import GMF +from mlp import MLP +from neumf import NeuMF +from Dataset import Dataset +import logging +import paddle +import args +import utils +import time +#from numba import jit, autojit + +# Global variables that are shared across processes +_model = None +_testRatings = None +_testNegatives = None +_K = None +_args = None +_model_path = None + +def run_infer(args, model_path, test_data_path): + test_data_generator = utils.CriteoDataset() + + with fluid.scope_guard(fluid.Scope()): + test_reader = paddle.batch(test_data_generator.test(test_data_path, False), batch_size=args.test_batch_size) + + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(model_path, exe) + + for data in test_reader(): + user_input = np.array([dat[0] for dat in data]) + item_input = np.array([dat[1] for dat in data]) + + pred_val = exe.run(infer_program, + feed={"user_input": user_input, + "item_input": item_input}, + fetch_list=fetch_vars, + return_numpy=True) + + return pred_val[0].reshape(1, -1).tolist()[0] + +def evaluate_model(args, testRatings, testNegatives, K, model_path): + """ + Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation + Return: score of each test rating. + """ + global _model + global _testRatings + global _testNegatives + global _K + global _model_path + global _args + + _args = args + _model_path= model_path + _testRatings = testRatings + _testNegatives = testNegatives + _K = K + + hits, ndcgs = [],[] + for idx in range(len(_testRatings)): + (hr,ndcg) = eval_one_rating(idx) + hits.append(hr) + ndcgs.append(ndcg) + return (hits, ndcgs) + +def eval_one_rating(idx): + rating = _testRatings[idx] + items = _testNegatives[idx] + u = rating[0] + gtItem = rating[1] + items.append(gtItem) + # Get prediction scores + map_item_score = {} + users = np.full(len(items), u, dtype = 'int32') + users = users.reshape(-1,1) + items_array = np.array(items).reshape(-1,1) + temp = np.hstack((users, items_array)) + np.savetxt("Data/test.txt", temp, fmt='%d', delimiter=',') + predictions = run_infer(_args, _model_path, _args.test_data_path) + + for i in range(len(items)): + item = items[i] + map_item_score[item] = predictions[i] + items.pop() + + # Evaluate top rank list + ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get) + hr = getHitRatio(ranklist, gtItem) + ndcg = getNDCG(ranklist, gtItem) + + return (hr, ndcg) + +def getHitRatio(ranklist, gtItem): + for item in ranklist: + if item == gtItem: + return 1 + return 0 + +def getNDCG(ranklist, gtItem): + for i in range(len(ranklist)): + item = ranklist[i] + if item == gtItem: + return math.log(2) / math.log(i+2) + return 0 diff --git a/PaddleRec/ncf/get_train_data.py b/PaddleRec/ncf/get_train_data.py new file mode 100644 index 0000000000000000000000000000000000000000..44578dd100046d71063a9adbf63fa3ac122a23d3 --- /dev/null +++ b/PaddleRec/ncf/get_train_data.py @@ -0,0 +1,56 @@ +import scipy.sparse as sp +import numpy as np +from time import time +import args + +def get_train_data(filename, write_file, num_negatives): + ''' + Read .rating file and Return dok matrix. + The first line of .rating file is: num_users\t num_items + ''' + # Get number of users and items + num_users, num_items = 0, 0 + with open(filename, "r") as f: + line = f.readline() + while line != None and line != "": + arr = line.split("\t") + u, i = int(arr[0]), int(arr[1]) + num_users = max(num_users, u) + num_items = max(num_items, i) + line = f.readline() + # Construct matrix + mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32) + with open(filename, "r") as f: + line = f.readline() + while line != None and line != "": + arr = line.split("\t") + user, item, rating = int(arr[0]), int(arr[1]), float(arr[2]) + if (rating > 0): + mat[user, item] = 1.0 + line = f.readline() + + file = open(write_file, 'w') + print("writing " + write_file) + + for (u, i) in mat.keys(): + # positive instance + user_input = str(u) + item_input = str(i) + label = str(1) + sample = "{0},{1},{2}".format(user_input, item_input,label) + "\n" + file.write(sample) + # negative instances + for t in range(num_negatives): + j = np.random.randint(num_items) + while (u, j) in mat.keys(): + j = np.random.randint(num_items) + user_input = str(u) + item_input = str(j) + label = str(0) + sample = "{0},{1},{2}".format(user_input, item_input,label) + "\n" + file.write(sample) + +if __name__ == "__main__": + args = args.parse_args() + get_train_data(args.path + args.dataset + ".train.rating", args.train_data_path, args.num_neg) + \ No newline at end of file diff --git a/PaddleRec/ncf/gmf.py b/PaddleRec/ncf/gmf.py new file mode 100644 index 0000000000000000000000000000000000000000..fc6a36204f894b6770ffce1ad005e51f835f657a --- /dev/null +++ b/PaddleRec/ncf/gmf.py @@ -0,0 +1,50 @@ +import numpy as np +import paddle.fluid as fluid +import sys +import math +from time import time + +class GMF(object): + def net(self, inputs, num_users, num_items, latent_dim): + MF_Embedding_User = fluid.embedding(input=inputs[0], + size=[num_users, latent_dim], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + MF_Embedding_Item = fluid.embedding(input=inputs[1], + size=[num_items, latent_dim], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + + predict_vector = fluid.layers.elementwise_mul(MF_Embedding_User, MF_Embedding_Item) + + prediction = fluid.layers.fc(input=predict_vector, + size=1, + act='sigmoid', + param_attr=fluid.initializer.MSRAInitializer(uniform=True), + name='prediction') + + cost = fluid.layers.log_loss(input=prediction, label=fluid.layers.cast(x=inputs[2], dtype='float32')) + avg_cost = fluid.layers.mean(cost) + + return avg_cost, prediction + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PaddleRec/ncf/infer.py b/PaddleRec/ncf/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..9da89c3a89681be02474d3a54eb1e9dd46c52727 --- /dev/null +++ b/PaddleRec/ncf/infer.py @@ -0,0 +1,48 @@ +import numpy as np +import os +import paddle.fluid as fluid +from gmf import GMF +from mlp import MLP +from neumf import NeuMF +from Dataset import Dataset +from evaluate import evaluate_model +import logging +import paddle +import args +import utils +import time +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +if __name__ == "__main__": + + args = args.parse_args() + dataset = Dataset(args.path + args.dataset) + testRatings, testNegatives = dataset.testRatings, dataset.testNegatives + topK = 10 + + begin = time.time() + model_path = args.model_dir + "/epoch_" + str(args.epochs - 1) + (hits, ndcgs) = evaluate_model(args, testRatings, testNegatives, topK, model_path) + hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() + end = time.time() + + logger.info("epoch: {}, epoch_time: {:.5f}s, HR: {:.5f}, NDCG: {:.5f}".format(args.epochs, end - begin, hr, ndcg)) + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PaddleRec/ncf/mlp.py b/PaddleRec/ncf/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..12566c7ccdf8f560a2a7a04d9499b32f469359f2 --- /dev/null +++ b/PaddleRec/ncf/mlp.py @@ -0,0 +1,64 @@ +import numpy as np +import paddle.fluid as fluid +import sys +import math +from time import time + +class MLP(object): + def net(self, inputs, num_users, num_items, layers = [20, 10]): + + num_layer = len(layers) #Number of layers in the MLP + + MLP_Embedding_User = fluid.embedding(input=inputs[0], + size=[num_users, int(layers[0] / 2)], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + MLP_Embedding_Item = fluid.embedding(input=inputs[1], + size=[num_items, int(layers[0] / 2)], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + + # The 0-th layer is the concatenation of embedding layers + vector = fluid.layers.concat(input=[MLP_Embedding_User, MLP_Embedding_Item], axis=-1) + + for i in range(1, num_layer): + vector = fluid.layers.fc(input=vector, + size=layers[i], + act='relu', + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(vector.shape[1])), + regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)), + name='layer_' + str(i)) + + # Final prediction layer + + prediction = fluid.layers.fc(input=vector, + size=1, + act='sigmoid', + param_attr=fluid.initializer.MSRAInitializer(uniform=True), + name='prediction') + + cost = fluid.layers.log_loss(input=prediction, label=fluid.layers.cast(x=inputs[2], dtype='float32')) + avg_cost = fluid.layers.mean(cost) + + return avg_cost, prediction + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PaddleRec/ncf/neumf.py b/PaddleRec/ncf/neumf.py new file mode 100644 index 0000000000000000000000000000000000000000..5330e2789d05d3ea61f50ef67054ae23166a9c2f --- /dev/null +++ b/PaddleRec/ncf/neumf.py @@ -0,0 +1,85 @@ +import numpy as np +import paddle.fluid as fluid +import sys +import math +from time import time + +class NeuMF(object): + def net(self, inputs, num_users, num_items, latent_dim, layers = [64,32,16,8]): + num_layer = len(layers) #Number of layers in the MLP + + + MF_Embedding_User = fluid.embedding(input=inputs[0], + size=[num_users, latent_dim], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + MF_Embedding_Item = fluid.embedding(input=inputs[1], + size=[num_items, latent_dim], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + + MLP_Embedding_User = fluid.embedding(input=inputs[0], + size=[num_users, int(layers[0] / 2)], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + MLP_Embedding_Item = fluid.embedding(input=inputs[1], + size=[num_items, int(layers[0] / 2)], + param_attr=fluid.initializer.Normal(loc=0.0, scale=0.01), + is_sparse=True) + + # MF part + mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1) + mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1) + mf_vector = fluid.layers.elementwise_mul(mf_user_latent, mf_item_latent) + #fluid.layers.Print(mf_vector, message="mf_vector") + + # MLP part + # The 0-th layer is the concatenation of embedding layers + mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1) + mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1) + mlp_vector = fluid.layers.concat(input=[mlp_user_latent, mlp_item_latent], axis=-1) + #fluid.layers.Print(mlp_vector, message="mlp_vector") + + for i in range(1, num_layer): + mlp_vector = fluid.layers.fc(input=mlp_vector, + size=layers[i], + act='relu', + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(mlp_vector.shape[1])), + regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)), + name='layer_' + str(i)) + + # Concatenate MF and MLP parts + predict_vector = fluid.layers.concat(input=[mf_vector, mlp_vector], axis=-1) + + # Final prediction layer + prediction = fluid.layers.fc(input=predict_vector, + size=1, + act='sigmoid', + param_attr=fluid.initializer.MSRAInitializer(uniform=True), + name='prediction') + + cost = fluid.layers.log_loss(input=prediction, label=fluid.layers.cast(x=inputs[2], dtype='float32')) + avg_cost = fluid.layers.mean(cost) + + return avg_cost, prediction + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PaddleRec/ncf/requirements.txt b/PaddleRec/ncf/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f4c46961a136dcbaf3493cd8ab891023de590c4 --- /dev/null +++ b/PaddleRec/ncf/requirements.txt @@ -0,0 +1,132 @@ +absl-py==0.8.1 +aspy.yaml==1.3.0 +attrs==19.2.0 +audioread==2.1.8 +backcall==0.1.0 +bleach==3.1.0 +cachetools==4.0.0 +certifi==2019.9.11 +cffi==1.14.0 +cfgv==2.0.1 +chardet==3.0.4 +Click==7.0 +cloudpickle==1.2.1 +cma==2.7.0 +colorlog==4.1.0 +cycler==0.10.0 +Cython==0.29 +decorator==4.4.0 +entrypoints==0.3 +flake8==3.7.9 +Flask==1.1.1 +funcsigs==1.0.2 +future==0.18.0 +google-auth==1.10.0 +google-auth-oauthlib==0.4.1 +graphviz==0.13 +grpcio==1.26.0 +gunicorn==20.0.4 +gym==0.12.1 +h5py==2.9.0 +identify==1.4.10 +idna==2.8 +imageio==2.6.1 +imageio-ffmpeg==0.3.0 +importlib-metadata==0.23 +ipykernel==5.1.0 +ipython==7.0.1 +ipython-genutils==0.2.0 +itsdangerous==1.1.0 +jedi==0.15.1 +jieba==0.42.1 +Jinja2==2.10.1 +joblib==0.14.1 +jsonschema==3.1.1 +jupyter-client==5.3.3 +jupyter-core==4.5.0 +kiwisolver==1.1.0 +librosa==0.7.2 +llvmlite==0.31.0 +Markdown==3.1.1 +MarkupSafe==1.1.1 +matplotlib==2.2.3 +mccabe==0.6.1 +mistune==0.8.4 +more-itertools==7.2.0 +moviepy==1.0.1 +nbconvert==5.3.1 +nbformat==4.4.0 +networkx==2.4 +nltk==3.4.5 +nodeenv==1.3.4 +notebook==5.7.0 +numba==0.48.0 +numpy==1.16.4 +oauthlib==3.1.0 +objgraph==3.4.1 +opencv-python==4.1.1.26 +paddlehub==1.5.0 +paddlepaddle-gpu==1.7.1.post97 +pandas==0.23.4 +pandocfilters==1.4.2 +parl==1.1.2 +parso==0.5.1 +pexpect==4.7.0 +pickleshare==0.7.5 +Pillow==6.2.0 +pre-commit==1.21.0 +prettytable==0.7.2 +proglog==0.1.9 +prometheus-client==0.5.0 +prompt-toolkit==2.0.10 +protobuf==3.10.0 +ptyprocess==0.6.0 +pyarrow==0.13.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.7 +pycodestyle==2.5.0 +pycparser==2.19 +pyflakes==2.1.1 +pyglet==1.4.5 +Pygments==2.4.2 +pyparsing==2.4.2 +pyrsistent==0.15.4 +python-dateutil==2.8.0 +pytz==2019.3 +PyYAML==5.1.2 +pyzmq==18.0.1 +rarfile==3.1 +recordio==0.1.7 +requests==2.22.0 +requests-oauthlib==1.3.0 +resampy==0.2.2 +rsa==4.0 +scikit-learn==0.20.0 +scipy==1.3.0 +seaborn==0.10.0 +Send2Trash==1.5.0 +sentencepiece==0.1.85 +simplegeneric==0.8.1 +six==1.12.0 +sklearn==0.0 +SoundFile==0.10.3.post1 +tb-nightly==1.15.0a20190801 +tb-paddle==0.3.6 +tensorboard==2.1.0 +tensorboardX==1.8 +termcolor==1.1.0 +terminado==0.8.2 +testpath==0.4.2 +toml==0.10.0 +tornado==5.1.1 +tqdm==4.36.1 +traitlets==4.3.3 +urllib3==1.25.6 +virtualenv==16.7.9 +visualdl==1.3.0 +wcwidth==0.1.7 +webencodings==0.5.1 +Werkzeug==0.16.0 +xgboost==1.0.1 +yapf==0.26.0 +zipp==0.6.0 diff --git a/PaddleRec/ncf/train.py b/PaddleRec/ncf/train.py new file mode 100644 index 0000000000000000000000000000000000000000..247a4a7d75b0e62ba760f0d23bc4126ce74acc3f --- /dev/null +++ b/PaddleRec/ncf/train.py @@ -0,0 +1,67 @@ +import numpy as np +import os +import paddle.fluid as fluid +from gmf import GMF +from mlp import MLP +from neumf import NeuMF +from Dataset import Dataset +import logging +import paddle +import args +import utils +import time +from evaluate import evaluate_model + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +def train(args, train_data_path): + print("use_gpu:{}, NeuMF:{}, epochs:{}, batch_size:{}, num_factors:{}, num_neg:{}, lr:{}, model_dir:{}, layers:{}".format( + args.use_gpu, args.NeuMF, args.epochs, args.batch_size, args.num_factors, args.num_neg, args.lr, args.model_dir, args.layers)) + dataset = Dataset(args.path + args.dataset) + testRatings, testNegatives = dataset.testRatings, dataset.testNegatives + + train_data_generator = utils.CriteoDataset() + train_reader = paddle.batch(train_data_generator.train(train_data_path, True), batch_size=args.batch_size) + + inputs = utils.input_data(True) + if args.GMF: + model = GMF() + loss, pred = model.net(inputs, args.num_users, args.num_items, args.num_factors) + elif args.MLP: + model = MLP() + loss, pred = model.net(inputs, args.num_users, args.num_items, args.layers) + elif args.NeuMF: + model = NeuMF() + loss, pred = model.net(inputs, args.num_users, args.num_items, args.num_factors, args.layers) + + optimizer = fluid.optimizer.AdamOptimizer(args.lr) + optimizer.minimize(loss) + + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + feeder = fluid.DataFeeder(feed_list=inputs, place=place) + + for epoch in range(args.epochs): + + for batch_id, data in enumerate(train_reader()): + begin = time.time() + loss_val = exe.run(program=fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[loss.name], + return_numpy=True) + end = time.time() + logger.info("epoch: {}, batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}".format(epoch, batch_id, end - begin, np.array(loss_val)[0][0])) + + save_dir = "%s/epoch_%d" % (args.model_dir, epoch) + feed_var_names = ["user_input", "item_input"] + fetch_vars = [pred] + fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) + + + +if __name__ == "__main__": + args = args.parse_args() + train(args, args.train_data_path) diff --git a/PaddleRec/ncf/train_cpu.sh b/PaddleRec/ncf/train_cpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..bf6c8e563adf2644b2dc7e88d38eded34d228f9f --- /dev/null +++ b/PaddleRec/ncf/train_cpu.sh @@ -0,0 +1,8 @@ +python train.py --use_gpu 0 \ + --NeuMF 1 \ + --epochs 20 \ + --batch_size 256 \ + --num_factors 8 \ + --num_neg 4 \ + --lr 0.001 \ + --model_dir 'model_dir' \ No newline at end of file diff --git a/PaddleRec/ncf/train_gpu.sh b/PaddleRec/ncf/train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..6e8ed3a4748c91ac55bd79e5279bf3d078e74598 --- /dev/null +++ b/PaddleRec/ncf/train_gpu.sh @@ -0,0 +1,8 @@ +CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 \ + --NeuMF 1 \ + --epochs 20 \ + --batch_size 256 \ + --num_factors 8 \ + --num_neg 4 \ + --lr 0.001 \ + --model_dir 'model_dir' \ No newline at end of file diff --git a/PaddleRec/ncf/utils.py b/PaddleRec/ncf/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..400de8a2451c5b84df2441ce84e60ea3462113b0 --- /dev/null +++ b/PaddleRec/ncf/utils.py @@ -0,0 +1,39 @@ +import numpy as np +import os +import paddle.fluid as fluid + +class CriteoDataset(object): + + def _reader_creator(self, file, is_train): + def reader(): + with open(file, 'r') as f: + for i,line in enumerate(f): + line = line.strip().split(',') + features = list(map(int, line)) + + output = [] + output.append([features[0]]) + output.append([features[1]]) + if is_train: + output.append([features[2]]) + + yield output + + return reader + + def train(self, file, is_train): + return self._reader_creator(file, is_train) + + def test(self, file, is_train): + return self._reader_creator(file, is_train) + +def input_data(is_train): + user_input = fluid.data(name="user_input", shape=[-1, 1], dtype="int64", lod_level=0) + item_input = fluid.data(name="item_input", shape=[-1, 1], dtype="int64", lod_level=0) + label = fluid.data(name="label", shape=[-1, 1], dtype="int64", lod_level=0) + if is_train: + inputs = [user_input] + [item_input] + [label] + else: + inputs = [user_input] + [item_input] + + return inputs \ No newline at end of file