From e4ad047a0102643cfb94a4e6afa9a1ec4cd969de Mon Sep 17 00:00:00 2001 From: overlordmax <37664905+overlordmax@users.noreply.github.com> Date: Tue, 16 Jun 2020 10:38:09 +0800 Subject: [PATCH] fix bug (#4706) --- PaddleRec/ctr/wide_deep/train.py | 2 +- PaddleRec/ncf/args.py | 1 + PaddleRec/ncf/evaluate.py | 55 ++++++++++++----------------- PaddleRec/ncf/get_train_data.py | 2 +- PaddleRec/ncf/infer.py | 2 +- PaddleRec/rerank/listwise/README.md | 14 -------- PaddleRec/rerank/listwise/args.py | 2 +- PaddleRec/rerank/listwise/train.py | 22 ++++++------ 8 files changed, 39 insertions(+), 61 deletions(-) diff --git a/PaddleRec/ctr/wide_deep/train.py b/PaddleRec/ctr/wide_deep/train.py index c2d9a927..f693c290 100644 --- a/PaddleRec/ctr/wide_deep/train.py +++ b/PaddleRec/ctr/wide_deep/train.py @@ -48,5 +48,5 @@ def train(args, train_data_path): if __name__ == "__main__": args = args.parse_args() logger.info("epoch:{}, batch_size: {}, use_gpu: {}, train_data_path: {}, model_dir: {}, hidden1_units: {}, hidden2_units: {}, hidden3_units: {}".format( - args.epoch, args.batch_size, args.use_gpu, args.train_data_path, args.model_dir, args.hidden1_units, args.hidden2_units, args.hidden3_units)) + args.epochs, args.batch_size, args.use_gpu, args.train_data_path, args.model_dir, args.hidden1_units, args.hidden2_units, args.hidden3_units)) train(args, args.train_data_path) diff --git a/PaddleRec/ncf/args.py b/PaddleRec/ncf/args.py index 12c6afee..36751fc2 100644 --- a/PaddleRec/ncf/args.py +++ b/PaddleRec/ncf/args.py @@ -6,6 +6,7 @@ def parse_args(): parser.add_argument('--dataset', nargs='?', default='ml-1m', help='Choose a dataset.') parser.add_argument('--epochs', type=int, default=20, help='Number of epochs.') parser.add_argument('--batch_size', type=int, default=256, help='Batch size.') + parser.add_argument('--test_epoch', type=str, default='19',help='test_epoch') parser.add_argument('--test_batch_size', type=int, default=100, help='Batch size.') parser.add_argument('--num_factors', type=int, default=8, help='Embedding size.') parser.add_argument('--num_users', type=int, default=6040, help='num_users') diff --git a/PaddleRec/ncf/evaluate.py b/PaddleRec/ncf/evaluate.py index e3cb6802..c6206bf0 100644 --- a/PaddleRec/ncf/evaluate.py +++ b/PaddleRec/ncf/evaluate.py @@ -1,5 +1,5 @@ import math -import heapq # for retrieval topK +import heapq # for retrieval topK import multiprocessing import numpy as np from time import time @@ -23,36 +23,30 @@ _K = None _args = None _model_path = None - def run_infer(args, model_path, test_data_path): test_data_generator = utils.Dataset() - + with fluid.scope_guard(fluid.Scope()): - test_reader = fluid.io.batch( - test_data_generator.test(test_data_path, False), - batch_size=args.test_batch_size) - + test_reader = fluid.io.batch(test_data_generator.test(test_data_path, False), batch_size=args.test_batch_size) + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model( - model_path, exe) + infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(model_path, exe) for data in test_reader(): user_input = np.array([dat[0] for dat in data]) item_input = np.array([dat[1] for dat in data]) - pred_val = exe.run( - infer_program, - feed={"user_input": user_input, - "item_input": item_input}, - fetch_list=fetch_vars, - return_numpy=True) - + pred_val = exe.run(infer_program, + feed={"user_input": user_input, + "item_input": item_input}, + fetch_list=fetch_vars, + return_numpy=True) + return pred_val[0].reshape(1, -1).tolist()[0] - def evaluate_model(args, testRatings, testNegatives, K, model_path): """ Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation @@ -62,23 +56,22 @@ def evaluate_model(args, testRatings, testNegatives, K, model_path): global _testRatings global _testNegatives global _K - global _model_path + global _model_path global _args - + _args = args - _model_path = model_path + _model_path= model_path _testRatings = testRatings _testNegatives = testNegatives _K = K - - hits, ndcgs = [], [] + + hits, ndcgs = [],[] for idx in range(len(_testRatings)): - (hr, ndcg) = eval_one_rating(idx) + (hr,ndcg) = eval_one_rating(idx) hits.append(hr) - ndcgs.append(ndcg) + ndcgs.append(ndcg) return (hits, ndcgs) - def eval_one_rating(idx): rating = _testRatings[idx] items = _testNegatives[idx] @@ -87,9 +80,9 @@ def eval_one_rating(idx): items.append(gtItem) # Get prediction scores map_item_score = {} - users = np.full(len(items), u, dtype='int32') - users = users.reshape(-1, 1) - items_array = np.array(items).reshape(-1, 1) + users = np.full(len(items), u, dtype = 'int32') + users = users.reshape(-1,1) + items_array = np.array(items).reshape(-1,1) temp = np.hstack((users, items_array)) np.savetxt("Data/test.txt", temp, fmt='%d', delimiter=',') predictions = run_infer(_args, _model_path, _args.test_data_path) @@ -98,7 +91,7 @@ def eval_one_rating(idx): item = items[i] map_item_score[item] = predictions[i] items.pop() - + # Evaluate top rank list ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get) hr = getHitRatio(ranklist, gtItem) @@ -106,17 +99,15 @@ def eval_one_rating(idx): return (hr, ndcg) - def getHitRatio(ranklist, gtItem): for item in ranklist: if item == gtItem: return 1 return 0 - def getNDCG(ranklist, gtItem): for i in range(len(ranklist)): item = ranklist[i] if item == gtItem: - return math.log(2) / math.log(i + 2) + return math.log(2) / math.log(i+2) return 0 diff --git a/PaddleRec/ncf/get_train_data.py b/PaddleRec/ncf/get_train_data.py index e87bdcad..44578dd1 100644 --- a/PaddleRec/ncf/get_train_data.py +++ b/PaddleRec/ncf/get_train_data.py @@ -32,7 +32,7 @@ def get_train_data(filename, write_file, num_negatives): file = open(write_file, 'w') print("writing " + write_file) - for (u, i) in mat: + for (u, i) in mat.keys(): # positive instance user_input = str(u) item_input = str(i) diff --git a/PaddleRec/ncf/infer.py b/PaddleRec/ncf/infer.py index 29c1f03f..cac2270f 100644 --- a/PaddleRec/ncf/infer.py +++ b/PaddleRec/ncf/infer.py @@ -23,7 +23,7 @@ if __name__ == "__main__": topK = 10 begin = time.time() - model_path = args.model_dir + "/epoch_" + str(12) + model_path = args.model_dir + "/epoch_" + args.test_epoch (hits, ndcgs) = evaluate_model(args, testRatings, testNegatives, topK, model_path) hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() end = time.time() diff --git a/PaddleRec/rerank/listwise/README.md b/PaddleRec/rerank/listwise/README.md index 47e46beb..cdd23473 100644 --- a/PaddleRec/rerank/listwise/README.md +++ b/PaddleRec/rerank/listwise/README.md @@ -27,20 +27,6 @@ python3.7 -## 数据集说明 - -本项目构造数据集验证模型的正确性,字段说明如下: - -user_slot_name:用户端特征群id - -item_slot_name:item段特征群id - -lenght:item的长度 - -label:用户对给定的是否点击item的list - -注意:由于构造数据集的限制,本项目只用一个epoch,如果多个epoch,则多个epoch的数据是变化的,没有意义,因此只采用一个epoch。 - ## 单机训练 GPU环境 diff --git a/PaddleRec/rerank/listwise/args.py b/PaddleRec/rerank/listwise/args.py index 5fc63297..d52791e2 100644 --- a/PaddleRec/rerank/listwise/args.py +++ b/PaddleRec/rerank/listwise/args.py @@ -22,7 +22,7 @@ import sys def parse_args(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--epochs", type=int, default=1, help="epochs") + parser.add_argument("--epochs", type=int, default=20, help="epochs") parser.add_argument("--batch_size", type=int, default=32, help="batch_size") parser.add_argument("--test_epoch", type=int, default=1, help="test_epoch") parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') diff --git a/PaddleRec/rerank/listwise/train.py b/PaddleRec/rerank/listwise/train.py index 78f9c5be..c8b5d310 100644 --- a/PaddleRec/rerank/listwise/train.py +++ b/PaddleRec/rerank/listwise/train.py @@ -51,17 +51,17 @@ def train(args): train_reader = fluid.io.batch(train_data_generator.get_train_data(), batch_size=args.batch_size) loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True) loader.set_sample_list_generator(train_reader, places=place) - for epoch in range(args.epochs): - for i in range(args.sample_size): - for batch_id, data in enumerate(loader()): - begin = time.time() - loss_val, auc = exe.run(program=fluid.default_main_program(), - feed=data, - fetch_list=[loss.name, auc_val], - return_numpy=True) - end = time.time() - logger.info("epoch: {}, batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( - epoch, batch_id, end-begin, float(np.array(loss_val)), float(np.array(auc)))) + + for i in range(args.sample_size): + for batch_id, data in enumerate(loader()): + begin = time.time() + loss_val, auc = exe.run(program=fluid.default_main_program(), + feed=data, + fetch_list=[loss.name, auc_val], + return_numpy=True) + end = time.time() + logger.info("batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( + batch_id, end-begin, float(np.array(loss_val)), float(np.array(auc)))) #save model model_dir = os.path.join(args.model_dir, 'epoch_' + str(1), "checkpoint") -- GitLab