From 0b66650ed07db1b749996410c148ab060d8329b0 Mon Sep 17 00:00:00 2001 From: overlord <515704170@qq.com> Date: Wed, 13 May 2020 16:27:01 +0800 Subject: [PATCH] fix some bugs --- PaddleRec/ctr/wide_deep/args.py | 2 +- PaddleRec/ctr/wide_deep/infer.py | 14 +++--- PaddleRec/ctr/wide_deep/net.py | 4 ++ PaddleRec/ctr/wide_deep/train.py | 13 +++--- PaddleRec/ctr/wide_deep/utils.py | 2 +- PaddleRec/multi_task/esmm/README.md | 10 ++++- .../multi_task/esmm/dataset_generator.py | 5 +-- PaddleRec/multi_task/esmm/infer.py | 18 +------- PaddleRec/multi_task/esmm/net.py | 35 +++------------ PaddleRec/multi_task/esmm/reader.py | 31 +++++-------- PaddleRec/multi_task/esmm/train.py | 10 ----- PaddleRec/multi_task/esmm/utils.py | 44 ------------------- PaddleRec/ncf/Dataset.py | 5 +-- PaddleRec/ncf/README.md | 1 + PaddleRec/ncf/evaluate.py | 1 - PaddleRec/ncf/gmf.py | 21 +-------- PaddleRec/ncf/infer.py | 20 +-------- PaddleRec/ncf/mlp.py | 22 +--------- PaddleRec/ncf/neumf.py | 21 --------- PaddleRec/ncf/train.py | 17 +++---- PaddleRec/ncf/train_gpu.sh | 4 +- PaddleRec/ncf/utils.py | 4 +- 22 files changed, 66 insertions(+), 238 deletions(-) diff --git a/PaddleRec/ctr/wide_deep/args.py b/PaddleRec/ctr/wide_deep/args.py index 128e9f01..e63383bb 100644 --- a/PaddleRec/ctr/wide_deep/args.py +++ b/PaddleRec/ctr/wide_deep/args.py @@ -27,7 +27,7 @@ def parse_args(): parser.add_argument("--epochs", type=int, default=40, help="epochs") parser.add_argument("--batch_size", type=int, default=40, help="batch_size") parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') - parser.add_argument('--test_epoch', type=str, default='1',help='test_epoch') + parser.add_argument('--test_epoch', type=str, default='39',help='test_epoch') parser.add_argument('--train_path', type=str, default='data/adult.data', help='train_path') parser.add_argument('--test_path', type=str, default='data/adult.test', help='test_path') parser.add_argument('--train_data_path', type=str, default='train_data/train_data.csv', help='train_data_path') diff --git a/PaddleRec/ctr/wide_deep/infer.py b/PaddleRec/ctr/wide_deep/infer.py index 4ad524a8..68c0bfb9 100644 --- a/PaddleRec/ctr/wide_deep/infer.py +++ b/PaddleRec/ctr/wide_deep/infer.py @@ -27,10 +27,8 @@ def set_zero(var_name,scope=fluid.global_scope(), place=fluid.CPUPlace(),param_t def run_infer(args,test_data_path): wide_deep_model = wide_deep() - - test_data_generator = utils.CriteoDataset() - test_reader = paddle.batch(test_data_generator.test(test_data_path), batch_size=args.batch_size) - + test_data_generator = utils.Dataset() + test_reader = fluid.io.batch(test_data_generator.test(test_data_path), batch_size=args.batch_size) inference_scope = fluid.Scope() startup_program = fluid.framework.Program() test_program = fluid.framework.Program() @@ -43,20 +41,20 @@ def run_infer(args,test_data_path): place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() loss, acc, auc, batch_auc, auc_states = wide_deep_model.model(inputs, args.hidden1_units, args.hidden2_units, args.hidden3_units) exe = fluid.Executor(place) - exe.run(startup_program) fluid.load(fluid.default_main_program(), cur_model_path,exe) - feeder = fluid.DataFeeder(feed_list=inputs, place=place) + loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True) + loader.set_sample_list_generator(test_reader, places=place) for var in auc_states: # reset auc states set_zero(var.name, scope=inference_scope, place=place) mean_acc = [] mean_auc = [] - for batch_id, data in enumerate(test_reader()): + for batch_id, data in enumerate(loader()): begin = time.time() acc_val,auc_val = exe.run(program=test_program, - feed=feeder.feed(data), + feed=data, fetch_list=[acc.name, auc.name], return_numpy=True ) diff --git a/PaddleRec/ctr/wide_deep/net.py b/PaddleRec/ctr/wide_deep/net.py index f598bdc9..53e0316d 100644 --- a/PaddleRec/ctr/wide_deep/net.py +++ b/PaddleRec/ctr/wide_deep/net.py @@ -5,6 +5,7 @@ import paddle.fluid as fluid class wide_deep(object): def wide_part(self, data): + out = fluid.layers.fc(input=data, size=1, param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1])), @@ -14,6 +15,7 @@ class wide_deep(object): return out def fc(self, data, hidden_units, active, tag): + output = fluid.layers.fc(input=data, size=hidden_units, param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))), @@ -23,6 +25,7 @@ class wide_deep(object): return output def deep_part(self, data, hidden1_units, hidden2_units, hidden3_units): + l1 = self.fc(data, hidden1_units, 'relu', 'l1') l2 = self.fc(l1, hidden2_units, 'relu', 'l2') l3 = self.fc(l2, hidden3_units, 'relu', 'l3') @@ -38,6 +41,7 @@ class wide_deep(object): return inputs def model(self, inputs, hidden1_units, hidden2_units, hidden3_units): + wide_output = self.wide_part(inputs[0]) deep_output = self.deep_part(inputs[1], hidden1_units, hidden2_units, hidden3_units) diff --git a/PaddleRec/ctr/wide_deep/train.py b/PaddleRec/ctr/wide_deep/train.py index 1d07c7cc..65a3a980 100644 --- a/PaddleRec/ctr/wide_deep/train.py +++ b/PaddleRec/ctr/wide_deep/train.py @@ -15,8 +15,8 @@ logger.setLevel(logging.INFO) def train(args, train_data_path): wide_deep_model = wide_deep() inputs = wide_deep_model.input_data() - train_data_generator = utils.CriteoDataset() - train_reader = paddle.batch(train_data_generator.train(train_data_path), batch_size=args.batch_size) + train_data_generator = utils.Dataset() + train_reader = fluid.io.batch(train_data_generator.train(train_data_path), batch_size=args.batch_size) loss, acc, auc, batch_auc, auc_states = wide_deep_model.model(inputs, args.hidden1_units, args.hidden2_units, args.hidden3_units) optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.01) @@ -25,13 +25,16 @@ def train(args, train_data_path): place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - feeder = fluid.DataFeeder(feed_list=inputs, place=place) + + loader = fluid.io.DataLoader.from_generator( + feed_list=inputs, capacity=args.batch_size, iterable=True) + loader.set_sample_list_generator(train_reader, places=place) for epoch in range(args.epochs): - for batch_id, data in enumerate(train_reader()): + for batch_id, data in enumerate(loader()): begin = time.time() loss_val, acc_val, auc_val = exe.run(program=fluid.default_main_program(), - feed=feeder.feed(data), + feed=data, fetch_list=[loss.name, acc.name, auc.name], return_numpy=True) end = time.time() diff --git a/PaddleRec/ctr/wide_deep/utils.py b/PaddleRec/ctr/wide_deep/utils.py index fc239624..fb2c7bec 100644 --- a/PaddleRec/ctr/wide_deep/utils.py +++ b/PaddleRec/ctr/wide_deep/utils.py @@ -2,7 +2,7 @@ import numpy as np import os import paddle.fluid as fluid -class CriteoDataset(object): +class Dataset(object): def _reader_creator(self, file): def reader(): diff --git a/PaddleRec/multi_task/esmm/README.md b/PaddleRec/multi_task/esmm/README.md index f5acde01..e79a7323 100644 --- a/PaddleRec/multi_task/esmm/README.md +++ b/PaddleRec/multi_task/esmm/README.md @@ -108,7 +108,7 @@ python infer.py --use_gpu True\ #是否使用gpu CPU环境 -在cpu_train.sh脚本文件中设置好数据路径、参数。 +在cpu_infer.sh脚本文件中设置好数据路径、参数。 ```shell python infer.py --use_gpu False\ #是否使用gpu @@ -118,6 +118,14 @@ python infer.py --use_gpu False\ #是否使用gpu --vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 ``` +修改脚本的可执行权限并运行 + +``` +./cpu_infer.sh +``` + + + ## 模型效果 目前只抽取部分数据验证模型正确性。模型预测结果实例如下: diff --git a/PaddleRec/multi_task/esmm/dataset_generator.py b/PaddleRec/multi_task/esmm/dataset_generator.py index ce7f31ce..8e93977a 100644 --- a/PaddleRec/multi_task/esmm/dataset_generator.py +++ b/PaddleRec/multi_task/esmm/dataset_generator.py @@ -15,8 +15,6 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator): def reader(): features = line.strip().split(',') - #ctr = list(map(int, features[1])) - #cvr = list(map(int, features[2])) ctr = features[1] cvr = features[2] @@ -28,8 +26,7 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator): if field_id not in all_field_id_dict: continue all_field_id_dict[field_id][0] = True - index = all_field_id_dict[field_id][1] - #feat_id = list(map(int, feat_id)) + index = all_field_id_dict[field_id][1] output[index][1].append(feat_id) for field_id in all_field_id_dict: diff --git a/PaddleRec/multi_task/esmm/infer.py b/PaddleRec/multi_task/esmm/infer.py index b1788c70..4c735337 100644 --- a/PaddleRec/multi_task/esmm/infer.py +++ b/PaddleRec/multi_task/esmm/infer.py @@ -25,11 +25,6 @@ def run_infer(args,model_path,test_data_path,vocab_size): place = fluid.CPUPlace() esmm_model = ESMM() - - test_data_generator = utils.CriteoDataset() - test_reader = paddle.batch(test_data_generator.test(test_data_path),batch_size=args.batch_size) - - startup_program = fluid.framework.Program() test_program = fluid.framework.Program() @@ -41,7 +36,6 @@ def run_infer(args,model_path,test_data_path,vocab_size): dataset, file_list = utils.get_dataset(inputs, test_data_path,args.batch_size,args.cpu_num) exe = fluid.Executor(place) - #加载模型 fluid.load(fluid.default_main_program(),os.path.join(model_path, "checkpoint"), exe) set_zero(place) @@ -70,14 +64,4 @@ if __name__ == "__main__": logger.info("Test model {}".format(model)) run_infer(args, model,args.test_data_path) - - - - - - - - - - - \ No newline at end of file + \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/net.py b/PaddleRec/multi_task/esmm/net.py index 7487cb06..b8a0092b 100644 --- a/PaddleRec/multi_task/esmm/net.py +++ b/PaddleRec/multi_task/esmm/net.py @@ -5,7 +5,6 @@ import paddle import utils import args - class ESMM(object): def fc(self,tag, data, out_dim, active='prelu'): @@ -47,14 +46,12 @@ class ESMM(object): initializer=fluid.initializer.Xavier(fan_in=embed_size,fan_out=embed_size) ), is_sparse=True) - #fluid.layers.Print(feat_emb, message="feat_emb") + field_emb = fluid.layers.sequence_pool(input=feat_emb,pool_type='sum') emb.append(field_emb) concat_emb = fluid.layers.concat(emb, axis=1) - - + # ctr - active = 'relu' ctr_fc1 = self.fc('ctr_fc1', concat_emb, 200, active) ctr_fc2 = self.fc('ctr_fc2', ctr_fc1, 80, active) @@ -67,15 +64,10 @@ class ESMM(object): ctr_clk = inputs[-2] ctcvr_buy = inputs[-1] - #ctr_label = fluid.layers.concat(input=[ctr_clk,1-ctr_clk],axis=1) - #ctcvr_label = fluid.layers.concat(input=[ctcvr_buy,1-ctcvr_buy],axis=1) - #ctr_label = fluid.layers.cast(x=ctr_label, dtype='float32') - #ctcvr_label = fluid.layers.cast(x=ctcvr_label, dtype='float32') - + ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2]) cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2]) - ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one) ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1) @@ -83,26 +75,9 @@ class ESMM(object): loss_ctcvr = paddle.fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) cost = loss_ctr + loss_ctcvr avg_cost = fluid.layers.mean(cost) - #fluid.layers.Print(ctr_clk, message="ctr_clk") + auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk) auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy) return avg_cost,auc_ctr,auc_ctcvr - - - - - - - - - - - - - - - - - - \ No newline at end of file + \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/reader.py b/PaddleRec/multi_task/esmm/reader.py index 5b675570..4ccd31c9 100644 --- a/PaddleRec/multi_task/esmm/reader.py +++ b/PaddleRec/multi_task/esmm/reader.py @@ -7,7 +7,7 @@ import os def join_data(file1,file2,write_file,sample_size): sample_list = [] common_logs = defaultdict(lambda: '') - file = open(write_file, 'w',encoding='utf-8') + file = open(write_file, 'w') print("begin push sample_list!") with open(file1,'r') as f: @@ -45,7 +45,7 @@ def join_data(file1,file2,write_file,sample_size): def read_data(file_name,write_file): - file = open(write_file, 'w',encoding='utf-8') + file = open(write_file, 'w') print("begin to write!") with open(file_name,'r') as f: for i, line in enumerate(f): @@ -65,7 +65,7 @@ def read_data(file_name,write_file): #sample_id|y|z|common_feature_index|feat_num|feat_list elif(feat_len == 6): - # y=0 & z=1过滤 + # y=0 & z=1 filter if(line[1] == '0' and line[2] == '1'): continue feat_strs = line[5] @@ -80,15 +80,13 @@ def read_data(file_name,write_file): file.close() - -##重新编码 def recode(file_path,writh_file,vocab_path): all_feat_id_dict = defaultdict(int) - file1 = open(writh_file[0], 'w',encoding='utf-8') - file2 = open(writh_file[1], 'w',encoding='utf-8') - vocab_file = open(vocab_path, 'w',encoding='utf-8') + file1 = open(writh_file[0], 'w') + file2 = open(writh_file[1], 'w') + vocab_file = open(vocab_path, 'w') id = 0 - with open(file_path[0], "r", encoding='utf-8') as f: + with open(file_path[0], "r") as f: for i, line in enumerate(f): line = line.strip().split(',') feat_lists = [] @@ -100,7 +98,7 @@ def recode(file_path,writh_file,vocab_path): feat_lists.append('%s:%s' % (field_id,all_feat_id_dict[feat_id])) sample = "{0},{1},{2},{3}".format(line[0], line[1], line[2], ','.join(feat_lists)) + "\n" file1.write(sample) - with open(file_path[1], "r", encoding='utf-8') as f: + with open(file_path[1], "r") as f: for i, line in enumerate(f): line = line.strip().split(',') feat_lists = [] @@ -131,7 +129,7 @@ if __name__ == "__main__": write_file = args.train_data_path + '/train_data.csv' join_data(skeleton_train_path,features_train_path,write_file,args.train_sample_size) - ##删除产生的中间文件 + os.system('rm -rf ' + skeleton_train_path) os.system('rm -rf ' + features_train_path) @@ -146,7 +144,7 @@ if __name__ == "__main__": write_file = args.test_data_path + '/test_data.csv' join_data(skeleton_test_path,features_test_path,write_file,args.test_sample_size) - ##删除产生的中间文件 + os.system('rm -rf ' + skeleton_test_path) os.system('rm -rf ' + features_test_path) @@ -154,13 +152,6 @@ if __name__ == "__main__": file_path = [args.train_data_path + '/train_data.csv', args.test_data_path + '/test_data.csv'] write_file = [args.train_data_path + '/traindata.csv',args.test_data_path + '/testdata.csv'] recode(file_path,write_file,args.vocab_path) - ##删除产生的中间文件 + for file in file_path: os.system('rm -rf ' + file_path) - - - - - - - diff --git a/PaddleRec/multi_task/esmm/train.py b/PaddleRec/multi_task/esmm/train.py index 0ca95dcb..2cb5e781 100644 --- a/PaddleRec/multi_task/esmm/train.py +++ b/PaddleRec/multi_task/esmm/train.py @@ -6,9 +6,6 @@ import paddle import utils import args - - - def train(args, vocab_size, train_data_path): esmm_model = ESMM() inputs = esmm_model.input_data() @@ -16,8 +13,6 @@ def train(args, vocab_size, train_data_path): dataset, file_list = utils.get_dataset(inputs, train_data_path,args.batch_size,args.cpu_num) avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size) - - # 选择反向更新优化策略 optimizer = fluid.optimizer.Adam() optimizer.minimize(avg_cost) @@ -42,12 +37,7 @@ def train(args, vocab_size, train_data_path): main_program = fluid.default_main_program() fluid.io.save(main_program,model_dir) - - if __name__ == "__main__": args = args.parse_args() vocab_size =utils.get_vocab_size(args.vocab_path) train(args, vocab_size, args.train_data_path) - - - diff --git a/PaddleRec/multi_task/esmm/utils.py b/PaddleRec/multi_task/esmm/utils.py index 81828661..7ab66dff 100644 --- a/PaddleRec/multi_task/esmm/utils.py +++ b/PaddleRec/multi_task/esmm/utils.py @@ -31,47 +31,3 @@ def get_vocab_size(vocab_path): line = rf.readline() return int(line.strip()) + 1 - -class CriteoDataset(object): - - def _reader_creator(self, file): - def reader(): - with open(file, 'r') as f: - for line in f: - features = line.strip().split(',') - ctr = features[1] - cvr = features[2] - - padding = '0' - output = [(field_id,[]) for field_id in all_field_id_dict] - - for elem in features[4:]: - field_id,feat_id = elem.strip().split(':') - if field_id not in all_field_id_dict: - continue - all_field_id_dict[field_id][0] = True - index = all_field_id_dict[field_id][1] - output[index][1].append(feat_id) - - for field_id in all_field_id_dict: - visited,index = all_field_id_dict[field_id] - if visited: - all_field_id_dict[field_id][0] = False - else: - output[index][1].append(padding) - output.append(('ctr',ctr)) - output.append(('cvr',cvr)) - yield output - - return reader - - def train(self, file): - return self._reader_creator(file) - - def test(self, file): - return self._reader_creator(file) - - - - - \ No newline at end of file diff --git a/PaddleRec/ncf/Dataset.py b/PaddleRec/ncf/Dataset.py index 9cbe0200..ce9a6ada 100644 --- a/PaddleRec/ncf/Dataset.py +++ b/PaddleRec/ncf/Dataset.py @@ -32,7 +32,4 @@ class Dataset(object): negatives.append(int(x)) negativeList.append(negatives) line = f.readline() - return negativeList - - - + return negativeList \ No newline at end of file diff --git a/PaddleRec/ncf/README.md b/PaddleRec/ncf/README.md index 53def11f..1069701a 100644 --- a/PaddleRec/ncf/README.md +++ b/PaddleRec/ncf/README.md @@ -3,6 +3,7 @@ 以下是本例的简要目录结构及说明: ``` +├── Data/ #原始数据集目录 ├── README.md # 文档 ├── requirements.txt # 需要的安装包 ├── gmf.py # gmf网络文件 diff --git a/PaddleRec/ncf/evaluate.py b/PaddleRec/ncf/evaluate.py index b91a7e26..a8becd1b 100644 --- a/PaddleRec/ncf/evaluate.py +++ b/PaddleRec/ncf/evaluate.py @@ -14,7 +14,6 @@ import paddle import args import utils import time -#from numba import jit, autojit # Global variables that are shared across processes _model = None diff --git a/PaddleRec/ncf/gmf.py b/PaddleRec/ncf/gmf.py index fc6a3620..7f97dee4 100644 --- a/PaddleRec/ncf/gmf.py +++ b/PaddleRec/ncf/gmf.py @@ -28,23 +28,4 @@ class GMF(object): return avg_cost, prediction - - - - - - - - - - - - - - - - - - - - \ No newline at end of file + \ No newline at end of file diff --git a/PaddleRec/ncf/infer.py b/PaddleRec/ncf/infer.py index 9da89c3a..29c1f03f 100644 --- a/PaddleRec/ncf/infer.py +++ b/PaddleRec/ncf/infer.py @@ -23,26 +23,10 @@ if __name__ == "__main__": topK = 10 begin = time.time() - model_path = args.model_dir + "/epoch_" + str(args.epochs - 1) + model_path = args.model_dir + "/epoch_" + str(12) (hits, ndcgs) = evaluate_model(args, testRatings, testNegatives, topK, model_path) hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() end = time.time() logger.info("epoch: {}, epoch_time: {:.5f}s, HR: {:.5f}, NDCG: {:.5f}".format(args.epochs, end - begin, hr, ndcg)) - - - - - - - - - - - - - - - - - \ No newline at end of file + \ No newline at end of file diff --git a/PaddleRec/ncf/mlp.py b/PaddleRec/ncf/mlp.py index 12566c7c..38fbd381 100644 --- a/PaddleRec/ncf/mlp.py +++ b/PaddleRec/ncf/mlp.py @@ -30,7 +30,6 @@ class MLP(object): name='layer_' + str(i)) # Final prediction layer - prediction = fluid.layers.fc(input=vector, size=1, act='sigmoid', @@ -42,23 +41,4 @@ class MLP(object): return avg_cost, prediction - - - - - - - - - - - - - - - - - - - - \ No newline at end of file + \ No newline at end of file diff --git a/PaddleRec/ncf/neumf.py b/PaddleRec/ncf/neumf.py index 5330e278..aecbc78e 100644 --- a/PaddleRec/ncf/neumf.py +++ b/PaddleRec/ncf/neumf.py @@ -31,14 +31,12 @@ class NeuMF(object): mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1) mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1) mf_vector = fluid.layers.elementwise_mul(mf_user_latent, mf_item_latent) - #fluid.layers.Print(mf_vector, message="mf_vector") # MLP part # The 0-th layer is the concatenation of embedding layers mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1) mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1) mlp_vector = fluid.layers.concat(input=[mlp_user_latent, mlp_item_latent], axis=-1) - #fluid.layers.Print(mlp_vector, message="mlp_vector") for i in range(1, num_layer): mlp_vector = fluid.layers.fc(input=mlp_vector, @@ -62,24 +60,5 @@ class NeuMF(object): avg_cost = fluid.layers.mean(cost) return avg_cost, prediction - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/PaddleRec/ncf/train.py b/PaddleRec/ncf/train.py index 247a4a7d..6cd17ed1 100644 --- a/PaddleRec/ncf/train.py +++ b/PaddleRec/ncf/train.py @@ -22,8 +22,8 @@ def train(args, train_data_path): dataset = Dataset(args.path + args.dataset) testRatings, testNegatives = dataset.testRatings, dataset.testNegatives - train_data_generator = utils.CriteoDataset() - train_reader = paddle.batch(train_data_generator.train(train_data_path, True), batch_size=args.batch_size) + train_data_generator = utils.Dataset() + train_reader = fluid.io.batch(train_data_generator.train(train_data_path, True), batch_size=args.batch_size) inputs = utils.input_data(True) if args.GMF: @@ -42,14 +42,17 @@ def train(args, train_data_path): place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - feeder = fluid.DataFeeder(feed_list=inputs, place=place) + + loader = fluid.io.DataLoader.from_generator( + feed_list=inputs, capacity=args.batch_size, iterable=True) + loader.set_sample_list_generator(train_reader, places=place) for epoch in range(args.epochs): - for batch_id, data in enumerate(train_reader()): + for batch_id, data in enumerate(loader()): begin = time.time() loss_val = exe.run(program=fluid.default_main_program(), - feed=feeder.feed(data), + feed=data, fetch_list=[loss.name], return_numpy=True) end = time.time() @@ -59,9 +62,7 @@ def train(args, train_data_path): feed_var_names = ["user_input", "item_input"] fetch_vars = [pred] fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) - - - + if __name__ == "__main__": args = args.parse_args() train(args, args.train_data_path) diff --git a/PaddleRec/ncf/train_gpu.sh b/PaddleRec/ncf/train_gpu.sh index 6e8ed3a4..f6691865 100644 --- a/PaddleRec/ncf/train_gpu.sh +++ b/PaddleRec/ncf/train_gpu.sh @@ -1,8 +1,8 @@ CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 \ - --NeuMF 1 \ + --MLP 1 \ --epochs 20 \ --batch_size 256 \ --num_factors 8 \ --num_neg 4 \ --lr 0.001 \ - --model_dir 'model_dir' \ No newline at end of file + --model_dir 'mlp_model_dir' \ No newline at end of file diff --git a/PaddleRec/ncf/utils.py b/PaddleRec/ncf/utils.py index 400de8a2..96cdec53 100644 --- a/PaddleRec/ncf/utils.py +++ b/PaddleRec/ncf/utils.py @@ -2,7 +2,7 @@ import numpy as np import os import paddle.fluid as fluid -class CriteoDataset(object): +class Dataset(object): def _reader_creator(self, file, is_train): def reader(): @@ -35,5 +35,5 @@ def input_data(is_train): inputs = [user_input] + [item_input] + [label] else: inputs = [user_input] + [item_input] - + return inputs \ No newline at end of file -- GitLab