提交 0b66650e 编写于 作者: O overlord

fix some bugs

上级 a5b9f83b
...@@ -27,7 +27,7 @@ def parse_args(): ...@@ -27,7 +27,7 @@ def parse_args():
parser.add_argument("--epochs", type=int, default=40, help="epochs") parser.add_argument("--epochs", type=int, default=40, help="epochs")
parser.add_argument("--batch_size", type=int, default=40, help="batch_size") parser.add_argument("--batch_size", type=int, default=40, help="batch_size")
parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu')
parser.add_argument('--test_epoch', type=str, default='1',help='test_epoch') parser.add_argument('--test_epoch', type=str, default='39',help='test_epoch')
parser.add_argument('--train_path', type=str, default='data/adult.data', help='train_path') parser.add_argument('--train_path', type=str, default='data/adult.data', help='train_path')
parser.add_argument('--test_path', type=str, default='data/adult.test', help='test_path') parser.add_argument('--test_path', type=str, default='data/adult.test', help='test_path')
parser.add_argument('--train_data_path', type=str, default='train_data/train_data.csv', help='train_data_path') parser.add_argument('--train_data_path', type=str, default='train_data/train_data.csv', help='train_data_path')
......
...@@ -27,10 +27,8 @@ def set_zero(var_name,scope=fluid.global_scope(), place=fluid.CPUPlace(),param_t ...@@ -27,10 +27,8 @@ def set_zero(var_name,scope=fluid.global_scope(), place=fluid.CPUPlace(),param_t
def run_infer(args,test_data_path): def run_infer(args,test_data_path):
wide_deep_model = wide_deep() wide_deep_model = wide_deep()
test_data_generator = utils.Dataset()
test_data_generator = utils.CriteoDataset() test_reader = fluid.io.batch(test_data_generator.test(test_data_path), batch_size=args.batch_size)
test_reader = paddle.batch(test_data_generator.test(test_data_path), batch_size=args.batch_size)
inference_scope = fluid.Scope() inference_scope = fluid.Scope()
startup_program = fluid.framework.Program() startup_program = fluid.framework.Program()
test_program = fluid.framework.Program() test_program = fluid.framework.Program()
...@@ -43,20 +41,20 @@ def run_infer(args,test_data_path): ...@@ -43,20 +41,20 @@ def run_infer(args,test_data_path):
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
loss, acc, auc, batch_auc, auc_states = wide_deep_model.model(inputs, args.hidden1_units, args.hidden2_units, args.hidden3_units) loss, acc, auc, batch_auc, auc_states = wide_deep_model.model(inputs, args.hidden1_units, args.hidden2_units, args.hidden3_units)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_program)
fluid.load(fluid.default_main_program(), cur_model_path,exe) fluid.load(fluid.default_main_program(), cur_model_path,exe)
feeder = fluid.DataFeeder(feed_list=inputs, place=place) loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True)
loader.set_sample_list_generator(test_reader, places=place)
for var in auc_states: # reset auc states for var in auc_states: # reset auc states
set_zero(var.name, scope=inference_scope, place=place) set_zero(var.name, scope=inference_scope, place=place)
mean_acc = [] mean_acc = []
mean_auc = [] mean_auc = []
for batch_id, data in enumerate(test_reader()): for batch_id, data in enumerate(loader()):
begin = time.time() begin = time.time()
acc_val,auc_val = exe.run(program=test_program, acc_val,auc_val = exe.run(program=test_program,
feed=feeder.feed(data), feed=data,
fetch_list=[acc.name, auc.name], fetch_list=[acc.name, auc.name],
return_numpy=True return_numpy=True
) )
......
...@@ -5,6 +5,7 @@ import paddle.fluid as fluid ...@@ -5,6 +5,7 @@ import paddle.fluid as fluid
class wide_deep(object): class wide_deep(object):
def wide_part(self, data): def wide_part(self, data):
out = fluid.layers.fc(input=data, out = fluid.layers.fc(input=data,
size=1, size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1])), param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1])),
...@@ -14,6 +15,7 @@ class wide_deep(object): ...@@ -14,6 +15,7 @@ class wide_deep(object):
return out return out
def fc(self, data, hidden_units, active, tag): def fc(self, data, hidden_units, active, tag):
output = fluid.layers.fc(input=data, output = fluid.layers.fc(input=data,
size=hidden_units, size=hidden_units,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))), param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))),
...@@ -23,6 +25,7 @@ class wide_deep(object): ...@@ -23,6 +25,7 @@ class wide_deep(object):
return output return output
def deep_part(self, data, hidden1_units, hidden2_units, hidden3_units): def deep_part(self, data, hidden1_units, hidden2_units, hidden3_units):
l1 = self.fc(data, hidden1_units, 'relu', 'l1') l1 = self.fc(data, hidden1_units, 'relu', 'l1')
l2 = self.fc(l1, hidden2_units, 'relu', 'l2') l2 = self.fc(l1, hidden2_units, 'relu', 'l2')
l3 = self.fc(l2, hidden3_units, 'relu', 'l3') l3 = self.fc(l2, hidden3_units, 'relu', 'l3')
...@@ -38,6 +41,7 @@ class wide_deep(object): ...@@ -38,6 +41,7 @@ class wide_deep(object):
return inputs return inputs
def model(self, inputs, hidden1_units, hidden2_units, hidden3_units): def model(self, inputs, hidden1_units, hidden2_units, hidden3_units):
wide_output = self.wide_part(inputs[0]) wide_output = self.wide_part(inputs[0])
deep_output = self.deep_part(inputs[1], hidden1_units, hidden2_units, hidden3_units) deep_output = self.deep_part(inputs[1], hidden1_units, hidden2_units, hidden3_units)
......
...@@ -15,8 +15,8 @@ logger.setLevel(logging.INFO) ...@@ -15,8 +15,8 @@ logger.setLevel(logging.INFO)
def train(args, train_data_path): def train(args, train_data_path):
wide_deep_model = wide_deep() wide_deep_model = wide_deep()
inputs = wide_deep_model.input_data() inputs = wide_deep_model.input_data()
train_data_generator = utils.CriteoDataset() train_data_generator = utils.Dataset()
train_reader = paddle.batch(train_data_generator.train(train_data_path), batch_size=args.batch_size) train_reader = fluid.io.batch(train_data_generator.train(train_data_path), batch_size=args.batch_size)
loss, acc, auc, batch_auc, auc_states = wide_deep_model.model(inputs, args.hidden1_units, args.hidden2_units, args.hidden3_units) loss, acc, auc, batch_auc, auc_states = wide_deep_model.model(inputs, args.hidden1_units, args.hidden2_units, args.hidden3_units)
optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.01) optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.01)
...@@ -25,13 +25,16 @@ def train(args, train_data_path): ...@@ -25,13 +25,16 @@ def train(args, train_data_path):
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
feeder = fluid.DataFeeder(feed_list=inputs, place=place)
loader = fluid.io.DataLoader.from_generator(
feed_list=inputs, capacity=args.batch_size, iterable=True)
loader.set_sample_list_generator(train_reader, places=place)
for epoch in range(args.epochs): for epoch in range(args.epochs):
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(loader()):
begin = time.time() begin = time.time()
loss_val, acc_val, auc_val = exe.run(program=fluid.default_main_program(), loss_val, acc_val, auc_val = exe.run(program=fluid.default_main_program(),
feed=feeder.feed(data), feed=data,
fetch_list=[loss.name, acc.name, auc.name], fetch_list=[loss.name, acc.name, auc.name],
return_numpy=True) return_numpy=True)
end = time.time() end = time.time()
......
...@@ -2,7 +2,7 @@ import numpy as np ...@@ -2,7 +2,7 @@ import numpy as np
import os import os
import paddle.fluid as fluid import paddle.fluid as fluid
class CriteoDataset(object): class Dataset(object):
def _reader_creator(self, file): def _reader_creator(self, file):
def reader(): def reader():
......
...@@ -108,7 +108,7 @@ python infer.py --use_gpu True\ #是否使用gpu ...@@ -108,7 +108,7 @@ python infer.py --use_gpu True\ #是否使用gpu
CPU环境 CPU环境
在cpu_train.sh脚本文件中设置好数据路径、参数。 在cpu_infer.sh脚本文件中设置好数据路径、参数。
```shell ```shell
python infer.py --use_gpu False\ #是否使用gpu python infer.py --use_gpu False\ #是否使用gpu
...@@ -118,6 +118,14 @@ python infer.py --use_gpu False\ #是否使用gpu ...@@ -118,6 +118,14 @@ python infer.py --use_gpu False\ #是否使用gpu
--vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 --vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径
``` ```
修改脚本的可执行权限并运行
```
./cpu_infer.sh
```
## 模型效果 ## 模型效果
目前只抽取部分数据验证模型正确性。模型预测结果实例如下: 目前只抽取部分数据验证模型正确性。模型预测结果实例如下:
......
...@@ -15,8 +15,6 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator): ...@@ -15,8 +15,6 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator):
def reader(): def reader():
features = line.strip().split(',') features = line.strip().split(',')
#ctr = list(map(int, features[1]))
#cvr = list(map(int, features[2]))
ctr = features[1] ctr = features[1]
cvr = features[2] cvr = features[2]
...@@ -29,7 +27,6 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator): ...@@ -29,7 +27,6 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator):
continue continue
all_field_id_dict[field_id][0] = True all_field_id_dict[field_id][0] = True
index = all_field_id_dict[field_id][1] index = all_field_id_dict[field_id][1]
#feat_id = list(map(int, feat_id))
output[index][1].append(feat_id) output[index][1].append(feat_id)
for field_id in all_field_id_dict: for field_id in all_field_id_dict:
......
...@@ -25,11 +25,6 @@ def run_infer(args,model_path,test_data_path,vocab_size): ...@@ -25,11 +25,6 @@ def run_infer(args,model_path,test_data_path,vocab_size):
place = fluid.CPUPlace() place = fluid.CPUPlace()
esmm_model = ESMM() esmm_model = ESMM()
test_data_generator = utils.CriteoDataset()
test_reader = paddle.batch(test_data_generator.test(test_data_path),batch_size=args.batch_size)
startup_program = fluid.framework.Program() startup_program = fluid.framework.Program()
test_program = fluid.framework.Program() test_program = fluid.framework.Program()
...@@ -41,7 +36,6 @@ def run_infer(args,model_path,test_data_path,vocab_size): ...@@ -41,7 +36,6 @@ def run_infer(args,model_path,test_data_path,vocab_size):
dataset, file_list = utils.get_dataset(inputs, test_data_path,args.batch_size,args.cpu_num) dataset, file_list = utils.get_dataset(inputs, test_data_path,args.batch_size,args.cpu_num)
exe = fluid.Executor(place) exe = fluid.Executor(place)
#加载模型
fluid.load(fluid.default_main_program(),os.path.join(model_path, "checkpoint"), exe) fluid.load(fluid.default_main_program(),os.path.join(model_path, "checkpoint"), exe)
set_zero(place) set_zero(place)
...@@ -71,13 +65,3 @@ if __name__ == "__main__": ...@@ -71,13 +65,3 @@ if __name__ == "__main__":
run_infer(args, model,args.test_data_path) run_infer(args, model,args.test_data_path)
\ No newline at end of file
\ No newline at end of file
...@@ -5,7 +5,6 @@ import paddle ...@@ -5,7 +5,6 @@ import paddle
import utils import utils
import args import args
class ESMM(object): class ESMM(object):
def fc(self,tag, data, out_dim, active='prelu'): def fc(self,tag, data, out_dim, active='prelu'):
...@@ -47,14 +46,12 @@ class ESMM(object): ...@@ -47,14 +46,12 @@ class ESMM(object):
initializer=fluid.initializer.Xavier(fan_in=embed_size,fan_out=embed_size) initializer=fluid.initializer.Xavier(fan_in=embed_size,fan_out=embed_size)
), ),
is_sparse=True) is_sparse=True)
#fluid.layers.Print(feat_emb, message="feat_emb")
field_emb = fluid.layers.sequence_pool(input=feat_emb,pool_type='sum') field_emb = fluid.layers.sequence_pool(input=feat_emb,pool_type='sum')
emb.append(field_emb) emb.append(field_emb)
concat_emb = fluid.layers.concat(emb, axis=1) concat_emb = fluid.layers.concat(emb, axis=1)
# ctr # ctr
active = 'relu' active = 'relu'
ctr_fc1 = self.fc('ctr_fc1', concat_emb, 200, active) ctr_fc1 = self.fc('ctr_fc1', concat_emb, 200, active)
ctr_fc2 = self.fc('ctr_fc2', ctr_fc1, 80, active) ctr_fc2 = self.fc('ctr_fc2', ctr_fc1, 80, active)
...@@ -67,15 +64,10 @@ class ESMM(object): ...@@ -67,15 +64,10 @@ class ESMM(object):
ctr_clk = inputs[-2] ctr_clk = inputs[-2]
ctcvr_buy = inputs[-1] ctcvr_buy = inputs[-1]
#ctr_label = fluid.layers.concat(input=[ctr_clk,1-ctr_clk],axis=1)
#ctcvr_label = fluid.layers.concat(input=[ctcvr_buy,1-ctcvr_buy],axis=1)
#ctr_label = fluid.layers.cast(x=ctr_label, dtype='float32')
#ctcvr_label = fluid.layers.cast(x=ctcvr_label, dtype='float32')
ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2]) ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2])
cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2]) cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2])
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one) ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one)
ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1) ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1)
...@@ -83,26 +75,9 @@ class ESMM(object): ...@@ -83,26 +75,9 @@ class ESMM(object):
loss_ctcvr = paddle.fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) loss_ctcvr = paddle.fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy)
cost = loss_ctr + loss_ctcvr cost = loss_ctr + loss_ctcvr
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
#fluid.layers.Print(ctr_clk, message="ctr_clk")
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk) auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy) auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy)
return avg_cost,auc_ctr,auc_ctcvr return avg_cost,auc_ctr,auc_ctcvr
\ No newline at end of file
\ No newline at end of file
...@@ -7,7 +7,7 @@ import os ...@@ -7,7 +7,7 @@ import os
def join_data(file1,file2,write_file,sample_size): def join_data(file1,file2,write_file,sample_size):
sample_list = [] sample_list = []
common_logs = defaultdict(lambda: '') common_logs = defaultdict(lambda: '')
file = open(write_file, 'w',encoding='utf-8') file = open(write_file, 'w')
print("begin push sample_list!") print("begin push sample_list!")
with open(file1,'r') as f: with open(file1,'r') as f:
...@@ -45,7 +45,7 @@ def join_data(file1,file2,write_file,sample_size): ...@@ -45,7 +45,7 @@ def join_data(file1,file2,write_file,sample_size):
def read_data(file_name,write_file): def read_data(file_name,write_file):
file = open(write_file, 'w',encoding='utf-8') file = open(write_file, 'w')
print("begin to write!") print("begin to write!")
with open(file_name,'r') as f: with open(file_name,'r') as f:
for i, line in enumerate(f): for i, line in enumerate(f):
...@@ -65,7 +65,7 @@ def read_data(file_name,write_file): ...@@ -65,7 +65,7 @@ def read_data(file_name,write_file):
#sample_id|y|z|common_feature_index|feat_num|feat_list #sample_id|y|z|common_feature_index|feat_num|feat_list
elif(feat_len == 6): elif(feat_len == 6):
# y=0 & z=1过滤 # y=0 & z=1 filter
if(line[1] == '0' and line[2] == '1'): if(line[1] == '0' and line[2] == '1'):
continue continue
feat_strs = line[5] feat_strs = line[5]
...@@ -80,15 +80,13 @@ def read_data(file_name,write_file): ...@@ -80,15 +80,13 @@ def read_data(file_name,write_file):
file.close() file.close()
##重新编码
def recode(file_path,writh_file,vocab_path): def recode(file_path,writh_file,vocab_path):
all_feat_id_dict = defaultdict(int) all_feat_id_dict = defaultdict(int)
file1 = open(writh_file[0], 'w',encoding='utf-8') file1 = open(writh_file[0], 'w')
file2 = open(writh_file[1], 'w',encoding='utf-8') file2 = open(writh_file[1], 'w')
vocab_file = open(vocab_path, 'w',encoding='utf-8') vocab_file = open(vocab_path, 'w')
id = 0 id = 0
with open(file_path[0], "r", encoding='utf-8') as f: with open(file_path[0], "r") as f:
for i, line in enumerate(f): for i, line in enumerate(f):
line = line.strip().split(',') line = line.strip().split(',')
feat_lists = [] feat_lists = []
...@@ -100,7 +98,7 @@ def recode(file_path,writh_file,vocab_path): ...@@ -100,7 +98,7 @@ def recode(file_path,writh_file,vocab_path):
feat_lists.append('%s:%s' % (field_id,all_feat_id_dict[feat_id])) feat_lists.append('%s:%s' % (field_id,all_feat_id_dict[feat_id]))
sample = "{0},{1},{2},{3}".format(line[0], line[1], line[2], ','.join(feat_lists)) + "\n" sample = "{0},{1},{2},{3}".format(line[0], line[1], line[2], ','.join(feat_lists)) + "\n"
file1.write(sample) file1.write(sample)
with open(file_path[1], "r", encoding='utf-8') as f: with open(file_path[1], "r") as f:
for i, line in enumerate(f): for i, line in enumerate(f):
line = line.strip().split(',') line = line.strip().split(',')
feat_lists = [] feat_lists = []
...@@ -131,7 +129,7 @@ if __name__ == "__main__": ...@@ -131,7 +129,7 @@ if __name__ == "__main__":
write_file = args.train_data_path + '/train_data.csv' write_file = args.train_data_path + '/train_data.csv'
join_data(skeleton_train_path,features_train_path,write_file,args.train_sample_size) join_data(skeleton_train_path,features_train_path,write_file,args.train_sample_size)
##删除产生的中间文件
os.system('rm -rf ' + skeleton_train_path) os.system('rm -rf ' + skeleton_train_path)
os.system('rm -rf ' + features_train_path) os.system('rm -rf ' + features_train_path)
...@@ -146,7 +144,7 @@ if __name__ == "__main__": ...@@ -146,7 +144,7 @@ if __name__ == "__main__":
write_file = args.test_data_path + '/test_data.csv' write_file = args.test_data_path + '/test_data.csv'
join_data(skeleton_test_path,features_test_path,write_file,args.test_sample_size) join_data(skeleton_test_path,features_test_path,write_file,args.test_sample_size)
##删除产生的中间文件
os.system('rm -rf ' + skeleton_test_path) os.system('rm -rf ' + skeleton_test_path)
os.system('rm -rf ' + features_test_path) os.system('rm -rf ' + features_test_path)
...@@ -154,13 +152,6 @@ if __name__ == "__main__": ...@@ -154,13 +152,6 @@ if __name__ == "__main__":
file_path = [args.train_data_path + '/train_data.csv', args.test_data_path + '/test_data.csv'] file_path = [args.train_data_path + '/train_data.csv', args.test_data_path + '/test_data.csv']
write_file = [args.train_data_path + '/traindata.csv',args.test_data_path + '/testdata.csv'] write_file = [args.train_data_path + '/traindata.csv',args.test_data_path + '/testdata.csv']
recode(file_path,write_file,args.vocab_path) recode(file_path,write_file,args.vocab_path)
##删除产生的中间文件
for file in file_path: for file in file_path:
os.system('rm -rf ' + file_path) os.system('rm -rf ' + file_path)
...@@ -6,9 +6,6 @@ import paddle ...@@ -6,9 +6,6 @@ import paddle
import utils import utils
import args import args
def train(args, vocab_size, train_data_path): def train(args, vocab_size, train_data_path):
esmm_model = ESMM() esmm_model = ESMM()
inputs = esmm_model.input_data() inputs = esmm_model.input_data()
...@@ -16,8 +13,6 @@ def train(args, vocab_size, train_data_path): ...@@ -16,8 +13,6 @@ def train(args, vocab_size, train_data_path):
dataset, file_list = utils.get_dataset(inputs, train_data_path,args.batch_size,args.cpu_num) dataset, file_list = utils.get_dataset(inputs, train_data_path,args.batch_size,args.cpu_num)
avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size) avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size)
# 选择反向更新优化策略
optimizer = fluid.optimizer.Adam() optimizer = fluid.optimizer.Adam()
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
...@@ -42,12 +37,7 @@ def train(args, vocab_size, train_data_path): ...@@ -42,12 +37,7 @@ def train(args, vocab_size, train_data_path):
main_program = fluid.default_main_program() main_program = fluid.default_main_program()
fluid.io.save(main_program,model_dir) fluid.io.save(main_program,model_dir)
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
vocab_size =utils.get_vocab_size(args.vocab_path) vocab_size =utils.get_vocab_size(args.vocab_path)
train(args, vocab_size, args.train_data_path) train(args, vocab_size, args.train_data_path)
...@@ -31,47 +31,3 @@ def get_vocab_size(vocab_path): ...@@ -31,47 +31,3 @@ def get_vocab_size(vocab_path):
line = rf.readline() line = rf.readline()
return int(line.strip()) + 1 return int(line.strip()) + 1
class CriteoDataset(object):
def _reader_creator(self, file):
def reader():
with open(file, 'r') as f:
for line in f:
features = line.strip().split(',')
ctr = features[1]
cvr = features[2]
padding = '0'
output = [(field_id,[]) for field_id in all_field_id_dict]
for elem in features[4:]:
field_id,feat_id = elem.strip().split(':')
if field_id not in all_field_id_dict:
continue
all_field_id_dict[field_id][0] = True
index = all_field_id_dict[field_id][1]
output[index][1].append(feat_id)
for field_id in all_field_id_dict:
visited,index = all_field_id_dict[field_id]
if visited:
all_field_id_dict[field_id][0] = False
else:
output[index][1].append(padding)
output.append(('ctr',ctr))
output.append(('cvr',cvr))
yield output
return reader
def train(self, file):
return self._reader_creator(file)
def test(self, file):
return self._reader_creator(file)
\ No newline at end of file
...@@ -33,6 +33,3 @@ class Dataset(object): ...@@ -33,6 +33,3 @@ class Dataset(object):
negativeList.append(negatives) negativeList.append(negatives)
line = f.readline() line = f.readline()
return negativeList return negativeList
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
以下是本例的简要目录结构及说明: 以下是本例的简要目录结构及说明:
``` ```
├── Data/ #原始数据集目录
├── README.md # 文档 ├── README.md # 文档
├── requirements.txt # 需要的安装包 ├── requirements.txt # 需要的安装包
├── gmf.py # gmf网络文件 ├── gmf.py # gmf网络文件
......
...@@ -14,7 +14,6 @@ import paddle ...@@ -14,7 +14,6 @@ import paddle
import args import args
import utils import utils
import time import time
#from numba import jit, autojit
# Global variables that are shared across processes # Global variables that are shared across processes
_model = None _model = None
......
...@@ -28,23 +28,4 @@ class GMF(object): ...@@ -28,23 +28,4 @@ class GMF(object):
return avg_cost, prediction return avg_cost, prediction
\ No newline at end of file
...@@ -23,26 +23,10 @@ if __name__ == "__main__": ...@@ -23,26 +23,10 @@ if __name__ == "__main__":
topK = 10 topK = 10
begin = time.time() begin = time.time()
model_path = args.model_dir + "/epoch_" + str(args.epochs - 1) model_path = args.model_dir + "/epoch_" + str(12)
(hits, ndcgs) = evaluate_model(args, testRatings, testNegatives, topK, model_path) (hits, ndcgs) = evaluate_model(args, testRatings, testNegatives, topK, model_path)
hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
end = time.time() end = time.time()
logger.info("epoch: {}, epoch_time: {:.5f}s, HR: {:.5f}, NDCG: {:.5f}".format(args.epochs, end - begin, hr, ndcg)) logger.info("epoch: {}, epoch_time: {:.5f}s, HR: {:.5f}, NDCG: {:.5f}".format(args.epochs, end - begin, hr, ndcg))
\ No newline at end of file
\ No newline at end of file
...@@ -30,7 +30,6 @@ class MLP(object): ...@@ -30,7 +30,6 @@ class MLP(object):
name='layer_' + str(i)) name='layer_' + str(i))
# Final prediction layer # Final prediction layer
prediction = fluid.layers.fc(input=vector, prediction = fluid.layers.fc(input=vector,
size=1, size=1,
act='sigmoid', act='sigmoid',
...@@ -42,23 +41,4 @@ class MLP(object): ...@@ -42,23 +41,4 @@ class MLP(object):
return avg_cost, prediction return avg_cost, prediction
\ No newline at end of file
...@@ -31,14 +31,12 @@ class NeuMF(object): ...@@ -31,14 +31,12 @@ class NeuMF(object):
mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1) mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1)
mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1) mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1)
mf_vector = fluid.layers.elementwise_mul(mf_user_latent, mf_item_latent) mf_vector = fluid.layers.elementwise_mul(mf_user_latent, mf_item_latent)
#fluid.layers.Print(mf_vector, message="mf_vector")
# MLP part # MLP part
# The 0-th layer is the concatenation of embedding layers # The 0-th layer is the concatenation of embedding layers
mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1) mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1)
mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1) mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1)
mlp_vector = fluid.layers.concat(input=[mlp_user_latent, mlp_item_latent], axis=-1) mlp_vector = fluid.layers.concat(input=[mlp_user_latent, mlp_item_latent], axis=-1)
#fluid.layers.Print(mlp_vector, message="mlp_vector")
for i in range(1, num_layer): for i in range(1, num_layer):
mlp_vector = fluid.layers.fc(input=mlp_vector, mlp_vector = fluid.layers.fc(input=mlp_vector,
...@@ -63,23 +61,4 @@ class NeuMF(object): ...@@ -63,23 +61,4 @@ class NeuMF(object):
return avg_cost, prediction return avg_cost, prediction
\ No newline at end of file
...@@ -22,8 +22,8 @@ def train(args, train_data_path): ...@@ -22,8 +22,8 @@ def train(args, train_data_path):
dataset = Dataset(args.path + args.dataset) dataset = Dataset(args.path + args.dataset)
testRatings, testNegatives = dataset.testRatings, dataset.testNegatives testRatings, testNegatives = dataset.testRatings, dataset.testNegatives
train_data_generator = utils.CriteoDataset() train_data_generator = utils.Dataset()
train_reader = paddle.batch(train_data_generator.train(train_data_path, True), batch_size=args.batch_size) train_reader = fluid.io.batch(train_data_generator.train(train_data_path, True), batch_size=args.batch_size)
inputs = utils.input_data(True) inputs = utils.input_data(True)
if args.GMF: if args.GMF:
...@@ -42,14 +42,17 @@ def train(args, train_data_path): ...@@ -42,14 +42,17 @@ def train(args, train_data_path):
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
feeder = fluid.DataFeeder(feed_list=inputs, place=place)
loader = fluid.io.DataLoader.from_generator(
feed_list=inputs, capacity=args.batch_size, iterable=True)
loader.set_sample_list_generator(train_reader, places=place)
for epoch in range(args.epochs): for epoch in range(args.epochs):
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(loader()):
begin = time.time() begin = time.time()
loss_val = exe.run(program=fluid.default_main_program(), loss_val = exe.run(program=fluid.default_main_program(),
feed=feeder.feed(data), feed=data,
fetch_list=[loss.name], fetch_list=[loss.name],
return_numpy=True) return_numpy=True)
end = time.time() end = time.time()
...@@ -60,8 +63,6 @@ def train(args, train_data_path): ...@@ -60,8 +63,6 @@ def train(args, train_data_path):
fetch_vars = [pred] fetch_vars = [pred]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
if __name__ == "__main__": if __name__ == "__main__":
args = args.parse_args() args = args.parse_args()
train(args, args.train_data_path) train(args, args.train_data_path)
CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 \ CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 \
--NeuMF 1 \ --MLP 1 \
--epochs 20 \ --epochs 20 \
--batch_size 256 \ --batch_size 256 \
--num_factors 8 \ --num_factors 8 \
--num_neg 4 \ --num_neg 4 \
--lr 0.001 \ --lr 0.001 \
--model_dir 'model_dir' --model_dir 'mlp_model_dir'
\ No newline at end of file \ No newline at end of file
...@@ -2,7 +2,7 @@ import numpy as np ...@@ -2,7 +2,7 @@ import numpy as np
import os import os
import paddle.fluid as fluid import paddle.fluid as fluid
class CriteoDataset(object): class Dataset(object):
def _reader_creator(self, file, is_train): def _reader_creator(self, file, is_train):
def reader(): def reader():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册