未验证 提交 bf3fb7a5 编写于 作者: O overlordmax 提交者: GitHub

Pr 05131624 (#4616)

* fix bugs

* fix bugs

* add wide_deep

* fix code style

* fix code style

* fix some bugs

* fix filename

* add ncf

* add download data

* add download data

* add youtube dnn

* edit README.md

* fix some bugs
上级 bc249915
......@@ -27,7 +27,7 @@ def parse_args():
parser.add_argument("--epochs", type=int, default=40, help="epochs")
parser.add_argument("--batch_size", type=int, default=40, help="batch_size")
parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu')
parser.add_argument('--test_epoch', type=str, default='1',help='test_epoch')
parser.add_argument('--test_epoch', type=str, default='39',help='test_epoch')
parser.add_argument('--train_path', type=str, default='data/adult.data', help='train_path')
parser.add_argument('--test_path', type=str, default='data/adult.test', help='test_path')
parser.add_argument('--train_data_path', type=str, default='train_data/train_data.csv', help='train_data_path')
......
......@@ -27,10 +27,8 @@ def set_zero(var_name,scope=fluid.global_scope(), place=fluid.CPUPlace(),param_t
def run_infer(args,test_data_path):
wide_deep_model = wide_deep()
test_data_generator = utils.CriteoDataset()
test_reader = paddle.batch(test_data_generator.test(test_data_path), batch_size=args.batch_size)
test_data_generator = utils.Dataset()
test_reader = fluid.io.batch(test_data_generator.test(test_data_path), batch_size=args.batch_size)
inference_scope = fluid.Scope()
startup_program = fluid.framework.Program()
test_program = fluid.framework.Program()
......@@ -43,20 +41,20 @@ def run_infer(args,test_data_path):
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
loss, acc, auc, batch_auc, auc_states = wide_deep_model.model(inputs, args.hidden1_units, args.hidden2_units, args.hidden3_units)
exe = fluid.Executor(place)
exe.run(startup_program)
fluid.load(fluid.default_main_program(), cur_model_path,exe)
feeder = fluid.DataFeeder(feed_list=inputs, place=place)
loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True)
loader.set_sample_list_generator(test_reader, places=place)
for var in auc_states: # reset auc states
set_zero(var.name, scope=inference_scope, place=place)
mean_acc = []
mean_auc = []
for batch_id, data in enumerate(test_reader()):
for batch_id, data in enumerate(loader()):
begin = time.time()
acc_val,auc_val = exe.run(program=test_program,
feed=feeder.feed(data),
feed=data,
fetch_list=[acc.name, auc.name],
return_numpy=True
)
......
......@@ -5,6 +5,7 @@ import paddle.fluid as fluid
class wide_deep(object):
def wide_part(self, data):
out = fluid.layers.fc(input=data,
size=1,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1])),
......@@ -14,6 +15,7 @@ class wide_deep(object):
return out
def fc(self, data, hidden_units, active, tag):
output = fluid.layers.fc(input=data,
size=hidden_units,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))),
......@@ -23,6 +25,7 @@ class wide_deep(object):
return output
def deep_part(self, data, hidden1_units, hidden2_units, hidden3_units):
l1 = self.fc(data, hidden1_units, 'relu', 'l1')
l2 = self.fc(l1, hidden2_units, 'relu', 'l2')
l3 = self.fc(l2, hidden3_units, 'relu', 'l3')
......@@ -38,6 +41,7 @@ class wide_deep(object):
return inputs
def model(self, inputs, hidden1_units, hidden2_units, hidden3_units):
wide_output = self.wide_part(inputs[0])
deep_output = self.deep_part(inputs[1], hidden1_units, hidden2_units, hidden3_units)
......
......@@ -15,8 +15,8 @@ logger.setLevel(logging.INFO)
def train(args, train_data_path):
wide_deep_model = wide_deep()
inputs = wide_deep_model.input_data()
train_data_generator = utils.CriteoDataset()
train_reader = paddle.batch(train_data_generator.train(train_data_path), batch_size=args.batch_size)
train_data_generator = utils.Dataset()
train_reader = fluid.io.batch(train_data_generator.train(train_data_path), batch_size=args.batch_size)
loss, acc, auc, batch_auc, auc_states = wide_deep_model.model(inputs, args.hidden1_units, args.hidden2_units, args.hidden3_units)
optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.01)
......@@ -25,13 +25,16 @@ def train(args, train_data_path):
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
feeder = fluid.DataFeeder(feed_list=inputs, place=place)
loader = fluid.io.DataLoader.from_generator(
feed_list=inputs, capacity=args.batch_size, iterable=True)
loader.set_sample_list_generator(train_reader, places=place)
for epoch in range(args.epochs):
for batch_id, data in enumerate(train_reader()):
for batch_id, data in enumerate(loader()):
begin = time.time()
loss_val, acc_val, auc_val = exe.run(program=fluid.default_main_program(),
feed=feeder.feed(data),
feed=data,
fetch_list=[loss.name, acc.name, auc.name],
return_numpy=True)
end = time.time()
......
......@@ -2,7 +2,7 @@ import numpy as np
import os
import paddle.fluid as fluid
class CriteoDataset(object):
class Dataset(object):
def _reader_creator(self, file):
def reader():
......
......@@ -108,7 +108,7 @@ python infer.py --use_gpu True\ #是否使用gpu
CPU环境
在cpu_train.sh脚本文件中设置好数据路径、参数。
在cpu_infer.sh脚本文件中设置好数据路径、参数。
```shell
python infer.py --use_gpu False\ #是否使用gpu
......@@ -118,6 +118,14 @@ python infer.py --use_gpu False\ #是否使用gpu
--vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径
```
修改脚本的可执行权限并运行
```
./cpu_infer.sh
```
## 模型效果
目前只抽取部分数据验证模型正确性。模型预测结果实例如下:
......
......@@ -15,8 +15,6 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator):
def reader():
features = line.strip().split(',')
#ctr = list(map(int, features[1]))
#cvr = list(map(int, features[2]))
ctr = features[1]
cvr = features[2]
......@@ -28,8 +26,7 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator):
if field_id not in all_field_id_dict:
continue
all_field_id_dict[field_id][0] = True
index = all_field_id_dict[field_id][1]
#feat_id = list(map(int, feat_id))
index = all_field_id_dict[field_id][1]
output[index][1].append(feat_id)
for field_id in all_field_id_dict:
......
......@@ -25,11 +25,6 @@ def run_infer(args,model_path,test_data_path,vocab_size):
place = fluid.CPUPlace()
esmm_model = ESMM()
test_data_generator = utils.CriteoDataset()
test_reader = paddle.batch(test_data_generator.test(test_data_path),batch_size=args.batch_size)
startup_program = fluid.framework.Program()
test_program = fluid.framework.Program()
......@@ -41,7 +36,6 @@ def run_infer(args,model_path,test_data_path,vocab_size):
dataset, file_list = utils.get_dataset(inputs, test_data_path,args.batch_size,args.cpu_num)
exe = fluid.Executor(place)
#加载模型
fluid.load(fluid.default_main_program(),os.path.join(model_path, "checkpoint"), exe)
set_zero(place)
......@@ -70,14 +64,4 @@ if __name__ == "__main__":
logger.info("Test model {}".format(model))
run_infer(args, model,args.test_data_path)
\ No newline at end of file
\ No newline at end of file
......@@ -5,7 +5,6 @@ import paddle
import utils
import args
class ESMM(object):
def fc(self,tag, data, out_dim, active='prelu'):
......@@ -47,14 +46,12 @@ class ESMM(object):
initializer=fluid.initializer.Xavier(fan_in=embed_size,fan_out=embed_size)
),
is_sparse=True)
#fluid.layers.Print(feat_emb, message="feat_emb")
field_emb = fluid.layers.sequence_pool(input=feat_emb,pool_type='sum')
emb.append(field_emb)
concat_emb = fluid.layers.concat(emb, axis=1)
# ctr
active = 'relu'
ctr_fc1 = self.fc('ctr_fc1', concat_emb, 200, active)
ctr_fc2 = self.fc('ctr_fc2', ctr_fc1, 80, active)
......@@ -67,15 +64,10 @@ class ESMM(object):
ctr_clk = inputs[-2]
ctcvr_buy = inputs[-1]
#ctr_label = fluid.layers.concat(input=[ctr_clk,1-ctr_clk],axis=1)
#ctcvr_label = fluid.layers.concat(input=[ctcvr_buy,1-ctcvr_buy],axis=1)
#ctr_label = fluid.layers.cast(x=ctr_label, dtype='float32')
#ctcvr_label = fluid.layers.cast(x=ctcvr_label, dtype='float32')
ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2])
cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2])
ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one)
ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1)
......@@ -83,26 +75,9 @@ class ESMM(object):
loss_ctcvr = paddle.fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy)
cost = loss_ctr + loss_ctcvr
avg_cost = fluid.layers.mean(cost)
#fluid.layers.Print(ctr_clk, message="ctr_clk")
auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk)
auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy)
return avg_cost,auc_ctr,auc_ctcvr
\ No newline at end of file
\ No newline at end of file
......@@ -7,7 +7,7 @@ import os
def join_data(file1,file2,write_file,sample_size):
sample_list = []
common_logs = defaultdict(lambda: '')
file = open(write_file, 'w',encoding='utf-8')
file = open(write_file, 'w')
print("begin push sample_list!")
with open(file1,'r') as f:
......@@ -45,7 +45,7 @@ def join_data(file1,file2,write_file,sample_size):
def read_data(file_name,write_file):
file = open(write_file, 'w',encoding='utf-8')
file = open(write_file, 'w')
print("begin to write!")
with open(file_name,'r') as f:
for i, line in enumerate(f):
......@@ -65,7 +65,7 @@ def read_data(file_name,write_file):
#sample_id|y|z|common_feature_index|feat_num|feat_list
elif(feat_len == 6):
# y=0 & z=1过滤
# y=0 & z=1 filter
if(line[1] == '0' and line[2] == '1'):
continue
feat_strs = line[5]
......@@ -80,15 +80,13 @@ def read_data(file_name,write_file):
file.close()
##重新编码
def recode(file_path,writh_file,vocab_path):
all_feat_id_dict = defaultdict(int)
file1 = open(writh_file[0], 'w',encoding='utf-8')
file2 = open(writh_file[1], 'w',encoding='utf-8')
vocab_file = open(vocab_path, 'w',encoding='utf-8')
file1 = open(writh_file[0], 'w')
file2 = open(writh_file[1], 'w')
vocab_file = open(vocab_path, 'w')
id = 0
with open(file_path[0], "r", encoding='utf-8') as f:
with open(file_path[0], "r") as f:
for i, line in enumerate(f):
line = line.strip().split(',')
feat_lists = []
......@@ -100,7 +98,7 @@ def recode(file_path,writh_file,vocab_path):
feat_lists.append('%s:%s' % (field_id,all_feat_id_dict[feat_id]))
sample = "{0},{1},{2},{3}".format(line[0], line[1], line[2], ','.join(feat_lists)) + "\n"
file1.write(sample)
with open(file_path[1], "r", encoding='utf-8') as f:
with open(file_path[1], "r") as f:
for i, line in enumerate(f):
line = line.strip().split(',')
feat_lists = []
......@@ -131,7 +129,7 @@ if __name__ == "__main__":
write_file = args.train_data_path + '/train_data.csv'
join_data(skeleton_train_path,features_train_path,write_file,args.train_sample_size)
##删除产生的中间文件
os.system('rm -rf ' + skeleton_train_path)
os.system('rm -rf ' + features_train_path)
......@@ -146,7 +144,7 @@ if __name__ == "__main__":
write_file = args.test_data_path + '/test_data.csv'
join_data(skeleton_test_path,features_test_path,write_file,args.test_sample_size)
##删除产生的中间文件
os.system('rm -rf ' + skeleton_test_path)
os.system('rm -rf ' + features_test_path)
......@@ -154,13 +152,6 @@ if __name__ == "__main__":
file_path = [args.train_data_path + '/train_data.csv', args.test_data_path + '/test_data.csv']
write_file = [args.train_data_path + '/traindata.csv',args.test_data_path + '/testdata.csv']
recode(file_path,write_file,args.vocab_path)
##删除产生的中间文件
for file in file_path:
os.system('rm -rf ' + file_path)
......@@ -6,9 +6,6 @@ import paddle
import utils
import args
def train(args, vocab_size, train_data_path):
esmm_model = ESMM()
inputs = esmm_model.input_data()
......@@ -16,8 +13,6 @@ def train(args, vocab_size, train_data_path):
dataset, file_list = utils.get_dataset(inputs, train_data_path,args.batch_size,args.cpu_num)
avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size)
# 选择反向更新优化策略
optimizer = fluid.optimizer.Adam()
optimizer.minimize(avg_cost)
......@@ -42,12 +37,7 @@ def train(args, vocab_size, train_data_path):
main_program = fluid.default_main_program()
fluid.io.save(main_program,model_dir)
if __name__ == "__main__":
args = args.parse_args()
vocab_size =utils.get_vocab_size(args.vocab_path)
train(args, vocab_size, args.train_data_path)
......@@ -31,47 +31,3 @@ def get_vocab_size(vocab_path):
line = rf.readline()
return int(line.strip()) + 1
class CriteoDataset(object):
def _reader_creator(self, file):
def reader():
with open(file, 'r') as f:
for line in f:
features = line.strip().split(',')
ctr = features[1]
cvr = features[2]
padding = '0'
output = [(field_id,[]) for field_id in all_field_id_dict]
for elem in features[4:]:
field_id,feat_id = elem.strip().split(':')
if field_id not in all_field_id_dict:
continue
all_field_id_dict[field_id][0] = True
index = all_field_id_dict[field_id][1]
output[index][1].append(feat_id)
for field_id in all_field_id_dict:
visited,index = all_field_id_dict[field_id]
if visited:
all_field_id_dict[field_id][0] = False
else:
output[index][1].append(padding)
output.append(('ctr',ctr))
output.append(('cvr',cvr))
yield output
return reader
def train(self, file):
return self._reader_creator(file)
def test(self, file):
return self._reader_creator(file)
\ No newline at end of file
......@@ -32,7 +32,4 @@ class Dataset(object):
negatives.append(int(x))
negativeList.append(negatives)
line = f.readline()
return negativeList
return negativeList
\ No newline at end of file
......@@ -3,6 +3,7 @@
以下是本例的简要目录结构及说明:
```
├── Data/ #原始数据集目录
├── README.md # 文档
├── requirements.txt # 需要的安装包
├── gmf.py # gmf网络文件
......
......@@ -14,7 +14,6 @@ import paddle
import args
import utils
import time
#from numba import jit, autojit
# Global variables that are shared across processes
_model = None
......
......@@ -28,23 +28,4 @@ class GMF(object):
return avg_cost, prediction
\ No newline at end of file
\ No newline at end of file
......@@ -23,26 +23,10 @@ if __name__ == "__main__":
topK = 10
begin = time.time()
model_path = args.model_dir + "/epoch_" + str(args.epochs - 1)
model_path = args.model_dir + "/epoch_" + str(12)
(hits, ndcgs) = evaluate_model(args, testRatings, testNegatives, topK, model_path)
hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
end = time.time()
logger.info("epoch: {}, epoch_time: {:.5f}s, HR: {:.5f}, NDCG: {:.5f}".format(args.epochs, end - begin, hr, ndcg))
\ No newline at end of file
\ No newline at end of file
......@@ -30,7 +30,6 @@ class MLP(object):
name='layer_' + str(i))
# Final prediction layer
prediction = fluid.layers.fc(input=vector,
size=1,
act='sigmoid',
......@@ -42,23 +41,4 @@ class MLP(object):
return avg_cost, prediction
\ No newline at end of file
\ No newline at end of file
......@@ -31,14 +31,12 @@ class NeuMF(object):
mf_user_latent = fluid.layers.flatten(x=MF_Embedding_User, axis=1)
mf_item_latent = fluid.layers.flatten(x=MF_Embedding_Item, axis=1)
mf_vector = fluid.layers.elementwise_mul(mf_user_latent, mf_item_latent)
#fluid.layers.Print(mf_vector, message="mf_vector")
# MLP part
# The 0-th layer is the concatenation of embedding layers
mlp_user_latent = fluid.layers.flatten(x=MLP_Embedding_User, axis=1)
mlp_item_latent = fluid.layers.flatten(x=MLP_Embedding_Item, axis=1)
mlp_vector = fluid.layers.concat(input=[mlp_user_latent, mlp_item_latent], axis=-1)
#fluid.layers.Print(mlp_vector, message="mlp_vector")
for i in range(1, num_layer):
mlp_vector = fluid.layers.fc(input=mlp_vector,
......@@ -62,24 +60,5 @@ class NeuMF(object):
avg_cost = fluid.layers.mean(cost)
return avg_cost, prediction
\ No newline at end of file
......@@ -22,8 +22,8 @@ def train(args, train_data_path):
dataset = Dataset(args.path + args.dataset)
testRatings, testNegatives = dataset.testRatings, dataset.testNegatives
train_data_generator = utils.CriteoDataset()
train_reader = paddle.batch(train_data_generator.train(train_data_path, True), batch_size=args.batch_size)
train_data_generator = utils.Dataset()
train_reader = fluid.io.batch(train_data_generator.train(train_data_path, True), batch_size=args.batch_size)
inputs = utils.input_data(True)
if args.GMF:
......@@ -42,14 +42,17 @@ def train(args, train_data_path):
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
feeder = fluid.DataFeeder(feed_list=inputs, place=place)
loader = fluid.io.DataLoader.from_generator(
feed_list=inputs, capacity=args.batch_size, iterable=True)
loader.set_sample_list_generator(train_reader, places=place)
for epoch in range(args.epochs):
for batch_id, data in enumerate(train_reader()):
for batch_id, data in enumerate(loader()):
begin = time.time()
loss_val = exe.run(program=fluid.default_main_program(),
feed=feeder.feed(data),
feed=data,
fetch_list=[loss.name],
return_numpy=True)
end = time.time()
......@@ -59,9 +62,7 @@ def train(args, train_data_path):
feed_var_names = ["user_input", "item_input"]
fetch_vars = [pred]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
if __name__ == "__main__":
args = args.parse_args()
train(args, args.train_data_path)
CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 \
--NeuMF 1 \
--MLP 1 \
--epochs 20 \
--batch_size 256 \
--num_factors 8 \
--num_neg 4 \
--lr 0.001 \
--model_dir 'model_dir'
\ No newline at end of file
--model_dir 'mlp_model_dir'
\ No newline at end of file
......@@ -2,7 +2,7 @@ import numpy as np
import os
import paddle.fluid as fluid
class CriteoDataset(object):
class Dataset(object):
def _reader_creator(self, file, is_train):
def reader():
......@@ -35,5 +35,5 @@ def input_data(is_train):
inputs = [user_input] + [item_input] + [label]
else:
inputs = [user_input] + [item_input]
return inputs
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册