提交 9677fd6c 编写于 作者: Z zhangwenhui03

add gru4rec bpr loss &fix tagspace doc 2

上级 e97ca3c1
......@@ -5,9 +5,12 @@
```text
.
├── README.md # 文档
├── train.py # 训练脚本
├── infer.py # 预测脚本
├── net.py # 网络结构
├── train.py # 训练脚本 cross-entropy loss
├── train_bpr.py # 训练脚本 bpr loss
├── infer.py # 预测脚本 cross-entropy loss
├── infer_bpr.py # 预测脚本 bpr loss
├── net.py # 网络结构 cross-entropy loss
├── net_bpr.py # 网络结构 bpr loss
├── text2paddle.py # 文本数据转paddle数据
├── cluster_train.py # 多机训练
├── cluster_train.sh # 多机训练脚本
......@@ -110,7 +113,7 @@ python text2paddle.py raw_train_data/ raw_test_data/ train_data test_data vocab.
## 训练
'--use_cuda 1' 表示使用gpu, 缺省表示使用cpu '--parallel 1' 表示使用多卡,缺省表示使用单卡
具体的参数配置可运行
具体的参数配置可运行
```
python train.py -h
```
......@@ -118,7 +121,7 @@ python train.py -h
GPU 环境
运行命令开始训练模型。
```
CUDA_VISIBLE_DEVICES=0 python train.py --train_dir train_data/ --use_cuda 1
CUDA_VISIBLE_DEVICES=0 python train.py --train_dir train_data/ --use_cuda 1
```
CPU 环境
运行命令开始训练模型。
......@@ -126,6 +129,9 @@ CPU 环境
python train.py --train_dir train_data/
```
bayesian pairwise ranking loss(bpr loss) 训练和cross-entropy的格式一样。
请注意CPU环境下运行单机多卡任务(--parallel 1)时,batch_size应大于cpu核数。
## 自定义网络结构
......
import argparse
import sys
import time
import math
import unittest
import contextlib
import numpy as np
import six
import paddle.fluid as fluid
import paddle
import net_bpr as net
import utils
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--test_dir', type=str, default='test_data', help='test file address')
parser.add_argument(
'--start_index', type=int, default='1', help='start index')
parser.add_argument('--last_index', type=int, default='3', help='end index')
parser.add_argument(
'--model_dir', type=str, default='model_bpr_recall20', help='model dir')
parser.add_argument(
'--use_cuda', type=int, default='0', help='whether use cuda')
parser.add_argument(
'--batch_size', type=int, default='5', help='batch_size')
parser.add_argument(
'--hid_size', type=int, default='100', help='batch_size')
parser.add_argument(
'--vocab_path',
type=str,
default='vocab.txt',
help='vocab file address')
args = parser.parse_args()
return args
def infer(args, vocab_size, test_reader, use_cuda):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
hid_size = args.hid_size
batch_size = args.batch_size
with fluid.scope_guard(fluid.core.Scope()):
main_program = fluid.Program()
with fluid.program_guard(main_program):
acc = net.infer_network(vocab_size, batch_size, hid_size)
for epoch in range(start_index, last_index + 1):
copy_program = main_program.clone()
model_path = model_dir + "/epoch_" + str(epoch)
fluid.io.load_params(
executor=exe, dirname=model_path, main_program=copy_program)
accum_num_recall = 0.0
accum_num_sum = 0.0
t0 = time.time()
step_id = 0
for data in test_reader():
step_id += 1
label_data = [dat[1] for dat in data]
ls, lp = utils.to_lodtensor_bpr_test(data, vocab_size,
place)
para = exe.run(
copy_program,
feed={
"src": ls,
"all_label":
np.arange(vocab_size).reshape(vocab_size, 1),
"pos_label": lp
},
fetch_list=[acc.name],
return_numpy=False)
acc_ = np.array(para[0])[0]
data_length = len(
np.concatenate(
label_data, axis=0).astype("int64"))
accum_num_sum += (data_length)
accum_num_recall += (data_length * acc_)
if step_id % 1 == 0:
print("step:%d " % (step_id),
accum_num_recall / accum_num_sum)
t1 = time.time()
print("model:%s recall@20:%.4f time_cost(s):%.2f" %
(model_path, accum_num_recall / accum_num_sum, t1 - t0))
if __name__ == "__main__":
args = parse_args()
start_index = args.start_index
last_index = args.last_index
test_dir = args.test_dir
model_dir = args.model_dir
batch_size = args.batch_size
vocab_path = args.vocab_path
use_cuda = True if args.use_cuda else False
print("start index: ", start_index, " last_index:", last_index)
vocab_size, test_reader = utils.prepare_data(
test_dir,
vocab_path,
batch_size=batch_size,
buffer_size=1000,
word_freq_threshold=0,
is_train=False)
infer(args, vocab_size, test_reader=test_reader, use_cuda=use_cuda)
import paddle.fluid as fluid
def train_network(vocab_size, neg_size, hid_size, drop_out=0.2):
""" network definition """
emb_lr_x = 1.0
gru_lr_x = 1.0
fc_lr_x = 1.0
# Input data
src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1)
pos_label = fluid.layers.data(
name="pos_label", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(
name="label", shape=[neg_size + 1], dtype="int64", lod_level=1)
emb_src = fluid.layers.embedding(
input=src,
size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
name="emb",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=emb_lr_x))
emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out)
fc0 = fluid.layers.fc(input=emb_src_drop,
size=hid_size * 3,
param_attr=fluid.ParamAttr(
name="gru_fc",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=gru_lr_x),
bias_attr=False)
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
size=hid_size,
param_attr=fluid.ParamAttr(
name="dy_gru.param",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=gru_lr_x),
bias_attr="dy_gru.bias")
gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out)
label_re = fluid.layers.sequence_reshape(input=label, new_dim=1)
emb_label = fluid.layers.embedding(
input=label_re,
size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
name="emb",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=emb_lr_x))
emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out)
gru_exp = fluid.layers.expand(
x=gru_h0_drop, expand_times=[1, (neg_size + 1)])
gru = fluid.layers.sequence_reshape(input=gru_exp, new_dim=hid_size)
ele_mul = fluid.layers.elementwise_mul(emb_label_drop, gru)
red_sum = fluid.layers.reduce_sum(input=ele_mul, dim=1, keep_dim=True)
pre = fluid.layers.sequence_reshape(input=red_sum, new_dim=(neg_size + 1))
cost = fluid.layers.bpr_loss(input=pre, label=pos_label)
cost_sum = fluid.layers.reduce_sum(input=cost)
return src, pos_label, label, cost_sum
def infer_network(vocab_size, batch_size, hid_size, dropout=0.2):
src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1)
emb_src = fluid.layers.embedding(
input=src, size=[vocab_size, hid_size], param_attr="emb")
emb_src_drop = fluid.layers.dropout(
emb_src, dropout_prob=dropout, is_test=True)
fc0 = fluid.layers.fc(input=emb_src_drop,
size=hid_size * 3,
param_attr="gru_fc",
bias_attr=False)
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
size=hid_size,
param_attr="dy_gru.param",
bias_attr="dy_gru.bias")
gru_h0_drop = fluid.layers.dropout(
gru_h0, dropout_prob=dropout, is_test=True)
all_label = fluid.layers.data(
name="all_label",
shape=[vocab_size, 1],
dtype="int64",
append_batch_size=False)
emb_all_label = fluid.layers.embedding(
input=all_label, size=[vocab_size, hid_size], param_attr="emb")
emb_all_label_drop = fluid.layers.dropout(
emb_all_label, dropout_prob=dropout, is_test=True)
all_pre = fluid.layers.matmul(
gru_h0_drop, emb_all_label_drop, transpose_y=True)
pos_label = fluid.layers.data(
name="pos_label", shape=[1], dtype="int64", lod_level=1)
acc = fluid.layers.accuracy(input=all_pre, label=pos_label, k=20)
return acc
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle.fluid as fluid
import paddle
import time
import utils
import net_bpr as net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--train_dir',
type=str,
default='train_data',
help='train file address')
parser.add_argument(
'--vocab_path',
type=str,
default='vocab.txt',
help='vocab file address')
parser.add_argument('--is_local', type=int, default=1, help='whether local')
parser.add_argument('--hid_size', type=int, default=100, help='hid size')
parser.add_argument('--neg_size', type=int, default=10, help='neg size')
parser.add_argument(
'--model_dir', type=str, default='model_bpr_recall20', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument('--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
'--parallel', type=int, default=0, help='whether parallel')
parser.add_argument(
'--base_lr', type=float, default=0.01, help='learning rate')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
args = parser.parse_args()
return args
def get_cards(args):
return args.num_devices
def train():
""" do training """
args = parse_args()
hid_size = args.hid_size
train_dir = args.train_dir
vocab_path = args.vocab_path
use_cuda = True if args.use_cuda else False
parallel = True if args.parallel else False
print("use_cuda:", use_cuda, "parallel:", parallel)
batch_size = args.batch_size
vocab_size, train_reader = utils.prepare_data(
train_dir, vocab_path, batch_size=batch_size * get_cards(args),\
buffer_size=1000, word_freq_threshold=0, is_train=True)
# Train program
src, pos_label, label, avg_cost = net.train_network(
neg_size=args.neg_size, vocab_size=vocab_size, hid_size=hid_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost)
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=avg_cost.name)
else:
train_exe = exe
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
total_time = 0.0
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
ls, lp, ll = utils.to_lodtensor_bpr(data, args.neg_size, vocab_size,
place)
ret_avg_cost = train_exe.run(
feed={"src": ls,
"label": ll,
"pos_label": lp},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % args.print_batch == 0:
print("step:%d ppl:%.3f" % (i, newest_ppl))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
fluid.io.save_params(executor=exe, dirname=save_dir)
print("model saved in %s" % save_dir)
print("finish training")
if __name__ == "__main__":
train()
......@@ -7,6 +7,7 @@ import paddle.fluid as fluid
import paddle
import os
def to_lodtensor(data, place):
""" convert to LODtensor """
seq_lens = [len(seq) for seq in data]
......@@ -22,11 +23,74 @@ def to_lodtensor(data, place):
res.set_lod([lod])
return res
def to_lodtensor_bpr(raw_data, neg_size, vocab_size, place):
""" convert to LODtensor """
data = [dat[0] for dat in raw_data]
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
data = [dat[1] for dat in raw_data]
pos_data = np.concatenate(data, axis=0).astype("int64")
length = np.size(pos_data)
neg_data = np.tile(pos_data, neg_size)
np.random.shuffle(neg_data)
for ii in range(length * neg_size):
if neg_data[ii] == pos_data[ii / neg_size]:
neg_data[ii] = pos_data[length - 1 - ii / neg_size]
label_data = np.column_stack(
(pos_data.reshape(length, 1), neg_data.reshape(length, neg_size)))
res_label = fluid.LoDTensor()
res_label.set(label_data, place)
res_label.set_lod([lod])
res_pos = fluid.LoDTensor()
res_pos.set(np.zeros([len(flattened_data), 1]).astype("int64"), place)
res_pos.set_lod([lod])
return res, res_pos, res_label
def to_lodtensor_bpr_test(raw_data, vocab_size, place):
""" convert to LODtensor """
data = [dat[0] for dat in raw_data]
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
data = [dat[1] for dat in raw_data]
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res_pos = fluid.LoDTensor()
res_pos.set(flattened_data, place)
res_pos.set_lod([lod])
return res, res_pos
def get_vocab_size(vocab_path):
with open(vocab_path, "r") as rf:
line = rf.readline()
return int(line.strip())
def prepare_data(file_dir,
vocab_path,
batch_size,
......@@ -45,12 +109,10 @@ def prepare_data(file_dir,
batch_size,
batch_size * 20)
else:
reader = sort_batch(
vocab_size = get_vocab_size(vocab_path)
reader = paddle.batch(
test(
file_dir, buffer_size, data_type=DataType.SEQ),
batch_size,
batch_size * 20)
vocab_size = 0
file_dir, buffer_size, data_type=DataType.SEQ), batch_size)
return vocab_size, reader
......@@ -103,6 +165,7 @@ def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
class DataType(object):
SEQ = 2
def reader_creator(file_dir, n, data_type):
def reader():
files = os.listdir(file_dir)
......@@ -118,10 +181,13 @@ def reader_creator(file_dir, n, data_type):
yield src_seq, trg_seq
else:
assert False, 'error data type'
return reader
def train(train_dir, n, data_type=DataType.SEQ):
return reader_creator(train_dir, n, data_type)
def test(test_dir, n, data_type=DataType.SEQ):
return reader_creator(test_dir, n, data_type)
......@@ -10,12 +10,16 @@ import paddle.fluid as fluid
import paddle
import utils
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser = argparse.ArgumentParser("tagspace benchmark.")
parser.add_argument(
'--test_dir', type=str, default='test_data', help='test file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab path')
'--vocab_tag_path',
type=str,
default='vocab_tag.txt',
help='vocab path')
parser.add_argument(
'--start_index', type=int, default='1', help='start index')
parser.add_argument(
......@@ -29,6 +33,7 @@ def parse_args():
args = parser.parse_args()
return args
def infer(test_reader, vocab_tag, use_cuda, model_path):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......@@ -39,7 +44,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
model_path, exe)
t0 = time.time()
step_id = 0
true_num = 0
true_num = 0
all_num = 0
size = vocab_tag
value = []
......@@ -48,13 +53,11 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
lod_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_pos_tag = utils.to_lodtensor([dat[2] for dat in data], place)
para = exe.run(
infer_program,
feed={
"text": lod_text_seq,
"pos_tag": lod_tag},
fetch_list=fetch_vars,
return_numpy=False)
para = exe.run(infer_program,
feed={"text": lod_text_seq,
"pos_tag": lod_tag},
fetch_list=fetch_vars,
return_numpy=False)
value.append(para[0]._get_float_element(0))
if step_id % size == 0 and step_id > 1:
all_num += 1
......@@ -66,6 +69,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
print(step_id, 1.0 * true_num / all_num)
t1 = time.time()
if __name__ == "__main__":
args = parse_args()
start_index = args.start_index
......@@ -75,11 +79,20 @@ if __name__ == "__main__":
batch_size = args.batch_size
vocab_tag_path = args.vocab_tag_path
use_cuda = True if args.use_cuda else False
print("start index: ", start_index, " last_index:" ,last_index)
print("start index: ", start_index, " last_index:", last_index)
vocab_text, vocab_tag, test_reader = utils.prepare_data(
test_dir, "", vocab_tag_path, batch_size=1,
neg_size=0, buffer_size=1000, is_train=False)
test_dir,
"",
vocab_tag_path,
batch_size=1,
neg_size=0,
buffer_size=1000,
is_train=False)
for epoch in range(start_index, last_index + 1):
epoch_path = model_dir + "/epoch_" + str(epoch)
infer(test_reader=test_reader, vocab_tag=vocab_tag, use_cuda=False, model_path=epoch_path)
infer(
test_reader=test_reader,
vocab_tag=vocab_tag,
use_cuda=False,
model_path=epoch_path)
......@@ -13,24 +13,32 @@ import net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("TagSpace benchmark.")
parser.add_argument('--neg_size', type=int, default=3, help='neg/pos ratio')
parser.add_argument(
'--neg_size', type=int, default=3, help='neg/pos ratio')
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address')
'--train_dir',
type=str,
default='train_data',
help='train file address')
parser.add_argument(
'--vocab_text_path', type=str, default='vocab_text.txt', help='vocab_text file address')
'--vocab_text_path',
type=str,
default='vocab_text.txt',
help='vocab_text file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab_text file address')
'--vocab_tag_path',
type=str,
default='vocab_tag.txt',
help='vocab_text file address')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument('--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
......@@ -42,9 +50,11 @@ def parse_args():
args = parser.parse_args()
return args
def get_cards(args):
return args.num_devices
def train():
""" do training """
args = parse_args()
......@@ -56,15 +66,19 @@ def train():
batch_size = args.batch_size
neg_size = args.neg_size
print("use_cuda: {}, parallel: {}, batch_size: {}, neg_size: {} "
.format(use_cuda, parallel, batch_size, neg_size))
.format(use_cuda, parallel, batch_size, neg_size))
vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data(
file_dir=train_dir, vocab_text_path=vocab_text_path,
vocab_tag_path=vocab_tag_path, neg_size=neg_size,
batch_size=batch_size * get_cards(args),
buffer_size=batch_size*100, is_train=True)
file_dir=train_dir,
vocab_text_path=vocab_text_path,
vocab_tag_path=vocab_tag_path,
neg_size=neg_size,
batch_size=batch_size * get_cards(args),
buffer_size=batch_size * 100,
is_train=True)
""" train network """
# Train program
avg_cost, correct, cos_pos = net.network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
avg_cost, correct, cos_pos = net.network(
vocab_text_size, vocab_tag_size, neg_size=neg_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
......@@ -76,11 +90,10 @@ def train():
exe.run(fluid.default_startup_program())
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=avg_cost.name)
use_cuda=use_cuda, loss_name=avg_cost.name)
else:
train_exe = exe
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
......@@ -94,15 +107,18 @@ def train():
lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
loss_val, correct_val = train_exe.run(
feed={
"text": lod_text_seq,
"pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag},
fetch_list=[avg_cost.name, correct.name])
feed={
"text": lod_text_seq,
"pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag
},
fetch_list=[avg_cost.name, correct.name])
if batch_id % args.print_batch == 0:
print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}"
.format(pass_idx, (batch_id+10) * batch_size, np.mean(loss_val),
float(np.sum(correct_val)) / (args.num_devices*batch_size)))
.format(pass_idx, (batch_id + 10) * batch_size,
np.mean(loss_val),
float(np.sum(correct_val)) / (args.num_devices *
batch_size)))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
......@@ -110,8 +126,10 @@ def train():
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["text", "pos_tag"]
fetch_vars = [cos_pos]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars,
train_exe)
print("finish training")
if __name__ == "__main__":
train()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册