未验证 提交 2de73266 编写于 作者: Z zhang wenhui 提交者: GitHub

Merge pull request #1639 from frankwhzhang/fix_bug

add gru4rec bpr loss and fix style
......@@ -5,8 +5,10 @@
```text
.
├── README.md # 文档
├── train.py # 训练脚本
├── infer.py # 预测脚本
├── train.py # 训练脚本 全词表 cross-entropy
├── train_sample_neg.py # 训练脚本 sample负例 包含bpr loss 和cross-entropy
├── infer.py # 预测脚本 全词表
├── infer_sample_neg.py # 预测脚本 sample负例
├── net.py # 网络结构
├── text2paddle.py # 文本数据转paddle数据
├── cluster_train.py # 多机训练
......@@ -30,6 +32,17 @@ GRU4REC模型的介绍可以参阅论文[Session-based Recommendations with Recu
session-based推荐应用场景非常广泛,比如用户的商品浏览、新闻点击、地点签到等序列数据。
支持三种形式的损失函数, 分别是全词表的cross-entropy, 负采样的Bayesian Pairwise Ranking和负采样的Cross-entropy.
我们基本复现了论文效果,recall@20的效果分别为
全词表 cross entropy : 0.67
负采样 bpr : 0.606
负采样 cross entropy : 0.605
运行样例程序可跳过'RSC15 数据下载及预处理'部分
## RSC15 数据下载及预处理
......@@ -108,25 +121,42 @@ python text2paddle.py raw_train_data/ raw_test_data/ train_data test_data vocab.
```
## 训练
'--use_cuda 1' 表示使用gpu, 缺省表示使用cpu '--parallel 1' 表示使用多卡,缺省表示使用单卡
具体的参数配置可运行
具体的参数配置可运行
```
python train.py -h
```
全词表cross entropy 训练代码
gpu 单机单卡训练
``` bash
CUDA_VISIBLE_DEVICES=0 python train.py --train_dir train_data --use_cuda 1 --batch_size 50 --model_dir model_output
```
GPU 环境
运行命令开始训练模型。
cpu 单机训练
``` bash
python train.py --train_dir train_data --use_cuda 0 --batch_size 50 --model_dir model_output
```
CUDA_VISIBLE_DEVICES=0 python train.py --train_dir train_data/ --use_cuda 1
gpu 单机多卡训练
``` bash
CUDA_VISIBLE_DEVICES=0,1 python train.py --train_dir train_data --use_cuda 1 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 2
```
CPU 环境
运行命令开始训练模型。
cpu 单机多卡训练
``` bash
CPU_NUM=10 python train.py --train_dir train_data --use_cuda 0 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 10
```
python train.py --train_dir train_data/
负采样 bayesian pairwise ranking loss(bpr loss) 训练
```
CUDA_VISIBLE_DEVICES=0 python train_sample_neg.py --loss bpr --use_cuda 1
```
请注意CPU环境下运行单机多卡任务(--parallel 1)时,batch_size应大于cpu核数。
负采样 cross entropy 训练
```
CUDA_VISIBLE_DEVICES=0 python train_sample_neg.py --loss ce --use_cuda 1
```
## 自定义网络结构
......
import argparse
import sys
import time
import math
import unittest
import contextlib
import numpy as np
import six
import paddle.fluid as fluid
import paddle
import net
import utils
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--test_dir', type=str, default='test_data', help='test file address')
parser.add_argument(
'--start_index', type=int, default='1', help='start index')
parser.add_argument(
'--last_index', type=int, default='3', help='last index')
parser.add_argument(
'--model_dir', type=str, default='model_bpr_recall20', help='model dir')
parser.add_argument(
'--use_cuda', type=int, default='0', help='whether use cuda')
parser.add_argument(
'--batch_size', type=int, default='5', help='batch_size')
parser.add_argument(
'--hid_size', type=int, default='100', help='batch_size')
parser.add_argument(
'--vocab_path', type=str, default='vocab.txt', help='vocab file')
args = parser.parse_args()
return args
def infer(args, vocab_size, test_reader, use_cuda):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
hid_size = args.hid_size
batch_size = args.batch_size
with fluid.scope_guard(fluid.core.Scope()):
main_program = fluid.Program()
with fluid.program_guard(main_program):
acc = net.infer_network(vocab_size, batch_size, hid_size)
for epoch in range(start_index, last_index + 1):
copy_program = main_program.clone()
model_path = model_dir + "/epoch_" + str(epoch)
fluid.io.load_params(
executor=exe, dirname=model_path, main_program=copy_program)
accum_num_recall = 0.0
accum_num_sum = 0.0
t0 = time.time()
step_id = 0
for data in test_reader():
step_id += 1
label_data = [dat[1] for dat in data]
ls, lp = utils.to_lodtensor_bpr_test(data, vocab_size,
place)
para = exe.run(
copy_program,
feed={
"src": ls,
"all_label":
np.arange(vocab_size).reshape(vocab_size, 1),
"pos_label": lp
},
fetch_list=[acc.name],
return_numpy=False)
acc_ = np.array(para[0])[0]
data_length = len(
np.concatenate(
label_data, axis=0).astype("int64"))
accum_num_sum += (data_length)
accum_num_recall += (data_length * acc_)
if step_id % 1 == 0:
print("step:%d " % (step_id),
accum_num_recall / accum_num_sum)
t1 = time.time()
print("model:%s recall@20:%.4f time_cost(s):%.2f" %
(model_path, accum_num_recall / accum_num_sum, t1 - t0))
if __name__ == "__main__":
args = parse_args()
start_index = args.start_index
last_index = args.last_index
test_dir = args.test_dir
model_dir = args.model_dir
batch_size = args.batch_size
vocab_path = args.vocab_path
use_cuda = True if args.use_cuda else False
print("start index: ", start_index, " last_index:", last_index)
vocab_size, test_reader = utils.prepare_data(
test_dir,
vocab_path,
batch_size=batch_size,
buffer_size=1000,
word_freq_threshold=0,
is_train=False)
infer(args, vocab_size, test_reader=test_reader, use_cuda=use_cuda)
import paddle.fluid as fluid
def network(vocab_size,
hid_size=100,
init_low_bound=-0.04,
init_high_bound=0.04):
def all_vocab_network(vocab_size,
hid_size=100,
init_low_bound=-0.04,
init_high_bound=0.04):
""" network definition """
emb_lr_x = 10.0
gru_lr_x = 1.0
......@@ -43,8 +44,173 @@ def network(vocab_size,
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=fc_lr_x))
cost = fluid.layers.cross_entropy(input=fc, label=dst_wordseq)
acc = fluid.layers.accuracy(input=fc, label=dst_wordseq, k=20)
avg_cost = fluid.layers.mean(x=cost)
return src_wordseq, dst_wordseq, avg_cost, acc
def train_bpr_network(vocab_size, neg_size, hid_size, drop_out=0.2):
""" network definition """
emb_lr_x = 1.0
gru_lr_x = 1.0
fc_lr_x = 1.0
# Input data
src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1)
pos_label = fluid.layers.data(
name="pos_label", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(
name="label", shape=[neg_size + 1], dtype="int64", lod_level=1)
emb_src = fluid.layers.embedding(
input=src,
size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
name="emb",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=emb_lr_x))
emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out)
fc0 = fluid.layers.fc(input=emb_src_drop,
size=hid_size * 3,
param_attr=fluid.ParamAttr(
name="gru_fc",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=gru_lr_x),
bias_attr=False)
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
size=hid_size,
param_attr=fluid.ParamAttr(
name="dy_gru.param",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=gru_lr_x),
bias_attr="dy_gru.bias")
gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out)
label_re = fluid.layers.sequence_reshape(input=label, new_dim=1)
emb_label = fluid.layers.embedding(
input=label_re,
size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
name="emb",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=emb_lr_x))
emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out)
gru_exp = fluid.layers.expand(
x=gru_h0_drop, expand_times=[1, (neg_size + 1)])
gru = fluid.layers.sequence_reshape(input=gru_exp, new_dim=hid_size)
ele_mul = fluid.layers.elementwise_mul(emb_label_drop, gru)
red_sum = fluid.layers.reduce_sum(input=ele_mul, dim=1, keep_dim=True)
pre = fluid.layers.sequence_reshape(input=red_sum, new_dim=(neg_size + 1))
cost = fluid.layers.bpr_loss(input=pre, label=pos_label)
cost_sum = fluid.layers.reduce_sum(input=cost)
return src, pos_label, label, cost_sum
def train_cross_entropy_network(vocab_size, neg_size, hid_size, drop_out=0.2):
""" network definition """
emb_lr_x = 1.0
gru_lr_x = 1.0
fc_lr_x = 1.0
# Input data
src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1)
pos_label = fluid.layers.data(
name="pos_label", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(
name="label", shape=[neg_size + 1], dtype="int64", lod_level=1)
emb_src = fluid.layers.embedding(
input=src,
size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
name="emb",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=emb_lr_x))
emb_src_drop = fluid.layers.dropout(emb_src, dropout_prob=drop_out)
fc0 = fluid.layers.fc(input=emb_src_drop,
size=hid_size * 3,
param_attr=fluid.ParamAttr(
name="gru_fc",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=gru_lr_x),
bias_attr=False)
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
size=hid_size,
param_attr=fluid.ParamAttr(
name="dy_gru.param",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=gru_lr_x),
bias_attr="dy_gru.bias")
gru_h0_drop = fluid.layers.dropout(gru_h0, dropout_prob=drop_out)
label_re = fluid.layers.sequence_reshape(input=label, new_dim=1)
emb_label = fluid.layers.embedding(
input=label_re,
size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
name="emb",
initializer=fluid.initializer.XavierInitializer(),
learning_rate=emb_lr_x))
emb_label_drop = fluid.layers.dropout(emb_label, dropout_prob=drop_out)
gru_exp = fluid.layers.expand(
x=gru_h0_drop, expand_times=[1, (neg_size + 1)])
gru = fluid.layers.sequence_reshape(input=gru_exp, new_dim=hid_size)
ele_mul = fluid.layers.elementwise_mul(emb_label_drop, gru)
red_sum = fluid.layers.reduce_sum(input=ele_mul, dim=1, keep_dim=True)
pre = fluid.layers.sequence_reshape(input=red_sum, new_dim=(neg_size + 1))
cost = fluid.layers.cross_entropy(input=pre, label=pos_label)
cost_sum = fluid.layers.reduce_sum(input=cost)
return src, pos_label, label, cost_sum
def infer_network(vocab_size, batch_size, hid_size, dropout=0.2):
src = fluid.layers.data(name="src", shape=[1], dtype="int64", lod_level=1)
emb_src = fluid.layers.embedding(
input=src, size=[vocab_size, hid_size], param_attr="emb")
emb_src_drop = fluid.layers.dropout(
emb_src, dropout_prob=dropout, is_test=True)
fc0 = fluid.layers.fc(input=emb_src_drop,
size=hid_size * 3,
param_attr="gru_fc",
bias_attr=False)
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
size=hid_size,
param_attr="dy_gru.param",
bias_attr="dy_gru.bias")
gru_h0_drop = fluid.layers.dropout(
gru_h0, dropout_prob=dropout, is_test=True)
all_label = fluid.layers.data(
name="all_label",
shape=[vocab_size, 1],
dtype="int64",
append_batch_size=False)
emb_all_label = fluid.layers.embedding(
input=all_label, size=[vocab_size, hid_size], param_attr="emb")
emb_all_label_drop = fluid.layers.dropout(
emb_all_label, dropout_prob=dropout, is_test=True)
all_pre = fluid.layers.matmul(
gru_h0_drop, emb_all_label_drop, transpose_y=True)
pos_label = fluid.layers.data(
name="pos_label", shape=[1], dtype="int64", lod_level=1)
acc = fluid.layers.accuracy(input=all_pre, label=pos_label, k=20)
return acc
......@@ -17,13 +17,13 @@ SEED = 102
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address')
'--train_dir', type=str, default='train_data', help='train file')
parser.add_argument(
'--vocab_path', type=str, default='vocab.txt', help='vocab file address')
'--vocab_path', type=str, default='vocab.txt', help='vocab file')
parser.add_argument(
'--is_local', type=int, default=1, help='whether local')
'--is_local', type=int, default=1, help='whether is local')
parser.add_argument(
'--hid_size', type=int, default=100, help='hid size')
'--hid_size', type=int, default=100, help='hidden-dim size')
parser.add_argument(
'--model_dir', type=str, default='model_recall20', help='model dir')
parser.add_argument(
......@@ -31,7 +31,7 @@ def parse_args():
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='num of epoch')
'--pass_num', type=int, default=10, help='number of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
......@@ -43,9 +43,11 @@ def parse_args():
args = parser.parse_args()
return args
def get_cards(args):
return args.num_devices
def train():
""" do training """
args = parse_args()
......@@ -61,23 +63,23 @@ def train():
buffer_size=1000, word_freq_threshold=0, is_train=True)
# Train program
src_wordseq, dst_wordseq, avg_cost, acc = net.network(vocab_size=vocab_size, hid_size=hid_size)
src_wordseq, dst_wordseq, avg_cost, acc = net.all_vocab_network(
vocab_size=vocab_size, hid_size=hid_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost)
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=avg_cost.name)
use_cuda=use_cuda, loss_name=avg_cost.name)
else:
train_exe = exe
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
......@@ -96,10 +98,11 @@ def train():
place)
lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
place)
ret_avg_cost = train_exe.run(feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq},
fetch_list=fetch_list)
ret_avg_cost = train_exe.run(feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % args.print_batch == 0:
......@@ -114,7 +117,6 @@ def train():
fetch_vars = [avg_cost, acc]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir)
#exe.close()
print("finish training")
......
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle.fluid as fluid
import paddle
import time
import utils
import net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file')
parser.add_argument(
'--vocab_path', type=str, default='vocab.txt', help='vocab file')
parser.add_argument(
'--is_local', type=int, default=1, help='whether is local')
parser.add_argument(
'--hid_size', type=int, default=100, help='hidden-dim size')
parser.add_argument(
'--neg_size', type=int, default=10, help='neg item size')
parser.add_argument(
'--loss', type=str, default="bpr", help='loss: bpr/cross_entropy')
parser.add_argument(
'--model_dir', type=str, default='model_bpr_recall20', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='number of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
'--parallel', type=int, default=0, help='whether parallel')
parser.add_argument(
'--base_lr', type=float, default=0.01, help='learning rate')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
args = parser.parse_args()
return args
def get_cards(args):
return args.num_devices
def train():
""" do training """
args = parse_args()
hid_size = args.hid_size
train_dir = args.train_dir
vocab_path = args.vocab_path
use_cuda = True if args.use_cuda else False
parallel = True if args.parallel else False
print("use_cuda:", use_cuda, "parallel:", parallel)
batch_size = args.batch_size
vocab_size, train_reader = utils.prepare_data(
train_dir, vocab_path, batch_size=batch_size * get_cards(args),\
buffer_size=1000, word_freq_threshold=0, is_train=True)
# Train program
if args.loss == 'bpr':
src, pos_label, label, avg_cost = net.train_bpr_network(
neg_size=args.neg_size, vocab_size=vocab_size, hid_size=hid_size)
else:
src, pos_label, label, avg_cost = net.train_cross_entropy_network(
neg_size=args.neg_size, vocab_size=vocab_size, hid_size=hid_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost)
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=avg_cost.name)
else:
train_exe = exe
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
total_time = 0.0
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
ls, lp, ll = utils.to_lodtensor_bpr(data, args.neg_size, vocab_size,
place)
ret_avg_cost = train_exe.run(
feed={"src": ls,
"label": ll,
"pos_label": lp},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % args.print_batch == 0:
print("step:%d ppl:%.3f" % (i, newest_ppl))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
fluid.io.save_params(executor=exe, dirname=save_dir)
print("model saved in %s" % save_dir)
print("finish training")
if __name__ == "__main__":
train()
......@@ -7,6 +7,7 @@ import paddle.fluid as fluid
import paddle
import os
def to_lodtensor(data, place):
""" convert to LODtensor """
seq_lens = [len(seq) for seq in data]
......@@ -22,11 +23,74 @@ def to_lodtensor(data, place):
res.set_lod([lod])
return res
def to_lodtensor_bpr(raw_data, neg_size, vocab_size, place):
""" convert to LODtensor """
data = [dat[0] for dat in raw_data]
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
data = [dat[1] for dat in raw_data]
pos_data = np.concatenate(data, axis=0).astype("int64")
length = np.size(pos_data)
neg_data = np.tile(pos_data, neg_size)
np.random.shuffle(neg_data)
for ii in range(length * neg_size):
if neg_data[ii] == pos_data[ii / neg_size]:
neg_data[ii] = pos_data[length - 1 - ii / neg_size]
label_data = np.column_stack(
(pos_data.reshape(length, 1), neg_data.reshape(length, neg_size)))
res_label = fluid.LoDTensor()
res_label.set(label_data, place)
res_label.set_lod([lod])
res_pos = fluid.LoDTensor()
res_pos.set(np.zeros([len(flattened_data), 1]).astype("int64"), place)
res_pos.set_lod([lod])
return res, res_pos, res_label
def to_lodtensor_bpr_test(raw_data, vocab_size, place):
""" convert to LODtensor """
data = [dat[0] for dat in raw_data]
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
data = [dat[1] for dat in raw_data]
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res_pos = fluid.LoDTensor()
res_pos.set(flattened_data, place)
res_pos.set_lod([lod])
return res, res_pos
def get_vocab_size(vocab_path):
with open(vocab_path, "r") as rf:
line = rf.readline()
return int(line.strip())
def prepare_data(file_dir,
vocab_path,
batch_size,
......@@ -45,12 +109,10 @@ def prepare_data(file_dir,
batch_size,
batch_size * 20)
else:
reader = sort_batch(
vocab_size = get_vocab_size(vocab_path)
reader = paddle.batch(
test(
file_dir, buffer_size, data_type=DataType.SEQ),
batch_size,
batch_size * 20)
vocab_size = 0
file_dir, buffer_size, data_type=DataType.SEQ), batch_size)
return vocab_size, reader
......@@ -103,6 +165,7 @@ def sort_batch(reader, batch_size, sort_group_size, drop_last=False):
class DataType(object):
SEQ = 2
def reader_creator(file_dir, n, data_type):
def reader():
files = os.listdir(file_dir)
......@@ -118,10 +181,13 @@ def reader_creator(file_dir, n, data_type):
yield src_seq, trg_seq
else:
assert False, 'error data type'
return reader
def train(train_dir, n, data_type=DataType.SEQ):
return reader_creator(train_dir, n, data_type)
def test(test_dir, n, data_type=DataType.SEQ):
return reader_creator(test_dir, n, data_type)
......@@ -3,31 +3,47 @@
## Introduction
In news recommendation scenarios, different from traditional systems that recommend entertainment items such as movies or music, there are several new problems to solve.
- Very sparse user profile features exist that a user may login a news recommendation app anonymously and a user is likely to read a fresh news item.
- News are generated or disappeared very fast compare with movies or musics. Usually, there will be thousands of news generated in a news recommendation app. The Consumption of news is also fast since users care about newly happened things.
- News are generated or disappeared very fast compare with movies or musics. Usually, there will be thousands of news generated in a news recommendation app. The Consumption of news is also fast since users care about newly happened things.
- User interests may change frequently in the news recommendation setting. The content of news will affect users' reading behaviors a lot even the category of the news does not belong to users' long-term interest. In news recommendation, reading behaviors are determined by both short-term interest and long-term interest of users.
[GRU4Rec](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/gru4rec) models a user's short-term and long-term interest by applying a gated-recurrent-unit on the user's reading history. The generalization ability of recurrent neural network captures users' similarity of reading sequences that alleviates the user profile sparsity problem. However, the paper of GRU4Rec operates on close domain of items that the model predicts which item a user will be interested in through classification method. In news recommendation, news items are dynamic through time that GRU4Rec model can not predict items that do not exist in training dataset.
Sequence Semantic Retrieval(SSR) Model shares the similar idea with Multi-Rate Deep Learning for Temporal Recommendation, SIGIR 2016. Sequence Semantic Retrieval Model has two components, one is the matching model part, the other one is the retrieval part.
- The idea of SSR is to model a user's personalized interest of an item through matching model structure, and the representation of a news item can be computed online even the news item does not exist in training dataset.
- The idea of SSR is to model a user's personalized interest of an item through matching model structure, and the representation of a news item can be computed online even the news item does not exist in training dataset.
- With the representation of news items, we are able to build an vector indexing service online for news prediction and this is the retrieval part of SSR.
## Dataset
Dataset preprocessing follows the method of [GRU4Rec Project](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/gru4rec). Note that you should reuse scripts from GRU4Rec project for data preprocessing.
## Training
Before training, you should set PYTHONPATH environment
The command line options for training can be listed by `python train.py -h`
gpu 单机单卡训练
``` bash
CUDA_VISIBLE_DEVICES=0 python train.py --train_dir train_data --use_cuda 1 --batch_size 50 --model_dir model_output
```
export PYTHONPATH=./models/fluid:$PYTHONPATH
cpu 单机训练
``` bash
python train.py --train_dir train_data --use_cuda 0 --batch_size 50 --model_dir model_output
```
The command line options for training can be listed by `python train.py -h`
gpu 单机多卡训练
``` bash
python train.py --train_file rsc15_train_tr_paddle.txt
CUDA_VISIBLE_DEVICES=0,1 python train.py --train_dir train_data --use_cuda 1 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 2
```
## Build Index
TBA
cpu 单机多卡训练
``` bash
CPU_NUM=10 python train.py --train_dir train_data --use_cuda 0 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 10
```
多机训练 参考fluid/PaddleRec/gru4rec下的配置
## Retrieval
TBA
## Inference
gpu 预测
``` bash
CUDA_VISIBLE_DEVICES=0 python infer.py --test_dir test_data --use_cuda 1 --batch_size 50 --model_dir model_output
```
import sys
import argparse
import time
import math
import unittest
import contextlib
import numpy as np
import six
import paddle.fluid as fluid
import paddle
import utils
import nets as net
def parse_args():
parser = argparse.ArgumentParser("ssr benchmark.")
parser.add_argument(
'--test_dir', type=str, default='test_data', help='test file address')
parser.add_argument(
'--vocab_path', type=str, default='vocab.txt', help='vocab path')
parser.add_argument(
'--start_index', type=int, default='1', help='start index')
parser.add_argument(
'--last_index', type=int, default='10', help='end index')
parser.add_argument(
'--model_dir', type=str, default='model_output', help='model dir')
parser.add_argument(
'--use_cuda', type=int, default='0', help='whether use cuda')
parser.add_argument(
'--batch_size', type=int, default='50', help='batch_size')
parser.add_argument(
'--hid_size', type=int, default='128', help='hidden size')
parser.add_argument(
'--emb_size', type=int, default='128', help='embedding size')
args = parser.parse_args()
return args
def model(vocab_size, emb_size, hidden_size):
user_data = fluid.layers.data(
name="user", shape=[1], dtype="int64", lod_level=1)
all_item_data = fluid.layers.data(
name="all_item", shape=[vocab_size, 1], dtype="int64")
user_emb = fluid.layers.embedding(
input=user_data, size=[vocab_size, emb_size], param_attr="emb.item")
all_item_emb = fluid.layers.embedding(
input=all_item_data, size=[vocab_size, emb_size], param_attr="emb.item")
all_item_emb_re = fluid.layers.reshape(x=all_item_emb, shape=[-1, emb_size])
user_encoder = net.GrnnEncoder(hidden_size=hidden_size)
user_enc = user_encoder.forward(user_emb)
user_hid = fluid.layers.fc(input=user_enc,
size=hidden_size,
param_attr='user.w',
bias_attr="user.b")
user_exp = fluid.layers.expand(x=user_hid, expand_times=[1, vocab_size])
user_re = fluid.layers.reshape(x=user_exp, shape=[-1, hidden_size])
all_item_hid = fluid.layers.fc(input=all_item_emb_re,
size=hidden_size,
param_attr='item.w',
bias_attr="item.b")
cos_item = fluid.layers.cos_sim(X=all_item_hid, Y=user_re)
all_pre_ = fluid.layers.reshape(x=cos_item, shape=[-1, vocab_size])
pos_label = fluid.layers.data(name="pos_label", shape=[1], dtype="int64")
acc = fluid.layers.accuracy(input=all_pre_, label=pos_label, k=20)
return acc
def infer(args, vocab_size, test_reader):
""" inference function """
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
emb_size = args.emb_size
hid_size = args.hid_size
batch_size = args.batch_size
model_path = args.model_dir
with fluid.scope_guard(fluid.core.Scope()):
main_program = fluid.Program()
start_up_program = fluid.Program()
with fluid.program_guard(main_program, start_up_program):
acc = model(vocab_size, emb_size, hid_size)
for epoch in xrange(start_index, last_index + 1):
copy_program = main_program.clone()
model_path = model_dir + "/epoch_" + str(epoch)
fluid.io.load_params(
executor=exe, dirname=model_path, main_program=copy_program)
accum_num_recall = 0.0
accum_num_sum = 0.0
t0 = time.time()
step_id = 0
for data in test_reader():
step_id += 1
user_data, pos_label = utils.infer_data(data, place)
all_item_numpy = np.tile(
np.arange(vocab_size), len(pos_label)).reshape(
len(pos_label), vocab_size, 1)
para = exe.run(copy_program,
feed={
"user": user_data,
"all_item": all_item_numpy,
"pos_label": pos_label
},
fetch_list=[acc.name],
return_numpy=False)
acc_ = para[0]._get_float_element(0)
data_length = len(
np.concatenate(
pos_label, axis=0).astype("int64"))
accum_num_sum += (data_length)
accum_num_recall += (data_length * acc_)
if step_id % 1 == 0:
print("step:%d " % (step_id),
accum_num_recall / accum_num_sum)
t1 = time.time()
print("model:%s recall@20:%.3f time_cost(s):%.2f" %
(model_path, accum_num_recall / accum_num_sum, t1 - t0))
if __name__ == "__main__":
args = parse_args()
start_index = args.start_index
last_index = args.last_index
test_dir = args.test_dir
model_dir = args.model_dir
batch_size = args.batch_size
vocab_path = args.vocab_path
use_cuda = True if args.use_cuda else False
print("start index: ", start_index, " last_index:", last_index)
test_reader, vocab_size = utils.construct_test_data(
test_dir, vocab_path, batch_size=args.batch_size)
infer(args, vocab_size, test_reader=test_reader)
......@@ -17,35 +17,60 @@ import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
import paddle.fluid.layers.io as io
from PaddleRec.multiview_simnet.nets import BowEncoder
from PaddleRec.multiview_simnet.nets import GrnnEncoder
class BowEncoder(object):
""" bow-encoder """
def __init__(self):
self.param_name = ""
def forward(self, emb):
return nn.sequence_pool(input=emb, pool_type='sum')
class GrnnEncoder(object):
""" grnn-encoder """
def __init__(self, param_name="grnn", hidden_size=128):
self.param_name = param_name
self.hidden_size = hidden_size
def forward(self, emb):
fc0 = nn.fc(input=emb,
size=self.hidden_size * 3,
param_attr=self.param_name + "_fc.w",
bias_attr=False)
gru_h = nn.dynamic_gru(
input=fc0,
size=self.hidden_size,
is_reverse=False,
param_attr=self.param_name + ".param",
bias_attr=self.param_name + ".bias")
return nn.sequence_pool(input=gru_h, pool_type='max')
class PairwiseHingeLoss(object):
def __init__(self, margin=0.8):
self.margin = margin
def forward(self, pos, neg):
loss_part1 = nn.elementwise_sub(
tensor.fill_constant_batch_size_like(
input=pos,
shape=[-1, 1],
value=self.margin,
dtype='float32'),
input=pos, shape=[-1, 1], value=self.margin, dtype='float32'),
pos)
loss_part2 = nn.elementwise_add(loss_part1, neg)
loss_part3 = nn.elementwise_max(
tensor.fill_constant_batch_size_like(
input=loss_part2,
shape=[-1, 1],
value=0.0,
dtype='float32'),
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
return loss_part3
class SequenceSemanticRetrieval(object):
""" sequence semantic retrieval model """
def __init__(self, embedding_size, embedding_dim, hidden_size):
self.embedding_size = embedding_size
self.embedding_dim = embedding_dim
......@@ -54,48 +79,44 @@ class SequenceSemanticRetrieval(object):
self.user_encoder = GrnnEncoder(hidden_size=hidden_size)
self.item_encoder = BowEncoder()
self.pairwise_hinge_loss = PairwiseHingeLoss()
def get_correct(self, x, y):
less = tensor.cast(cf.less_than(x, y), dtype='float32')
correct = nn.reduce_sum(less)
return correct
def train(self):
user_data = io.data(
name="user", shape=[1], dtype="int64", lod_level=1
)
user_data = io.data(name="user", shape=[1], dtype="int64", lod_level=1)
pos_item_data = io.data(
name="p_item", shape=[1], dtype="int64", lod_level=1
)
name="p_item", shape=[1], dtype="int64", lod_level=1)
neg_item_data = io.data(
name="n_item", shape=[1], dtype="int64", lod_level=1
)
name="n_item", shape=[1], dtype="int64", lod_level=1)
user_emb = nn.embedding(
input=user_data, size=self.emb_shape, param_attr="emb.item"
)
input=user_data, size=self.emb_shape, param_attr="emb.item")
pos_item_emb = nn.embedding(
input=pos_item_data, size=self.emb_shape, param_attr="emb.item"
)
input=pos_item_data, size=self.emb_shape, param_attr="emb.item")
neg_item_emb = nn.embedding(
input=neg_item_data, size=self.emb_shape, param_attr="emb.item"
)
input=neg_item_data, size=self.emb_shape, param_attr="emb.item")
user_enc = self.user_encoder.forward(user_emb)
pos_item_enc = self.item_encoder.forward(pos_item_emb)
neg_item_enc = self.item_encoder.forward(neg_item_emb)
user_hid = nn.fc(
input=user_enc, size=self.hidden_size, param_attr='user.w', bias_attr="user.b"
)
pos_item_hid = nn.fc(
input=pos_item_enc, size=self.hidden_size, param_attr='item.w', bias_attr="item.b"
)
neg_item_hid = nn.fc(
input=neg_item_enc, size=self.hidden_size, param_attr='item.w', bias_attr="item.b"
)
user_hid = nn.fc(input=user_enc,
size=self.hidden_size,
param_attr='user.w',
bias_attr="user.b")
pos_item_hid = nn.fc(input=pos_item_enc,
size=self.hidden_size,
param_attr='item.w',
bias_attr="item.b")
neg_item_hid = nn.fc(input=neg_item_enc,
size=self.hidden_size,
param_attr='item.w',
bias_attr="item.b")
cos_pos = nn.cos_sim(user_hid, pos_item_hid)
cos_neg = nn.cos_sim(user_hid, neg_item_hid)
hinge_loss = self.pairwise_hinge_loss.forward(cos_pos, cos_neg)
avg_cost = nn.mean(hinge_loss)
correct = self.get_correct(cos_neg, cos_pos)
return [user_data, pos_item_data, neg_item_data], \
pos_item_hid, neg_item_hid, avg_cost, correct
return [user_data, pos_item_data,
neg_item_data], cos_pos, avg_cost, correct
......@@ -14,19 +14,22 @@
import random
class Dataset:
def __init__(self):
pass
class Vocab:
def __init__(self):
pass
class YoochooseVocab(Vocab):
def __init__(self):
self.vocab = {}
self.word_array = []
def load(self, filelist):
idx = 0
for f in filelist:
......@@ -47,21 +50,16 @@ class YoochooseVocab(Vocab):
def _get_word_array(self):
return self.word_array
class YoochooseDataset(Dataset):
def __init__(self, y_vocab):
self.vocab_size = len(y_vocab.get_vocab())
self.word_array = y_vocab._get_word_array()
self.vocab = y_vocab.get_vocab()
def __init__(self, vocab_size):
self.vocab_size = vocab_size
def sample_neg(self):
return random.randint(0, self.vocab_size - 1)
def sample_neg_from_seq(self, seq):
return seq[random.randint(0, len(seq) - 1)]
# TODO(guru4elephant): wait memory, should be improved
def sample_from_word_freq(self):
return self.word_array[random.randint(0, len(self.word_array) - 1)]
def _reader_creator(self, filelist, is_train):
def reader():
......@@ -72,23 +70,20 @@ class YoochooseDataset(Dataset):
ids = line.strip().split()
if len(ids) <= 1:
continue
conv_ids = [self.vocab[i] if i in self.vocab else 0 for i in ids]
# random select an index as boundary
# make ids before boundary as sequence
# make id next to boundary right as target
boundary = random.randint(1, len(ids) - 1)
conv_ids = [i for i in ids]
boundary = len(ids) - 1
src = conv_ids[:boundary]
pos_tgt = [conv_ids[boundary]]
if is_train:
neg_tgt = [self.sample_from_word_freq()]
neg_tgt = [self.sample_neg()]
yield [src, pos_tgt, neg_tgt]
else:
yield [src, pos_tgt]
return reader
def train(self, file_list):
return self._reader_creator(file_list, True)
def test(self, file_list):
return self._reader_creator(file_list, False)
0 16
475 473 155
491 21
96 185 96
29 14 13
5 481 11 21 470
70 5 70 11
167 42 167 217
72 15 73 161 172
82 82
97 297 97
193 182 186 183 184 177 214
152 152
163 298 7
39 73 71
490 23 23 496 488 74 23 74 486 23 23 74
17 17
170 170 483 444 443 234
25 472
5 5 11 70 69
149 149 455
356 68 477 468 17 479 66
159 172 6 71 6 6 158 13 494 169
155 44 438 144 500
156 9 9
146 146
173 10 10 461
7 6 6
269 48 268
50 100
323 174 18
69 69 22 98
38 171
22 29 489 10
0 0
11 5
29 13 14 232 231 451 289 452 229
260 11 156
166 160 166 39
223 134 134 420
66 401 68 132 17 84 287 5
39 304
65 84 132
400 211
145 144
16 28 254 48 50 100 42 154 262 133 17
0 0
28 28
11 476 464
61 61 86 86
38 38
463 478
437 265
22 39 485 171 98
434 51 344
16 16
67 67 67 448
22 12 161
15 377 147 147 374
119 317 0
38 484
403 499
432 442
28 0 16 50 465 42
163 487 7 162
99 99 325 423 83 83
154 133
5 37 492 235 160 279
10 10 457 493 10 460
441 4 4 4 4 4 4 4
153 153
159 164 164
328 37
65 65 404 347 431 459
80 80 44 44
61 446
162 495 7 453
157 21 204 68 37 66 469 145
37 151 230 206 240 205 264 87 409 87 288 270 280 329 157 296 454 474
430 445 433
449 14
9 9 9 9
440 238 226
148 148
266 267 181
48 498
263 255 256
458 158 7
72 168 12 165 71 73 173 49
0 0
7 7 6
14 29 13 6 15 14 15 13
480 439 21
450 21 151
12 12 49 14 13 165 12 169 72 15 15
91 91
22 12 49 168
497 101 30 411 30 482 30 53 30 101 176 415 53 447
462 150 150
471 456 131 435 131 467 436 412 227 218 190 466 429 213 326
......@@ -13,87 +13,108 @@
# limitations under the License.
import os
import sys
import time
import argparse
import logging
import paddle.fluid as fluid
import paddle
import reader as reader
import utils
import numpy as np
from nets import SequenceSemanticRetrieval
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser("sequence semantic retrieval")
parser.add_argument("--train_file", type=str, help="Training file")
parser.add_argument("--valid_file", type=str, help="Validation file")
parser.add_argument(
"--epochs", type=int, default=10, help="Number of epochs for training")
"--train_dir", type=str, default='train_data', help="Training file")
parser.add_argument(
"--base_lr", type=float, default=0.01, help="learning rate")
parser.add_argument(
'--vocab_path', type=str, default='vocab.txt', help='vocab file')
parser.add_argument(
"--epochs", type=int, default=10, help="Number of epochs")
parser.add_argument(
'--parallel', type=int, default=0, help='whether parallel')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
"--model_output_dir",
type=str,
default='model_output',
help="Model output folder")
'--model_dir', type=str, default='model_output', help='model dir')
parser.add_argument(
"--sequence_encode_dim",
type=int,
default=128,
help="Dimension of sequence encoder output")
"--hidden_size", type=int, default=128, help="hidden size")
parser.add_argument(
"--matching_dim",
type=int,
default=128,
help="Dimension of hidden layer")
"--batch_size", type=int, default=50, help="number of batch")
parser.add_argument(
"--batch_size", type=int, default=128, help="Batch size for training")
"--embedding_dim", type=int, default=128, help="embedding dim")
parser.add_argument(
"--embedding_dim",
type=int,
default=128,
help="Default Dimension of Embedding")
'--num_devices', type=int, default=1, help='Number of GPU devices')
return parser.parse_args()
def start_train(args):
y_vocab = reader.YoochooseVocab()
y_vocab.load([args.train_file])
logger.info("Load yoochoose vocabulary size: {}".format(len(y_vocab.get_vocab())))
y_data = reader.YoochooseDataset(y_vocab)
train_reader = paddle.batch(
paddle.reader.shuffle(
y_data.train([args.train_file]), buf_size=args.batch_size * 100),
batch_size=args.batch_size)
place = fluid.CPUPlace()
ssr = SequenceSemanticRetrieval(
len(y_vocab.get_vocab()), args.embedding_dim, args.matching_dim
)
input_data, user_rep, item_rep, avg_cost, acc = ssr.train()
optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
def get_cards(args):
return args.num_devices
def train(args):
use_cuda = True if args.use_cuda else False
parallel = True if args.parallel else False
print("use_cuda:", use_cuda, "parallel:", parallel)
train_reader, vocab_size = utils.construct_train_data(
args.train_dir, args.vocab_path, args.batch_size * get_cards(args))
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
ssr = SequenceSemanticRetrieval(vocab_size, args.embedding_dim,
args.hidden_size)
# Train program
train_input_data, cos_pos, avg_cost, acc = ssr.train()
# Optimization to minimize lost
optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
optimizer.minimize(avg_cost)
startup_program = fluid.default_startup_program()
loop_program = fluid.default_main_program()
data_list = [var.name for var in input_data]
data_list = [var.name for var in train_input_data]
feeder = fluid.DataFeeder(feed_list=data_list, place=place)
exe = fluid.Executor(place)
exe.run(startup_program)
exe.run(fluid.default_startup_program())
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=avg_cost.name)
else:
train_exe = exe
total_time = 0.0
for pass_id in range(args.epochs):
epoch_idx = pass_id + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
i = 0
for batch_id, data in enumerate(train_reader()):
loss_val, correct_val = exe.run(loop_program,
feed=feeder.feed(data),
fetch_list=[avg_cost, acc])
logger.info("Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}".
format(pass_id, batch_id, loss_val,
float(correct_val) / args.batch_size))
fluid.io.save_inference_model(args.model_output_dir,
[var.name for val in input_data],
[user_rep, item_rep, avg_cost, acc], exe)
i += 1
loss_val, correct_val = train_exe.run(
feed=feeder.feed(data), fetch_list=[avg_cost.name, acc.name])
if i % args.print_batch == 0:
logger.info(
"Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}".
format(pass_id, batch_id,
np.mean(loss_val),
float(np.mean(correct_val)) / args.batch_size))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (args.model_dir, epoch_idx)
fluid.io.save_params(executor=exe, dirname=save_dir)
print("model saved in %s" % save_dir)
def main():
args = parse_args()
start_train(args)
train(args)
if __name__ == "__main__":
main()
197 196 198 236
93 93 384 362 363 43
336 364 407
421 322
314 388
128 58
138 138
46 46 46
34 34 57 57 57 342 228 321 346 357 59 376
110 110
135 94 135
27 250 27
129 118
18 18 18
81 81 89 89
27 27
20 20 20 20 20 212
33 33 33 33
62 62 62 63 63 55 248 124 381 428 383 382 43 43 261 63
90 90 78 78
399 397 202 141 104 104 245 192 191 271
239 332 283 88
187 313
136 136 324
41 41
352 128
413 414
410 45 45 45 1 1 1 1 1 1 1 1 31 31 31 31
92 334 92
95 285
215 249
390 41
116 116
300 252
2 2 2 2 2
8 8 8 8 8 8
53 241 259
118 129 126 94 137 208 216 299
209 368 139 418 419
311 180
303 302 203 284
369 32 32 32 32 337
207 47 47 47
106 107
143 143
179 178
109 109
405 79 79 371 246
251 417 427
333 88 387 358 123 348 394 360 36 365
3 3 3 3 3
189 188
398 425
107 406
281 201 141
2 2 2
359 54
395 385 293
60 60 60 121 121 233 58 58
24 199 175 24 24 24 351 386 106
115 294
122 122 127 127
35 35
282 393
277 140 140 343 225 123 36 36 36 221 114 114 59 59 117 117 247 367 219 258 222 301 375 350 353 111 111
275 272 273 274 331 330 305 108 76 76 108
26 26 26 408 26
290 18 210 291
372 139 424 113
341 340 335
120 370
224 200
426 416
137 319
402 55
54 54
327 119
125 125
391 396 354 355 389
142 142
295 320
113 366
253 85 85
56 56 310 309 308 307 278 25 25 19 19 3 312 19 19 19 3 25
220 338
34 130
130 120 380 315
339 422
379 378
95 56 392 115
55 124
126 34
349 373 361
195 194
75 75
64 64 64
35 35
40 40 40 242 77 244 77 243
257 316
103 306 102 51 52 103 105 52 52 292 318 112 286 345 237 276 112 51 102 105
import numpy as np
import reader as reader
import os
import logging
import paddle.fluid as fluid
import paddle
def get_vocab_size(vocab_path):
with open(vocab_path, "r") as rf:
line = rf.readline()
return int(line.strip())
def construct_train_data(file_dir, vocab_path, batch_size):
vocab_size = get_vocab_size(vocab_path)
files = [file_dir + '/' + f for f in os.listdir(file_dir)]
y_data = reader.YoochooseDataset(vocab_size)
train_reader = paddle.batch(
paddle.reader.shuffle(
y_data.train(files), buf_size=batch_size * 100),
batch_size=batch_size)
return train_reader, vocab_size
def construct_test_data(file_dir, vocab_path, batch_size):
vocab_size = get_vocab_size(vocab_path)
files = [file_dir + '/' + f for f in os.listdir(file_dir)]
y_data = reader.YoochooseDataset(vocab_size)
test_reader = paddle.batch(y_data.test(files), batch_size=batch_size)
return test_reader, vocab_size
def infer_data(raw_data, place):
data = [dat[0] for dat in raw_data]
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
p_label = [dat[1] for dat in raw_data]
pos_label = np.array(p_label).astype("int64").reshape(len(p_label), 1)
return res, pos_label
......@@ -10,12 +10,16 @@ import paddle.fluid as fluid
import paddle
import utils
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser = argparse.ArgumentParser("tagspace benchmark.")
parser.add_argument(
'--test_dir', type=str, default='test_data', help='test file address')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab path')
'--vocab_tag_path',
type=str,
default='vocab_tag.txt',
help='vocab path')
parser.add_argument(
'--start_index', type=int, default='1', help='start index')
parser.add_argument(
......@@ -29,6 +33,7 @@ def parse_args():
args = parser.parse_args()
return args
def infer(test_reader, vocab_tag, use_cuda, model_path):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......@@ -39,7 +44,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
model_path, exe)
t0 = time.time()
step_id = 0
true_num = 0
true_num = 0
all_num = 0
size = vocab_tag
value = []
......@@ -48,13 +53,11 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
lod_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_pos_tag = utils.to_lodtensor([dat[2] for dat in data], place)
para = exe.run(
infer_program,
feed={
"text": lod_text_seq,
"pos_tag": lod_tag},
fetch_list=fetch_vars,
return_numpy=False)
para = exe.run(infer_program,
feed={"text": lod_text_seq,
"pos_tag": lod_tag},
fetch_list=fetch_vars,
return_numpy=False)
value.append(para[0]._get_float_element(0))
if step_id % size == 0 and step_id > 1:
all_num += 1
......@@ -66,6 +69,7 @@ def infer(test_reader, vocab_tag, use_cuda, model_path):
print(step_id, 1.0 * true_num / all_num)
t1 = time.time()
if __name__ == "__main__":
args = parse_args()
start_index = args.start_index
......@@ -75,11 +79,20 @@ if __name__ == "__main__":
batch_size = args.batch_size
vocab_tag_path = args.vocab_tag_path
use_cuda = True if args.use_cuda else False
print("start index: ", start_index, " last_index:" ,last_index)
print("start index: ", start_index, " last_index:", last_index)
vocab_text, vocab_tag, test_reader = utils.prepare_data(
test_dir, "", vocab_tag_path, batch_size=1,
neg_size=0, buffer_size=1000, is_train=False)
test_dir,
"",
vocab_tag_path,
batch_size=1,
neg_size=0,
buffer_size=1000,
is_train=False)
for epoch in range(start_index, last_index + 1):
epoch_path = model_dir + "/epoch_" + str(epoch)
infer(test_reader=test_reader, vocab_tag=vocab_tag, use_cuda=False, model_path=epoch_path)
infer(
test_reader=test_reader,
vocab_tag=vocab_tag,
use_cuda=False,
model_path=epoch_path)
......@@ -13,16 +13,17 @@ import net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("TagSpace benchmark.")
parser.add_argument(
'--neg_size', type=int, default=3, help='neg/pos ratio')
'--neg_size', type=int, default=3, help='number of neg item')
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address')
'--train_dir', type=str, default='train_data', help='train file')
parser.add_argument(
'--vocab_text_path', type=str, default='vocab_text.txt', help='vocab_text file address')
'--vocab_text_path', type=str, default='vocab_text.txt', help='text')
parser.add_argument(
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab_text file address')
'--vocab_tag_path', type=str, default='vocab_tag.txt', help='tag')
parser.add_argument(
'--model_dir', type=str, default='model_', help='model dir')
parser.add_argument(
......@@ -30,7 +31,7 @@ def parse_args():
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
'--pass_num', type=int, default=10, help='num of epoch')
'--pass_num', type=int, default=10, help='number of epoch')
parser.add_argument(
'--use_cuda', type=int, default=0, help='whether use gpu')
parser.add_argument(
......@@ -42,9 +43,11 @@ def parse_args():
args = parser.parse_args()
return args
def get_cards(args):
return args.num_devices
def train():
""" do training """
args = parse_args()
......@@ -56,15 +59,19 @@ def train():
batch_size = args.batch_size
neg_size = args.neg_size
print("use_cuda: {}, parallel: {}, batch_size: {}, neg_size: {} "
.format(use_cuda, parallel, batch_size, neg_size))
.format(use_cuda, parallel, batch_size, neg_size))
vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data(
file_dir=train_dir, vocab_text_path=vocab_text_path,
vocab_tag_path=vocab_tag_path, neg_size=neg_size,
batch_size=batch_size * get_cards(args),
buffer_size=batch_size*100, is_train=True)
file_dir=train_dir,
vocab_text_path=vocab_text_path,
vocab_tag_path=vocab_tag_path,
neg_size=neg_size,
batch_size=batch_size * get_cards(args),
buffer_size=batch_size * 100,
is_train=True)
""" train network """
# Train program
avg_cost, correct, cos_pos = net.network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
avg_cost, correct, cos_pos = net.network(
vocab_text_size, vocab_tag_size, neg_size=neg_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
......@@ -76,11 +83,10 @@ def train():
exe.run(fluid.default_startup_program())
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=avg_cost.name)
use_cuda=use_cuda, loss_name=avg_cost.name)
else:
train_exe = exe
pass_num = args.pass_num
model_dir = args.model_dir
fetch_list = [avg_cost.name]
......@@ -94,15 +100,18 @@ def train():
lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
loss_val, correct_val = train_exe.run(
feed={
"text": lod_text_seq,
"pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag},
fetch_list=[avg_cost.name, correct.name])
feed={
"text": lod_text_seq,
"pos_tag": lod_pos_tag,
"neg_tag": lod_neg_tag
},
fetch_list=[avg_cost.name, correct.name])
if batch_id % args.print_batch == 0:
print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}"
.format(pass_idx, (batch_id+10) * batch_size, np.mean(loss_val),
float(np.sum(correct_val)) / (args.num_devices*batch_size)))
.format(pass_idx, (batch_id + 10) * batch_size,
np.mean(loss_val),
float(np.sum(correct_val)) / (args.num_devices *
batch_size)))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
......@@ -110,8 +119,10 @@ def train():
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["text", "pos_tag"]
fetch_vars = [cos_pos]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars,
train_exe)
print("finish training")
if __name__ == "__main__":
train()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册