paddle v2 个性化推荐代码速度问题
Created by: xlhlhlx
背景
视频内容的个性化推荐
实现方式
仿照paddle的个性化推荐例子(http://book.paddlepaddle.org/05.recommender_system/index.cn.html),回归预测每个用户对video的阅读时长
模型代码如下:
def cluster_data_reader(file_dir, node_id):
def data_reader():
files = os.listdir(file_dir)
t1 = time.time()
print "\n*****begin to reader*****"
for fi in files:
with open(file_dir + '/' + fi, "r") as f:
for line in f:
user_id, time_period, user_location, app_id, content_id, cate_id, title, brief, quality_level, check_in_period, read_time = line.strip().split('\t')
user_id_code = feature_dict['user_id'].gen(user_id)
user_location_code = [feature_dict['user_location'].gen(ul) for ul in user_location.split('|')]
content_id_code = feature_dict['content_id'].gen(int(content_id))
cate_id_code = feature_dict['cate_id'].gen(cate_id)
title_code = [feature_dict['word'].gen(w.lower()) for w in title.split('^')]
brief_code = [feature_dict['word'].gen(w.lower()) for w in brief.split('^')]
check_in_period_code = feature_dict['check_in_period'].gen(int(check_in_period))
record = [user_id_code, int(time_period), user_location_code, content_id_code, cate_id_code, title_code, brief_code, check_in_period_code]
yield record + [[float(read_time)]]
print "cost time of reader is %s" % (time.time()-t1)
print "*****end reader*****"
return data_reader
def get_usr_combined_features():
print "user_id size_%d" % feature_dict['user_id'].size()
print "user_location_%d" % feature_dict['user_location'].size()
uid = paddle.layer.data(
name='user_id',
type=paddle.data_type.integer_value(
feature_dict['user_id'].size()))
usr_emb = paddle.layer.embedding(input=uid, size=16)
usr_fc = paddle.layer.fc(input=usr_emb, size=16)
time_period = paddle.layer.data(
name='time_period',
type=paddle.data_type.integer_value(24))
time_period_emb = paddle.layer.embedding(input=time_period, size=16)
time_period_fc = paddle.layer.fc(input=time_period_emb, size=16)
usr_location = paddle.layer.data(
name='user_location',
type=paddle.data_type.sparse_binary_vector(
feature_dict['user_location'].size()))
usr_location_fc = paddle.layer.fc(input=usr_location, size=32)
usr_combined_features = paddle.layer.fc(
input=[usr_fc, time_period_fc, usr_location_fc],
size=200,
act=paddle.activation.Tanh())
return usr_combined_features
def get_content_combined_features():
content_word_dict = feature_dict['word'].dic
print "content_id size_%d" % feature_dict['content_id'].size()
print "cate_id size_%d" % feature_dict['cate_id'].size()
print "content_word_dict length_%d" % len(content_word_dict)
print "check_in_period size_%d" % feature_dict['check_in_period'].size()
content_id = paddle.layer.data(
name='content_id',
type=paddle.data_type.integer_value(
feature_dict['content_id'].size()))
content_emb = paddle.layer.embedding(input=content_id, size=32)
content_fc = paddle.layer.fc(input=content_emb, size=32)
content_categories = paddle.layer.data(
name='cate_id',
type=paddle.data_type.integer_value(
feature_dict['cate_id'].size()))
content_categories_emb = paddle.layer.embedding(input=content_categories, size=16)
content_categories_fc = paddle.layer.fc(input=content_categories_emb, size=16)
content_title_id = paddle.layer.data(
name='title',
type=paddle.data_type.integer_value_sequence(len(content_word_dict)))
content_title_emb = paddle.layer.embedding(input=content_title_id, size=128)
content_title_conv = paddle.networks.sequence_conv_pool(
input=content_title_emb, hidden_size=128, context_len=2)
content_brief_id = paddle.layer.data(
name='brief',
type=paddle.data_type.integer_value_sequence(len(content_word_dict)))
content_brief_emb = paddle.layer.embedding(input=content_brief_id, size=128)
content_brief_conv = paddle.networks.sequence_conv_pool(
input=content_brief_emb, hidden_size=128, context_len=2)
check_in_period = paddle.layer.data(
name='check_in_period',
type=paddle.data_type.integer_value(
feature_dict['check_in_period'].size()))
check_in_period_emb = paddle.layer.embedding(input=check_in_period, size=32)
check_in_period_fc = paddle.layer.fc(input=check_in_period_emb, size=32)
content_combined_features = paddle.layer.fc(
input=[content_fc, content_categories_fc, content_title_conv, content_brief_conv, check_in_period_fc],
size=200,
act=paddle.activation.Tanh())
return content_combined_features
usr_combined_features = get_usr_combined_features()
content_combined_features = get_content_combined_features()
inference = paddle.layer.cos_sim(
a=usr_combined_features, b=content_combined_features, size=1)
cost = paddle.layer.square_error_cost(
input=inference,
label=paddle.layer.data(
name='read_time', type=paddle.data_type.dense_vector(1)))
def train_model(num_pass):
print("Begin to train model...")
train_reader = paddle.batch(
paddle.reader.shuffle(
cluster_data_reader(cluster_train_dir, node_id), buf_size=8192),
batch_size=100)
test_reader = paddle.batch(
paddle.reader.shuffle(
cluster_data_reader(cluster_test_dir, node_id), buf_size=8192),
batch_size=100)
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=1e-3))
global t
t = time.time()
def event_handler(event):
if isinstance(event, paddle.event.EndPass):
if not os.path.exists("model_params"):
os.makedirs("model_params")
with gzip.open("model_params/video_recomm_pass_%03d.tar.gz" % event.pass_id, 'w') as f:
parameters.to_tar(f)
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
global t
print "Pass %d Batch %d Cost %.5f Time %s" % (
event.pass_id, event.batch_id, event.cost, time.time()-t)
t = time.time()
trainer.train(
reader=train_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=num_pass)
print("Training finished!")
耗时数据
测试数据:500条,batch_size=100,pass_num=20,将每个batch的耗时打印出来,具体如下: begin to reader cost time of reader is 0.0190041065216 end reader Pass 0 Batch 0 Cost 0.00802 Time 0.0562610626221 Pass 0 Batch 1 Cost 0.01444 Time 2.72469496727 Pass 0 Batch 2 Cost 0.00655 Time 2.34903788567 Pass 0 Batch 3 Cost 0.00880 Time 2.33675003052 Pass 0 Batch 4 Cost 0.00874 Time 2.34233188629
begin to reader cost time of reader is 0.0191640853882 end reader Pass 1 Batch 0 Cost 0.00708 Time 70.6922171116 Pass 1 Batch 1 Cost 0.00345 Time 2.46361804008 Pass 1 Batch 2 Cost 0.00581 Time 2.33501601219 Pass 1 Batch 3 Cost 0.00240 Time 2.33918118477 Pass 1 Batch 4 Cost 0.00394 Time 2.33651804924
begin to reader cost time of reader is 0.0201029777527 end reader Pass 2 Batch 0 Cost 0.00428 Time 64.1795651913 Pass 2 Batch 1 Cost 0.00132 Time 2.34728193283 Pass 2 Batch 2 Cost 0.00151 Time 2.3494591713 Pass 2 Batch 3 Cost 0.00295 Time 2.35163593292 Pass 2 Batch 4 Cost 0.00090 Time 2.34288620949
begin to reader cost time of reader is 0.203039884567 end reader Pass 3 Batch 0 Cost 0.00317 Time 66.1391232014 Pass 3 Batch 1 Cost 0.00065 Time 2.39867901802 Pass 3 Batch 2 Cost 0.00048 Time 2.33365297318 Pass 3 Batch 3 Cost 0.00172 Time 2.33354210854 Pass 3 Batch 4 Cost 0.00055 Time 2.33912920952
问题
1、为什么第一个pass的第一个batch耗时很短(0.056s),后面每个pass的第一个batch耗时都非常长(60+s),核心问题点在哪里呢?是哪个地方的代码写的不好呢?