单机试验个性化推荐报错 TypeError: 'generator' object is not callable
Created by: xlhlhlx
自己定义了一个data reader,然后跑train的时候报错,但是单独运行data reader是可以正常生成样本的,具体报错的log如下:
自己写的data reader如下:
#!/usr/bin/python
#encoding=utf8
import sys
import os
import random
class CategoryFeatureGenerator(object):
def __init__(self):
self.dic = dict()
self.dic['unk'] = 0
self.counter = 1
def register(self, key):
'''
Register record.
'''
if key not in self.dic:
self.dic[key] = self.counter
self.counter += 1
def size(self):
return len(self.dic)
def gen(self, key):
'''
Generate one-hot representation for a record.
'''
if key not in self.dic:
res = self.dic['unk']
else:
res = self.dic[key]
return res
def __repr__(self):
return '<CategoryFeatureGenerator %d>' % len(self.dic)
feature_fields = ['user_id','user_location','content_id','cate_id','word','check_in_period']
feature_dict = {}
for key in feature_fields:
feature_dict[key] = CategoryFeatureGenerator()
def __init_dataset__(path):
with open(path, "r") as f:
for line in f:
user_id, time_period, user_location, app_id, content_id, cate_id, title, brief, quality_level, check_in_period, read_time = line.strip().split('\t')
feature_dict['user_id'].register(user_id)
feature_dict['content_id'].register(int(content_id))
user_location_list = user_location.split('|')
for ul in user_location_list:
feature_dict['user_location'].register(ul)
feature_dict['cate_id'].register(cate_id)
title_list = title.split(' ')
for w in title_list:
feature_dict['word'].register(w.lower())
brief_list = brief.split(' ')
for w in brief_list:
feature_dict['word'].register(w.lower())
feature_dict['check_in_period'].register(int(check_in_period))
class ReaderData(object):
def __init__(self, data_path, test_ratio, is_test):
__init_dataset__(data_path)
self.data_path = data_path
self.test_ratio = test_ratio
self.is_test = is_test
def reader_creator(self):
def reader():
rand = random.Random()
path = self.data_path
test_ratio = self.test_ratio
is_test = self.is_test
with open(path, "r") as f:
for line in f:
if (rand.random() < test_ratio) == is_test:
user_id, time_period, user_location, app_id, content_id, cate_id, title, brief, quality_level, check_in_period, read_time = line.strip().split('\t')
user_id_code = feature_dict['user_id'].gen(user_id)
user_location_code = [feature_dict['user_location'].gen(ul) for ul in user_location.split('|')]
content_id_code = feature_dict['content_id'].gen(int(content_id))
cate_id_code = feature_dict['cate_id'].gen(cate_id)
title_code = [feature_dict['word'].gen(w.lower()) for w in title.split(' ')]
brief_code = [feature_dict['word'].gen(w.lower()) for w in brief.split(' ')]
check_in_period_code = feature_dict['check_in_period'].gen(int(check_in_period))
record = [user_id_code, int(time_period), user_location_code, content_id_code, cate_id_code, title_code, brief_code, check_in_period_code]
yield record + [[float(read_time)]]
return reader
def get_content_word_dict(self):
return feature_dict['word'].dic
def user_id_len(self):
return feature_dict['user_id'].size()
def get_user_location_dict(self):
return feature_dict['user_location'].dic
def content_id_len(self):
return feature_dict['content_id'].size()
def category_id_len(self):
return feature_dict['cate_id'].size()
def check_in_period_len(self):
return feature_dict['check_in_period'].size()
if __name__ == '__main__':
path = "./videoSample"
test_ratio = 0.1
is_test = False
trainer = ReaderData(path, test_ratio, is_test)
print trainer.user_id_len()
a = trainer.get_user_location_dict()
for no, rcd in enumerate(trainer.read()):
print no, rcd
if no > 10 : break
训练模型的代码如下:
#!/usr/bin/python
#encoding=utf8
import paddle.v2 as paddle
import cPickle
import copy
from paddle.v2.dataset.video import feature_dict, ReaderData
dataset_train = ReaderData("./videoSample", 0.1, False)
def get_usr_combined_features():
uid = paddle.layer.data(
name='user_id',
type=paddle.data_type.integer_value(
dataset_train.user_id_len()))
usr_emb = paddle.layer.embedding(input=uid, size=32)
usr_fc = paddle.layer.fc(input=usr_emb, size=32)
time_period = paddle.layer.data(
name='time_period',
type=paddle.data_type.integer_value(24))
time_period_emb = paddle.layer.embedding(input=time_period, size=16)
time_period_fc = paddle.layer.fc(input=time_period_emb, size=16)
usr_location = paddle.layer.data(
name='user_location',
type=paddle.data_type.sparse_binary_vector(
len(dataset_train.get_user_location_dict())))
usr_location_fc = paddle.layer.fc(input=usr_location, size=32)
usr_combined_features = paddle.layer.fc(
input=[usr_fc, time_period_fc, usr_location_fc],
size=200,
act=paddle.activation.Tanh())
return usr_combined_features
def get_content_combined_features():
content_word_dict = dataset_train.get_content_word_dict()
content_id = paddle.layer.data(
name='content_id',
type=paddle.data_type.integer_value(
dataset_train.content_id_len()))
content_emb = paddle.layer.embedding(input=content_id, size=32)
content_fc = paddle.layer.fc(input=content_emb, size=32)
content_categories = paddle.layer.data(
name='category_id',
type=paddle.data_type.integer_value(
dataset_train.category_id_len()))
content_categories_emb = paddle.layer.embedding(input=content_categories, size=16)
content_categories_fc = paddle.layer.fc(input=content_categories, size=16)
content_title_id = paddle.layer.data(
name='title',
type=paddle.data_type.integer_value_sequence(len(content_word_dict)))
content_title_emb = paddle.layer.embedding(input=content_title_id, size=32)
content_title_conv = paddle.networks.sequence_conv_pool(
input=content_title_emb, hidden_size=32, context_len=2)
content_brief_id = paddle.layer.data(
name='brief',
type=paddle.data_type.integer_value_sequence(len(content_word_dict)))
content_brief_emb = paddle.layer.embedding(input=content_brief_id, size=32)
content_brief_conv = paddle.networks.sequence_conv_pool(
input=content_brief_emb, hidden_size=32, context_len=2)
check_in_period = paddle.layer.data(
name='check_in_period',
type=paddle.data_type.integer_value(
dataset_train.check_in_period_len()))
check_in_period_emb = paddle.layer.embedding(input=check_in_period, size=32)
check_in_period_fc = paddle.layer.fc(input=check_in_period, size=32)
content_combined_features = paddle.layer.fc(
input=[content_fc, content_categories_fc, content_title_conv, content_brief_conv, check_in_period_fc],
size=200,
act=paddle.activation.Tanh())
return content_combined_features
def main():
paddle.init(use_gpu=False)
usr_combined_features = get_usr_combined_features()
content_combined_features = get_content_combined_features()
inference = paddle.layer.cos_sim(
a=usr_combined_features, b=content_combined_features, size=1, scale=5)
cost = paddle.layer.mse_cost(
input=inference,
label=paddle.layer.data(
name='read_time', type=paddle.data_type.dense_vector(1)))
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
feeding = {
'user_id': 0,
'time_period': 1,
'user_location': 2,
'content_id': 3,
'category_id': 4,
'title': 5,
'brief': 6,
'check_in_period': 7,
'read_time': 8
}
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d Batch %d Cost %.2f" % (
event.pass_id, event.batch_id, event.cost)
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
dataset_train.reader_creator(), buf_size=8192),
batch_size=256),
event_handler=event_handler,
feeding=feeding,
num_passes=1)
user_id = "123a"
content_id = 20419555
time_period = 16
user_location = "上海市|上海市"
cate_id = "1001"
title = "白鹿原 白嘉轩 娶 的 第七任 老婆 仙草 洞房花烛 夜 白嘉轩 跑 了"
brief = ""
check_in_period = 3600
user_id = feature_dict['user_id'].gen(user_id)
content_id_code = feature_dict['content_id'].gen(content_id)
cate_id_code = feature_dict['cate_id'].gen(cate_id)
title_code = [feature_dict['word'].gen(w.lower()) for w in title.split(' ')]
brief_code = [feature_dict['word'].gen(w.lower()) for w in brief.split(' ')]
check_in_period_code = feature_dict['check_in_period'].gen(int(check_in_period))
print [user_id, content_id, time_period, user_location, cate_id, title, brief, check_in_period]
feature = [user_id_code, int(time_period), user_location_code, content_id_code, cate_id_code, title_code, brief_code, check_in_period_code]
print feature
infer_dict = copy.copy(feeding)
del infer_dict['read_time']
prediction = paddle.infer(
output_layer=inference,
parameters=parameters,
input=[feature],
feeding=infer_dict)
print prediction
if __name__ == '__main__':
main()