deep_fm调用自己的数据时报错
Created by: Ericzhuu
我们的数据已通过/deep_fm/preprocess.py进行预处理,在调用train函数时报错,错误日志如下图,
源代码如下,
import paddle.v2 as paddle
factor_size = 5
batch_size = 128
num_passes = 10
dense_feature_dim = 30
sparse_feature_dim = 269
feeding = {
'dense_input': 0,
'sparse_input': 1,
'C1': 2,
'C2': 3,
'C3': 4,
'C4': 5,
'C5': 6,
'C6': 7,
'C7': 8,
'C8': 9,
'C9': 10,
'C10': 11,
'C11': 12,
'C12': 13,
'C13': 14,
'C14': 15,
'C15': 16,
'C16': 17,
'C17': 18,
'C18': 19,
'C19': 20,
'C20': 21,
'C21': 22,
'C22': 23,
'C23': 24,
'C24': 25,
'C25': 26,
'C26': 27,
'C27': 28,
'C28': 29,
'C29': 30,
'label': 31
}
class Dataset:
def _reader_creator(self, path, is_infer):
def reader():
with open(path, 'r') as f:
for line in f:
features = line.rstrip('\n').split('\t')
dense_feature = map(float, features[0].split(','))
sparse_feature = map(int, features[1].split(','))
if not is_infer:
label = [int(features[2])]
yield [dense_feature, sparse_feature
] + sparse_feature + [label]
else:
yield [dense_feature, sparse_feature] + sparse_feature
return reader
def train(self, path):
return self._reader_creator(path, False)
def test(self, path):
return self._reader_creator(path, False)
def fm_layer(input, factor_size, fm_param_attr):
first_order = paddle.layer.fc(input=input,
size=1,
act=paddle.activation.Linear())
second_order = paddle.layer.factorization_machine(
input=input,
factor_size=factor_size,
act=paddle.activation.Linear(),
param_attr=fm_param_attr)
out = paddle.layer.addto(
input=[first_order, second_order],
act=paddle.activation.Linear(),
bias_attr=False)
return out
def DeepFM(factor_size, infer=False):
dense_input = paddle.layer.data(
name="dense_input",
type=paddle.data_type.dense_vector(dense_feature_dim))
sparse_input = paddle.layer.data(
name="sparse_input",
type=paddle.data_type.sparse_binary_vector(sparse_feature_dim))
sparse_input_ids = [
paddle.layer.data(
name="C" + str(i),
type=paddle.data_type.integer_value(sparse_feature_dim))
for i in range(1, 30)
]
dense_fm = fm_layer(
dense_input,
factor_size,
fm_param_attr=paddle.attr.Param(name="DenseFeatFactors"))
sparse_fm = fm_layer(
sparse_input,
factor_size,
fm_param_attr=paddle.attr.Param(name="SparseFeatFactors"))
def embedding_layer(input):
return paddle.layer.embedding(
input=input,
size=factor_size,
param_attr=paddle.attr.Param(name="SparseFeatFactors"))
sparse_embed_seq = map(embedding_layer, sparse_input_ids)
sparse_embed = paddle.layer.concat(sparse_embed_seq)
fc1 = paddle.layer.fc(input=[sparse_embed, dense_input],
size=400,
act=paddle.activation.Relu())
fc2 = paddle.layer.fc(input=fc1, size=400, act=paddle.activation.Relu())
fc3 = paddle.layer.fc(input=fc2, size=400, act=paddle.activation.Relu())
predict = paddle.layer.fc(input=[dense_fm, sparse_fm, fc3],
size=1,
act=paddle.activation.Sigmoid())
if not infer:
label = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(1))
cost = paddle.layer.multi_binary_label_cross_entropy_cost(
input=predict, label=label)
paddle.evaluator.classification_error(
name="classification_error", input=predict, label=label)
paddle.evaluator.auc(name="auc", input=predict, label=label)
return cost
else:
return predict
def train():
paddle.init(use_gpu=False, trainer_count=1)
optimizer = paddle.optimizer.Adam(learning_rate=1e-4)
model = DeepFM(factor_size)
params = paddle.parameters.create(model)
trainer = paddle.trainer.SGD(cost=model, parameters=params, update_equation=optimizer)
dataset = Dataset()
def __event_handler__(event):
if isinstance(event, paddle.event.EndIteration):
num_samples = event.batch_id * batch_size
if event.batch_id % 10 == 0:
print "Pass %d, Batch %d, Samples %d, Cost %f, %s" % (event.pass_id, event.batch_id, num_samples,
event.cost, event.metrics)
if event.batch_id % 100 == 0:
result = trainer.test(
reader=paddle.batch(
dataset.test('data/valid.txt'),
batch_size=batch_size),
feeding=feeding)
print "Test %d-%d, Cost %f, %s" % (event.pass_id, event.batch_id, result.cost,
result.metrics)
#path = "{}/model-pass-{}-batch-{}.tar.gz".format(
# model_output_dir, event.pass_id, event.batch_id)
#with gzip.open(path, 'w') as f:
# trainer.save_parameter_to_tar(f)
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
dataset.train('data/train.txt'),
buf_size=batch_size * 10000),
batch_size=batch_size),
feeding=feeding,
event_handler=__event_handler__,
num_passes=num_passes)
if __name__ == '__main__':
train()