跑改进版deep_fm出现段错误
Created by: xlhlhlx
-问题
在deep_fm demo的基础上,进行了修改,支持多值特征的场景,在跑mpi任务时(任务链接:http://yq01-hpc-bdl-master02.yq01.baidu.com:8090/job/i-338136/ ),在第一个pass的第二个batch时报段错误,错误片段信息如下:
Mon Jan 8 21:13:05 2018[1,22]<stderr>:/opt/compiler/gcc-4.8.2/lib/libc.so.6(+0x7354f)[0x7fdf7ee8254f]
Mon Jan 8 21:13:05 2018[1,22]<stderr>:/opt/compiler/gcc-4.8.2/lib/libc.so.6(+0x78dbe)[0x7fdf7ee87dbe]
Mon Jan 8 21:13:05 2018[1,22]<stderr>:/home/disk1/normandy/maybach/338136/workspace/python27-gcc482/lib/python2.7/site-packages/py_paddle/_swig_paddle.so(_ZN6paddle25FactorizationMachineLayer8backwardERKSt8functionIFvPNS_9ParameterEEE+0x6bf)[0x7fdf1f9110cf]
Mon Jan 8 21:13:05 2018[1,22]<stderr>:/home/disk1/normandy/maybach/338136/workspace/python27-gcc482/lib/python2.7/site-packages/py_paddle/_swig_paddle.so(_ZN6paddle13NeuralNetwork8backwardERKSt8functionIFvPNS_9ParameterEEE+0x162)[0x7fdf1fa2a852]
Mon Jan 8 21:13:05 2018[1,22]<stderr>:/home/disk1/normandy/maybach/338136/workspace/python27-gcc482/lib/python2.7/site-packages/py_paddle/_swig_paddle.so(_ZN6paddle13TrainerThread8backwardEv+0x42)[0x7fdf1fa4ca62]
Mon Jan 8 21:13:05 2018[1,22]<stderr>:/home/disk1/normandy/maybach/338136/workspace/python27-gcc482/lib/python2.7/site-packages/py_paddle/_swig_paddle.so(_ZN6paddle13TrainerThread13computeThreadEv+0xed)[0x7fdf1fa4cbed]
Mon Jan 8 21:13:05 2018[1,22]<stderr>:/opt/compiler/gcc-4.8.2/lib/libstdc++.so.6(+0xb08a0)[0x7fdf709558a0]
Mon Jan 8 21:13:05 2018[1,22]<stderr>:/opt/compiler/gcc-4.8.2/lib/libpthread.so.0(+0x81c3)[0x7fdf7f8ce1c3]
Mon Jan 8 21:13:05 2018[1,22]<stderr>:/opt/compiler/gcc-4.8.2/lib/libc.so.6(clone+0x6d)[0x7fdf7eef612d]
Mon Jan 8 21:13:05 2018[1,22]<stderr>:======= Memory map: ========
Mon Jan 8 21:13:05 2018[1,22]<stderr>:00400000-005a0000 r-xp 00000000 08:11 1136281 /home/disk1/normandy/maybach/338136/workspace/python27-gcc482/bin/python2.7
Mon Jan 8 21:13:05 2018[1,22]<stderr>:007a0000-007dc000 rw-p 001a0000 08:11 1136281 /home/disk1/normandy/maybach/338136/workspace/python27-gcc482/bin/python2.7
Mon Jan 8 21:13:05 2018[1,22]<stderr>:007dc000-52701000 rw-p 00000000 00:00 0 [heap]
Mon Jan 8 21:13:05 2018[1,22]<stderr>:3193e00000-3193e8c000 r-xp 00000000 08:11 1136169 /home/disk1/normandy/maybach/338136/workspace/python27-gcc482/lib/libsqlite3.so.0
Mon Jan 8 21:13:05 2018[1,22]<stderr>:3193e8c000-319408b000 ---p 0008c000 08:11 1136169 /home/disk1/normandy/maybach/338136/workspace/python27-gcc482/lib/libsqlite3.so.0
Mon Jan 8 21:13:05 2018[1,22]<stderr>:319408b000-319408e000 rw-p 0008b000 08:11 1136169 /home/disk1/normandy/maybach/338136/workspace/python27-gcc482/lib/libsqlite3.so.0
Mon Jan 8 21:13:05 2018[1,22]<stderr>:319408e000-319408f000 rw-p 00000000 00:00 0
Mon Jan 8 21:13:05 2018[1,22]<stderr>:3f0df00000-3f0df87000 r-xp 00000000 08:02 235078 /usr/lib64/libglib-2.0.so.0.400.7
Mon Jan 8 21:13:05 2018[1,22]<stderr>:3f0df87000-3f0e086000 ---p 00
请问是网络设置的问题吗?
-具体代码
1、reader
import os
import sys
import cPickle
feature_id_list = {2,3,4,9,11,14,15,18,20,30,31,32,33,34,37,38,39,40,43,44,47,48,51,54,55,56,57,58,59}
class Dataset:
def _reader_creator(self, dicts, path, is_infer):
def reader():
slot_feature_dict = dicts['slot_feature_dict']
files = os.listdir(path)
for fi in files:
with open(path+ '/' + fi, "r") as f:
UNK = '<unk>'
for line in f:
line_split = line.rstrip('\n').split('\t')
if len(line_split) < 60:
continue
label = [float(line_split[1])]
sparse_feature = []
for i in range(2, len(line_split)):
if i not in feature_id_list:
continue
slot_id = i-2
if line_split[i] == '':
continue
mini_seg_array = line_split[i].split('-')
count = 0
for mini_seg in mini_seg_array:
count = count + 1
#item_tag top6
if i == 30 or i == 48:
if count > 6:
continue
key = str(slot_id)+":"+mini_seg
if slot_feature_dict.has_key(key):
sparse_feature.append(slot_feature_dict.get(key))
else:
key = str(slot_id)+":"+UNK
if slot_feature_dict.has_key(key):
sparse_feature.append(slot_feature_dict.get(key))
else:
continue
if not is_infer:
yield [sparse_feature] + [sparse_feature] + [label]
else:
yield [sparse_feature] + [sparse_feature]
return reader
def train(self, dicts, path):
return self._reader_creator(dicts, path, False)
def test(self, dicts, path):
return self._reader_creator(dicts, path, False)
def infer(self, dicts, path):
return self._reader_creator(dicts, path, True)
if __name__ == "__main__":
dict_path = sys.argv[1]
train_path = sys.argv[2]
slot_feature_dict_path = dict_path + "/" + "slot_feature_dict.pkl"
with open(slot_feature_dict_path) as f:
slot_feature_dict = cPickle.load(f)
dicts = dict()
dicts['slot_feature_dict'] = slot_feature_dict
dataset = Dataset()
for dat in dataset.train(dicts, train_path)():
print dat
feeding = {
'sparse_input': 0,
'deep_input': 1,
'label': 2
}
2、network
import paddle.v2 as paddle
def fm_layer(input, factor_size, fm_param_attr):
first_order = paddle.layer.fc(
input=input, size=1, act=paddle.activation.Linear())
second_order = paddle.layer.factorization_machine(
input=input,
factor_size=factor_size,
act=paddle.activation.Linear(),
param_attr=fm_param_attr)
out = paddle.layer.addto(
input=[first_order, second_order],
act=paddle.activation.Linear(),
bias_attr=False)
return out
def DeepFM(factor_size, sparse_feature_dim, infer=False):
sparse_input = paddle.layer.data(
name="sparse_input",
type=paddle.data_type.sparse_binary_vector(sparse_feature_dim))
sparse_fm = fm_layer(
sparse_input,
factor_size,
fm_param_attr=paddle.attr.Param(name="SparseFeatFactors"))
deep_input = paddle.layer.data(
name="deep_input",
type=paddle.data_type.sparse_binary_vector(sparse_feature_dim))
deep_embedding = paddle.layer.fc(
input=deep_input,
size=factor_size,
param_attr=paddle.attr.Param(name="SparseFeatFactors"))
fc1 = paddle.layer.fc(
input=deep_embedding,
size=256,
act=paddle.activation.Relu())
fc2 = paddle.layer.fc(input=fc1, size=128, act=paddle.activation.Relu())
predict = paddle.layer.fc(
input=[sparse_fm, fc2],
size=1,
act=paddle.activation.Sigmoid())
if not infer:
label = paddle.layer.data(
name="label", type=paddle.data_type.dense_vector(1))
cost = paddle.layer.multi_binary_label_cross_entropy_cost(
input=predict, label=label)
paddle.evaluator.classification_error(
name="classification_error", input=predict, label=label)
paddle.evaluator.auc(name="auc", input=predict, label=label)
return cost
else:
return predict