V2版本,mpi训练,报错:Aborted at 1537770509 (unix time) try "date -d @1537770509" if you are using GNU date
Created by: feitonlife
参考DSSM的双塔结构,实现的一个相关性计算模型,模型网络结构没有问题,小数据量时已经跑通,能够收敛,预测。 但是mpi大数据量训练时,经常报错:Aborted at 1537770509 (unix time) try "date -d @1537770509" if you are using GNU date
开始以为是空样本的问题,可是每次出错的样本都不一样,大概排除了样本的问题。不知道是不是哪里的参数设置不合理? ps:训练环境,paddle V2,公司的mpi集群,lvliang01.
具体错误信息如下:
Mon Sep 24 14:28:29 2018[1,0]<stderr>:*** Aborted at 1537770509 (unix time) try "date -d @1537770509" if you are using GNU date ***
Mon Sep 24 14:28:29 2018[1,0]<stderr>:PC: @ 0x0 (unknown)
Mon Sep 24 14:28:29 2018[1,0]<stderr>:*** SIGSEGV (@0x8) received by PID 33946 (TID 0x7f58d53a8700) from PID 8; stack trace: ***
Mon Sep 24 14:28:29 2018[1,0]<stderr>: @ 0x7f59414e7160 (unknown)
Mon Sep 24 14:28:29 2018[1,0]<stderr>: @ 0x7f58a1914162 paddle::ProtoClient::recv()
Mon Sep 24 14:28:29 2018[1,0]<stderr>: @ 0x7f58a36376e4 paddle::ParameterClient2::sendParallel()
Mon Sep 24 14:28:29 2018[1,0]<stderr>: @ 0x7f58a1a1c57c _ZNSt6thread5_ImplISt12_Bind_simpleIFZN6paddle14SyncThreadPool5startEvEUliE_mEEE6_M_runEv
Mon Sep 24 14:28:29 2018[1,0]<stderr>: @ 0x7f5930e958a0 execute_native_thread_routine
Mon Sep 24 14:28:29 2018[1,0]<stderr>: @ 0x7f59414df1c3 start_thread
Mon Sep 24 14:28:29 2018[1,0]<stderr>: @ 0x7f5940b0712d __clone
Mon Sep 24 14:28:29 2018[1,0]<stderr>: @ 0x0 (unknown)
Mon Sep 24 14:28:30 2018[1,0]<stderr>:./train.sh: line 231: 33946 Segmentation fault python27-gcc482/bin/python conf/trainer_config.conf
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ '[' 139 -ne 0 ']'
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ kill_pserver2_exit
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ ps aux
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ grep paddle_pserver2
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ grep paddle_cluster_job
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ cut -c10-14
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ grep -v grep
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ xargs kill -9
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ log_fatal 'paddle_trainer failed kill paddle_pserver2 and exit'
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ echo '[./common.sh : 399] [kill_pserver2_exit]'
Mon Sep 24 14:28:30 2018[1,0]<stderr>:[./common.sh : 399] [kill_pserver2_exit]
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ echo '[FATAL]: paddle_trainer failed kill paddle_pserver2 and exit'
Mon Sep 24 14:28:30 2018[1,0]<stderr>:[FATAL]: paddle_trainer failed kill paddle_pserver2 and exit
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ get_stack
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ set +x
Mon Sep 24 14:28:30 2018[1,0]<stderr>:
Mon Sep 24 14:28:30 2018[1,0]<stderr>:*********************Shell Script Stack Trace********************
Mon Sep 24 14:28:30 2018[1,0]<stderr>: @: [./log.sh: 41] log_fatal
Mon Sep 24 14:28:30 2018[1,0]<stderr>: @: [./common.sh: 399] kill_pserver2_exit
Mon Sep 24 14:28:30 2018[1,0]<stderr>: @: [./train.sh: 234] main
Mon Sep 24 14:28:30 2018[1,0]<stderr>:
Mon Sep 24 14:28:30 2018[1,0]<stderr>:+ exit 1
Mon Sep 24 14:29:05 2018[1,0]<stderr>:10.209.42.50 - - [24/Sep/2018 14:29:05] "GET / HTTP/1.0" 200 -
# 网络结构如下: 输入的seq类特征,经过embedding+CNN,得到的向量concat起来,加一个cos层
def _build_classification_model(self):
print "build classification model"
#build source input
self._category_pref = paddle.layer.data(
name="category_pref",
type=paddle.data_type.integer_value_sequence(len(self.feature_id_dict['category'])))
self._sub_category_pref = paddle.layer.data(
name="sub_category_pref",
type=paddle.data_type.integer_value_sequence(len(self.feature_id_dict['sub_category'])))
self._attention_pref = paddle.layer.data(
name="attention_pref",
type=paddle.data_type.integer_value_sequence(len(self.feature_id_dict['attention'])))
self._sex = paddle.layer.data(
name="sex",
type=paddle.data_type.integer_value(len(self.feature_id_dict['sex'])))
#build target input:
self._target_category = paddle.layer.data(
name="target_category",
type=paddle.data_type.integer_value_sequence(len(self.feature_id_dict['category'])))
self._target_sub_category = paddle.layer.data(
name="target_sub_category",
type=paddle.data_type.integer_value_sequence(len(self.feature_id_dict['sub_category'])))
self._target_attention = paddle.layer.data(
name="target_attention",
type=paddle.data_type.integer_value_sequence(len(self.feature_id_dict['attention'])))
#label
self._label = paddle.layer.data(
name="label",
type=paddle.data_type.dense_vector(1))
#embedding
#build source embedding:
self._category_pref_emb = self.create_embedding(self._category_pref,
8, 'category')
self._sub_category_pref_emb = self.create_embedding(self._sub_category_pref,
16, 'sub_category')
self._attention_pref_emb = self.create_embedding(self._attention_pref,
32, 'attention')
self._sex_emb = self.create_embedding(self._sex,
8, 'sex')
#build target embedding:
self._target_category_emb = self.create_embedding(
self._target_category, 8, 'category')
self._target_sub_category_emb = self.create_embedding(
self._target_sub_category, 16, 'sub_category')
self._target_attention_emb = self.create_embedding(
self._target_attention, 32, 'attention')
#conv+pooling
category_pref_vec = self.create_cnn(
self._category_pref_emb,
8,
'category_pref')
sub_category_pref_vec = self.create_cnn(
self._sub_category_pref_emb,
16,
'sub_category_pref')
attention_pref_vec = self.create_cnn(
self._attention_pref_emb,
32,
'attention_pref')
target_category_vec = self.create_cnn(
self._target_category_emb,
8,
'target_category')
target_sub_category_vec = self.create_cnn(
self._target_sub_category_emb,
16,
'target_sub_category')
target_attention_vec = self.create_cnn(
self._target_attention_emb,
32,
'target_attention')
source_vec = paddle.layer.concat(input=[
category_pref_vec,
sub_category_pref_vec,
attention_pref_vec,
self._sex_emb,
])
target_vec = paddle.layer.concat(input=[
target_category_vec,
target_sub_category_vec,
target_attention_vec,
])
source_semantic_vec = self.create_fc(source_vec, [128, 64], 'source')
target_semantic_vec = self.create_fc(target_vec, [128, 64], 'target')
cos_prediction = paddle.layer.cos_sim(a=source_semantic_vec, b=target_semantic_vec)
#cos_sim的输出是[-1, 1]的区间,cos_sim上一层的激活函数是relu, 抑制了负数的输出,
#改成tanh之后,cos_sim的结果,不适合计算AUC,通过加了一层
prediction = paddle.layer.slope_intercept(input=cos_prediction, name='auc_layer', slope=0.5, intercept=0.5)
cost = paddle.layer.square_error_cost(prediction, self._label)
if not self.is_infer:
return cost, prediction, self._label
def create_embedding(self, input, size, name):
"""
Create word embedding. The `prefix` is added in front of the name of
embedding"s learnable parameter.
"""
print "Create embedding table [%s] whose dimention is %d. " % (name, size)
emb = paddle.layer.embedding(
input=input,
size=size,
param_attr=ParamAttr(name="%s_emb.w" % (name)))
return emb
def create_cnn(self, emb, hidden_size, name):
"""
A multi-layer CNN.
:param emb: The word embedding.
:type emb: paddle.layer
"""
def create_conv(context_len, hidden_size, name):
key = "%s_%d_%d" % (name, context_len, hidden_size)
conv = paddle.networks.sequence_conv_pool(
input=emb,
context_len=context_len,
hidden_size=hidden_size,
# set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + "_contex_proj.w"),
fc_param_attr=ParamAttr(name=key + "_fc.w"),
fc_bias_attr=ParamAttr(name=key + "_fc.b"),
pool_bias_attr=ParamAttr(name=key + "_pool.b"))
return conv
conv_3 = create_conv(3, hidden_size, name)
conv_4 = create_conv(4, hidden_size, name)
return paddle.layer.concat(input=[conv_3, conv_4])
def create_fc(self, input, dnn_dims, name):
_input_layer = input
for id, dim in enumerate(dnn_dims):
key = "%s_fc_%d_%d" % (name, id, dim)
layer_name = "%s_fc_%d" % (name, id)
print "create fc layer [%s] which dimention is %d" % (key, dim)
fc = paddle.layer.fc(name=layer_name,
input=_input_layer,
size=dim,
#act=paddle.activation.Relu(),
act=paddle.activation.Tanh(),
param_attr=ParamAttr(name="%s.w" % key),
bias_attr=ParamAttr(
name="%s.b" % key, initial_std=0.))
_input_layer = fc
return _input_layer
训练代码
dataset = reader.Dataset(
train_path=train_data_path,
test_path=test_data_path,
feature_id_dict_path=feature_id_dict_path,
word_id_dict_path=word_id_dict_path)
train_reader = paddle.batch(
paddle.reader.shuffle(
dataset.train(), buf_size=2000),
batch_size=100)
test_reader = paddle.batch(
paddle.reader.shuffle(
dataset.test(), buf_size=2000),
batch_size=100)
cost, prediction, label = DSSM(
feature_id_dict_path=feature_id_dict_path,
word_id_dict_path=word_id_dict_path)()
parameters = paddle.parameters.create(cost)
adam_optimizer = paddle.optimizer.Adam(
learning_rate=2e-4,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
trainer = paddle.trainer.SGD(
cost=cost,
extra_layers=paddle.evaluator.auc(input=prediction, label=label),
parameters=parameters,
update_equation=adam_optimizer,
is_local=False)
feeding = {
'category_pref': 0,
'sub_category_pref': 1,
'attention_pref': 2,
'sex': 3,
'target_category': 4,
'target_sub_category': 5,
'target_attention': 6,
}
trainer.train(
reader=train_reader,
event_handler=_event_handler,
feeding=feeding,
num_passes=1)