多线程时,同一输入做两个embedding程序会崩溃
Created by: jshower
在多线程时,使用如下网络结构进行训练时,在多线程时会出现训练崩溃的问题(单线程不会),在末尾处贴了错误的日志。
import sys
import os
import math
import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer
def ner_net(word_dict_len, label_dict_len, parallel):
IS_SPARSE = True
#embedding_name = 'emb'
#word_dict_len = 1942562
word_dim = 32
mention_dict_len = 57
mention_dim = 20
grnn_hidden = 36
#label_dict_len = 49
def _net_conf(word, mark, target):
word_embedding = fluid.layers.embedding(
input=word,
size=[word_dict_len, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(learning_rate=5))
mention_embedding = fluid.layers.embedding(
input=mention,
size=[mention_dict_len, mention_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(learning_rate=5))
word_embedding_r = fluid.layers.embedding(
input=word,
size=[word_dict_len, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(learning_rate=5))
mention_embedding_r = fluid.layers.embedding(
input=mention,
size=[mention_dict_len, mention_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(learning_rate=5))
word_mention_vector = fluid.layers.concat(
input=[word_embedding, mention_embedding], axis=1)
word_mention_vector_r = fluid.layers.concat(
input=[word_embedding_r, mention_embedding_r], axis=1)
pre_gru = fluid.layers.fc(input = word_mention_vector,
size = grnn_hidden * 3,
param_attr = fluid.ParamAttr(
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
gru = fluid.layers.dynamic_gru(input=pre_gru,
size=grnn_hidden,
param_attr = fluid.ParamAttr(
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
pre_gru_r = fluid.layers.fc(input=word_mention_vector_r,
size=grnn_hidden * 3,
param_attr = fluid.ParamAttr(
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
gru_r = fluid.layers.dynamic_gru(input=pre_gru_r,
size=grnn_hidden,
is_reverse=True,
param_attr = fluid.ParamAttr(
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
gru_merged = fluid.layers.concat(input=[gru, gru_r], axis=1)
emission = fluid.layers.fc(
size=label_dict_len,
input=gru_merged,
param_attr = fluid.ParamAttr(
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
crf_cost = fluid.layers.linear_chain_crf(
input=emission,
label=target,
param_attr=fluid.ParamAttr(
name='crfw',
learning_rate=0.2,
#regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)
))
avg_cost = fluid.layers.mean(x=crf_cost)
return avg_cost, emission
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name="target", shape=[1], dtype='int64', lod_level=1)
if parallel:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
word_ = pd.read_input(word)
mention_ = pd.read_input(mention)
target_ = pd.read_input(target)
avg_cost, emission_base = _net_conf(word_, mention_, target_)
pd.write_output(avg_cost)
pd.write_output(emission_base)
avg_cost_list, emission = pd()
avg_cost = fluid.layers.mean(x=avg_cost_list)
emission.stop_gradient = True
else:
avg_cost, emission = _net_conf(word, mention, target)
return avg_cost, emission, word, mention, target
将上述程序稍作调整,去掉两个不同的embedding,则不会出错。想请教一下是什么原因,感觉这是一个对用户不友好的地方,因为对同一个输入做多个不同的embedding实际中也是存在的。
import sys
import os
import math
import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer
def ner_net(word_dict_len, label_dict_len, parallel):
IS_SPARSE = True
#embedding_name = 'emb'
#word_dict_len = 1942562
word_dim = 32
mention_dict_len = 57
mention_dim = 20
grnn_hidden = 36
#label_dict_len = 49
def _net_conf(word, mark, target):
word_embedding = fluid.layers.embedding(
input=word,
size=[word_dict_len, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name="word_emb_f", learning_rate=5))
mention_embedding = fluid.layers.embedding(
input=mention,
size=[mention_dict_len, mention_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name="men_emb_r", learning_rate=5))
'''
word_embedding_r = fluid.layers.embedding(
input=word,
size=[word_dict_len, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name="word_emb_f", learning_rate=5))
mention_embedding_r = fluid.layers.embedding(
input=mention,
size=[mention_dict_len, mention_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name="men_emb_r", learning_rate=5))
'''
word_mention_vector = fluid.layers.concat(
input=[word_embedding, mention_embedding], axis=1)
'''
word_mention_vector_r = fluid.layers.concat(
input=[word_embedding_r, mention_embedding_r], axis=1)
'''
pre_gru = fluid.layers.fc(input = word_mention_vector,
size = grnn_hidden * 3,
param_attr = fluid.ParamAttr(
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
gru = fluid.layers.dynamic_gru(input=pre_gru,
size=grnn_hidden,
param_attr = fluid.ParamAttr(
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
pre_gru_r = fluid.layers.fc(input=word_mention_vector,
size=grnn_hidden * 3,
param_attr = fluid.ParamAttr(
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
gru_r = fluid.layers.dynamic_gru(input=pre_gru_r,
size=grnn_hidden,
is_reverse=True,
param_attr = fluid.ParamAttr(
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
gru_merged = fluid.layers.concat(input=[gru, gru_r], axis=1)
emission = fluid.layers.fc(
size=label_dict_len,
input=gru_merged,
param_attr = fluid.ParamAttr(
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
crf_cost = fluid.layers.linear_chain_crf(
input=emission,
label=target,
param_attr=fluid.ParamAttr(
name='crfw',
learning_rate=0.2,
#regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)
))
avg_cost = fluid.layers.mean(x=crf_cost)
return avg_cost, emission
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name="target", shape=[1], dtype='int64', lod_level=1)
if parallel:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
word_ = pd.read_input(word)
mention_ = pd.read_input(mention)
target_ = pd.read_input(target)
avg_cost, emission_base = _net_conf(word_, mention_, target_)
pd.write_output(avg_cost)
pd.write_output(emission_base)
avg_cost_list, emission = pd()
avg_cost = fluid.layers.mean(x=avg_cost_list)
emission.stop_gradient = True
else:
avg_cost, emission = _net_conf(word, mention, target)
return avg_cost, emission, word, mention, target
F0319 08:04:48.310140 3321 threadpool.h:96] The exception is thrown inside the thread pool. You should use RunAndGetException to handle the exception.
The default exception handler is LOG(FATAL).enforce dtype != -1 failed, -1 == -1
Sum operator should have at least one tensor at [/paddle_gpu/Paddle/paddle/fluid/operators/sum_op.cc:73]
PaddlePaddle Call Stacks:
0 0x7f2c50a1c48cp paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 572
1 0x7f2c514f6138p paddle::operators::SumOp::GetExpectedKernelType(paddle::framework::ExecutionContext const&) const + 1912
2 0x7f2c5161682dp paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 381
3 0x7f2c50acd4a5p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool) + 1781
4 0x7f2c50acea5fp paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 63
5 0x7f2c513f38b3p std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > >, std::__future_base::_Result_base::_Deleter>, std::_Bind_simple<std::reference_wrapper<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}> ()>, std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > >::_M_invoke(std::_Any_data const&) + 99
6 0x7f2c513f058ep std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) + 46
7 0x7f2c8a954a99p
8 0x7f2c513f0bd2p std::__future_base::_State_baseV2::_M_set_result(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>, bool) + 146
9 0x7f2c513f0d46p std::__future_base::_Task_state<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<std::unique_ptr> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}, std::allocator<int>, std::default_delete<std::unique_ptr> ()>::_M_run() + 86
10 0x7f2c516315d4p paddle::framework::ThreadPool::TaskLoop() + 1012
11 0x7f2c7eab8c80p
12 0x7f2c8a94d6bap
13 0x7f2c8a68341dp clone + 109
*** Check failure stack trace: ***
@ 0x7f2c517decad google::LogMessage::Fail()
@ 0x7f2c517e0ff8 google::LogMessage::SendToLog()
@ 0x7f2c517de7bb google::LogMessage::Flush()
@ 0x7f2c517e1ece google::LogMessageFatal::~LogMessageFatal()
@ 0x7f2c513f1847 std::_Function_handler<>::_M_invoke()
@ 0x7f2c513f058e std::__future_base::_State_baseV2::_M_do_set()
@ 0x7f2c8a954a99 __pthread_once_slow
@ 0x7f2c513f0bd2 std::__future_base::_State_baseV2::_M_set_result()
@ 0x7f2c513f0c91 std::__future_base::_Deferred_state<>::_M_complete_async()
@ 0x7f2c513fa32a paddle::operators::ParallelDoGradOp::RunImpl()
@ 0x7f2c50acd4a5 paddle::framework::Executor::RunPreparedContext()
@ 0x7f2c50acea5f paddle::framework::Executor::Run()
@ 0x7f2c50a38fc3 _ZZN8pybind1112cpp_function10initializeIZNS0_C4IvN6paddle9framework8ExecutorEIRKNS4_11ProgramDescEPNS4_5ScopeEibbEINS_4nameENS_9is_methodENS_7siblingEEEEMT0_FT_DpT1_EDpRKT2_EUlPS5_S8_SA_ibbE_vISO_S8_SA_ibbEISB_SC_SD_EEEvOSF_PFSE_SH_ESN_ENUlRNS_6detail13function_callEE1_4_FUNESV_
@ 0x7f2c50a36d04 pybind11::cpp_function::dispatcher()
@ 0x4c37ed PyEval_EvalFrameEx
@ 0x4b9ab6 PyEval_EvalCodeEx
@ 0x4c16e7 PyEval_EvalFrameEx
@ 0x4b9ab6 PyEval_EvalCodeEx
@ 0x4c1e6f PyEval_EvalFrameEx
@ 0x4b9ab6 PyEval_EvalCodeEx
@ 0x4eb30f (unknown)
@ 0x4e5422 PyRun_FileExFlags
@ 0x4e3cd6 PyRun_SimpleFileExFlags
@ 0x493ae2 Py_Main
@ 0x7f2c8a59c830 __libc_start_main
@ 0x4933e9 _start
@ (nil) (unknown)
Aborted
如果需要完整的环境来复现问题,可以在Hi上联系我(jiaozhenyu)。