多线程batch包含特定数目样本时程序崩溃的问题
Created by: jshower
我遇到的一个比较奇怪的bug。使用 branch:jzy2 models/fluid/sequence_tagging_for_ner/train.py进行模型的训练时, 使用conll03的训练集进行训练。 我在设置batch_size为200时并不会报错,全程正常训练。但是如果我设置batch_size=35时,就会出现下面的错误。
F0313 07:55:52.990689 19937 threadpool.h:96] The exception is thrown inside the thread pool. You should use RunAndGetException to handle the exception.
The default exception handler is LOG(FATAL).enforce numel() > 0 failed, 0 <= 0
When calling this method, the Tensor's numel must be larger than zero. Please check Tensor::Resize has been called first. at [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:123]
PaddlePaddle Call Stacks:
0 0x7feff9cc19acp paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 572
1 0x7feff9cc7851p paddle::framework::Tensor::mutable_data(boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_>, std::type_index) + 1233
2 0x7feff9d9c9a0p paddle::framework::Vector<long>::resize(unsigned long) + 496
3 0x7feffa066b46p paddle::operators::LookupTableGradKernel<float>::Compute(paddle::framework::ExecutionContext const&) const + 1190
4 0x7feffa40fac4p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 1588
5 0x7feffa40d418p paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 72
6 0x7feff9d6395ap paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 1482
7 0x7feffa2b7aa3p std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > >, std::__future_base::_Result_base::_Deleter>, std::_Bind_simple<std::reference_wrapper<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}> ()>, std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > >::_M_invoke(std::_Any_data const&) + 99
8 0x7feffa2b480ep std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) + 46
9 0x7ff045249a99p
10 0x7feffa2b4e52p std::__future_base::_State_baseV2::_M_set_result(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>, bool) + 146
11 0x7feffa2b4fc6p std::__future_base::_Task_state<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<std::unique_ptr> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}, std::allocator<int>, std::default_delete<std::unique_ptr> ()>::_M_run() + 86
12 0x7feffa425064p paddle::framework::ThreadPool::TaskLoop() + 1012
13 0x7ff038025c80p
14 0x7ff0452426bap
15 0x7ff044f7841dp clone + 109
*** Check failure stack trace: ***
@ 0x7feffa53bf0d google::LogMessage::Fail()
@ 0x7feffa53e258 google::LogMessage::SendToLog()
@ 0x7feffa53ba1b google::LogMessage::Flush()
@ 0x7feffa53f12e google::LogMessageFatal::~LogMessageFatal()
@ 0x7feffa2b59c7 std::_Function_handler<>::_M_invoke()
@ 0x7feffa2b480e std::__future_base::_State_baseV2::_M_do_set()
@ 0x7ff045249a99 __pthread_once_slow
@ 0x7feffa2b4e52 std::__future_base::_State_baseV2::_M_set_result()
@ 0x7feffa2b4f11 std::__future_base::_Deferred_state<>::_M_complete_async()
@ 0x7feffa2be33a paddle::operators::ParallelDoGradOp::RunImpl()
@ 0x7feffa40d418 paddle::framework::OperatorBase::Run()
@ 0x7feff9d6395a paddle::framework::Executor::Run()
@ 0x7feff9cdf253 _ZZN8pybind1112cpp_function10initializeIZNS0_C4IvN6paddle9framework8ExecutorEIRKNS4_11ProgramDescEPNS4_5ScopeEibbEINS_4nameENS_9is_methodENS_7siblingEEEEMT0_FT_DpT1_EDpRKT2_EUlPS5_S8_SA_ibbE_vISO_S8_SA_ibbEISB_SC_SD_EEEvOSF_PFSE_SH_ESN_ENUlRNS_6detail13function_callEE1_4_FUNESV_
@ 0x7feff9cdbdc4 pybind11::cpp_function::dispatcher()
@ 0x4c37ed PyEval_EvalFrameEx
@ 0x4b9ab6 PyEval_EvalCodeEx
@ 0x4c16e7 PyEval_EvalFrameEx
@ 0x4b9ab6 PyEval_EvalCodeEx
@ 0x4c1e6f PyEval_EvalFrameEx
@ 0x4b9ab6 PyEval_EvalCodeEx
@ 0x4eb30f (unknown)
@ 0x4e5422 PyRun_FileExFlags
@ 0x4e3cd6 PyRun_SimpleFileExFlags
@ 0x493ae2 Py_Main
@ 0x7ff044e91830 __libc_start_main
@ 0x4933e9 _start
@ (nil) (unknown)
Aborted
事实上,不仅是BATCH_SIZE=35时会报错,BATCH_SIZE=50也会报错,这是因为训练样本数%50=35,所以最后一个batch会包含35个样本。另外BATCH_SIZE=34也会报错。 因为样本在使用时会做shuffle,所以这不会是特定样本造成的。看上面提到的错误,是线程池出了问题,具体原因还请相关同学进行追查。追查时可以通过将原始的data/train文件复制多份合成一个大的训练集的方式,同样可以复现上述问题。