ResNet50混合精度训练test阶段出现core dump问题
Created by: wzzju
1、环境配置
-
PaddlePaddle版本:Paddle develop分支代码。
-
CPU/GPU(GPU驱动、CUDA、cuDNN版本):Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz; Tesla V100; CUDA 10; CUDNN 7.6.4
-
操作系统版本:Ubuntu 14.04
-
Python版本:Python 3.7
-
显存信息:16 GB
2、BUG复现步骤(必要时给出截图):
CUDA_VISIBLE_DEVICES=0 python -m paddle.distributed.launch \
--selected_gpus=0 \
--use_paddlecloud \
--log_dir mylog \
./train_with_fleet.py \
--model=ResNet50 \
--total_images=1281167 \
--data_dir=./ImageNet \
--class_dim=1000 \
--image_shape=3,224,224 \
--model_save_dir=output/ \
--with_mem_opt=False \
--lr_strategy=piecewise_decay \
--l2_decay=1e-4 \
--num_threads=3 \
--do_test=True \
--use_hierarchical_allreduce=0 \
--fuse=True \
--batch_size=256 \
--fp16=True \
--num_epochs=2 \
--scale_loss=128.0 \
--lr=0.1 \
--use_dali=True \
--data_format=NHWC
3、期望结果:
- ResNet50混合精度训练可正常进行train和test过程。
4、实际结果(必要时给出截图,报错信息、日志/代码关键片段):
-
运行到test阶段的最后一两个batch时会出现core dumped错误。
-
log如下:
paddle.fluid.core_avx.EnforceNotMet:
--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > paddle::platform::GetTraceBackString<char const*>(char const*&&, char const*, int)
1 paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int)
2 cudnnConvolutionFwdAlgo_t paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>::Find<float>(paddle::operators::ConvArgs const&, bool, bool, paddle::framework::ExecutionContext const&)::{lambda()#1}::operator()() const::{lambda(void*)#1}::operator()({lambda()#1}) const
3 void paddle::platform::CudnnWorkspaceHandle::RunFunc<cudnnConvolutionFwdAlgo_t paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>::Find<float>(paddle::operators::ConvArgs const&, bool, bool, paddle::framework::ExecutionContext const&)::{lambda()#1}::operator()() const::{lambda(void*)#1}&>(cudnnConvolutionFwdAlgo_t paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>::Find<float>(paddle::operators::ConvArgs const&, bool, bool, paddle::framework::ExecutionContext const&)::{lambda()#1}::operator()() const::{lambda(void*)#1}&, unsigned long)
4 void paddle::platform::CudnnWorkspaceHandle::RunFuncSync<cudnnConvolutionFwdAlgo_t paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>::Find<float>(paddle::operators::ConvArgs const&, bool, bool, paddle::framework::ExecutionContext const&)::{lambda()#1}::operator()() const::{lambda(void*)#1}&>(cudnnConvolutionFwdAlgo_t paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>::Find<float>(paddle::operators::ConvArgs const&, bool, bool, paddle::framework::ExecutionContext const&)::{lambda()#1}::operator()() const::{lambda(void*)#1}&, unsigned long)
5 cudnnConvolutionFwdAlgo_t paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>::Find<float>(paddle::operators::ConvArgs const&, bool, bool, paddle::framework::ExecutionContext const&)::{lambda()#1}::operator()() const
6 std::_Function_handler<cudnnConvolutionFwdAlgo_t (), cudnnConvolutionFwdAlgo_t paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>::Find<float>(paddle::operators::ConvArgs const&, bool, bool, paddle::framework::ExecutionContext const&)::{lambda()#1}>::_M_invoke(std::_Any_data const&)
7 std::function<cudnnConvolutionFwdAlgo_t ()>::operator()() const
8 paddle::framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>::GetAlgorithm(std::vector<long, std::allocator<long> > const&, std::vector<long, std::allocator<long> > const&, std::vector<int, std::allocator<int> > const&, std::vector<int, std::allocator<int> > const&, std::vector<int, std::allocator<int> > const&, int, long, std::function<cudnnConvolutionFwdAlgo_t ()>)
9 cudnnConvolutionFwdAlgo_t paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>::Find<float>(paddle::operators::ConvArgs const&, bool, bool, paddle::framework::ExecutionContext const&)
10 paddle::operators::CUDNNConvOpKernel<float>::Compute(paddle::framework::ExecutionContext const&) const
11 paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CUDAPlace, false, 0ul, paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<double>, paddle::operators::CUDNNConvOpKernel<paddle::platform::float16> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}::operator()(paddle::framework::ExecutionContext const&) const
12 std::_Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CUDAPlace, false, 0ul, paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<double>, paddle::operators::CUDNNConvOpKernel<paddle::platform::float16> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::_M_invoke(std::_Any_data const&, paddle::framework::ExecutionContext const&)
13 std::function<void (paddle::framework::ExecutionContext const&)>::operator()(paddle::framework::ExecutionContext const&) const
14 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&, paddle::framework::RuntimeContext*) const
15 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
16 paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
17 paddle::framework::details::ComputationOpHandle::RunImpl()
18 paddle::framework::details::OpHandleBase::Run(bool)
19 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOpSync(paddle::framework::details::OpHandleBase*)
20 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOp(paddle::framework::details::OpHandleBase*, std::shared_ptr<paddle::framework::BlockingQueue<unsigned long> > const&, unsigned long*)
21 std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>::operator()() const
22 std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)
23 void std::_Mem_fn_base<void (std::__future_base::_State_baseV2::*)(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*), true>::operator()<std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*, void>(std::__future_base::_State_baseV2*, std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*&&, bool*&&) const
24 void std::_Bind_simple<std::_Mem_fn<void (std::__future_base::_State_baseV2::*)(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)> (std::__future_base::_State_baseV2*, std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)>::_M_invoke<0ul, 1ul, 2ul>(std::_Index_tuple<0ul, 1ul, 2ul>)
25 std::_Bind_simple<std::_Mem_fn<void (std::__future_base::_State_baseV2::*)(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)> (std::__future_base::_State_baseV2*, std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)>::operator()()
26 void std::__once_call_impl<std::_Bind_simple<std::_Mem_fn<void (std::__future_base::_State_baseV2::*)(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)> (std::__future_base::_State_baseV2*, std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)> >()
27 void std::call_once<void (std::__future_base::_State_baseV2::*)(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*), std::__future_base::_State_baseV2*, std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*>(std::once_flag&, void (std::__future_base::_State_baseV2::*&&)(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*), std::__future_base::_State_baseV2*&&, std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*&&, bool*&&)
28 std::__future_base::_State_baseV2::_M_set_result(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>, bool)
29 std::packaged_task<void ()>::operator()()
30 std::function<void ()>::operator()() const
31 ThreadPool::ThreadPool(unsigned long)::{lambda()#1}::operator()() const
32 void std::_Bind_simple<ThreadPool::ThreadPool(unsigned long)::{lambda()#1} ()>::_M_invoke<>(std::_Index_tuple<>)
33 std::_Bind_simple<ThreadPool::ThreadPool(unsigned long)::{lambda()#1} ()>::operator()()
34 std::thread::_Impl<std::_Bind_simple<ThreadPool::ThreadPool(unsigned long)::{lambda()#1} ()> >::_M_run()
----------------------
Error Message Summary:
----------------------
Error: An error occurred here. There is no accurate error hint for this error yet. We are continuously in the process of increasing hint for this kind of error check. It would be helpful if you could inform us of how this conversion went by opening a github issue. And we will resolve it with high priority.
- New issue link: https://github.com/PaddlePaddle/Paddle/issues/new
- Recommended issue content: all error stack information
[Hint: CUDNN_STATUS_INTERNAL_ERROR] at (/work/Develop/sync_work/Paddle/paddle/fluid/operators/conv_cudnn_helper.h:218)
[operator < conv2d > error]
terminate called after throwing an instance of 'paddle::platform::EnforceNotMet'
what():
----------------------
Error Message Summary:
----------------------
Error: An error occurred here. There is no accurate error hint for this error yet. We are continuously in the process of increasing hint for this kind of error check. It would be helpful if you could inform us of how this conversion went by opening a github issue. And we will resolve it with high priority.
- New issue link: https://github.com/PaddlePaddle/Paddle/issues/new
- Recommended issue content: all error stack information: an illegal memory access was encountered at (/work/Develop/sync_work/Paddle/paddle/fluid/framework/details/op_handle_base.cc:39)
W0422 03:44:42.297633 62033 init.cc:216] Warning: PaddlePaddle catches a failure signal, it may not work properly
W0422 03:44:42.297662 62033 init.cc:218] You could check whether you killed PaddlePaddle thread/process accidentally or report the case to PaddlePaddle
W0422 03:44:42.297667 62033 init.cc:221] The detail failure signal is:
W0422 03:44:42.297673 62033 init.cc:224] *** Aborted at 1587527082 (unix time) try "date -d @1587527082" if you are using GNU date ***
W0422 03:44:42.302790 62033 init.cc:224] PC: @ 0x0 (unknown)
W0422 03:44:42.302935 62033 init.cc:224] *** SIGABRT (@0xf251) received by PID 62033 (TID 0x7fe586745700) from PID 62033; stack trace: ***
W0422 03:44:42.307202 62033 init.cc:224] @ 0x7fe585dce390 (unknown)
W0422 03:44:42.310827 62033 init.cc:224] @ 0x7fe585a28428 gsignal
W0422 03:44:42.314337 62033 init.cc:224] @ 0x7fe585a2a02a abort
W0422 03:44:42.317575 62033 init.cc:224] @ 0x7fe578d6884d __gnu_cxx::__verbose_terminate_handler()
W0422 03:44:42.320775 62033 init.cc:224] @ 0x7fe578d666b6 (unknown)
W0422 03:44:42.323982 62033 init.cc:224] @ 0x7fe578d656a9 (unknown)
W0422 03:44:42.327113 62033 init.cc:224] @ 0x7fe578d66005 __gxx_personality_v0
W0422 03:44:42.330179 62033 init.cc:224] @ 0x7fe579288f83 (unknown)
W0422 03:44:42.333235 62033 init.cc:224] @ 0x7fe579289487 _Unwind_Resume
W0422 03:44:42.351012 62033 init.cc:224] @ 0x7fe559d2eddf paddle::framework::details::OpHandleBase::~OpHandleBase()
W0422 03:44:42.364976 62033 init.cc:224] @ 0x7fe559cff21e paddle::framework::details::ComputationOpHandle::~ComputationOpHandle()
W0422 03:44:42.381896 62033 init.cc:224] @ 0x7fe559cff24e paddle::framework::details::ComputationOpHandle::~ComputationOpHandle()
W0422 03:44:42.402763 62033 init.cc:224] @ 0x7fe559c6709b _ZZN6paddle9framework2ir4Node9WrappedByINS0_7details12OpHandleBaseEEEvPT_ENKUlvE_clEv
W0422 03:44:42.428308 62033 init.cc:224] @ 0x7fe559c6791b _ZNSt17_Function_handlerIFvvEZN6paddle9framework2ir4Node9WrappedByINS2_7details12OpHandleBaseEEEvPT_EUlvE_E9_M_invokeERKSt9_Any_data
W0422 03:44:42.444703 62033 init.cc:224] @ 0x7fe5548f658a std::function<>::operator()()
W0422 03:44:42.467017 62033 init.cc:224] @ 0x7fe554e35c78 paddle::framework::ir::Node::~Node()
W0422 03:44:42.488453 62033 init.cc:224] @ 0x7fe554e35d36 paddle::framework::ir::Node::~Node()
W0422 03:44:42.508652 62033 init.cc:224] @ 0x7fe554e42216 std::default_delete<>::operator()()
W0422 03:44:42.521795 62033 init.cc:224] @ 0x7fe554e38361 std::unique_ptr<>::~unique_ptr()
W0422 03:44:42.536729 62033 init.cc:224] @ 0x7fe554e646a6 std::pair<>::~pair()
W0422 03:44:42.558369 62033 init.cc:224] @ 0x7fe554e646c6 __gnu_cxx::new_allocator<>::destroy<>()
W0422 03:44:42.585759 62033 init.cc:224] @ 0x7fe554e5c80d std::allocator_traits<>::destroy<>()
W0422 03:44:42.601658 62033 init.cc:224] @ 0x7fe554e53599 std::_Rb_tree<>::_M_destroy_node()
W0422 03:44:42.618242 62033 init.cc:224] @ 0x7fe554e47643 std::_Rb_tree<>::_M_drop_node()
W0422 03:44:42.632325 62033 init.cc:224] @ 0x7fe554e41a0e std::_Rb_tree<>::_M_erase()
W0422 03:44:42.645171 62033 init.cc:224] @ 0x7fe554e419eb std::_Rb_tree<>::_M_erase()
W0422 03:44:42.658629 62033 init.cc:224] @ 0x7fe554e419eb std::_Rb_tree<>::_M_erase()
W0422 03:44:42.672036 62033 init.cc:224] @ 0x7fe554e419eb std::_Rb_tree<>::_M_erase()
W0422 03:44:42.685377 62033 init.cc:224] @ 0x7fe554e419eb std::_Rb_tree<>::_M_erase()
W0422 03:44:42.698604 62033 init.cc:224] @ 0x7fe554e419eb std::_Rb_tree<>::_M_erase()
W0422 03:44:42.711668 62033 init.cc:224] @ 0x7fe554e419eb std::_Rb_tree<>::_M_erase()
W0422 03:44:42.724615 62033 init.cc:224] @ 0x7fe554e419eb std::_Rb_tree<>::_M_erase()