Created by: zhengya01
数据:6类数据
环境:gpu cuda9 cudnn7 FLAGS_fraction_of_gpu_memory_to_use=0.02
命令:python train.py --batch_size=50 --total_videos=585 --class_dim=6 --num_epochs=2 --image_shape=3,224,224 --model_save_dir=output/ --with_mem_opt=True --lr_init=0.01 --num_layers=50 --seg_num=7
报错:
terminate called after throwing an instance of 'paddle::platform::EnforceNotMet'
what(): unhandled cuda error at [/paddle/paddle/fluid/platform/nccl_helper.h:67]
PaddlePaddle Call Stacks:
0 0x7f89f49d21b5p void paddle::platform::EnforceNotMet::Init<char const*>(char const*, char const*, int) + 357
1 0x7f89f49d2539p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 137
2 0x7f89f4b138d8p paddle::platform::NCCLGroupGuard::~NCCLGroupGuard() + 328
3 0x7f89f62bb926p
4 0x7f89f630034dp
5 0x7f89f630034dp
6 0x7f89f630034dp
7 0x7f89f630034dp
8 0x7f89f62ffc95p paddle::framework::details::OpHandleBase::RunAndRecordEvent(std::function<void ()> const&) + 805
9 0x7f89f62bcd08p paddle::framework::details::AllReduceOpHandle::RunImpl() + 2056
10 0x7f89f6300bb6p paddle::framework::details::OpHandleBase::Run(bool) + 118
11 0x7f89f6298fbdp
12 0x7f89f5655be3p std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result, std::__future_base::_Result_base::_Deleter>, void> >::_M_invoke(std::_Any_data const&) + 35
13 0x7f89f5618f07p std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>&, bool&) + 39
14 0x7f8a2bfa8be0p pthread_once + 80
15 0x7f89f6297ca2p
16 0x7f89f561a334p ThreadPool::ThreadPool(unsigned long)::{lambda()#1 (closed)}::operator()() const + 404
17 0x7f8a11cf0678p
18 0x7f8a2bfa3df3p
19 0x7f8a2b5c82cdp clone + 109
* Aborted at 1552898901 (unix time) try "date -d @1552898901" if you are using GNU date *
PC: @ 0x0 (unknown)
* SIGABRT (@0x1f4000058ac) received by PID 22700 (TID 0x7f8869df2700) from PID 22700; stack trace: *
@ 0x7f8a2bfab130 (unknown)
@ 0x7f8a2b5079d9 __GI_raise
@ 0x7f8a2b5090e8 __GI_abort
@ 0x7f8a11cd63df __gnu_cxx::__verbose_terminate_handler()
@ 0x7f8a11cd4b16 __cxxabiv1::__terminate()
@ 0x7f8a11cd3f91 __cxa_call_terminate
@ 0x7f8a11cd479d __gxx_personality_v0
@ 0x7f8a20a93f56 _Unwind_RaiseException_Phase2
@ 0x7f8a20a94244 _Unwind_RaiseException
@ 0x7f8a11cd4d1b __cxa_throw
@ 0x7f89f4b138f6 paddle::platform::NCCLGroupGuard::~NCCLGroupGuard()
@ 0x7f89f62bb926 _ZNSt17_Function_handlerIFvvEZN6paddle9framework7details17AllReduceOpHandle7RunImplEvEUlvE0_E9_M_invokeERKSt9_Any_data
@ 0x7f89f630034d _ZNSt17_Function_handlerIFvvEZN6paddle9framework7details12OpHandleBase17RunAndRecordEventERKSt8functionIS0_EEUlvE_E9_M_invokeERKSt9_Any_data
@ 0x7f89f630034d _ZNSt17_Function_handlerIFvvEZN6paddle9framework7details12OpHandleBase17RunAndRecordEventERKSt8functionIS0_EEUlvE_E9_M_invokeERKSt9_Any_data
@ 0x7f89f630034d _ZNSt17_Function_handlerIFvvEZN6paddle9framework7details12OpHandleBase17RunAndRecordEventERKSt8functionIS0_EEUlvE_E9_M_invokeERKSt9_Any_data
@ 0x7f89f630034d _ZNSt17_Function_handlerIFvvEZN6paddle9framework7details12OpHandleBase17RunAndRecordEventERKSt8functionIS0_EEUlvE_E9_M_invokeERKSt9_Any_data
@ 0x7f89f62ffc95 paddle::framework::details::OpHandleBase::RunAndRecordEvent()
@ 0x7f89f62bcd08 paddle::framework::details::AllReduceOpHandle::RunImpl()
@ 0x7f89f6300bb6 paddle::framework::details::OpHandleBase::Run()
@ 0x7f89f6298fbd _ZZN6paddle9framework7details24ThreadedSSAGraphExecutor5RunOpERKSt10shared_ptrINS0_13BlockingQueueIPNS1_13VarHandleBaseEEEEPNS1_12OpHandleBaseEENKUlvE_clEv
@ 0x7f89f5655be3 std::_Function_handler<>::_M_invoke()
@ 0x7f89f5618f07 std::__future_base::_State_base::_M_do_set()
@ 0x7f8a2bfa8be0 __GI___pthread_once
@ 0x7f89f6297ca2 _ZNSt13__future_base11_Task_stateISt5_BindIFZN6paddle9framework7details24ThreadedSSAGraphExecutor5RunOpERKSt10shared_ptrINS3_13BlockingQueueIPNS4_13VarHandleBaseEEEEPNS4_12OpHandleBaseEEUlvE_vEESaIiEFvvEE6_M_runEv
@ 0x7f89f561a334 _ZZN10ThreadPoolC1EmENKUlvE_clEv
@ 0x7f8a11cf0678 execute_native_thread_routine_compat
@ 0x7f8a2bfa3df3 start_thread
@ 0x7f8a2b5c82cd __clone
@ 0x0 (unknown)