训练报错
Created by: Anikily
1)PaddlePaddle版本:1.62 3)GPU:V100, CUDA9.2, cudnn7.5 4)系统环境:CentOS, python3.69
- 训练信息 1)单机多卡 2)显存 本来上午还能训练,吃了个饭啥也没动就报错了 /home/yulu/anaconda3/envs/paddle/lib/python3.6/site-packages/paddle/fluid/executor.py:779: UserWarning: The following exception is not an EOF exception. "The following exception is not an EOF exception.") Traceback (most recent call last): File "train.py", line 494, in main(args) File "train.py", line 481, in main train(cfg) File "train.py", line 250, in train exe.run(startup_prog) File "/home/yulu/anaconda3/envs/paddle/lib/python3.6/site-packages/paddle/fluid/executor.py", line 780, in run six.reraise(*sys.exc_info()) File "/home/yulu/anaconda3/envs/paddle/lib/python3.6/site-packages/six.py", line 696, in reraise raise value File "/home/yulu/anaconda3/envs/paddle/lib/python3.6/site-packages/paddle/fluid/executor.py", line 775, in run use_program_cache=use_program_cache) File "/home/yulu/anaconda3/envs/paddle/lib/python3.6/site-packages/paddle/fluid/executor.py", line 822, in _run_impl use_program_cache=use_program_cache) File "/home/yulu/anaconda3/envs/paddle/lib/python3.6/site-packages/paddle/fluid/executor.py", line 899, in _run_program fetch_var_name) paddle.fluid.core_avx.EnforceNotMet:
C++ Call Stacks (More useful to developers):
0 std::string paddle::platform::GetTraceBackString<char const*>(char const*&&, char const*, int) 1 paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) 2 paddle::platform::CublasHandleHolder::CublasHandleHolder(CUstream_st*, cublasMath_t) 3 paddle::platform::CUDADeviceContext::CUDADeviceContext(paddle::platform::CUDAPlace) 4 std::_Function_handler<std::unique_ptr<paddle::platform::DeviceContext, std::default_deletepaddle::platform::DeviceContext > (), std::reference_wrapper<std::_Bind_simple<paddle::platform::EmplaceDeviceContext<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(std::map<paddle::platform::Place, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_deletepaddle::platform::DeviceContext > >, std::lesspaddle::platform::Place, std::allocator<std::pair<paddle::platform::Place const, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_deletepaddle::platform::DeviceContext > > > > >, paddle::platform::Place)::{lambda()#1 (closed)} ()> > >::_M_invoke(std::_Any_data const&) 5 std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::DeviceContext, std::default_deletepaddle::platform::DeviceContext > >, std::__future_base::_Result_base::_Deleter>, std::unique_ptr<paddle::platform::DeviceContext, std::default_deletepaddle::platform::DeviceContext > > >::_M_invoke(std::_Any_data const&) 6 std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>&, bool&) 7 std::__future_base::_Deferred_state<std::_Bind_simple<paddle::platform::EmplaceDeviceContext<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(std::map<paddle::platform::Place, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_deletepaddle::platform::DeviceContext > >, std::lesspaddle::platform::Place, std::allocator<std::pair<paddle::platform::Place const, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_deletepaddle::platform::DeviceContext > > > > >, paddle::platform::Place)::{lambda()#1 (closed)} ()>, std::unique_ptr<paddle::platform::DeviceContext, std::default_deletepaddle::platform::DeviceContext > >::_M_run_deferred() 8 paddle::platform::DeviceContextPool::Get(paddle::platform::Place const&) 9 paddle::framework::GarbageCollector::GarbageCollector(paddle::platform::Place const&, unsigned long) 10 paddle::framework::UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(paddle::platform::CUDAPlace const&, unsigned long) 11 paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool) 12 paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector<std::string, std::allocatorstd::string > const&, bool)
Error Message Summary:
Error: Paddle internal Check failed. (Please help us create a new issue, here we need to find the developer to add a user friendly error message) [CUBLAS: not initialized.] at (/paddle/paddle/fluid/platform/cuda_helper.h:32)