单机训练正常,mpi训练出错
Created by: mengyiliu22
mpi报错信息如下:
Traceback (most recent call last): File "train.py", line 425, in train(args, dataset) File "train.py", line 408, in train train_loop(exe, train_prog, startup_prog, net) File "train.py", line 267, in train_loop loss_val = train_exe.run(fetch_list=[loss.name]) File "/home/disk7/task_data/history/20200706/0.app-user-20200706222537-8247--fm_a_a_1_1_202007062223_paddlecloud/logs/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/parallel_executor.py", line 311, in run return_numpy=return_numpy) File "/home/disk7/task_data/history/20200706/0.app-user-20200706222537-8247--fm_a_a_1_1_202007062223_paddlecloud/logs/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/executor.py", line 775, in run six.reraise(*sys.exc_info()) File "/home/disk7/task_data/history/20200706/0.app-user-20200706222537-8247--fm_a_a_1_1_202007062223_paddlecloud/logs/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/executor.py", line 770, in run use_program_cache=use_program_cache) File "/home/disk7/task_data/history/20200706/0.app-user-20200706222537-8247--fm_a_a_1_1_202007062223_paddlecloud/logs/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/executor.py", line 829, in _run_impl return_numpy=return_numpy) File "/home/disk7/task_data/history/20200706/0.app-user-20200706222537-8247--fm_a_a_1_1_202007062223_paddlecloud/logs/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/executor.py", line 669, in _run_parallel tensors = exe.run(fetch_var_names)._move_to_list() paddle.fluid.core_avx.EnforceNotMet:
C++ Call Stacks (More useful to developers):
0 std::string paddle::platform::GetTraceBackString<std::string const&>(std::string const&, char const*, int) 1 paddle::platform::EnforceNotMet::EnforceNotMet(std::string const&, char const*, int) 2 paddle::operators::FetchBarrierOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const 3 paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) 4 paddle::framework::details::FetchBarrierOpHandle::RunImpl() 5 paddle::framework::details::OpHandleBase::Run(bool) 6 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOpSync(paddle::framework::details::OpHandleBase*) 7 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOp(paddle::framework::details::OpHandleBase*, std::shared_ptr<paddle::framework::BlockingQueue > const&, unsigned long*) 8 std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result, std::__future_base::_Result_base::_Deleter>, void> >::_M_invoke(std::_Any_data const&) 9 std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>&, bool&) 10 ThreadPool::ThreadPool(unsigned long)::{lambda()#1 (closed)}::operator()() const
Python Call Stacks (More useful to users):
File "/home/disk7/task_data/history/20200706/0.app-user-20200706222537-8247--fm_a_a_1_1_202007062223_paddlecloud/logs/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/framework.py", line 2459, in append_op attrs=kwargs.get("attrs", None)) File "/home/disk7/task_data/history/20200706/0.app-user-20200706222537-8247--fm_a_a_1_1_202007062223_paddlecloud/logs/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/transpiler/distribute_transpiler.py", line 834, in transpile RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE File "train.py", line 395, in train startup_program=startup_prog) File "train.py", line 425, in train(args, dataset)
Error Message Summary:
PaddleCheckError: Expected rets[i]->Wait() != 0U, but received rets[i]->Wait():0 == 0U:0. internal error in RPCClient at [/paddle/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc:50] [operator < fetch_barrier > error]