Paddle Fluid 的MPI集群训练总是在快要结束的那个batch之后出现GetRPC的错误,稳定复现6次了,每次都是这样
Created by: CruiseSun
Traceback (most recent call last): File "train.py", line 169, in main(use_cuda, is_local) File "train.py", line 161, in main train(use_cuda=use_cuda, save_dirname=save_dirname, is_local=is_local) File "train.py", line 146, in train train_loop(t.get_trainer_program()) File "train.py", line 84, in train_loop fetch_list=[avg_cost, auc_var, batch_auc_var]) File "/home/disk1/normandy/maybach/app-user-20181217210545-9581/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/executor.py", line 470, in run self.executor.run(program.desc, scope, 0, True, True) paddle.fluid.core.EnforceNotMet: internal error in RPCClient at [/paddle/paddle/fluid/operators/fetch_barrier_op.cc:43] PaddlePaddle Call Stacks: 0 0x7f6b9e15bc26p paddle::platform::EnforceNotMet::EnforceNotMet(std::exception_ptr::exception_ptr, char const*, int) + 486 1 0x7f6b9eb4dd28p paddle::operators::FetchBarrierOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 888 2 0x7f6b9ede9f58p paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 232 3 0x7f6b9e21ecb8p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool) + 216 4 0x7f6b9e21f680p paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 128 5 0x7f6b9e14079dp 6 0x7f6b9e1909a4p pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 2596 7 0x7f6c08fc1010p PyEval_EvalFrameEx + 16384 8 0x7f6c08fc2b80p PyEval_EvalCodeEx + 2128 9 0x7f6c08fc108ep PyEval_EvalFrameEx + 16510 10 0x7f6c08fc2b80p PyEval_EvalCodeEx + 2128 11 0x7f6c08fc108ep PyEval_EvalFrameEx + 16510 12 0x7f6c08fc2b80p PyEval_EvalCodeEx + 2128 13 0x7f6c08fc108ep PyEval_EvalFrameEx + 16510 14 0x7f6c08fc2b80p PyEval_EvalCodeEx + 2128 15 0x7f6c08fc108ep PyEval_EvalFrameEx + 16510 16 0x7f6c08fc2b80p PyEval_EvalCodeEx + 2128 17 0x7f6c08fc2c82p PyEval_EvalCode + 50 18 0x7f6c08fdb60fp 19 0x7f6c08fdc67ep PyRun_FileExFlags + 126 20 0x7f6c08fdd7d7p PyRun_SimpleFileExFlags + 199 21 0x7f6c08fedd9dp Py_Main + 3133 22 0x7f6c08233bd5p __libc_start_main + 245 23 0x4007c1p