多机多卡异步模式下 报错Tensor not init
Created by: ccmeteorljh
paddle 版本:develop 8月14编译的版本 模型:resnet 启动命令:
python fluid_benchmark.py --model resnet --gpus 8 --batch_size 64 --pass_num 1 --iterations 100
报错如下:
*** Aborted at 1534329424 (unix time) try "date -d @1534329424" if you are using GNU date ***
PC: @ 0x0 (unknown)
*** SIGSEGV (@0xea0) received by PID 700 (TID 0x7f7ee3fff700) from PID 3744; stack trace: ***
@ 0x7f7f9e738500 (unknown)
@ 0x7f7f40f2a117 paddle::framework::Scope::FindVarLocally()
@ 0x7f7f40f2a542 paddle::framework::Scope::FindVarInternal()
Traceback (most recent call last):
File "thirdparty/benchmark/fluid/fluid_benchmark.py", line 363, in <module>
main()
File "thirdparty/benchmark/fluid/fluid_benchmark.py", line 334, in main
train_parallel(*train_args)
File "thirdparty/benchmark/fluid/fluid_benchmark.py", line 268, in train_parallel
loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
File "/usr/local/lib/python2.7/site-packages/paddle/fluid/parallel_executor.py", line 272, in run
self.executor.run(fetch_list, fetch_var_name)
paddle.fluid.core.EnforceNotMet: holder_ should not be null
Tensor not initialized yet when Tensor::type() is called. at [/paddle/paddle/fluid/framework/tensor.h:139]
PaddlePaddle Call Stacks:
0 0x7f7f3ff3ce76p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 486
1 0x7f7f3ff3f2d7p paddle::framework::Tensor::type() const + 151
2 0x7f7f40f18445p paddle::framework::OperatorWithKernel::IndicateDataType(paddle::framework::ExecutionContext const&) const + 149
3 0x7f7f40f1877fp paddle::framework::OperatorWithKernel::GetExpectedKernelType(paddle::framework::ExecutionContext const&) const + 47
4 0x7f7f40f18c67p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 199
5 0x7f7f40f15d7cp paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 252
6 0x7f7f40d528e7p
7 0x7f7f40d70810p
8 0x7f7f40d70075p paddle::framework::details::OpHandleBase::RunAndRecordEvent(std::function<void ()()> const&) + 805
9 0x7f7f40d523bfp paddle::framework::details::ComputationOpHandle::RunImpl() + 95
10 0x7f7f40d71115p paddle::framework::details::OpHandleBase::Run(bool) + 117
11 0x7f7f40d315dap
12 0x7f7f40d32045p paddle::framework::details::ThreadedSSAGraphExecutor::RunOp(paddle::framework::BlockingQueue<paddle::framework::details::VarHandleBase*>*, paddle::framework::details::OpHandleBase*) + 1109
13 0x7f7f40d33bf8p paddle::framework::details::ThreadedSSAGraphExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&) + 2280
14 0x7f7f40d39e67p paddle::framework::details::ScopeBufferedSSAGraphExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&) + 519
15 0x7f7f4001c70cp paddle::framework::ParallelExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&, std::string const&) + 76
16 0x7f7f3ff31c30p
17 0x7f7f3ff4d8d4p pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 2596
18 0x7f7f9ea3e631p PyEval_EvalFrameEx + 24497
19 0x7f7f9ea3fbcep PyEval_EvalCodeEx + 2190
20 0x7f7f9ea3e20ap PyEval_EvalFrameEx + 23434
21 0x7f7f9ea3fbcep PyEval_EvalCodeEx + 2190
22 0x7f7f9e9bd7a1p
23 0x7f7f9e98e273p PyObject_Call + 83
24 0x7f7f9ea3c7c5p PyEval_EvalFrameEx + 16709
25 0x7f7f9ea3e560p PyEval_EvalFrameEx + 24288
26 0x7f7f9ea3fbcep PyEval_EvalCodeEx + 2190
27 0x7f7f9ea3fce2p PyEval_EvalCode + 50
28 0x7f7f9ea5f9e0p PyRun_FileExFlags + 176
29 0x7f7f9ea5fbbfp PyRun_SimpleFileExFlags + 239
30 0x7f7f9ea75454p Py_Main + 3188
31 0x7f7f9dd29cddp __libc_start_main + 253
32 0x400649p
@ 0x7f7f40f2a4ae paddle::framework::Scope::FindVar()
@ 0x7f7f40e59942 paddle::operators::distributed::DeserializeFromByteBuffer()
@ 0x7f7f40e4144a paddle::operators::distributed::ProcGetResponse()
@ 0x7f7f40e41f60 paddle::operators::distributed::GRPCClient::Proceed()
@ 0x7f7f937f3470 (unknown)
@ 0x7f7f9e730851 start_thread
@ 0x7f7f9ddf390d clone
@ 0x0 (unknown)
*********************error messages********************