Get exception when run distribute fluid with parallel executor
Created by: seiriosPlus
Detail error is :
Traceback (most recent call last):
File "/models/image_classification/vgg16_pe_gpu.py", line 331, in <module>
main()
File "/models/image_classification/vgg16_pe_gpu.py", line 317, in main
train_loop(train_exe, test_exe)
File "/models/image_classification/vgg16_pe_gpu.py", line 200, in train_loop
fetch_list=[avg_cost.name, batch_acc.name, batch_size.name])
File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/parallel_executor.py", line 210, in run
self.executor.run(fetch_list, fetch_var_name)
paddle.fluid.core.EnforceNotMet: enforce x_mat_dims[1] == y_mat_dims[0] failed, 25088 != 512
First matrix's width must be equal with second matrix's height. at [/paddle/paddle/fluid/operators/mul_op.cc:63]
PaddlePaddle Call Stacks:
0 0x7f69a2077c3cp paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 572
1 0x7f69a2a118d6p paddle::operators::MulOp::InferShape(paddle::framework::InferShapeContext*) const + 2678
2 0x7f69a2e37518p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 104
3 0x7f69a2c93b5ep
4 0x7f69a2caa16dp
5 0x7f69a2ca91bdp paddle::framework::details::OpHandleBase::RunAndRecordEvent(std::function<void ()> const&) + 845
6 0x7f69a2c93f40p paddle::framework::details::ComputationOpHandle::RunImpl() + 368
7 0x7f69a2caaa01p paddle::framework::details::OpHandleBase::Run(bool) + 321
8 0x7f69a2ca018cp
9 0x7f69a2ca06e0p
10 0x7f69a2b236fep std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) + 46
11 0x7f69f4d6ba99p
12 0x7f69a2c9f45dp
13 0x7f69a2ca52d4p std::thread::_Impl<std::_Bind_simple<ThreadPool::ThreadPool(unsigned long)::{lambda()#1} ()> >::_M_run() + 340
14 0x7f69eb5c8c80p
15 0x7f69f4d646bap
16 0x7f69f4a9a41dp clone + 109
+ check_trainer_ret 1
+ ret=1
+ stdbuf -oL echo 'job returned 1...setting pod return message...'
job returned 1...setting pod return message...
+ stdbuf -oL echo ===============================
===============================
+ '[' 1 -eq 136 ']'
+ '[' 1 -eq 139 ']'
+ '[' 1 -eq 1 ']'
+ echo 'General Error'
+ stdbuf -oL echo 'termination log wroted...'
termination log wroted...
+ exit 1
PaddlePaddle: gpu-latest OS: docker ubuntu 16.04