生成任务多卡运行时fetch报错:exe.run(fetch_var_names)._move_to_list()
Created by: zhengxiao-yu
1)PaddlePaddle版本:1.6.1 2)gpu多机多卡
运行平台:paddle cloud平台 日志链接:http://10.199.229.42:8388/v1/containers/be4b34bea1b8b2b24cec30e954dc4cff43984458e383b32d003776fe5591b1cb/backuplog
报错如下:
/opt/_internal/cpython-2.7.11-ucs4/lib/python2.7/site-packages/paddle/fluid/executor.py:774: UserWarning: The following exception is not an EOF exception.
"The following exception is not an EOF exception.")
ERROR: 03-02 19:38:17: custom_trainer.py:109 * 140338501658368 traceback.format_exc():Traceback (most recent call last):
File "/root/paddlejob/workspace/env_run/textone/training/custom_trainer.py", line 62, in train_and_eval
metrics_tensor_value = self.run(InstanceName.TRAINING, need_fetch=True)
File "base_trainer.py", line 517, in base_trainer.BaseTrainer.run (/home/work/textone_pro/dabao/textone_pro/training/base_trainer.c:12475)
File "/opt/_internal/cpython-2.7.11-ucs4/lib/python2.7/site-packages/paddle/fluid/parallel_executor.py", line 311, in run
return_numpy=return_numpy)
File "/opt/_internal/cpython-2.7.11-ucs4/lib/python2.7/site-packages/paddle/fluid/executor.py", line 775, in run
six.reraise(*sys.exc_info())
File "/opt/_internal/cpython-2.7.11-ucs4/lib/python2.7/site-packages/paddle/fluid/executor.py", line 770, in run
use_program_cache=use_program_cache)
File "/opt/_internal/cpython-2.7.11-ucs4/lib/python2.7/site-packages/paddle/fluid/executor.py", line 829, in _run_impl
return_numpy=return_numpy)
File "/opt/_internal/cpython-2.7.11-ucs4/lib/python2.7/site-packages/paddle/fluid/executor.py", line 669, in _run_parallel
tensors = exe.run(fetch_var_names)._move_to_list()
EnforceNotMet:
--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::string paddle::platform::GetTraceBackString<std::string const&>(std::string const&&&, char const*, int)
1 paddle::platform::EnforceNotMet::EnforceNotMet(std::string const&, char const*, int)
2 paddle::framework::LoDTensor::MergeLoDTensor(std::vector<paddle::framework::LoDTensor const*, std::allocator<paddle::framework::LoDTensor const*> > const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_>)
3 paddle::framework::details::FetchOpHandle::WaitAndMergeCPUTensors() const
4 paddle::framework::details::FetchOpHandle::RunImpl()
5 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOpSync(paddle::framework::details::OpHandleBase*)
6 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOp(paddle::framework::details::OpHandleBase*, std::shared_ptr<paddle::framework::BlockingQueue<unsigned long> > const&, unsigned long*)
7 std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()(), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<void>, std::__future_base::_Result_base::_Deleter>, void> >::_M_invoke(std::_Any_data const&)
8 std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()()>&, bool&)
9 _ZZN10ThreadPoolC1EmENKUlvE_clEv
----------------------
Error Message Summary:
----------------------
PaddleCheckError: Expected framework::product(new_dim) / new_dim[0] == framework::product(t->dims()) / t->dims()[0], but received framework::product(new_dim) / new_dim[0]:51 != framework::product(t->dims()) / t->dims()[0]:55.
at [/paddle/paddle/fluid/framework/lod_tensor.cc:363]
Traceback (most recent call last):
File "run_with_json.py", line 91, in <module>
trainer.train_and_eval()
File "/root/paddlejob/workspace/env_run/textone/training/custom_trainer.py", line 111, in train_and_eval
raise e
paddle.fluid.core_avx.EnforceNotMet:
--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::string paddle::platform::GetTraceBackString<std::string const&>(std::string const&&&, char const*, int)
1 paddle::platform::EnforceNotMet::EnforceNotMet(std::string const&, char const*, int)
2 paddle::framework::LoDTensor::MergeLoDTensor(std::vector<paddle::framework::LoDTensor const*, std::allocator<paddle::framework::LoDTensor const*> > const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_>)
3 paddle::framework::details::FetchOpHandle::WaitAndMergeCPUTensors() const
4 paddle::framework::details::FetchOpHandle::RunImpl()
5 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOpSync(paddle::framework::details::OpHandleBase*)
6 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOp(paddle::framework::details::OpHandleBase*, std::shared_ptr<paddle::framework::BlockingQueue<unsigned long> > const&, unsigned long*)
7 std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()(), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<void>, std::__future_base::_Result_base::_Deleter>, void> >::_M_invoke(std::_Any_data const&)
8 std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()()>&, bool&)
9 _ZZN10ThreadPoolC1EmENKUlvE_clEv
----------------------
Error Message Summary:
----------------------
PaddleCheckError: Expected framework::product(new_dim) / new_dim[0] == framework::product(t->dims()) / t->dims()[0], but received framework::product(new_dim) / new_dim[0]:51 != framework::product(t->dims()) / t->dims()[0]:55.
at [/paddle/paddle/fluid/framework/lod_tensor.cc:363]