paddlecloud fluid-1.5.0单机训练没有问题,多机训练报错
Created by: ShadowSkyLiu
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f92497c8782 _ZN6paddle6memory6detail14BuddyAllocator12SplitToAllocESt23_Rb_tree_const_iteratorISt5tupleIJmmPvEEEm
Fri Oct 19 17:43:44 2018[1,0]<stdout>: @ 0x7ff370c02782 _ZN6paddle6memory6detail14BuddyAllocator12SplitToAllocESt23_Rb_tree_const_iteratorISt5tupleIJmmPvEEEm
Fri Oct 19 17:43:44 2018[1,6]<stdout>: @ 0x7f7f5f807c7c paddle::memory::detail::MemoryBlock::split()
Fri Oct 19 17:43:44 2018[1,8]<stdout>:*** Aborted at 1539942224 (unix time) try "date -d @1539942224" if you are using GNU date ***
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f92497c8c85 paddle::memory::detail::BuddyAllocator::Alloc()
Fri Oct 19 17:43:44 2018[1,34]<stdout>: @ 0x7fd726dba229 paddle::memory::detail::MetadataCache::load()
Fri Oct 19 17:43:44 2018[1,0]<stdout>: @ 0x7ff370c02c85 paddle::memory::detail::BuddyAllocator::Alloc()
Fri Oct 19 17:43:44 2018[1,6]<stdout>: @ 0x7f7f5f806782 _ZN6paddle6memory6detail14BuddyAllocator12SplitToAllocESt23_Rb_tree_const_iteratorISt5tupleIJmmPvEEEm
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f92497c2f85 paddle::memory::Alloc<>()
Fri Oct 19 17:43:44 2018[1,0]<stdout>: @ 0x7ff370bfcf85 paddle::memory::Alloc<>()
Fri Oct 19 17:43:44 2018[1,6]<stdout>: @ 0x7f7f5f806c85 paddle::memory::detail::BuddyAllocator::Alloc()
Fri Oct 19 17:43:44 2018[1,8]<stdout>:PC: @ 0x0 (unknown)
Fri Oct 19 17:43:44 2018[1,34]<stdout>: @ 0x7fd726db9c7c paddle::memory::detail::MemoryBlock::split()
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f92497bd6f1 paddle::framework::Tensor::mutable_data()
Fri Oct 19 17:43:44 2018[1,34]<stdout>: @ 0x7fd726db8782 _ZN6paddle6memory6detail14BuddyAllocator12SplitToAllocESt23_Rb_tree_const_iteratorISt5tupleIJmmPvEEEm
Fri Oct 19 17:43:44 2018[1,0]<stdout>: @ 0x7ff370bf76f1 paddle::framework::Tensor::mutable_data()
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f9248dcbe11 paddle::framework::Tensor::mutable_data<>()
Fri Oct 19 17:43:44 2018[1,6]<stdout>: @ 0x7f7f5f800f85 paddle::memory::Alloc<>()
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f924959f1cc paddle::operators::SumKernel<>::Compute()
Fri Oct 19 17:43:44 2018[1,0]<stdout>: @ 0x7ff370ac1a09 paddle::operators::math::scatter::MergeAdd<>::operator()()
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f92495a0143 _ZNSt17_Function_handlerIFvRKN6paddle9framework16ExecutionContextEEZNKS1_24OpKernelRegistrarFunctorINS0_8platform8CPUPlaceELb0ELm0EINS0_9operators9SumKernelINS7_16CPUDeviceContextEfEENSA_ISB_dEENSA_ISB_iEENSA_ISB_lEEEEclEPKcSI_EUlS4_E_E9_M_invokeERKSt9_Any_dataS4_
Fri Oct 19 17:43:44 2018[1,6]<stdout>: @ 0x7f7f5f7fb6f1 paddle::framework::Tensor::mutable_data()
Fri Oct 19 17:43:44 2018[1,0]<stdout>: @ 0x7ff3709c2a9c paddle::operators::SparseAdagradFunctor<>::operator()()
Fri Oct 19 17:43:44 2018[1,4]<stdout>:E1019 17:43:44.255692 203552 listen_and_serv_op.cc:69] run sub program error Enforce failed. Expected param_dims == ctx->GetInputDim("Grad"), but received param_dims:97067, 64 != ctx->GetInputDim("Grad"):97067.
Fri Oct 19 17:43:44 2018[1,4]<stdout>:Param and Grad input of AdagradOp should have the same dimension. at [/paddle/paddle/fluid/operators/adagrad_op.cc:52]
Fri Oct 19 17:43:44 2018[1,4]<stdout>:PaddlePaddle Call Stacks:
Fri Oct 19 17:43:44 2018[1,4]<stdout>:0 0x7f9248dc0da6p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 486
Fri Oct 19 17:43:44 2018[1,4]<stdout>:1 0x7f9249585f66p paddle::operators::AdagradOp::InferShape(paddle::framework::InferShapeContext*) const + 2614
Fri Oct 19 17:43:44 2018[1,4]<stdout>:2 0x7f924976cee9p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 73
Fri Oct 19 17:43:44 2018[1,4]<stdout>:3 0x7f92497694ffp paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 255
Fri Oct 19 17:43:44 2018[1,4]<stdout>:4 0x7f9248e7eaa9p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool) + 393
Fri Oct 19 17:43:44 2018[1,4]<stdout>:5 0x7f92495c7b62p
Fri Oct 19 17:43:44 2018[1,4]<stdout>:6 0x7f92494e55cap std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > >, std::__future_base::_Result_base::_Deleter>, std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > >::_M_invoke(std::_Any_data const&) + 42
Fri Oct 19 17:43:44 2018[1,4]<stdout>:7 0x7f9248ef2577p std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>&, bool&) + 39
Fri Oct 19 17:43:44 2018[1,4]<stdout>:8 0x7f92eb018973p pthread_once + 83
Fri Oct 19 17:43:44 2018[1,4]<stdout>:9 0x7f92495c6fb2p
Fri Oct 19 17:43:44 2018[1,4]<stdout>:10 0x7f924977ff18p paddle::framework::ThreadPool::TaskLoop() + 920
Fri Oct 19 17:43:44 2018[1,4]<stdout>:11 0x7f925484c8a0p
Fri Oct 19 17:43:44 2018[1,4]<stdout>:12 0x7f92eb0131c3p
Fri Oct 19 17:43:44 2018[1,4]<stdout>:13 0x7f92ea63b12dp clone + 109
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f924976d0ab paddle::framework::OperatorWithKernel::RunImpl()
Fri Oct 19 17:43:44 2018[1,26]<stdout>:*** SIGSEGV (@0x0) received by PID 87097 (TID 0x7fb0dbe0e700) from PID 0; stack trace: ***
Fri Oct 19 17:43:44 2018[1,6]<stdout>: @ 0x7f7f5ee09e11 paddle::framework::Tensor::mutable_data<>()
Fri Oct 19 17:43:44 2018[1,0]<stdout>: @ 0x7ff3709c4414 paddle::operators::AdagradOpKernel<>::Compute()
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f92497694ff paddle::framework::OperatorBase::Run()
Fri Oct 19 17:43:44 2018[1,6]<stdout>:E1019 17:43:44.246381 52620 listen_and_serv_op.cc:69] run sub program error Enforce failed. Expected param_dims == ctx->GetInputDim("Grad"), but received param_dims:13922, 64 != ctx->GetInputDim("Grad"):13922.
Fri Oct 19 17:43:44 2018[1,6]<stdout>:Param and Grad input of AdagradOp should have the same dimension. at [/paddle/paddle/fluid/operators/adagrad_op.cc:52]
Fri Oct 19 17:43:44 2018[1,6]<stdout>:PaddlePaddle Call Stacks:
Fri Oct 19 17:43:44 2018[1,6]<stdout>:0 0x7f7f5edfeda6p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 486
Fri Oct 19 17:43:44 2018[1,6]<stdout>:1 0x7f7f5f5c3f66p paddle::operators::AdagradOp::InferShape(paddle::framework::InferShapeContext*) const + 2614
Fri Oct 19 17:43:44 2018[1,6]<stdout>:2 0x7f7f5f7aaee9p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 73
Fri Oct 19 17:43:44 2018[1,6]<stdout>:3 0x7f7f5f7a74ffp paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 255
Fri Oct 19 17:43:44 2018[1,6]<stdout>:4 0x7f7f5eebcaa9p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool) + 393
Fri Oct 19 17:43:44 2018[1,6]<stdout>:5 0x7f7f5f605b62p
Fri Oct 19 17:43:44 2018[1,6]<stdout>:6 0x7f7f5f5235cap std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > >, std::__future_base::_Result_base::_Deleter>, std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > >::_M_invoke(std::_Any_data const&) + 42
Fri Oct 19 17:43:44 2018[1,6]<stdout>:7 0x7f7f5ef30577p std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>&, bool&) + 39
Fri Oct 19 17:43:44 2018[1,6]<stdout>:8 0x7f7fcba86973p pthread_once + 83
Fri Oct 19 17:43:44 2018[1,6]<stdout>:9 0x7f7f5f604fb2p
Fri Oct 19 17:43:44 2018[1,6]<stdout>:10 0x7f7f5f7bdf18p paddle::framework::ThreadPool::TaskLoop() + 920
Fri Oct 19 17:43:44 2018[1,6]<stdout>:11 0x7f7f352ba8a0p
Fri Oct 19 17:43:44 2018[1,6]<stdout>:12 0x7f7fcba811c3p
Fri Oct 19 17:43:44 2018[1,6]<stdout>:13 0x7f7fcb0a912dp clone + 109
Fri Oct 19 17:43:44 2018[1,26]<stdout>: @ 0x7fb135deb160 (unknown)
Fri Oct 19 17:43:44 2018[1,32]<stdout>:*** SIGSEGV (@0x0) received by PID 56154 (TID 0x7fb1dc7e7700) from PID 0; stack trace: ***
Fri Oct 19 17:43:44 2018[1,0]<stdout>: @ 0x7ff3709c5003 _ZNSt17_Function_handlerIFvRKN6paddle9framework16ExecutionContextEEZNKS1_24OpKernelRegistrarFunctorINS0_8platform8CPUPlaceELb0ELm0EINS0_9operators15AdagradOpKernelINS7_16CPUDeviceContextEfEENSA_ISB_dEEEEclEPKcSG_EUlS4_E_E9_M_invokeERKSt9_Any_dataS4_
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f9248e7eaa9 paddle::framework::Executor::RunPreparedContext()
Fri Oct 19 17:43:44 2018[1,6]<stdout>: @ 0x7f7f5f5dd1cc paddle::operators::SumKernel<>::Compute()
Fri Oct 19 17:43:44 2018[1,26]<stdout>: @ 0x7fb08d79a229 paddle::memory::detail::MetadataCache::load()
Fri Oct 19 17:43:44 2018[1,4]<stdout>: @ 0x7f92495c7b62 _ZNSt17_Function_handlerIFSt10unique_ptrIN6paddle8platform13EnforceNotMetESt14default_deleteIS3_EEvESt17reference_wrapperISt12_Bind_simpleIFS8_IZNS1_9framework10ThreadPool18RunAndGetExceptionIZNS1_9operatorsL21ParallelExecuteBlocksERKSt6vectorImSaImEEPNSA_8ExecutorERKSE_ISt10shared_ptrINSA_22ExecutorPrepareContextEESaISN_EEPNSA_11ProgramDescEPNSA_5ScopeEEUlvE_EESt6futureIS6_ET_EUlvE_EvEEEE9_M_invokeERKSt9_Any_data
Fri Oct 19 17:43:44 2018[1,35]<stdout>:*** SIGSEGV (@0x0) received by PID 11194 (TID 0x7fc2c918a700) from PID 0; stack trace: ***
Fri Oct 19 17:43:44 2018[1,19]<stdout>:E1019 17:43:44.249644 198553 listen_and_serv_op.cc:69] run sub program error holder_ should not be null
Fri Oct 19 17:43:44 2018[1,19]<stdout>:Tensor not initialized yet when Tensor::type() is called. at [/paddle/paddle/fluid/framework/tensor.h:139]
Fri Oct 19 17:43:44 2018[1,19]<stdout>:PaddlePaddle Call Stacks:
Fri Oct 19 17:43:44 2018[1,19]<stdout>:0 0x7fbd49399da6p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 486
Fri Oct 19 17:43:44 2018[1,19]<stdout>:1 0x7fbd4939c006p paddle::framework::Tensor::type() const + 150
Fri Oct 19 17:43:44 2018[1,19]<stdout>:2 0x7fbd49d456a5p paddle::framework::OperatorWithKernel::IndicateDataType(paddle::framework::ExecutionContext const&) const + 149
Fri Oct 19 17:43:44 2018[1,19]<stdout>:3 0x7fbd49d45a7fp paddle::framework::OperatorWithKernel::GetExpectedKernelType(paddle::framework::ExecutionContext const&) const + 47
Fri Oct 19 17:43:44 2018[1,19]<stdout>:4 0x7fbd49d45f67p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 199
Fri Oct 19 17:43:44 2018[1,19]<stdout>:5 0x7fbd49d424ffp paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 255
Fri Oct 19 17:43:44 2018[1,19]<stdout>:6 0x7fbd49457aa9p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool) + 393
Fri Oct 19 17:43:44 2018[1,19]<stdout>:7 0x7fbd49ba0b62p
Fri Oct 19 17:43:44 2018[1,19]<stdout>:8 0x7fbd49abe5cap std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > >, std::__future_base::_Result_base::_Deleter>, std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > >::_M_invoke(std::_Any_data const&) + 42
Fri Oct 19 17:43:44 2018[1,19]<stdout>:9 0x7fbd494cb577p std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>&, bool&) + 39
Fri Oct 19 17:43:44 2018[1,19]<stdout>:10 0x7fbdeb5f1973p pthread_once + 83
Fri Oct 19 17:43:44 2018[1,19]<stdout>:11 0x7fbd49b9ffb2p
Fri Oct 19 17:43:44 2018[1,19]<stdout>:12 0x7fbd49d58f18p paddle::framework::ThreadPool::TaskLoop() + 920
Fri Oct 19 17:43:44 2018[1,19]<stdout>:13 0x7fbd54e258a0p
Fri Oct 19 17:43:44 2018[1,19]<stdout>:14 0x7fbdeb5ec1c3p
Fri Oct 19 17:43:44 2018[1,19]<stdout>:15 0x7fbdeac1412dp clone + 109
Fri Oct 19 17:43:44 2018[1,0]<stdout>: @ 0x7ff370ba70ab paddle::framework::OperatorWithKernel::RunImpl()