develop dist train failed
Created by: Yancey1989
Run failed with ParallelExecutor
distribute_transpiler.py
need to follow PR: #10656
Traceback (most recent call last):
File "vgg16_pe_gpu.py", line 329, in <module>
main()
File "vgg16_pe_gpu.py", line 313, in main
train_exe = fluid.ParallelExecutor(use_cuda=use_gpu, main_program=trainer_prog, loss_name=avg_cost.name)
File "/paddle/build/python/paddle/fluid/parallel_executor.py", line 155, in __init__
build_strategy, num_trainers, trainer_id)
paddle.fluid.core.EnforceNotMet: at [/paddle/paddle/fluid/framework/details/multi_devices_graph_builder.cc:232]
PaddlePaddle Call Stacks:
0 0x7f3439821cacp paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 572
1 0x7f343a07b036p paddle::framework::details::MultiDevSSAGraphBuilder::IsSparseGradient(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, paddle::framework::proto::VarType_Type, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, paddle::framework::proto::VarType_Type> > > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) const + 566
2 0x7f343a0812eep paddle::framework::details::MultiDevSSAGraphBuilder::Build(paddle::framework::ProgramDesc const&) const + 5214
3 0x7f34398ee52dp paddle::framework::ParallelExecutor::ParallelExecutor(std::vector<boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_>, std::allocator<boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> > > const&, std::unordered_set<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::unordered_set<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, paddle::framework::ProgramDesc const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, paddle::framework::Scope*, std::vector<paddle::framework::Scope*, std::allocator<paddle::framework::Scope*> > const&, paddle::framework::details::ExecutionStrategy const&, paddle::framework::details::BuildStrategy const&, unsigned long, unsigned long) + 701