使用最新develop版Paddle动态图多卡训练跑不起来
Created by: baiyfbupt
使用最新develop版Paddle,https://github.com/PaddlePaddle/models/tree/develop/dygraph/resnet#%E8%AE%AD%E7%BB%83%E6%B5%8B%E8%AF%95residual-network 这里的多卡训练demo跑不起来,请问是什么原因?报错信息如下:
start data reader (trainers_num: 2, trainer_id: 0)
Traceback (most recent call last):
File "train.py", line 389, in <module>
train_resnet()
File "train.py", line 340, in train_resnet
out = resnet(img)
File "/usr/local/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 460, in __call__
outputs = self.forward(*inputs, **kwargs)
File "/usr/local/lib/python3.6/site-packages/paddle/fluid/dygraph/parallel.py", line 288, in forward
return self._layers(*inputs, **kwargs)
File "/usr/local/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 460, in __call__
outputs = self.forward(*inputs, **kwargs)
File "train.py", line 226, in forward
y = self.conv(inputs)
File "/usr/local/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 460, in __call__
outputs = self.forward(*inputs, **kwargs)
File "train.py", line 108, in forward
y = self._conv(inputs)
File "/usr/local/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 457, in __call__
self._parameters.values())
File "/usr/local/lib/python3.6/site-packages/paddle/fluid/dygraph/parallel_helper.py", line 43, in _broadcast_parameters
collective._broadcast(param, 0, sync_mode=True)
File "/usr/local/lib/python3.6/site-packages/paddle/fluid/layers/collective.py", line 60, in _broadcast
"root": root})
File "/usr/local/lib/python3.6/site-packages/paddle/fluid/layer_helper.py", line 43, in append_op
return self.main_program.current_block().append_op(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/paddle/fluid/framework.py", line 2552, in append_op
kwargs.get("stop_gradient", False))
File "/usr/local/lib/python3.6/site-packages/paddle/fluid/dygraph/tracer.py", line 43, in trace_op
not stop_gradient)
paddle.fluid.core_avx.EnforceNotMet:
--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > paddle::platform::GetTraceBackString<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, char const*, int)
1 paddle::framework::OpRegistry::CreateOp(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > > > const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > > > const&, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, boost::variant<boost::blank, int, float, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<int, std::allocator<int> >, std::vector<float, std::allocator<float> >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, bool, std::vector<bool, std::allocator<bool> >, paddle::framework::BlockDesc*, long, std::vector<paddle::framework::BlockDesc*, std::allocator<paddle::framework::BlockDesc*> >, std::vector<long, std::allocator<long> >, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_>, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, boost::variant<boost::blank, int, float, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<int, std::allocator<int> >, std::vector<float, std::allocator<float> >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, bool, std::vector<bool, std::allocator<bool> >, paddle::framework::BlockDesc*, long, std::vector<paddle::framework::BlockDesc*, std::allocator<paddle::framework::BlockDesc*> >, std::vector<long, std::allocator<long> >, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> > > >, bool)
----------------------
Error Message Summary:
----------------------
Error: Operator broadcast has not been registered
[Hint: op_info_ptr should not be null.] at (/baiyifan/Paddle/paddle/fluid/framework/op_info.h:140)