test_recognize_digits.py使用LARS_weight_decay时分布式训练出错
Created by: kolinwei
单机是正常的。 optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3)换成其他的优化方式,分布式也是正常的。
报错信息如下: File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/transpiler/distribute_transpiler.py", line 592, in get_pserver_program cloned_op = self._append_pserver_non_opt_ops(lr_decay_block, op) File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/transpiler/distribute_transpiler.py", line 1523, in _append_pserver_non_opt_ops attrs=opt_op.all_attrs()) File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/framework.py", line 1140, in append_op op = Operator(block=self, desc=op_desc, args, kwargs) File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/framework.py", line 589, in init self.desc.infer_shape(self.block.desc) paddle.fluid.core.EnforceNotMet: Enforce failed. Expected x_mat_dims[1] == y_mat_dims[0], but received x_mat_dims[1]:400 != y_mat_dims[0]:800. First matrix's width must be equal with second matrix's height. at [/paddle/paddle/fluid/operators/mul_op.cc:59] PaddlePaddle Call Stacks: 0 0x7f0394963d06p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const, int) + 486 1 0x7f039550ceb9p paddle::operators::MulOp::InferShape(paddle::framework::InferShapeContext) const + 2569 2 0x7f0394a15876p paddle::framework::OpDesc::InferShape(paddle::framework::BlockDesc const&) const + 886 3 0x7f03949c0fe5p void pybind11::cpp_function::initialize<pybind11::cpp_function::initialize<void, paddle::framework::OpDesc, paddle::framework::BlockDesc const&, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::OpDesc::)(paddle::framework::BlockDesc const&) const, pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::OpDesc const*, paddle::framework::BlockDesc const&)#1 (closed)}, void, paddle::framework::OpDesc const*, paddle::framework::BlockDesc const&, pybind11::name, pybind11::is_method, pybind11::sibling>(pybind11::cpp_function::initialize<void, paddle::framework::OpDesc, paddle::framework::BlockDesc const&, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::OpDesc::)(paddle::framework::BlockDesc const&) const, pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::OpDesc const, paddle::framework::BlockDesc const&)#1 (closed)}&&, void ()(paddle::framework::OpDesc const, paddle::framework::BlockDesc const&), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(pybind11::detail::function_call&)#3 (closed)}::_FUN(pybind11::detail::function_call) + 213 4 0x7f03949749e4p pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 2596