未验证 提交 d4c7774f 编写于 作者: X xiongkun 提交者: GitHub

[dy2static-bugfix] fix backward gradient aggregation bugs (#50474)

* [dy2static-bugfix] fix backward gradient aggregation bugs
1. Yolov3 and Yolov5 all face the same problem.

* remove set_device

* code review fix
上级 e2aacd21
...@@ -75,6 +75,13 @@ void AddNKernel(const Context &dev_ctx, ...@@ -75,6 +75,13 @@ void AddNKernel(const Context &dev_ctx,
const std::vector<const TensorBase *> &x, const std::vector<const TensorBase *> &x,
DenseTensor *out) { DenseTensor *out) {
const size_t in_num = x.size(); const size_t in_num = x.size();
for (int i = 0; i < in_num; ++i) {
PADDLE_ENFORCE_EQ(
x[i]->initialized(),
true,
phi::errors::InvalidArgument(
"This argument is invalid, %d-th tensor is uninitialized.", i));
}
constexpr size_t theory_sm_threads = 1024; constexpr size_t theory_sm_threads = 1024;
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
......
...@@ -1672,6 +1672,18 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): ...@@ -1672,6 +1672,18 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
ops_to_remove.append(op_idx) ops_to_remove.append(op_idx)
continue continue
# sum may create invalid variable, here to deal with it.
if op_desc.type() == 'sum':
new_inputs = []
for grad_var_name in op_desc.input_arg_names():
if block.desc.has_var_recursive(grad_var_name.encode()):
# meet invalid sum variables, remove the invalid operand.
new_inputs.append(grad_var_name)
assert (
len(new_inputs) > 0
), "After remove invalid variables, sum op have no inputs."
op_desc.set_input("X", new_inputs)
new_vars = set() new_vars = set()
# create new gradient variables # create new gradient variables
for grad_var_name in op_desc.output_arg_names(): for grad_var_name in op_desc.output_arg_names():
......
...@@ -409,6 +409,39 @@ class ConditionalNet(BackwardNet): ...@@ -409,6 +409,39 @@ class ConditionalNet(BackwardNet):
super().__init__() super().__init__()
class TestBackwardUninitializedVariable(unittest.TestCase):
"""this case is found in yolov5 while to_static.
gradient aggregation may cause sum a invalid variable.
"""
def test(self):
paddle.enable_static()
main_prg, startup_prg = paddle.static.Program(), paddle.static.Program()
with paddle.static.program_guard(main_prg, startup_prg):
gt = paddle.static.data(name='gt', shape=[4], dtype='float32')
x = paddle.static.data(name='x', shape=[2], dtype='float32')
gt.stop_gradient = True
x.stop_gradient = False
gt = gt.reshape([4, 1]).reshape([4])
loss = (
paddle.nn.functional.binary_cross_entropy(x, gt[:2])
+ (gt[2:4] * x).sum()
)
exe = paddle.static.Executor()
paddle.fluid.backward.gradients(loss, [])
exe.run(startup_prg)
# Optimizer
out = exe.run(
main_prg,
feed={
'gt': np.array([1.0, 1.0, 0.0, 0.0], dtype='float32'),
'x': np.array([0.5, 0.5], dtype='float32'),
},
fetch_list=[loss],
)
print(out)
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static() paddle.enable_static()
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册