未验证 提交 0231f58e 编写于 作者: Q qingqing01 提交者: GitHub

Fix double_grad bug in statig-graph (#24190) (#24286)

* Rename internal gradient variables in multiple backward
* so that they have different names with previous backward
* For example:
*  y = x * x, grad = fluid.gradients(fluid.gradients(y, x) + y * y, x)
* In second-time backward, gradient variable names of partial
* forward network (y * y) may be have same names with first-time
* fluid.gradients(y, x).

test=develop
上级 55fc5019
...@@ -827,6 +827,19 @@ def _get_sub_block_path(sub_block, sub_block_op_desc, no_grad_set): ...@@ -827,6 +827,19 @@ def _get_sub_block_path(sub_block, sub_block_op_desc, no_grad_set):
return sub_block.ops return sub_block.ops
def _is_grad_op_(op):
op_maker = core.op_proto_and_checker_maker
backward = core.op_proto_and_checker_maker.OpRole.Backward
if op_maker.kOpRoleVarAttrName() in op.attr_names and \
int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(backward):
return True
return False
def _rename_grad_name_(name, grad_order):
return 'grad/' * grad_order + name
def _append_backward_ops_(block, def _append_backward_ops_(block,
ops, ops,
target_block, target_block,
...@@ -862,6 +875,8 @@ def _append_backward_ops_(block, ...@@ -862,6 +875,8 @@ def _append_backward_ops_(block,
grad_op_descs = [] grad_op_descs = []
program = block.program program = block.program
rename_var_map = {}
# add grad_op_desc by reversed ops # add grad_op_desc by reversed ops
for op in reversed(ops): for op in reversed(ops):
grad_sub_block_list = [] grad_sub_block_list = []
...@@ -894,6 +909,33 @@ def _append_backward_ops_(block, ...@@ -894,6 +909,33 @@ def _append_backward_ops_(block,
for op_desc in grad_op_desc: for op_desc in grad_op_desc:
op_desc._set_attr(device_attr_name, op_device) op_desc._set_attr(device_attr_name, op_device)
# Rename internal gradient variables in multiple backward
# so that they have different names with previous backward.
# For example:
# y = x * x, grad = fluid.gradients(fluid.gradients(y, x) + y * y, x)
# In second-time backward, gradient variable names of partial
# forward network (y * y) may be have same names with first-time
# fluid.gradients(y, x).
# So rename here before _addup_repetitive_outputs_.
if program._appending_grad_times > 1:
for op_desc in grad_op_desc:
if not _is_grad_op_(op):
for name in op_desc.input_arg_names():
if name in rename_var_map:
op_desc._rename_input(name, rename_var_map[name])
for name in op_desc.output_arg_names():
if "@GRAD" not in name:
continue
if block.desc.find_var(name.encode("ascii")):
new_name = _rename_grad_name_(
name, program._appending_grad_times)
op_desc._rename_output(name, new_name)
rename_var_map[name] = new_name
if name in op_grad_to_var:
op_grad_to_var[new_name] = op_grad_to_var[name]
op_grad_to_var.pop(name)
# If input_grad_names_set is not None, extend grad_op_descs only when # If input_grad_names_set is not None, extend grad_op_descs only when
# any input grad in outputs of previous grad ops. # any input grad in outputs of previous grad ops.
# But this strategy is not suited for while op for some control flow, # But this strategy is not suited for while op for some control flow,
......
...@@ -23,16 +23,62 @@ from paddle.fluid.backward import calc_gradient ...@@ -23,16 +23,62 @@ from paddle.fluid.backward import calc_gradient
class TestCalcGradient(unittest.TestCase): class TestCalcGradient(unittest.TestCase):
def test_calc_gradient(self): def test_calc_gradient(self):
x = layers.create_parameter(dtype="float32", shape=[5, 10]) main = fluid.Program()
y = layers.create_parameter(dtype="float32", shape=[10, 8]) startup = fluid.Program()
mul_out = layers.mul(x=x, y=y) with fluid.program_guard(main, startup):
mean_out = layers.mean(mul_out) x = layers.create_parameter(dtype="float32", shape=[5, 10])
a = calc_gradient(mean_out, mul_out) y = layers.create_parameter(dtype="float32", shape=[10, 8])
b = calc_gradient(mean_out, x) mul_out = layers.mul(x=x, y=y)
mean_out = layers.mean(mul_out)
a = calc_gradient(mean_out, mul_out)
b = calc_gradient(mean_out, x)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(startup)
exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b]) exe.run(main, feed={}, fetch_list=[a, b])
class TestDoubleGrad(unittest.TestCase):
def test1(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
net = lambda x: x * x
x = fluid.layers.create_parameter(
name='x',
shape=[1],
dtype='float32',
default_initializer=fluid.initializer.Constant(3))
grad1, = fluid.gradients(net(x), x) # 2x = 6
z = net(x - grad1)
grad2, = fluid.gradients(z, x) # gradients( (x - 2x)^2) = 2x = 6
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
out = exe.run(main, fetch_list=[grad1.name, grad2.name])
self.assertEqual(6, out[0][0])
self.assertEqual(6, out[1][0])
def test2(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
x = fluid.layers.create_parameter(
name='x',
shape=[1],
dtype='float32',
default_initializer=fluid.initializer.Constant(1))
y = x * x
dx1, = fluid.gradients(y, x)
z = dx1 * dx1 + y * y
dx2, = fluid.gradients(z, x)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
out, = exe.run(main, fetch_list=[dx2])
self.assertEqual(12, out[0])
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册