diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 61054a20e0cc5b7a66214a635e9725a6d5d46882..605846de1ec3cd777bf1608e805c5e41565fcfc1 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -827,6 +827,19 @@ def _get_sub_block_path(sub_block, sub_block_op_desc, no_grad_set): return sub_block.ops +def _is_grad_op_(op): + op_maker = core.op_proto_and_checker_maker + backward = core.op_proto_and_checker_maker.OpRole.Backward + if op_maker.kOpRoleVarAttrName() in op.attr_names and \ + int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(backward): + return True + return False + + +def _rename_grad_name_(name, grad_order): + return 'grad/' * grad_order + name + + def _append_backward_ops_(block, ops, target_block, @@ -862,6 +875,8 @@ def _append_backward_ops_(block, grad_op_descs = [] program = block.program + rename_var_map = {} + # add grad_op_desc by reversed ops for op in reversed(ops): grad_sub_block_list = [] @@ -894,6 +909,33 @@ def _append_backward_ops_(block, for op_desc in grad_op_desc: op_desc._set_attr(device_attr_name, op_device) + # Rename internal gradient variables in multiple backward + # so that they have different names with previous backward. + # For example: + # y = x * x, grad = fluid.gradients(fluid.gradients(y, x) + y * y, x) + # In second-time backward, gradient variable names of partial + # forward network (y * y) may be have same names with first-time + # fluid.gradients(y, x). + # So rename here before _addup_repetitive_outputs_. + if program._appending_grad_times > 1: + for op_desc in grad_op_desc: + if not _is_grad_op_(op): + for name in op_desc.input_arg_names(): + if name in rename_var_map: + op_desc._rename_input(name, rename_var_map[name]) + for name in op_desc.output_arg_names(): + if "@GRAD" not in name: + continue + if block.desc.find_var(name.encode("ascii")): + new_name = _rename_grad_name_( + name, program._appending_grad_times) + op_desc._rename_output(name, new_name) + rename_var_map[name] = new_name + + if name in op_grad_to_var: + op_grad_to_var[new_name] = op_grad_to_var[name] + op_grad_to_var.pop(name) + # If input_grad_names_set is not None, extend grad_op_descs only when # any input grad in outputs of previous grad ops. # But this strategy is not suited for while op for some control flow, diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py index 4120a18b72f87c7e750a0fb68780292b58e3a7f4..3e8c449d8995ca90401861e93f2fb987d1c6967d 100644 --- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py @@ -23,16 +23,62 @@ from paddle.fluid.backward import calc_gradient class TestCalcGradient(unittest.TestCase): def test_calc_gradient(self): - x = layers.create_parameter(dtype="float32", shape=[5, 10]) - y = layers.create_parameter(dtype="float32", shape=[10, 8]) - mul_out = layers.mul(x=x, y=y) - mean_out = layers.mean(mul_out) - a = calc_gradient(mean_out, mul_out) - b = calc_gradient(mean_out, x) + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + x = layers.create_parameter(dtype="float32", shape=[5, 10]) + y = layers.create_parameter(dtype="float32", shape=[10, 8]) + mul_out = layers.mul(x=x, y=y) + mean_out = layers.mean(mul_out) + a = calc_gradient(mean_out, mul_out) + b = calc_gradient(mean_out, x) place = fluid.CPUPlace() exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b]) + exe.run(startup) + exe.run(main, feed={}, fetch_list=[a, b]) + + +class TestDoubleGrad(unittest.TestCase): + def test1(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + net = lambda x: x * x + x = fluid.layers.create_parameter( + name='x', + shape=[1], + dtype='float32', + default_initializer=fluid.initializer.Constant(3)) + grad1, = fluid.gradients(net(x), x) # 2x = 6 + z = net(x - grad1) + grad2, = fluid.gradients(z, x) # gradients( (x - 2x)^2) = 2x = 6 + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + out = exe.run(main, fetch_list=[grad1.name, grad2.name]) + self.assertEqual(6, out[0][0]) + self.assertEqual(6, out[1][0]) + + def test2(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + x = fluid.layers.create_parameter( + name='x', + shape=[1], + dtype='float32', + default_initializer=fluid.initializer.Constant(1)) + y = x * x + dx1, = fluid.gradients(y, x) + z = dx1 * dx1 + y * y + dx2, = fluid.gradients(z, x) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + out, = exe.run(main, fetch_list=[dx2]) + self.assertEqual(12, out[0]) if __name__ == "__main__":