diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md index 202b4b65103c0b7c536a9cb466c4120ce134d8c3..691081c268b848811bf5ee6d6a41edfe0f47eec0 100644 --- a/doc/design/optimizer.md +++ b/doc/design/optimizer.md @@ -79,7 +79,7 @@ class Optimizer(object): def minimize(self, loss, parameter_list): """Add operations to minimize `loss` by updating `parameter_list`. - This method combines interface `append_backward_ops()` and + This method combines interface `append_backward()` and `create_optimization_pass()` into one. """ params_grads = self.create_backward_pass(loss, parameter_list) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index b12767b3bbbe51f2f60aede8470b9cff7e7d2c4c..382d057be4e0384f4bffa92d3b172650c51ee867 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -3,7 +3,7 @@ from . import core import collections import pdb -__all__ = ['append_backward_ops'] +__all__ = ['append_backward'] def _rename_arg_(op_desc_list, old_name, new_name, begin_idx=None, @@ -57,12 +57,11 @@ def _append_grad_suffix_(name): return name + core.grad_var_suffix() -def _backward_impl_(target, - block, - target_block, - no_grad_set, - grad_info_map, - callback=None): +def _append_backward_ops_(target, + block, + target_block, + no_grad_set, + callback=None): grad_op_descs = [] grad_to_var = dict() program = block.program @@ -71,11 +70,10 @@ def _backward_impl_(target, if each_op.has_attr("sub_block"): sub_block_idx = each_op.block_attr("sub_block") sub_block = program.block(sub_block_idx) - original_block_idx = program.current_block_idx grad_sub_block = program.create_block(parent_idx=sub_block_idx) - program.current_block_idx = original_block_idx - _backward_impl_(target, sub_block, grad_sub_block, no_grad_set, - grad_info_map, callback) + sub_grad_to_var = _append_backward_ops_( + target, sub_block, grad_sub_block, no_grad_set, callback) + grad_to_var = dict(grad_to_var, **sub_grad_to_var) grad_sub_block_list.append(grad_sub_block.desc) grad_op_desc, op_grad_to_var = core.get_grad_op_desc( each_op.desc, no_grad_set[block.idx], grad_sub_block_list) @@ -143,20 +141,7 @@ def _backward_impl_(target, "fill_zeros_like", {"X": [_strip_grad_suffix_(arg)]}, {"Y": [arg]}, {}) grad_op_descs.insert(ele[1], fill_zeros_like_op) - # create new gradient variables in the target block desc - new_vars = set() - for op_desc in grad_op_descs: - for grad_var_name in op_desc.output_arg_names(): - grad_var_name = grad_var_name.encode("ascii") - if target_block.desc.has_var_recursive( - grad_var_name) or grad_var_name == core.empty_var_name(): - continue - target_block.desc.var(grad_var_name) - new_vars.add(grad_var_name) - if not grad_to_var.has_key(grad_var_name): - continue - grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, - target_block) + if target_block.idx == 0: grad_target_name = _append_grad_suffix_(target.name) target_block.desc.var(grad_target_name.encode("ascii")) @@ -171,20 +156,40 @@ def _backward_impl_(target, "value": 1.0, "dtype": core.DataType.FP32 })) - # insert backward operators to target_block for op_desc in grad_op_descs: - op_desc.infer_var_type(target_block.desc) - op_desc.infer_shape(target_block.desc) - for arg in op_desc.output_arg_names(): - if arg in new_vars: - _infer_var_data_type_(arg, target_block) new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) - target_block.sync_with_cpp() + return grad_to_var + + +def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): + for op_idx in range(start_op_idx, block.desc.op_size()): + op_desc = block.desc.op(op_idx) + if op_desc.has_attr("sub_block"): + sub_block = block.program.block(op_desc.block_attr("sub_block")) + _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) + new_vars = set() + # create new gradient variables + for grad_var_name in op_desc.output_arg_names(): + grad_var_name = grad_var_name.encode("ascii") + if block.desc.has_var_recursive( + grad_var_name) or grad_var_name == core.empty_var_name(): + continue + block.desc.var(grad_var_name) + new_vars.add(grad_var_name) + if not grad_to_var.has_key(grad_var_name): + continue + grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block) + # infer_shape and infer_type + op_desc.infer_var_type(block.desc) + op_desc.infer_shape(block.desc) + for arg in op_desc.output_arg_names(): + if arg in new_vars: + _infer_var_data_type_(arg, block) -def append_backward_ops(loss, parameter_list=None, no_grad_set=None): +def append_backward(loss, parameter_list=None, no_grad_set=None): """ Create and add gradient Operators in BlockDesc to compute gradients of `loss` for parameters in parameter_list @@ -201,9 +206,9 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): """ assert isinstance(loss, framework.Variable) + program = loss.block.program if no_grad_set is None: no_grad_set = dict() - program = loss.block.program assert isinstance(program, framework.Program) for block in program.blocks: assert isinstance(block, framework.Block) @@ -215,14 +220,20 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): no_grad_set[block.idx] = block_no_grad_set grad_info_map = dict() - root_block = loss.block.program.block(0) + root_block = program.block(0) - _backward_impl_(loss, root_block, root_block, no_grad_set, grad_info_map) + fwd_op_num = root_block.desc.op_size() + current_block_idx = program.current_block_idx + grad_to_var = _append_backward_ops_(loss, root_block, root_block, + no_grad_set) + _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) + program.current_block_idx = current_block_idx + program.sync_with_cpp() if parameter_list is not None: parameters = parameter_list else: - params = loss.block.program.global_block().all_parameters() + params = program.global_block().all_parameters() parameters = [param.name for param in params] params_and_grads = [] for param in parameters: @@ -234,7 +245,7 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): raise ValueError("grad block[{0}] did not have grad var {1}".format( grad_info[1], grad_info[0])) # Get the param var from the global block - param_var = loss.block.program.global_block().var(param) + param_var = program.global_block().var(param) grad_var = grad_block.var(grad_info[0]) if loss.block.has_var(grad_info[0]): params_and_grads.append((param_var, grad_var)) diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index bbdfab2df9519b77e5df184c00aadf703ec765e0..e1830a7bc72dc97ce137a6d898722fac9b2774f4 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -1,7 +1,7 @@ from collections import defaultdict import framework -from backward import append_backward_ops +from backward import append_backward from framework import unique_name from initializer import Constant from layer_helper import LayerHelper @@ -195,10 +195,10 @@ class Optimizer(object): no_grad_set=None): """Add operations to minimize `loss` by updating `parameter_list`. - This method combines interface `append_backward_ops()` and + This method combines interface `append_backward()` and `create_optimization_pass()` into one. """ - params_grads = append_backward_ops(loss, parameter_list, no_grad_set) + params_grads = append_backward(loss, parameter_list, no_grad_set) # Add regularization if any params_grads = append_regularization_ops(params_grads, self.regularization) diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py index e83c4a0622013cbfebdf39434ef252412697acb1..e4c9b0218d6b348236a57f656c16b00728ec17af 100644 --- a/python/paddle/v2/fluid/tests/op_test.py +++ b/python/paddle/v2/fluid/tests/op_test.py @@ -4,7 +4,7 @@ import random import itertools import paddle.v2.fluid.core as core import collections -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward from paddle.v2.fluid.op import Operator from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.framework import Program, OpProtoHolder @@ -493,7 +493,7 @@ class OpTest(unittest.TestCase): op_loss.desc.infer_var_type(block.desc) op_loss.desc.infer_shape(block.desc) - param_grad_list = append_backward_ops( + param_grad_list = append_backward( loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set) feed_dict = { diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py index f6120aedecf1015c279b8f218f5e37f2e598ab91..01321de8eac34d562d99726b1f4125d1932ab40f 100644 --- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py +++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.core as core import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward from paddle.v2.fluid.framework import default_main_program import numpy @@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase): total_sum = layers.sums(input=[a_sum, x_sum]) total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0) - append_backward_ops(total_sum_scaled) + append_backward(total_sum_scaled) g_vars = map(default_main_program().global_block().var, [each_x.name + "@GRAD" for each_x in x]) diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py index 2b9d8f351a2836cd723d629d4790de1e068d0ea3..7d815123f3454d1457f59202219f9a93bf3d8c31 100644 --- a/python/paddle/v2/fluid/tests/test_conditional_block.py +++ b/python/paddle/v2/fluid/tests/test_conditional_block.py @@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core from paddle.v2.fluid.framework import default_startup_program, default_main_program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy @@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase): outs = exe.run(feed={'X': x}, fetch_list=[out])[0] print outs loss = layers.mean(x=out) - append_backward_ops(loss=loss) + append_backward(loss=loss) outs = exe.run( feed={'X': x}, fetch_list=[ diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py index 0a916a55bc3d097e17fb504b0d6b2f2818f030c9..ede1948937fcf2da6665e348cac68a754a3a91ff 100644 --- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py +++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py @@ -4,7 +4,7 @@ import numpy import paddle.v2.fluid.layers as layers from paddle.v2.fluid.framework import Program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestCPULoDTensorArrayOps(unittest.TestCase): @@ -172,7 +172,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase): mean = layers.mean(x=result, main_program=program) - append_backward_ops(mean) + append_backward(mean) tensor = core.LoDTensor() tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place) diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py index 2459dfd664300d405edb36c4ca906c1769b5e7d2..3d40d63bd035b3596c6e34d23a998d08de886b9c 100644 --- a/python/paddle/v2/fluid/tests/test_optimizer.py +++ b/python/paddle/v2/fluid/tests/test_optimizer.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.framework as framework import paddle.v2.fluid.optimizer as optimizer -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestOptimizer(unittest.TestCase): @@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase): dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) opts = momentum_optimizer.create_optimization_pass( @@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase): learning_rate = 0.01 momentum_optimizer = self.MockMomentum( learning_rate=learning_rate, momentum=0.2, use_nesterov=True) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) opts = momentum_optimizer.create_optimization_pass( @@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase): learning_rate = 0.01 adagrad_optimizer = self.MockAdagrad( learning_rate=learning_rate, epsilon=1.0e-6) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out, @@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase): learning_rate = 0.01 adam_optimizer = self.MockAdam( learning_rate=learning_rate, beta1=0.9, beta2=0.999) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adam_optimizer.get_accumulators()), 0) opts = adam_optimizer.create_optimization_pass(params_grads, mul_out, @@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase): learning_rate = 0.01 adamax_optimizer = self.MockAdamax( learning_rate=learning_rate, beta1=0.9, beta2=0.999) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out, @@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): learning_rate = 0.01 decayed_adagrad_optimizer = self.MockDecayedAdagrad( learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) opts = decayed_adagrad_optimizer.create_optimization_pass( diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py index 694ff0d8dd794111aff51bb7d503a56b87514342..609287bbce1f6cf02e26ddb8ceb014a104c5fb8b 100644 --- a/python/paddle/v2/fluid/tests/test_recurrent_op.py +++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py @@ -3,7 +3,7 @@ import unittest import paddle.v2.fluid.layers as layers from paddle.v2.fluid.framework import Program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy as np import paddle.v2.fluid.core as core @@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase): def test_backward(self): self.check_forward() - append_backward_ops(self.output) + append_backward(self.output) ana_grad = [np.array(x) for x in self.backward()] diff --git a/python/paddle/v2/fluid/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py index 24baf55e90c98f39bab926e8c85a791eee5ed4a4..890c881a126a32344128652691c6cad45e02e82d 100644 --- a/python/paddle/v2/fluid/tests/test_regularizer.py +++ b/python/paddle/v2/fluid/tests/test_regularizer.py @@ -3,7 +3,7 @@ import unittest import paddle.v2.fluid.framework as framework import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.regularizer as regularizer -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestL2DecayRegularizer(unittest.TestCase): @@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase): dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) count_ops = len(block.ops) params_grads = optimizer.append_regularization_ops(params_grads) @@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase): dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) count_ops = len(block.ops) params_grads = optimizer.append_regularization_ops(params_grads) diff --git a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py index 9999165ed509aa40f31f26aa676f381561bd0016..d1bb20f37a3785f70bee072b9df282bba4012c16 100644 --- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py +++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py @@ -2,7 +2,7 @@ import unittest from paddle.v2.fluid.framework import Program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy as np import paddle.v2.fluid.core as core diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py index 86db4c64b493d94cc675ed4bcee7e2925fef1977..be1588fc2d09fa58882425eb3d080ef1560ebc79 100644 --- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py +++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.core as core from paddle.v2.fluid.executor import Executor import paddle.v2.fluid.layers as layers -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward from paddle.v2.fluid.framework import default_main_program import numpy @@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase): self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2])) mem3_mean = layers.mean(x=mem3) - append_backward_ops(loss=mem3_mean) + append_backward(loss=mem3_mean) x_grad = exe.run( feed={'x': tensor}, fetch_list=[main_program.global_block().var('x@GRAD')])[0] diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py index f5da4e408f0a83dbf6da530b478e91bbf9cd5ab2..f3c634e8f12fe4405ad3a4158452fa4adeeeade4 100644 --- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py +++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py @@ -4,7 +4,7 @@ import numpy as np import paddle.v2.fluid.layers as layers from paddle.v2.fluid.framework import Program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestCPULoDTensorArrayOps(unittest.TestCase): @@ -150,7 +150,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): main_program=program) mean = layers.mean(x=out, main_program=program) - append_backward_ops(mean) + append_backward(mean) tensor = core.LoDTensor() tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place) diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py index 033b03a4957131e1155c61e8ed2f10eefb23fda4..7c5593cc5e5a66d4ccb237e3706ff3e544adf033 100644 --- a/python/paddle/v2/fluid/tests/test_while_op.py +++ b/python/paddle/v2/fluid/tests/test_while_op.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor import paddle.v2.fluid.core as core -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy @@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase): sum_result = layers.array_read(array=mem_array, i=i) loss = layers.mean(x=sum_result) - append_backward_ops(loss) + append_backward(loss) cpu = core.CPUPlace() exe = Executor(cpu)