From a839f724d266837eb97fe583823abef0465bb942 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Thu, 20 Jun 2019 23:48:10 +0800 Subject: [PATCH] [cherry-pick]Update backward appending stragety to support double backward. (#18216) * Update backward appending stragety to support double backward and fix some bug. (#18104) * Update backward.py: - If there is no input grad var in all outputs of previous ops, do not append this op into graph. - Only apply this stragety when double backward. * Update some double backward op. * Update sum_op to judge whether a tensor is empty by numel or IsInitialized(). --- paddle/fluid/API.spec | 2 + paddle/fluid/operators/activation_op.cc | 18 +-- paddle/fluid/operators/activation_op.h | 8 -- paddle/fluid/operators/conv_op.cc | 19 ++- paddle/fluid/operators/mul_op.cc | 22 +-- .../operators/reduce_ops/reduce_mean_op.cc | 10 -- paddle/fluid/operators/sum_op.cc | 2 +- paddle/fluid/operators/sum_op.cu | 10 +- python/paddle/fluid/__init__.py | 3 +- python/paddle/fluid/backward.py | 132 ++++++++++++++++-- python/paddle/fluid/framework.py | 4 + .../fluid/tests/unittests/gradient_checker.py | 5 +- 12 files changed, 177 insertions(+), 58 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index d1742671a8b..74a42ba5de9 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -47,6 +47,7 @@ paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core_avx.Par paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy) -> None +paddle.fluid.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b')) paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '869104f47e6fd21d897c3fcc426aa942')) paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07ffd5351b30cf47172ccfd61bd0de6f')) paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9141bb5f32caf7975eb3fd88c8a1b2da')) @@ -555,6 +556,7 @@ paddle.fluid.optimizer.PipelineOptimizer.find_section_opt (ArgSpec(args=['self', paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.PipelineOptimizer.split_program (ArgSpec(args=['self', 'main_program', 'cut_list'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '08a5dd9f6f376ff3d55e0b1d92115cbd')) +paddle.fluid.backward.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b')) paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core_avx.LoDTensor, arg0: List[List[int]]) -> None 2. __init__(self: paddle.fluid.core_avx.LoDTensor) -> None diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 8e38d5787bd..0c053cd0e54 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -604,21 +604,25 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { if (static_cast(kDepValue) & static_cast(kDepX)) { - if (ctx->HasOutput("DX")) { + // some op has no output DX, check HasOutputs("DX") here + if (HasOutputs("DX") && ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); } - if (ctx->HasOutput("DDOut")) { + // some op has no output DDout, check HasOutputs("DDout") here + if (HasOutputs("DDOut") && ctx->HasOutput("DDOut")) { ctx->ShareDim("X", "DDOut"); ctx->ShareLoD("X", "DDOut"); } } if (static_cast(kDepValue) & static_cast(kDepOut)) { - if (ctx->HasOutput("DOut")) { + // some op has no output DOut, check HasOutputs("DOut") here + if (HasOutputs("DOut") && ctx->HasOutput("DOut")) { ctx->ShareDim("Out", "DOut"); ctx->ShareLoD("Out", "DOut"); } - if (ctx->HasOutput("DDOut")) { + // some op has no output DDOut, check HasOutputs("DDOut") here + if (HasOutputs("DDOut") && ctx->HasOutput("DDOut")) { ctx->ShareDim("Out", "DDOut"); ctx->ShareLoD("Out", "DDOut"); } @@ -635,7 +639,6 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { // // ReluGrad: dx = dy if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0 -// dy = 0 // class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker { public: @@ -650,9 +653,7 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker { // input2: ddx op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); op->SetAttrMap(Attrs()); - // output1: ddy - op->SetOutput("DOut", InputGrad("Out")); - // output2: ddy + // output: ddy op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out"))); return std::unique_ptr<::paddle::framework::OpDesc>(op); } @@ -675,7 +676,6 @@ class LeakyReluDoubleGradMaker op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); op->SetAttrMap(Attrs()); // Out@GRAD@GRAD: ddy - op->SetOutput("DX", InputGrad("X")); op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out"))); return std::unique_ptr<::paddle::framework::OpDesc>(op); } diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 5a4fb0828a7..b516fc8a418 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -1321,10 +1321,6 @@ struct ReluGradGradFunctor : public BaseActivationFunctor { auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); } - if (dOut) { - auto dout = framework::EigenVector::Flatten(detail::Ref(dOut)); - dout.device(*d) = dout.constant(static_cast(0)); - } } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; @@ -1351,10 +1347,6 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor { (x < static_cast(0)).template cast().eval()) .template cast(); } - if (dX) { - auto dx = framework::EigenVector::Flatten(detail::Ref(dX)); - dx.device(*d) = dx.constant(static_cast(0)); - } } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 9572bb69e28..5c48a8ee8f5 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -533,9 +533,16 @@ class Conv2DDoubleGradMaker : public framework::SingleGradOpDescMaker { // ddO, dI, dW // Unlike grad op, double grad op does not use name@GRAD@GRAD // as key of ops' inputs and outputs. - op->SetOutput("DDOutput", InputGrad(framework::GradVarName("Output"))); - op->SetOutput("DFilter", InputGrad("Filter")); - op->SetOutput("DInput", InputGrad("Input")); + auto ddx = OutputGrad(framework::GradVarName("Input")); + auto ddw = OutputGrad(framework::GradVarName("Filter")); + std::vector empty_str = {}; + + op->SetOutput( + "DDOutput", + ddx.empty() ? empty_str : InputGrad(framework::GradVarName("Output"))); + op->SetOutput("DFilter", ddx.empty() ? empty_str : InputGrad("Filter")); + op->SetOutput("DInput", ddw.empty() ? empty_str : InputGrad("Input")); + op->SetAttrMap(Attrs()); return std::unique_ptr(op); @@ -547,13 +554,13 @@ void ConvOpDoubleGrad::InferShape(framework::InferShapeContext* ctx) const { auto w_dims = ctx->GetInputDim("Filter"); auto do_dims = ctx->GetInputDim("DOutput"); - if (ctx->HasOutput("DDOutput")) { + if (ctx->HasOutput("DDOutput") && ctx->HasInput("DDInput")) { ctx->SetOutputDim("DDOutput", do_dims); } - if (ctx->HasOutput("DFilter")) { + if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) { ctx->SetOutputDim("DFilter", w_dims); } - if (ctx->HasOutput("DInput")) { + if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) { ctx->SetOutputDim("DInput", x_dims); } } diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 6dac9041b61..bbf9fbfa1ff 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -189,15 +189,15 @@ class MulDoubleGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); PADDLE_ENFORCE(ctx->HasInput("DOut"), "Input(DOut) should not be null"); - if (ctx->HasOutput("DX")) { + if (ctx->HasOutput("DDOut") && ctx->HasInput("DDX")) { + ctx->ShareDim("DOut", "DDOut"); + } + if (ctx->HasOutput("DX") && ctx->HasInput("DDY")) { ctx->ShareDim("X", "DX"); } - if (ctx->HasOutput("DY")) { + if (ctx->HasOutput("DY") && ctx->HasInput("DDX")) { ctx->ShareDim("Y", "DY"); } - if (ctx->HasOutput("DDOut")) { - ctx->ShareDim("DOut", "DDOut"); - } } }; @@ -216,9 +216,15 @@ class MulDoubleGradMaker : public framework::SingleGradOpDescMaker { retv->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); retv->SetInput("DDY", OutputGrad(framework::GradVarName("Y"))); - retv->SetOutput("DDOut", InputGrad(framework::GradVarName("Out"))); - retv->SetOutput("DX", InputGrad("X")); - retv->SetOutput("DY", InputGrad("Y")); + auto ddx = OutputGrad(framework::GradVarName("X")); + auto ddw = OutputGrad(framework::GradVarName("Y")); + std::vector empty_str = {}; + + retv->SetOutput("DDOut", (ddx.empty()) + ? empty_str + : InputGrad(framework::GradVarName("Out"))); + retv->SetOutput("DX", ddw.empty() ? empty_str : InputGrad("X")); + retv->SetOutput("DY", ddx.empty() ? empty_str : InputGrad("Y")); retv->SetAttrMap(Attrs()); return retv; diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 14593ea54ff..d1b508792c2 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -46,17 +46,7 @@ class ReduceMeanDoubleGradMaker : public framework::GradOpDescMakerBase { std::vector> operator()() const override { std::vector> ops; - auto x_grads = InputGrad("X"); auto x_gg = OutputGrad(framework::GradVarName("X")); // input ddx - if (!x_grads.empty()) { - auto* x_grad_op = new framework::OpDesc(); - x_grad_op->SetType("scale"); - x_grad_op->SetInput("X", x_gg); - x_grad_op->SetOutput("Out", x_grads); - x_grad_op->SetAttr("scale", 0.0f); - ops.emplace_back(x_grad_op); - } - auto out_grads = InputGrad(framework::GradVarName("Out")); if (!out_grads.empty()) { auto* out_grad_op = new framework::OpDesc(); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 1eb4076d64d..e6c87726425 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -111,7 +111,7 @@ class SumOp : public framework::OperatorWithKernel { "Input var[%s] should not be nullptr", x_vars_name[idx]); auto tensor = framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_vars[idx]); - if (tensor->numel() == 0) { + if (tensor->numel() <= 0 || (!tensor->IsInitialized())) { continue; } if (dtype == -1) { diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 5cecb7e09e7..790626a59d0 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -126,12 +126,20 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { auto &in_1 = in_vars[1]->Get(); auto length = in_0.numel(); - if (length) { + if (length && in_0.IsInitialized() && in_1.IsInitialized()) { auto result = EigenVector::Flatten(*out); auto &place = *dev_ctx.eigen_device(); auto in_0_e = EigenVector::Flatten(in_0); auto in_1_e = EigenVector::Flatten(in_1); result.device(place) = in_0_e + in_1_e; + } else if (length && in_0.IsInitialized()) { + auto result = EigenVector::Flatten(*out); + auto &place = *dev_ctx.eigen_device(); + result.device(place) = EigenVector::Flatten(in_0); + } else if (length && in_1.IsInitialized()) { + auto result = EigenVector::Flatten(*out); + auto &place = *dev_ctx.eigen_device(); + result.device(place) = EigenVector::Flatten(in_1); } return; } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 70252a51e76..1a3a1dd5096 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -39,6 +39,7 @@ from . import contrib from . import nets from . import optimizer from . import backward +from .backward import gradients from . import regularizer from . import average from . import metrics @@ -72,7 +73,7 @@ Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + \ trainer_desc.__all__ + inferencer.__all__ + transpiler.__all__ + \ parallel_executor.__all__ + lod_tensor.__all__ + \ - data_feed_desc.__all__ + compiler.__all__ + [ + data_feed_desc.__all__ + compiler.__all__ + backward.__all__ + [ 'io', 'initializer', 'layers', diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 9030a33f3ef..9de001849b9 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -22,7 +22,7 @@ import six from .. import compat as cpt from . import unique_name -__all__ = ['append_backward'] +__all__ = ['append_backward', 'gradients'] def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): @@ -142,6 +142,7 @@ def _addup_repetitive_outputs_(op_descs): pending_sum_ops = [] var_rename_count = collections.defaultdict(int) renamed_vars = collections.defaultdict(list) + renamed_var_start_idx = collections.defaultdict(list) for idx, op_desc in enumerate(op_descs): for var_name in op_desc.input_arg_names(): if len(renamed_vars[var_name]) > 1: @@ -159,6 +160,7 @@ def _addup_repetitive_outputs_(op_descs): if len(renamed_vars[var_name]) == 0: # it's the first time we get the variable renamed_vars[var_name] = [var_name] + renamed_var_start_idx[var_name] = idx else: if len(renamed_vars[var_name]) == 1: new_name = var_name + "@RENAME@" + \ @@ -166,7 +168,12 @@ def _addup_repetitive_outputs_(op_descs): var_rename_count[var_name] += 1 # rename original var_name renamed_vars[var_name][0] = new_name - _rename_arg_(op_descs, var_name, new_name, 0, idx) + # before change: _rename_arg_(op_descs, var_name, + # new_name, 0, idx) + # rename arg from idx of the first appearance + # in backward, not always from 0 + _rename_arg_(op_descs, var_name, new_name, + renamed_var_start_idx[var_name], idx) _rename_arg_(pending_sum_ops, var_name, new_name) for p in op_desc.output_names()[:param_idx]: @@ -254,7 +261,8 @@ def _append_backward_ops_(block, target_block, no_grad_dict, grad_to_var, - callbacks=None): + callbacks=None, + input_grad_names_set=None): """ Create all grad ops, and insert them into given block @@ -286,8 +294,13 @@ def _append_backward_ops_(block, sub_block = program.block(op._block_attr_id("sub_block")) grad_sub_block = program._create_block() grad_sub_block._set_forward_block_idx(sub_block.idx) + # see follwing comments for why set None here. + pre_input_grad_names_set = copy.copy(input_grad_names_set) + input_grad_names_set = None _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, - no_grad_dict, grad_to_var, callbacks) + no_grad_dict, grad_to_var, callbacks, + input_grad_names_set) + input_grad_names_set = pre_input_grad_names_set program._rollback() grad_sub_block_list.append(grad_sub_block.desc) @@ -296,8 +309,33 @@ def _append_backward_ops_(block, grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list) - grad_op_descs.extend(grad_op_desc) - grad_to_var.update(op_grad_to_var) + # If input_grad_names_set is not None, extend grad_op_descs only when + # any input grad in outputs of previous grad ops. + # But this strategy is not suited for while op for some control flow, + # for example, for while op, the grads maybe generated in next loop. + if input_grad_names_set is not None: + is_append_grad = False + for op_desc in grad_op_desc: + input_grad_names = [ + name for name in op_desc.input_arg_names() + if name.find(core.grad_var_suffix()) != -1 + ] + # some code of gradient ops, like increment, are not very + # standard, there is no @GRAD in these ops' inputs. + if len(input_grad_names) == 0: + is_append_grad = True + break + + if _some_in_set_(input_grad_names, input_grad_names_set): + grad_op_descs.append(op_desc) + is_append_grad = True + for name in op_desc.output_arg_names(): + input_grad_names_set.add(name) + if is_append_grad: + grad_to_var.update(op_grad_to_var) + else: + grad_op_descs.extend(grad_op_desc) + grad_to_var.update(op_grad_to_var) grad_op_descs = _addup_repetitive_outputs_(grad_op_descs) @@ -481,6 +519,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, isinstance(callbacks, list) program = loss.block.program + program._appending_grad_times += 1 + if no_grad_set is None: no_grad_set = set() no_grad_set = copy.copy(no_grad_set) @@ -511,10 +551,23 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0])) op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set) + no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set))) - _append_backward_ops_(root_block, op_path, root_block, no_grad_dict, - grad_to_var, callbacks) + input_grad_names_set = None + # For double backward, input_grad_names is used for filter + # some non-used gradients op. + if program._appending_grad_times > 1: + input_grad_names_set = set([_append_grad_suffix_(loss.name)]) + + _append_backward_ops_( + root_block, + op_path, + root_block, + no_grad_dict, + grad_to_var, + callbacks, + input_grad_names_set=input_grad_names_set) # Because calc_gradient may be called multiple times, # we need rename the internal gradient variables so that they have @@ -618,17 +671,20 @@ def _find_op_path_(block, outputs, inputs, no_grad_set): def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): """ - Backpropagate the graidents of targets to inputs. + Backpropagate the gradients of targets to inputs. Args: targets(Variable|list[Variable]): The target variables inputs(Variable|list[Variable]): The input variables + target_gradients (Variable|list[Variable]|None): The gradient variables + of targets which has the same shape with targets, If None, ones will + be created for them. no_grad_set(set[string]): The names of variables that have no gradients in Block 0. All variables with `stop_gradient=True` from all blocks will be automatically added. Return: - (list[Variable]): list of gradients for inputs + (list[Variable]): A list of gradients for inputs If an input does not affect targets, the corresponding gradient variable will be None """ @@ -638,6 +694,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): block = targets[0].block prog = block.program + # increase appending gradients times + prog._appending_grad_times += 1 block_idx = block.idx if not target_gradients: @@ -655,6 +713,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): fwd_op_num = block.desc.op_size() + input_grad_names_set = set() + target_grad_map = {} for i, grad in enumerate(target_gradients): target = targets[i] @@ -670,6 +730,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): 'output_dim_idx': 0 }) block.desc.append_op().copy_from(op_desc) + input_grad_names_set.add(grad_name) else: if target.block.idx != block_idx or target.block.program != prog: raise ValueError("all targets must be in the same block") @@ -678,6 +739,12 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): "The shapes of target and grad are different: %s %s" % ( target.name, grad.name)) target_grad_map[_append_grad_suffix_(target.name)] = grad.name + input_grad_names_set.add(grad.name) + + # For double backward, input_grad_names is used for filter + # some non-used gradients op. + if prog._appending_grad_times == 1: + input_grad_names_set = None for input in inputs: if input.block.program != prog: @@ -688,7 +755,13 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set))) grad_to_var = dict() grad_info_map = dict() - _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var) + _append_backward_ops_( + block, + op_path, + block, + no_grad_dict, + grad_to_var, + input_grad_names_set=input_grad_names_set) # Because calc_gradient may be called multiple times, # we need rename the internal gradient variables so that they have @@ -712,3 +785,40 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): return grad_vars[0] else: return grad_vars + + +def gradients(targets, inputs, target_gradients=None, no_grad_set=None): + """ + Backpropagate the gradients of targets to inputs. + + Args: + targets (Variable|list[Variable]): The target variables. + inputs (Variable|list[Variable]): The input variables. + target_gradients (Variable|list[Variable]|None): The gradient variables + of targets which has the same shape with targets, If None, ones will + be created for them. + no_grad_set (set[string]): The names of variables that have no gradients + in Block 0. All variables with `stop_gradient=True` from all blocks + will be automatically added. + + Return: + (list[Variable]): A list of gradients for inputs + If an input does not affect targets, the corresponding gradient variable + will be None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + + x = fluid.layers.data(name='x', shape=[2,8,8], dtype='float32') + x.stop_gradient=False + y = fluid.layers.conv2d(x, 4, 1, bias_attr=False) + y = fluid.layers.relu(y) + y = fluid.layers.conv2d(y, 4, 1, bias_attr=False) + y = fluid.layers.relu(y) + z = fluid.gradients([y], x) + print(z) + """ + outs = calc_gradient(targets, inputs, target_gradients, no_grad_set) + return _as_list(outs) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 1407c93a240..7e89c4a36ec 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -2775,6 +2775,9 @@ class Program(object): # assigned if this program has been parsed by a pipeline optimizer self._pipeline_opt = None + # appending gradients times + self._appending_grad_times = 0 + @property def _is_mem_optimized(self): # if the program is optimized, operator input/outputs @@ -3108,6 +3111,7 @@ class Program(object): p._current_role = self._current_role p.__op_role_var = self.__op_role_var + p._appending_grad_times = self._appending_grad_times p._sync_with_cpp() diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py index 98ca93caeb6..3775f62097d 100644 --- a/python/paddle/fluid/tests/unittests/gradient_checker.py +++ b/python/paddle/fluid/tests/unittests/gradient_checker.py @@ -23,7 +23,6 @@ from itertools import product import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.executor import Executor -from paddle.fluid.backward import calc_gradient from paddle.fluid.backward import _append_grad_suffix_, _as_list @@ -183,7 +182,7 @@ def _compute_analytical_jacobian(program, x, y, place, scope): dy = program.global_block().create_var( name=dy_name, shape=y.shape, dtype=np_type, persistable=True) # append backward - dx = calc_gradient(y, x, dy) + dx = fluid.gradients(y, x, dy) # init dy tensor in scope value = np.zeros(y.shape, dtype=np_type) @@ -382,7 +381,7 @@ def double_grad_check(x, ] # append first order grads - target_grads = calc_gradient(y, x, y_grads) + target_grads = fluid.gradients(y, x, y_grads) # y_grads are the input of first-order backward, # so, they are also the input of second-order backward. -- GitLab