未验证 提交 80d2e66f 编写于 作者: Q qingqing01 提交者: GitHub

Update backward appending stragety to support double backward and fix some bug. (#18104)

* Update backward.py:
     - If there is no input grad var in all outputs of previous ops, do not append this op into graph.
     - Only apply this stragety when double backward.
* Update some double backward op.
* Update sum_op to judge whether a tensor is empty by numel or IsInitialized().
上级 ca5642c8
...@@ -47,6 +47,7 @@ paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core_avx.Par ...@@ -47,6 +47,7 @@ paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core_avx.Par
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy) -> None paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy) -> None
paddle.fluid.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b'))
paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '869104f47e6fd21d897c3fcc426aa942')) paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '869104f47e6fd21d897c3fcc426aa942'))
paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07ffd5351b30cf47172ccfd61bd0de6f')) paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07ffd5351b30cf47172ccfd61bd0de6f'))
paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9141bb5f32caf7975eb3fd88c8a1b2da')) paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9141bb5f32caf7975eb3fd88c8a1b2da'))
...@@ -556,6 +557,7 @@ paddle.fluid.optimizer.PipelineOptimizer.find_section_opt (ArgSpec(args=['self', ...@@ -556,6 +557,7 @@ paddle.fluid.optimizer.PipelineOptimizer.find_section_opt (ArgSpec(args=['self',
paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.PipelineOptimizer.split_program (ArgSpec(args=['self', 'main_program', 'cut_list'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.PipelineOptimizer.split_program (ArgSpec(args=['self', 'main_program', 'cut_list'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '08a5dd9f6f376ff3d55e0b1d92115cbd')) paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '08a5dd9f6f376ff3d55e0b1d92115cbd'))
paddle.fluid.backward.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b'))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core_avx.LoDTensor, arg0: List[List[int]]) -> None 2. __init__(self: paddle.fluid.core_avx.LoDTensor) -> None paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core_avx.LoDTensor, arg0: List[List[int]]) -> None 2. __init__(self: paddle.fluid.core_avx.LoDTensor) -> None
......
...@@ -604,21 +604,21 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { ...@@ -604,21 +604,21 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) { if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
if (ctx->HasOutput("DX")) { if (HasOutputs("DX") && ctx->HasOutput("DX")) {
ctx->ShareDim("X", "DX"); ctx->ShareDim("X", "DX");
ctx->ShareLoD("X", "DX"); ctx->ShareLoD("X", "DX");
} }
if (ctx->HasOutput("DDOut")) { if (HasOutputs("DDOut") && ctx->HasOutput("DDOut")) {
ctx->ShareDim("X", "DDOut"); ctx->ShareDim("X", "DDOut");
ctx->ShareLoD("X", "DDOut"); ctx->ShareLoD("X", "DDOut");
} }
} }
if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) { if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
if (ctx->HasOutput("DOut")) { if (HasOutputs("DOut") && ctx->HasOutput("DOut")) {
ctx->ShareDim("Out", "DOut"); ctx->ShareDim("Out", "DOut");
ctx->ShareLoD("Out", "DOut"); ctx->ShareLoD("Out", "DOut");
} }
if (ctx->HasOutput("DDOut")) { if (HasOutputs("DDOut") && ctx->HasOutput("DDOut")) {
ctx->ShareDim("Out", "DDOut"); ctx->ShareDim("Out", "DDOut");
ctx->ShareLoD("Out", "DDOut"); ctx->ShareLoD("Out", "DDOut");
} }
...@@ -635,7 +635,6 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { ...@@ -635,7 +635,6 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
// //
// ReluGrad: dx = dy if y >= 0 else 0 // ReluGrad: dx = dy if y >= 0 else 0
// ReluGradGrad: ddy = ddx if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0
// dy = 0
// //
class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker { class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
public: public:
...@@ -650,9 +649,7 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker { ...@@ -650,9 +649,7 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
// input2: ddx // input2: ddx
op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
op->SetAttrMap(Attrs()); op->SetAttrMap(Attrs());
// output1: ddy // output: ddy
op->SetOutput("DOut", InputGrad("Out"));
// output2: ddy
op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out"))); op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
return std::unique_ptr<::paddle::framework::OpDesc>(op); return std::unique_ptr<::paddle::framework::OpDesc>(op);
} }
...@@ -675,7 +672,6 @@ class LeakyReluDoubleGradMaker ...@@ -675,7 +672,6 @@ class LeakyReluDoubleGradMaker
op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
op->SetAttrMap(Attrs()); op->SetAttrMap(Attrs());
// Out@GRAD@GRAD: ddy // Out@GRAD@GRAD: ddy
op->SetOutput("DX", InputGrad("X"));
op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out"))); op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
return std::unique_ptr<::paddle::framework::OpDesc>(op); return std::unique_ptr<::paddle::framework::OpDesc>(op);
} }
......
...@@ -1321,10 +1321,6 @@ struct ReluGradGradFunctor : public BaseActivationFunctor<T> { ...@@ -1321,10 +1321,6 @@ struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut)); auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>(); ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
} }
if (dOut) {
auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
dout.device(*d) = dout.constant(static_cast<T>(0));
}
} }
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
}; };
...@@ -1351,10 +1347,6 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> { ...@@ -1351,10 +1347,6 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
(x < static_cast<T>(0)).template cast<T>().eval()) (x < static_cast<T>(0)).template cast<T>().eval())
.template cast<T>(); .template cast<T>();
} }
if (dX) {
auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
dx.device(*d) = dx.constant(static_cast<T>(0));
}
} }
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
}; };
......
...@@ -533,9 +533,16 @@ class Conv2DDoubleGradMaker : public framework::SingleGradOpDescMaker { ...@@ -533,9 +533,16 @@ class Conv2DDoubleGradMaker : public framework::SingleGradOpDescMaker {
// ddO, dI, dW // ddO, dI, dW
// Unlike grad op, double grad op does not use name@GRAD@GRAD // Unlike grad op, double grad op does not use name@GRAD@GRAD
// as key of ops' inputs and outputs. // as key of ops' inputs and outputs.
op->SetOutput("DDOutput", InputGrad(framework::GradVarName("Output"))); auto ddx = OutputGrad(framework::GradVarName("Input"));
op->SetOutput("DFilter", InputGrad("Filter")); auto ddw = OutputGrad(framework::GradVarName("Filter"));
op->SetOutput("DInput", InputGrad("Input")); std::vector<std::string> empty_str = {};
op->SetOutput(
"DDOutput",
ddx.empty() ? empty_str : InputGrad(framework::GradVarName("Output")));
op->SetOutput("DFilter", ddx.empty() ? empty_str : InputGrad("Filter"));
op->SetOutput("DInput", ddw.empty() ? empty_str : InputGrad("Input"));
op->SetAttrMap(Attrs()); op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(op); return std::unique_ptr<framework::OpDesc>(op);
...@@ -547,13 +554,13 @@ void ConvOpDoubleGrad::InferShape(framework::InferShapeContext* ctx) const { ...@@ -547,13 +554,13 @@ void ConvOpDoubleGrad::InferShape(framework::InferShapeContext* ctx) const {
auto w_dims = ctx->GetInputDim("Filter"); auto w_dims = ctx->GetInputDim("Filter");
auto do_dims = ctx->GetInputDim("DOutput"); auto do_dims = ctx->GetInputDim("DOutput");
if (ctx->HasOutput("DDOutput")) { if (ctx->HasOutput("DDOutput") && ctx->HasInput("DDInput")) {
ctx->SetOutputDim("DDOutput", do_dims); ctx->SetOutputDim("DDOutput", do_dims);
} }
if (ctx->HasOutput("DFilter")) { if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) {
ctx->SetOutputDim("DFilter", w_dims); ctx->SetOutputDim("DFilter", w_dims);
} }
if (ctx->HasOutput("DInput")) { if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) {
ctx->SetOutputDim("DInput", x_dims); ctx->SetOutputDim("DInput", x_dims);
} }
} }
......
...@@ -189,15 +189,15 @@ class MulDoubleGradOp : public framework::OperatorWithKernel { ...@@ -189,15 +189,15 @@ class MulDoubleGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
PADDLE_ENFORCE(ctx->HasInput("DOut"), "Input(DOut) should not be null"); PADDLE_ENFORCE(ctx->HasInput("DOut"), "Input(DOut) should not be null");
if (ctx->HasOutput("DX")) { if (ctx->HasOutput("DDOut") && ctx->HasInput("DDX")) {
ctx->ShareDim("DOut", "DDOut");
}
if (ctx->HasOutput("DX") && ctx->HasInput("DDY")) {
ctx->ShareDim("X", "DX"); ctx->ShareDim("X", "DX");
} }
if (ctx->HasOutput("DY")) { if (ctx->HasOutput("DY") && ctx->HasInput("DDX")) {
ctx->ShareDim("Y", "DY"); ctx->ShareDim("Y", "DY");
} }
if (ctx->HasOutput("DDOut")) {
ctx->ShareDim("DOut", "DDOut");
}
} }
}; };
...@@ -216,9 +216,15 @@ class MulDoubleGradMaker : public framework::SingleGradOpDescMaker { ...@@ -216,9 +216,15 @@ class MulDoubleGradMaker : public framework::SingleGradOpDescMaker {
retv->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); retv->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
retv->SetInput("DDY", OutputGrad(framework::GradVarName("Y"))); retv->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
retv->SetOutput("DDOut", InputGrad(framework::GradVarName("Out"))); auto ddx = OutputGrad(framework::GradVarName("X"));
retv->SetOutput("DX", InputGrad("X")); auto ddw = OutputGrad(framework::GradVarName("Y"));
retv->SetOutput("DY", InputGrad("Y")); std::vector<std::string> empty_str = {};
retv->SetOutput("DDOut", (ddx.empty())
? empty_str
: InputGrad(framework::GradVarName("Out")));
retv->SetOutput("DX", ddw.empty() ? empty_str : InputGrad("X"));
retv->SetOutput("DY", ddx.empty() ? empty_str : InputGrad("Y"));
retv->SetAttrMap(Attrs()); retv->SetAttrMap(Attrs());
return retv; return retv;
......
...@@ -46,17 +46,7 @@ class ReduceMeanDoubleGradMaker : public framework::GradOpDescMakerBase { ...@@ -46,17 +46,7 @@ class ReduceMeanDoubleGradMaker : public framework::GradOpDescMakerBase {
std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override { std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
std::vector<std::unique_ptr<framework::OpDesc>> ops; std::vector<std::unique_ptr<framework::OpDesc>> ops;
auto x_grads = InputGrad("X");
auto x_gg = OutputGrad(framework::GradVarName("X")); // input ddx auto x_gg = OutputGrad(framework::GradVarName("X")); // input ddx
if (!x_grads.empty()) {
auto* x_grad_op = new framework::OpDesc();
x_grad_op->SetType("scale");
x_grad_op->SetInput("X", x_gg);
x_grad_op->SetOutput("Out", x_grads);
x_grad_op->SetAttr("scale", 0.0f);
ops.emplace_back(x_grad_op);
}
auto out_grads = InputGrad(framework::GradVarName("Out")); auto out_grads = InputGrad(framework::GradVarName("Out"));
if (!out_grads.empty()) { if (!out_grads.empty()) {
auto* out_grad_op = new framework::OpDesc(); auto* out_grad_op = new framework::OpDesc();
......
...@@ -111,7 +111,7 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -111,7 +111,7 @@ class SumOp : public framework::OperatorWithKernel {
"Input var[%s] should not be nullptr", x_vars_name[idx]); "Input var[%s] should not be nullptr", x_vars_name[idx]);
auto tensor = auto tensor =
framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_vars[idx]); framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_vars[idx]);
if (tensor->numel() == 0) { if (tensor->numel() <= 0 || (!tensor->IsInitialized())) {
continue; continue;
} }
if (dtype == -1) { if (dtype == -1) {
......
...@@ -126,12 +126,20 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { ...@@ -126,12 +126,20 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
auto &in_1 = in_vars[1]->Get<framework::LoDTensor>(); auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
auto length = in_0.numel(); auto length = in_0.numel();
if (length) { if (length && in_0.IsInitialized() && in_1.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out); auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device(); auto &place = *dev_ctx.eigen_device();
auto in_0_e = EigenVector<T>::Flatten(in_0); auto in_0_e = EigenVector<T>::Flatten(in_0);
auto in_1_e = EigenVector<T>::Flatten(in_1); auto in_1_e = EigenVector<T>::Flatten(in_1);
result.device(place) = in_0_e + in_1_e; result.device(place) = in_0_e + in_1_e;
} else if (length && in_0.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
result.device(place) = EigenVector<T>::Flatten(in_0);
} else if (length && in_1.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
result.device(place) = EigenVector<T>::Flatten(in_1);
} }
return; return;
} }
......
...@@ -39,6 +39,7 @@ from . import contrib ...@@ -39,6 +39,7 @@ from . import contrib
from . import nets from . import nets
from . import optimizer from . import optimizer
from . import backward from . import backward
from .backward import gradients
from . import regularizer from . import regularizer
from . import average from . import average
from . import metrics from . import metrics
...@@ -72,7 +73,7 @@ Tensor = LoDTensor ...@@ -72,7 +73,7 @@ Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + \ __all__ = framework.__all__ + executor.__all__ + \
trainer_desc.__all__ + inferencer.__all__ + transpiler.__all__ + \ trainer_desc.__all__ + inferencer.__all__ + transpiler.__all__ + \
parallel_executor.__all__ + lod_tensor.__all__ + \ parallel_executor.__all__ + lod_tensor.__all__ + \
data_feed_desc.__all__ + compiler.__all__ + [ data_feed_desc.__all__ + compiler.__all__ + backward.__all__ + [
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
......
...@@ -22,7 +22,7 @@ import six ...@@ -22,7 +22,7 @@ import six
from .. import compat as cpt from .. import compat as cpt
from . import unique_name from . import unique_name
__all__ = ['append_backward'] __all__ = ['append_backward', 'gradients']
def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
...@@ -142,6 +142,7 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -142,6 +142,7 @@ def _addup_repetitive_outputs_(op_descs):
pending_sum_ops = [] pending_sum_ops = []
var_rename_count = collections.defaultdict(int) var_rename_count = collections.defaultdict(int)
renamed_vars = collections.defaultdict(list) renamed_vars = collections.defaultdict(list)
renamed_var_start_idx = collections.defaultdict(list)
for idx, op_desc in enumerate(op_descs): for idx, op_desc in enumerate(op_descs):
for var_name in op_desc.input_arg_names(): for var_name in op_desc.input_arg_names():
if len(renamed_vars[var_name]) > 1: if len(renamed_vars[var_name]) > 1:
...@@ -159,6 +160,7 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -159,6 +160,7 @@ def _addup_repetitive_outputs_(op_descs):
if len(renamed_vars[var_name]) == 0: if len(renamed_vars[var_name]) == 0:
# it's the first time we get the variable # it's the first time we get the variable
renamed_vars[var_name] = [var_name] renamed_vars[var_name] = [var_name]
renamed_var_start_idx[var_name] = idx
else: else:
if len(renamed_vars[var_name]) == 1: if len(renamed_vars[var_name]) == 1:
new_name = var_name + "@RENAME@" + \ new_name = var_name + "@RENAME@" + \
...@@ -166,7 +168,12 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -166,7 +168,12 @@ def _addup_repetitive_outputs_(op_descs):
var_rename_count[var_name] += 1 var_rename_count[var_name] += 1
# rename original var_name # rename original var_name
renamed_vars[var_name][0] = new_name renamed_vars[var_name][0] = new_name
_rename_arg_(op_descs, var_name, new_name, 0, idx) # before change: _rename_arg_(op_descs, var_name,
# new_name, 0, idx)
# rename arg from idx of the first appearance
# in backward, not always from 0
_rename_arg_(op_descs, var_name, new_name,
renamed_var_start_idx[var_name], idx)
_rename_arg_(pending_sum_ops, var_name, new_name) _rename_arg_(pending_sum_ops, var_name, new_name)
for p in op_desc.output_names()[:param_idx]: for p in op_desc.output_names()[:param_idx]:
...@@ -254,7 +261,8 @@ def _append_backward_ops_(block, ...@@ -254,7 +261,8 @@ def _append_backward_ops_(block,
target_block, target_block,
no_grad_dict, no_grad_dict,
grad_to_var, grad_to_var,
callbacks=None): callbacks=None,
input_grad_names_set=None):
""" """
Create all grad ops, and insert them into given block Create all grad ops, and insert them into given block
...@@ -286,8 +294,13 @@ def _append_backward_ops_(block, ...@@ -286,8 +294,13 @@ def _append_backward_ops_(block,
sub_block = program.block(op._block_attr_id("sub_block")) sub_block = program.block(op._block_attr_id("sub_block"))
grad_sub_block = program._create_block() grad_sub_block = program._create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx) grad_sub_block._set_forward_block_idx(sub_block.idx)
# see follwing comments for why set None here.
pre_input_grad_names_set = copy.copy(input_grad_names_set)
input_grad_names_set = None
_append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
no_grad_dict, grad_to_var, callbacks) no_grad_dict, grad_to_var, callbacks,
input_grad_names_set)
input_grad_names_set = pre_input_grad_names_set
program._rollback() program._rollback()
grad_sub_block_list.append(grad_sub_block.desc) grad_sub_block_list.append(grad_sub_block.desc)
...@@ -296,6 +309,31 @@ def _append_backward_ops_(block, ...@@ -296,6 +309,31 @@ def _append_backward_ops_(block,
grad_op_desc, op_grad_to_var = core.get_grad_op_desc( grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list) op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
# If input_grad_names_set is not None, extend grad_op_descs only when
# any input grad in outputs of previous grad ops.
# But this strategy is not suited for while op for some control flow,
# for example, for while op, the grads maybe generated in next loop.
if input_grad_names_set is not None:
is_append_grad = False
for op_desc in grad_op_desc:
input_grad_names = [
name for name in op_desc.input_arg_names()
if name.find(core.grad_var_suffix()) != -1
]
# some code of gradient ops, like increment, are not very
# standard, there is no @GRAD in these ops' inputs.
if len(input_grad_names) == 0:
is_append_grad = True
break
if _some_in_set_(input_grad_names, input_grad_names_set):
grad_op_descs.append(op_desc)
is_append_grad = True
for name in op_desc.output_arg_names():
input_grad_names_set.add(name)
if is_append_grad:
grad_to_var.update(op_grad_to_var)
else:
grad_op_descs.extend(grad_op_desc) grad_op_descs.extend(grad_op_desc)
grad_to_var.update(op_grad_to_var) grad_to_var.update(op_grad_to_var)
...@@ -481,6 +519,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -481,6 +519,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
isinstance(callbacks, list) isinstance(callbacks, list)
program = loss.block.program program = loss.block.program
program._appending_grad_times += 1
if no_grad_set is None: if no_grad_set is None:
no_grad_set = set() no_grad_set = set()
no_grad_set = copy.copy(no_grad_set) no_grad_set = copy.copy(no_grad_set)
...@@ -511,10 +551,23 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -511,10 +551,23 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0])) block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set) op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set))) no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
_append_backward_ops_(root_block, op_path, root_block, no_grad_dict, input_grad_names_set = None
grad_to_var, callbacks) # For double backward, input_grad_names is used for filter
# some non-used gradients op.
if program._appending_grad_times > 1:
input_grad_names_set = set([_append_grad_suffix_(loss.name)])
_append_backward_ops_(
root_block,
op_path,
root_block,
no_grad_dict,
grad_to_var,
callbacks,
input_grad_names_set=input_grad_names_set)
# Because calc_gradient may be called multiple times, # Because calc_gradient may be called multiple times,
# we need rename the internal gradient variables so that they have # we need rename the internal gradient variables so that they have
...@@ -618,17 +671,20 @@ def _find_op_path_(block, outputs, inputs, no_grad_set): ...@@ -618,17 +671,20 @@ def _find_op_path_(block, outputs, inputs, no_grad_set):
def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
""" """
Backpropagate the graidents of targets to inputs. Backpropagate the gradients of targets to inputs.
Args: Args:
targets(Variable|list[Variable]): The target variables targets(Variable|list[Variable]): The target variables
inputs(Variable|list[Variable]): The input variables inputs(Variable|list[Variable]): The input variables
target_gradients (Variable|list[Variable]|None): The gradient variables
of targets which has the same shape with targets, If None, ones will
be created for them.
no_grad_set(set[string]): The names of variables that have no gradients no_grad_set(set[string]): The names of variables that have no gradients
in Block 0. All variables with `stop_gradient=True` from all blocks in Block 0. All variables with `stop_gradient=True` from all blocks
will be automatically added. will be automatically added.
Return: Return:
(list[Variable]): list of gradients for inputs (list[Variable]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable If an input does not affect targets, the corresponding gradient variable
will be None will be None
""" """
...@@ -638,6 +694,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): ...@@ -638,6 +694,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
block = targets[0].block block = targets[0].block
prog = block.program prog = block.program
# increase appending gradients times
prog._appending_grad_times += 1
block_idx = block.idx block_idx = block.idx
if not target_gradients: if not target_gradients:
...@@ -655,6 +713,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): ...@@ -655,6 +713,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
fwd_op_num = block.desc.op_size() fwd_op_num = block.desc.op_size()
input_grad_names_set = set()
target_grad_map = {} target_grad_map = {}
for i, grad in enumerate(target_gradients): for i, grad in enumerate(target_gradients):
target = targets[i] target = targets[i]
...@@ -670,6 +730,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): ...@@ -670,6 +730,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
'output_dim_idx': 0 'output_dim_idx': 0
}) })
block.desc.append_op().copy_from(op_desc) block.desc.append_op().copy_from(op_desc)
input_grad_names_set.add(grad_name)
else: else:
if target.block.idx != block_idx or target.block.program != prog: if target.block.idx != block_idx or target.block.program != prog:
raise ValueError("all targets must be in the same block") raise ValueError("all targets must be in the same block")
...@@ -678,6 +739,12 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): ...@@ -678,6 +739,12 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
"The shapes of target and grad are different: %s %s" % ( "The shapes of target and grad are different: %s %s" % (
target.name, grad.name)) target.name, grad.name))
target_grad_map[_append_grad_suffix_(target.name)] = grad.name target_grad_map[_append_grad_suffix_(target.name)] = grad.name
input_grad_names_set.add(grad.name)
# For double backward, input_grad_names is used for filter
# some non-used gradients op.
if prog._appending_grad_times == 1:
input_grad_names_set = None
for input in inputs: for input in inputs:
if input.block.program != prog: if input.block.program != prog:
...@@ -688,7 +755,13 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): ...@@ -688,7 +755,13 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set))) no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
grad_to_var = dict() grad_to_var = dict()
grad_info_map = dict() grad_info_map = dict()
_append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var) _append_backward_ops_(
block,
op_path,
block,
no_grad_dict,
grad_to_var,
input_grad_names_set=input_grad_names_set)
# Because calc_gradient may be called multiple times, # Because calc_gradient may be called multiple times,
# we need rename the internal gradient variables so that they have # we need rename the internal gradient variables so that they have
...@@ -712,3 +785,40 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): ...@@ -712,3 +785,40 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
return grad_vars[0] return grad_vars[0]
else: else:
return grad_vars return grad_vars
def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
"""
Backpropagate the gradients of targets to inputs.
Args:
targets (Variable|list[Variable]): The target variables.
inputs (Variable|list[Variable]): The input variables.
target_gradients (Variable|list[Variable]|None): The gradient variables
of targets which has the same shape with targets, If None, ones will
be created for them.
no_grad_set (set[string]): The names of variables that have no gradients
in Block 0. All variables with `stop_gradient=True` from all blocks
will be automatically added.
Return:
(list[Variable]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable
will be None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
x = fluid.layers.data(name='x', shape=[2,8,8], dtype='float32')
x.stop_gradient=False
y = fluid.layers.conv2d(x, 4, 1, bias_attr=False)
y = fluid.layers.relu(y)
y = fluid.layers.conv2d(y, 4, 1, bias_attr=False)
y = fluid.layers.relu(y)
z = fluid.gradients([y], x)
print(z)
"""
outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
return _as_list(outs)
...@@ -2775,6 +2775,9 @@ class Program(object): ...@@ -2775,6 +2775,9 @@ class Program(object):
# assigned if this program has been parsed by a pipeline optimizer # assigned if this program has been parsed by a pipeline optimizer
self._pipeline_opt = None self._pipeline_opt = None
# appending gradients times
self._appending_grad_times = 0
@property @property
def _is_mem_optimized(self): def _is_mem_optimized(self):
# if the program is optimized, operator input/outputs # if the program is optimized, operator input/outputs
...@@ -3108,6 +3111,7 @@ class Program(object): ...@@ -3108,6 +3111,7 @@ class Program(object):
p._current_role = self._current_role p._current_role = self._current_role
p.__op_role_var = self.__op_role_var p.__op_role_var = self.__op_role_var
p._appending_grad_times = self._appending_grad_times
p._sync_with_cpp() p._sync_with_cpp()
......
...@@ -23,7 +23,6 @@ from itertools import product ...@@ -23,7 +23,6 @@ from itertools import product
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.backward import calc_gradient
from paddle.fluid.backward import _append_grad_suffix_, _as_list from paddle.fluid.backward import _append_grad_suffix_, _as_list
...@@ -183,7 +182,7 @@ def _compute_analytical_jacobian(program, x, y, place, scope): ...@@ -183,7 +182,7 @@ def _compute_analytical_jacobian(program, x, y, place, scope):
dy = program.global_block().create_var( dy = program.global_block().create_var(
name=dy_name, shape=y.shape, dtype=np_type, persistable=True) name=dy_name, shape=y.shape, dtype=np_type, persistable=True)
# append backward # append backward
dx = calc_gradient(y, x, dy) dx = fluid.gradients(y, x, dy)
# init dy tensor in scope # init dy tensor in scope
value = np.zeros(y.shape, dtype=np_type) value = np.zeros(y.shape, dtype=np_type)
...@@ -382,7 +381,7 @@ def double_grad_check(x, ...@@ -382,7 +381,7 @@ def double_grad_check(x,
] ]
# append first order grads # append first order grads
target_grads = calc_gradient(y, x, y_grads) target_grads = fluid.gradients(y, x, y_grads)
# y_grads are the input of first-order backward, # y_grads are the input of first-order backward,
# so, they are also the input of second-order backward. # so, they are also the input of second-order backward.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册