未验证 提交 80d2e66f 编写于 作者: Q qingqing01 提交者: GitHub

Update backward appending stragety to support double backward and fix some bug. (#18104)

* Update backward.py:
     - If there is no input grad var in all outputs of previous ops, do not append this op into graph.
     - Only apply this stragety when double backward.
* Update some double backward op.
* Update sum_op to judge whether a tensor is empty by numel or IsInitialized().
上级 ca5642c8
......@@ -47,6 +47,7 @@ paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core_avx.Par
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy) -> None
paddle.fluid.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b'))
paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '869104f47e6fd21d897c3fcc426aa942'))
paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07ffd5351b30cf47172ccfd61bd0de6f'))
paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9141bb5f32caf7975eb3fd88c8a1b2da'))
......@@ -556,6 +557,7 @@ paddle.fluid.optimizer.PipelineOptimizer.find_section_opt (ArgSpec(args=['self',
paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.PipelineOptimizer.split_program (ArgSpec(args=['self', 'main_program', 'cut_list'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '08a5dd9f6f376ff3d55e0b1d92115cbd'))
paddle.fluid.backward.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b'))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core_avx.LoDTensor, arg0: List[List[int]]) -> None 2. __init__(self: paddle.fluid.core_avx.LoDTensor) -> None
......
......@@ -604,21 +604,21 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {
if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
if (ctx->HasOutput("DX")) {
if (HasOutputs("DX") && ctx->HasOutput("DX")) {
ctx->ShareDim("X", "DX");
ctx->ShareLoD("X", "DX");
}
if (ctx->HasOutput("DDOut")) {
if (HasOutputs("DDOut") && ctx->HasOutput("DDOut")) {
ctx->ShareDim("X", "DDOut");
ctx->ShareLoD("X", "DDOut");
}
}
if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
if (ctx->HasOutput("DOut")) {
if (HasOutputs("DOut") && ctx->HasOutput("DOut")) {
ctx->ShareDim("Out", "DOut");
ctx->ShareLoD("Out", "DOut");
}
if (ctx->HasOutput("DDOut")) {
if (HasOutputs("DDOut") && ctx->HasOutput("DDOut")) {
ctx->ShareDim("Out", "DDOut");
ctx->ShareLoD("Out", "DDOut");
}
......@@ -635,7 +635,6 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
//
// ReluGrad: dx = dy if y >= 0 else 0
// ReluGradGrad: ddy = ddx if y >= 0 else 0
// dy = 0
//
class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
public:
......@@ -650,9 +649,7 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
// input2: ddx
op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
op->SetAttrMap(Attrs());
// output1: ddy
op->SetOutput("DOut", InputGrad("Out"));
// output2: ddy
// output: ddy
op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
return std::unique_ptr<::paddle::framework::OpDesc>(op);
}
......@@ -675,7 +672,6 @@ class LeakyReluDoubleGradMaker
op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
op->SetAttrMap(Attrs());
// Out@GRAD@GRAD: ddy
op->SetOutput("DX", InputGrad("X"));
op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
return std::unique_ptr<::paddle::framework::OpDesc>(op);
}
......
......@@ -1321,10 +1321,6 @@ struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
}
if (dOut) {
auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
dout.device(*d) = dout.constant(static_cast<T>(0));
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
};
......@@ -1351,10 +1347,6 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
(x < static_cast<T>(0)).template cast<T>().eval())
.template cast<T>();
}
if (dX) {
auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
dx.device(*d) = dx.constant(static_cast<T>(0));
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
};
......
......@@ -533,9 +533,16 @@ class Conv2DDoubleGradMaker : public framework::SingleGradOpDescMaker {
// ddO, dI, dW
// Unlike grad op, double grad op does not use name@GRAD@GRAD
// as key of ops' inputs and outputs.
op->SetOutput("DDOutput", InputGrad(framework::GradVarName("Output")));
op->SetOutput("DFilter", InputGrad("Filter"));
op->SetOutput("DInput", InputGrad("Input"));
auto ddx = OutputGrad(framework::GradVarName("Input"));
auto ddw = OutputGrad(framework::GradVarName("Filter"));
std::vector<std::string> empty_str = {};
op->SetOutput(
"DDOutput",
ddx.empty() ? empty_str : InputGrad(framework::GradVarName("Output")));
op->SetOutput("DFilter", ddx.empty() ? empty_str : InputGrad("Filter"));
op->SetOutput("DInput", ddw.empty() ? empty_str : InputGrad("Input"));
op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(op);
......@@ -547,13 +554,13 @@ void ConvOpDoubleGrad::InferShape(framework::InferShapeContext* ctx) const {
auto w_dims = ctx->GetInputDim("Filter");
auto do_dims = ctx->GetInputDim("DOutput");
if (ctx->HasOutput("DDOutput")) {
if (ctx->HasOutput("DDOutput") && ctx->HasInput("DDInput")) {
ctx->SetOutputDim("DDOutput", do_dims);
}
if (ctx->HasOutput("DFilter")) {
if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) {
ctx->SetOutputDim("DFilter", w_dims);
}
if (ctx->HasOutput("DInput")) {
if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) {
ctx->SetOutputDim("DInput", x_dims);
}
}
......
......@@ -189,15 +189,15 @@ class MulDoubleGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
PADDLE_ENFORCE(ctx->HasInput("DOut"), "Input(DOut) should not be null");
if (ctx->HasOutput("DX")) {
if (ctx->HasOutput("DDOut") && ctx->HasInput("DDX")) {
ctx->ShareDim("DOut", "DDOut");
}
if (ctx->HasOutput("DX") && ctx->HasInput("DDY")) {
ctx->ShareDim("X", "DX");
}
if (ctx->HasOutput("DY")) {
if (ctx->HasOutput("DY") && ctx->HasInput("DDX")) {
ctx->ShareDim("Y", "DY");
}
if (ctx->HasOutput("DDOut")) {
ctx->ShareDim("DOut", "DDOut");
}
}
};
......@@ -216,9 +216,15 @@ class MulDoubleGradMaker : public framework::SingleGradOpDescMaker {
retv->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
retv->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
retv->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
retv->SetOutput("DX", InputGrad("X"));
retv->SetOutput("DY", InputGrad("Y"));
auto ddx = OutputGrad(framework::GradVarName("X"));
auto ddw = OutputGrad(framework::GradVarName("Y"));
std::vector<std::string> empty_str = {};
retv->SetOutput("DDOut", (ddx.empty())
? empty_str
: InputGrad(framework::GradVarName("Out")));
retv->SetOutput("DX", ddw.empty() ? empty_str : InputGrad("X"));
retv->SetOutput("DY", ddx.empty() ? empty_str : InputGrad("Y"));
retv->SetAttrMap(Attrs());
return retv;
......
......@@ -46,17 +46,7 @@ class ReduceMeanDoubleGradMaker : public framework::GradOpDescMakerBase {
std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
std::vector<std::unique_ptr<framework::OpDesc>> ops;
auto x_grads = InputGrad("X");
auto x_gg = OutputGrad(framework::GradVarName("X")); // input ddx
if (!x_grads.empty()) {
auto* x_grad_op = new framework::OpDesc();
x_grad_op->SetType("scale");
x_grad_op->SetInput("X", x_gg);
x_grad_op->SetOutput("Out", x_grads);
x_grad_op->SetAttr("scale", 0.0f);
ops.emplace_back(x_grad_op);
}
auto out_grads = InputGrad(framework::GradVarName("Out"));
if (!out_grads.empty()) {
auto* out_grad_op = new framework::OpDesc();
......
......@@ -111,7 +111,7 @@ class SumOp : public framework::OperatorWithKernel {
"Input var[%s] should not be nullptr", x_vars_name[idx]);
auto tensor =
framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_vars[idx]);
if (tensor->numel() == 0) {
if (tensor->numel() <= 0 || (!tensor->IsInitialized())) {
continue;
}
if (dtype == -1) {
......
......@@ -126,12 +126,20 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
auto length = in_0.numel();
if (length) {
if (length && in_0.IsInitialized() && in_1.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
auto in_0_e = EigenVector<T>::Flatten(in_0);
auto in_1_e = EigenVector<T>::Flatten(in_1);
result.device(place) = in_0_e + in_1_e;
} else if (length && in_0.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
result.device(place) = EigenVector<T>::Flatten(in_0);
} else if (length && in_1.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
result.device(place) = EigenVector<T>::Flatten(in_1);
}
return;
}
......
......@@ -39,6 +39,7 @@ from . import contrib
from . import nets
from . import optimizer
from . import backward
from .backward import gradients
from . import regularizer
from . import average
from . import metrics
......@@ -72,7 +73,7 @@ Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + \
trainer_desc.__all__ + inferencer.__all__ + transpiler.__all__ + \
parallel_executor.__all__ + lod_tensor.__all__ + \
data_feed_desc.__all__ + compiler.__all__ + [
data_feed_desc.__all__ + compiler.__all__ + backward.__all__ + [
'io',
'initializer',
'layers',
......
......@@ -22,7 +22,7 @@ import six
from .. import compat as cpt
from . import unique_name
__all__ = ['append_backward']
__all__ = ['append_backward', 'gradients']
def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
......@@ -142,6 +142,7 @@ def _addup_repetitive_outputs_(op_descs):
pending_sum_ops = []
var_rename_count = collections.defaultdict(int)
renamed_vars = collections.defaultdict(list)
renamed_var_start_idx = collections.defaultdict(list)
for idx, op_desc in enumerate(op_descs):
for var_name in op_desc.input_arg_names():
if len(renamed_vars[var_name]) > 1:
......@@ -159,6 +160,7 @@ def _addup_repetitive_outputs_(op_descs):
if len(renamed_vars[var_name]) == 0:
# it's the first time we get the variable
renamed_vars[var_name] = [var_name]
renamed_var_start_idx[var_name] = idx
else:
if len(renamed_vars[var_name]) == 1:
new_name = var_name + "@RENAME@" + \
......@@ -166,7 +168,12 @@ def _addup_repetitive_outputs_(op_descs):
var_rename_count[var_name] += 1
# rename original var_name
renamed_vars[var_name][0] = new_name
_rename_arg_(op_descs, var_name, new_name, 0, idx)
# before change: _rename_arg_(op_descs, var_name,
# new_name, 0, idx)
# rename arg from idx of the first appearance
# in backward, not always from 0
_rename_arg_(op_descs, var_name, new_name,
renamed_var_start_idx[var_name], idx)
_rename_arg_(pending_sum_ops, var_name, new_name)
for p in op_desc.output_names()[:param_idx]:
......@@ -254,7 +261,8 @@ def _append_backward_ops_(block,
target_block,
no_grad_dict,
grad_to_var,
callbacks=None):
callbacks=None,
input_grad_names_set=None):
"""
Create all grad ops, and insert them into given block
......@@ -286,8 +294,13 @@ def _append_backward_ops_(block,
sub_block = program.block(op._block_attr_id("sub_block"))
grad_sub_block = program._create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx)
# see follwing comments for why set None here.
pre_input_grad_names_set = copy.copy(input_grad_names_set)
input_grad_names_set = None
_append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
no_grad_dict, grad_to_var, callbacks)
no_grad_dict, grad_to_var, callbacks,
input_grad_names_set)
input_grad_names_set = pre_input_grad_names_set
program._rollback()
grad_sub_block_list.append(grad_sub_block.desc)
......@@ -296,6 +309,31 @@ def _append_backward_ops_(block,
grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
# If input_grad_names_set is not None, extend grad_op_descs only when
# any input grad in outputs of previous grad ops.
# But this strategy is not suited for while op for some control flow,
# for example, for while op, the grads maybe generated in next loop.
if input_grad_names_set is not None:
is_append_grad = False
for op_desc in grad_op_desc:
input_grad_names = [
name for name in op_desc.input_arg_names()
if name.find(core.grad_var_suffix()) != -1
]
# some code of gradient ops, like increment, are not very
# standard, there is no @GRAD in these ops' inputs.
if len(input_grad_names) == 0:
is_append_grad = True
break
if _some_in_set_(input_grad_names, input_grad_names_set):
grad_op_descs.append(op_desc)
is_append_grad = True
for name in op_desc.output_arg_names():
input_grad_names_set.add(name)
if is_append_grad:
grad_to_var.update(op_grad_to_var)
else:
grad_op_descs.extend(grad_op_desc)
grad_to_var.update(op_grad_to_var)
......@@ -481,6 +519,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
isinstance(callbacks, list)
program = loss.block.program
program._appending_grad_times += 1
if no_grad_set is None:
no_grad_set = set()
no_grad_set = copy.copy(no_grad_set)
......@@ -511,10 +551,23 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
_append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
grad_to_var, callbacks)
input_grad_names_set = None
# For double backward, input_grad_names is used for filter
# some non-used gradients op.
if program._appending_grad_times > 1:
input_grad_names_set = set([_append_grad_suffix_(loss.name)])
_append_backward_ops_(
root_block,
op_path,
root_block,
no_grad_dict,
grad_to_var,
callbacks,
input_grad_names_set=input_grad_names_set)
# Because calc_gradient may be called multiple times,
# we need rename the internal gradient variables so that they have
......@@ -618,17 +671,20 @@ def _find_op_path_(block, outputs, inputs, no_grad_set):
def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
"""
Backpropagate the graidents of targets to inputs.
Backpropagate the gradients of targets to inputs.
Args:
targets(Variable|list[Variable]): The target variables
inputs(Variable|list[Variable]): The input variables
target_gradients (Variable|list[Variable]|None): The gradient variables
of targets which has the same shape with targets, If None, ones will
be created for them.
no_grad_set(set[string]): The names of variables that have no gradients
in Block 0. All variables with `stop_gradient=True` from all blocks
will be automatically added.
Return:
(list[Variable]): list of gradients for inputs
(list[Variable]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable
will be None
"""
......@@ -638,6 +694,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
block = targets[0].block
prog = block.program
# increase appending gradients times
prog._appending_grad_times += 1
block_idx = block.idx
if not target_gradients:
......@@ -655,6 +713,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
fwd_op_num = block.desc.op_size()
input_grad_names_set = set()
target_grad_map = {}
for i, grad in enumerate(target_gradients):
target = targets[i]
......@@ -670,6 +730,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
'output_dim_idx': 0
})
block.desc.append_op().copy_from(op_desc)
input_grad_names_set.add(grad_name)
else:
if target.block.idx != block_idx or target.block.program != prog:
raise ValueError("all targets must be in the same block")
......@@ -678,6 +739,12 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
"The shapes of target and grad are different: %s %s" % (
target.name, grad.name))
target_grad_map[_append_grad_suffix_(target.name)] = grad.name
input_grad_names_set.add(grad.name)
# For double backward, input_grad_names is used for filter
# some non-used gradients op.
if prog._appending_grad_times == 1:
input_grad_names_set = None
for input in inputs:
if input.block.program != prog:
......@@ -688,7 +755,13 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
grad_to_var = dict()
grad_info_map = dict()
_append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
_append_backward_ops_(
block,
op_path,
block,
no_grad_dict,
grad_to_var,
input_grad_names_set=input_grad_names_set)
# Because calc_gradient may be called multiple times,
# we need rename the internal gradient variables so that they have
......@@ -712,3 +785,40 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
return grad_vars[0]
else:
return grad_vars
def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
"""
Backpropagate the gradients of targets to inputs.
Args:
targets (Variable|list[Variable]): The target variables.
inputs (Variable|list[Variable]): The input variables.
target_gradients (Variable|list[Variable]|None): The gradient variables
of targets which has the same shape with targets, If None, ones will
be created for them.
no_grad_set (set[string]): The names of variables that have no gradients
in Block 0. All variables with `stop_gradient=True` from all blocks
will be automatically added.
Return:
(list[Variable]): A list of gradients for inputs
If an input does not affect targets, the corresponding gradient variable
will be None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
x = fluid.layers.data(name='x', shape=[2,8,8], dtype='float32')
x.stop_gradient=False
y = fluid.layers.conv2d(x, 4, 1, bias_attr=False)
y = fluid.layers.relu(y)
y = fluid.layers.conv2d(y, 4, 1, bias_attr=False)
y = fluid.layers.relu(y)
z = fluid.gradients([y], x)
print(z)
"""
outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
return _as_list(outs)
......@@ -2775,6 +2775,9 @@ class Program(object):
# assigned if this program has been parsed by a pipeline optimizer
self._pipeline_opt = None
# appending gradients times
self._appending_grad_times = 0
@property
def _is_mem_optimized(self):
# if the program is optimized, operator input/outputs
......@@ -3108,6 +3111,7 @@ class Program(object):
p._current_role = self._current_role
p.__op_role_var = self.__op_role_var
p._appending_grad_times = self._appending_grad_times
p._sync_with_cpp()
......
......@@ -23,7 +23,6 @@ from itertools import product
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.executor import Executor
from paddle.fluid.backward import calc_gradient
from paddle.fluid.backward import _append_grad_suffix_, _as_list
......@@ -183,7 +182,7 @@ def _compute_analytical_jacobian(program, x, y, place, scope):
dy = program.global_block().create_var(
name=dy_name, shape=y.shape, dtype=np_type, persistable=True)
# append backward
dx = calc_gradient(y, x, dy)
dx = fluid.gradients(y, x, dy)
# init dy tensor in scope
value = np.zeros(y.shape, dtype=np_type)
......@@ -382,7 +381,7 @@ def double_grad_check(x,
]
# append first order grads
target_grads = calc_gradient(y, x, y_grads)
target_grads = fluid.gradients(y, x, y_grads)
# y_grads are the input of first-order backward,
# so, they are also the input of second-order backward.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册