未验证 提交 24ec6ed0 编写于 作者: Y YuanRisheng 提交者: GitHub

Add some double/triple grad kernel yaml file (#42361)

* add double yaml

* add inline func
上级 2bee99df
...@@ -22,17 +22,12 @@ import os ...@@ -22,17 +22,12 @@ import os
### Global Variables ### ### Global Variables ###
######################## ########################
ops_to_fill_zero_for_empty_grads = set([ ops_to_fill_zero_for_empty_grads = set([
"split_grad", "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad",
"rnn_grad", "sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad",
"matmul_double_grad", "add_triple_grad", "multiply_double_grad", "multiply_triple_grad",
"matmul_triple_grad", "conv2d_grad_grad", "batch_norm_double_grad", "tanh_double_grad",
"sigmoid_double_grad", "tanh_triple_grad", "subtract_double_grad", "divide_double_grad",
"sigmoid_triple_grad", "log_double_grad", "elu_double_grad"
"add_double_grad",
"add_triple_grad",
"multiply_double_grad",
"multiply_triple_grad",
"conv2d_grad_grad",
]) ])
# For API dispatch used at python-level # For API dispatch used at python-level
......
...@@ -96,8 +96,7 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> { ...@@ -96,8 +96,7 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
// TODO(chenweihang): deal with multiple diff input Tensors // TODO(chenweihang): deal with multiple diff input Tensors
// TODO(chenweihang): add global device guard method to set backend // TODO(chenweihang): add global device guard method to set backend
void operator()(const Tensor& x) { inline void AssignKernelKeySet(const phi::TensorBase& tensor) {
const phi::TensorBase& tensor = *x.impl();
key_set.backend_set = key_set.backend_set =
key_set.backend_set | detail::GetTensorBackendSet(tensor); key_set.backend_set | detail::GetTensorBackendSet(tensor);
// TODO(chenweihang): select multi layout and dtype // TODO(chenweihang): select multi layout and dtype
...@@ -110,6 +109,8 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> { ...@@ -110,6 +109,8 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
} }
} }
void operator()(const Tensor& x) { AssignKernelKeySet(*x.impl()); }
void operator()(const std::vector<Tensor>& x) { void operator()(const std::vector<Tensor>& x) {
const phi::TensorBase& tensor = *x.at(0).impl(); const phi::TensorBase& tensor = *x.at(0).impl();
key_set.backend_set = key_set.backend_set =
...@@ -119,6 +120,13 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> { ...@@ -119,6 +120,13 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
key_set.dtype = tensor.dtype(); key_set.dtype = tensor.dtype();
} }
void operator()(const paddle::optional<const Tensor&> x) {
if (x.get_ptr() != nullptr) {
const phi::TensorBase& tensor = *(x.get_ptr()->impl());
AssignKernelKeySet(tensor);
}
}
// skip other type args, these args don't used in kernel selection // skip other type args, these args don't used in kernel selection
template <typename T> template <typename T>
void operator()(const T& x) { void operator()(const T& x) {
......
...@@ -82,18 +82,18 @@ void ReluDoubleGradKernel(const Context& dev_ctx, ...@@ -82,18 +82,18 @@ void ReluDoubleGradKernel(const Context& dev_ctx,
template <typename T, typename Context> template <typename T, typename Context>
void TanhDoubleGradKernel(const Context& dev_ctx, void TanhDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& out, const DenseTensor& out,
const DenseTensor& ddx,
const DenseTensor& dout, const DenseTensor& dout,
const DenseTensor& ddx,
DenseTensor* dout_new, DenseTensor* dout_new,
DenseTensor* ddout); DenseTensor* ddout);
template <typename T, typename Context> template <typename T, typename Context>
void TanhTripleGradKernel(const Context& dev_ctx, void TanhTripleGradKernel(const Context& dev_ctx,
const DenseTensor& out, const DenseTensor& out,
const DenseTensor& ddx,
const DenseTensor& dout, const DenseTensor& dout,
const DenseTensor& d_ddout, const DenseTensor& ddx,
const DenseTensor& d_dout_new, const DenseTensor& d_dout_new,
const DenseTensor& d_ddout,
DenseTensor* d_out_new, DenseTensor* d_out_new,
DenseTensor* d_dout, DenseTensor* d_dout,
DenseTensor* d_ddx); DenseTensor* d_ddx);
......
...@@ -66,16 +66,16 @@ void BatchNormGradKernel(const Context& dev_ctx, ...@@ -66,16 +66,16 @@ void BatchNormGradKernel(const Context& dev_ctx,
template <typename T, typename Context> template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context& dev_ctx, void BatchNormDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x_grad_grad,
const DenseTensor& scale_grad_grad,
const DenseTensor& bias_grad_grad,
const DenseTensor& y_grad,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& scale, const DenseTensor& scale,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> mean, paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance, paddle::optional<const DenseTensor&> variance,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
const DenseTensor& y_grad,
const DenseTensor& x_grad_grad,
const DenseTensor& scale_grad_grad,
const DenseTensor& bias_grad_grad,
float momentum, float momentum,
float epsilon, float epsilon,
const std::string& data_layout, const std::string& data_layout,
......
...@@ -341,16 +341,16 @@ void BatchNormGradKernel(const Context& dev_ctx, ...@@ -341,16 +341,16 @@ void BatchNormGradKernel(const Context& dev_ctx,
template <typename T, typename Context> template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context& ctx, void BatchNormDoubleGradKernel(const Context& ctx,
const DenseTensor& x_grad_grad,
const DenseTensor& scale_grad_grad,
const DenseTensor& bias_grad_grad,
const DenseTensor& y_grad,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& scale, const DenseTensor& scale,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> mean, paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance, paddle::optional<const DenseTensor&> variance,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
const DenseTensor& y_grad,
const DenseTensor& x_grad_grad,
const DenseTensor& scale_grad_grad,
const DenseTensor& bias_grad_grad,
float momentum, float momentum,
float epsilon, float epsilon,
const std::string& data_layout_str, const std::string& data_layout_str,
......
...@@ -38,9 +38,9 @@ void SubtractGradKernel(const Context& dev_ctx, ...@@ -38,9 +38,9 @@ void SubtractGradKernel(const Context& dev_ctx,
template <typename T, typename Context> template <typename T, typename Context>
void SubtractDoubleGradKernel(const Context& dev_ctx, void SubtractDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y, const DenseTensor& y,
const DenseTensor& dout,
paddle::optional<const DenseTensor&> ddx, paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy, paddle::optional<const DenseTensor&> ddy,
const DenseTensor& dout,
int axis, int axis,
DenseTensor* ddout) { DenseTensor* ddout) {
phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout); phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
......
...@@ -30,9 +30,9 @@ void SubtractGradKernel(const Context& dev_ctx, ...@@ -30,9 +30,9 @@ void SubtractGradKernel(const Context& dev_ctx,
template <typename T, typename Context> template <typename T, typename Context>
void SubtractDoubleGradKernel(const Context& dev_ctx, void SubtractDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y, const DenseTensor& y,
const DenseTensor& dout,
paddle::optional<const DenseTensor&> ddx, paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy, paddle::optional<const DenseTensor&> ddy,
const DenseTensor& dout,
int axis, int axis,
DenseTensor* ddout); DenseTensor* ddout);
......
...@@ -908,16 +908,16 @@ void BatchNormGradKernel(const Context &dev_ctx, ...@@ -908,16 +908,16 @@ void BatchNormGradKernel(const Context &dev_ctx,
template <typename T, typename Context> template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context &ctx, void BatchNormDoubleGradKernel(const Context &ctx,
const DenseTensor &x_grad_grad,
const DenseTensor &scale_grad_grad,
const DenseTensor &bias_grad_grad,
const DenseTensor &y_grad,
const DenseTensor &x, const DenseTensor &x,
const DenseTensor &scale, const DenseTensor &scale,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
paddle::optional<const DenseTensor &> mean, paddle::optional<const DenseTensor &> mean,
paddle::optional<const DenseTensor &> variance, paddle::optional<const DenseTensor &> variance,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
const DenseTensor &y_grad,
const DenseTensor &x_grad_grad,
const DenseTensor &scale_grad_grad,
const DenseTensor &bias_grad_grad,
float momentum, float momentum,
float epsilon, float epsilon,
const std::string &data_layout_str, const std::string &data_layout_str,
......
...@@ -46,9 +46,9 @@ void SubtractGradKernel(const Context& dev_ctx, ...@@ -46,9 +46,9 @@ void SubtractGradKernel(const Context& dev_ctx,
template <typename T, typename Context> template <typename T, typename Context>
void SubtractDoubleGradKernel(const Context& dev_ctx, void SubtractDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y, const DenseTensor& y,
const DenseTensor& dout,
paddle::optional<const DenseTensor&> ddx, paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy, paddle::optional<const DenseTensor&> ddy,
const DenseTensor& dout,
int axis, int axis,
DenseTensor* ddout) { DenseTensor* ddout) {
phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout); phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
......
...@@ -152,8 +152,8 @@ void LeakyReluDoubleGradKernel(const Context& dev_ctx, ...@@ -152,8 +152,8 @@ void LeakyReluDoubleGradKernel(const Context& dev_ctx,
template <typename T, typename Context> template <typename T, typename Context>
void TanhDoubleGradKernel(const Context& dev_ctx, void TanhDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& out, const DenseTensor& out,
const DenseTensor& ddx,
const DenseTensor& dout, const DenseTensor& dout,
const DenseTensor& ddx,
DenseTensor* dout_new, DenseTensor* dout_new,
DenseTensor* ddout) { DenseTensor* ddout) {
if (dout_new) { if (dout_new) {
...@@ -171,10 +171,10 @@ void TanhDoubleGradKernel(const Context& dev_ctx, ...@@ -171,10 +171,10 @@ void TanhDoubleGradKernel(const Context& dev_ctx,
template <typename T, typename Context> template <typename T, typename Context>
void TanhTripleGradKernel(const Context& dev_ctx, void TanhTripleGradKernel(const Context& dev_ctx,
const DenseTensor& out, const DenseTensor& out,
const DenseTensor& ddx,
const DenseTensor& dout, const DenseTensor& dout,
const DenseTensor& d_ddout, const DenseTensor& ddx,
const DenseTensor& d_dout_new, const DenseTensor& d_dout_new,
const DenseTensor& d_ddout,
DenseTensor* d_out_new, DenseTensor* d_out_new,
DenseTensor* d_dout, DenseTensor* d_dout,
DenseTensor* d_ddx) { DenseTensor* d_ddx) {
......
...@@ -121,13 +121,13 @@ KernelSignature ReluDoubleGradOpArgumentMapping( ...@@ -121,13 +121,13 @@ KernelSignature ReluDoubleGradOpArgumentMapping(
KernelSignature TanhDoubleGradOpArgumentMapping( KernelSignature TanhDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
return KernelSignature( return KernelSignature(
"tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"}); "tanh_double_grad", {"Out", "DOut", "DDX"}, {}, {"DOutNew", "DDOut"});
} }
KernelSignature TanhTripleGradOpArgumentMapping( KernelSignature TanhTripleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
return KernelSignature("tanh_triple_grad", return KernelSignature("tanh_triple_grad",
{"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"}, {"Out", "DOut", "DDX", "D_DOut_New", "D_DDOut"},
{}, {},
{"D_OutNew", "D_DOut", "D_DDx"}); {"D_OutNew", "D_DOut", "D_DDx"});
} }
......
...@@ -82,16 +82,16 @@ KernelSignature BatchNormGradOpArgumentMapping( ...@@ -82,16 +82,16 @@ KernelSignature BatchNormGradOpArgumentMapping(
KernelSignature BatchNormGradGradOpArgumentMapping( KernelSignature BatchNormGradGradOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
return KernelSignature("batch_norm_grad_grad", return KernelSignature("batch_norm_grad_grad",
{"DDX", {"X",
"DDScale",
"DDBias",
"DY",
"X",
"Scale", "Scale",
"Mean",
"Variance",
"SavedMean", "SavedMean",
"SavedVariance", "SavedVariance",
"Mean", "DY",
"Variance"}, "DDX",
"DDScale",
"DDBias"},
{"momentum", {"momentum",
"epsilon", "epsilon",
"data_layout", "data_layout",
......
...@@ -133,7 +133,7 @@ KernelSignature ElementwiseSubGradOpArgumentMapping( ...@@ -133,7 +133,7 @@ KernelSignature ElementwiseSubGradOpArgumentMapping(
KernelSignature ElementwiseSubDoubleGradOpArgumentMapping( KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
return KernelSignature( return KernelSignature(
"subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"}); "subtract_double_grad", {"Y", "DOut", "DDX", "DDY"}, {"axis"}, {"DDOut"});
} }
KernelSignature ElementwiseDivGradOpArgumentMapping( KernelSignature ElementwiseDivGradOpArgumentMapping(
......
...@@ -560,7 +560,10 @@ def get_static_double_grad(x, ...@@ -560,7 +560,10 @@ def get_static_double_grad(x,
# so, they are also the input of second-order backward. # so, they are also the input of second-order backward.
x += y_grads x += y_grads
x_init += dy_init x_init += dy_init
y = dx
# filter None in dx for DX/DY may be None in kernel
filted_dx = [dxi for dxi in dx if dxi is not None]
y = filted_dx
# check input arguments # check input arguments
x = _as_list(x) x = _as_list(x)
...@@ -619,6 +622,7 @@ def get_static_double_grad(x, ...@@ -619,6 +622,7 @@ def get_static_double_grad(x,
def get_eager_double_grad(func, def get_eager_double_grad(func,
x_init=None, x_init=None,
dy_init=None, dy_init=None,
place=None,
return_mid_result=False): return_mid_result=False):
""" """
Get Double Grad result of dygraph. Get Double Grad result of dygraph.
...@@ -627,6 +631,7 @@ def get_eager_double_grad(func, ...@@ -627,6 +631,7 @@ def get_eager_double_grad(func,
func: A wrapped dygraph function that its logic is equal to static program func: A wrapped dygraph function that its logic is equal to static program
x_init (numpy.array|list[numpy.array]|None): the init value for input x. x_init (numpy.array|list[numpy.array]|None): the init value for input x.
dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output. dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output.
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
return_mid_result (bool): A flag that controls the return content. return_mid_result (bool): A flag that controls the return content.
Returns: Returns:
If 'return_mid_result' set True. If 'return_mid_result' set True.
...@@ -635,6 +640,10 @@ def get_eager_double_grad(func, ...@@ -635,6 +640,10 @@ def get_eager_double_grad(func,
If 'return_mid_result' set False. If 'return_mid_result' set False.
A list of numpy array that stores second derivative result calulated by dygraph. A list of numpy array that stores second derivative result calulated by dygraph.
""" """
if isinstance(place, fluid.CPUPlace):
paddle.set_device("cpu")
if isinstance(place, fluid.CUDAPlace):
paddle.set_device("gpu")
inputs = [] inputs = []
dys = [] dys = []
for x in x_init: for x in x_init:
...@@ -648,7 +657,12 @@ def get_eager_double_grad(func, ...@@ -648,7 +657,12 @@ def get_eager_double_grad(func,
# calculate first derivative # calculate first derivative
outputs = func(inputs) outputs = func(inputs)
d_inputs = paddle.grad( d_inputs = paddle.grad(
outputs=outputs, inputs=inputs, grad_outputs=dys, create_graph=True) outputs=outputs,
inputs=inputs,
grad_outputs=dys,
create_graph=True,
allow_unused=True)
d_inputs = [d_input for d_input in d_inputs if d_input is not None]
# calcluate second derivative # calcluate second derivative
inputs = inputs + dys inputs = inputs + dys
...@@ -663,15 +677,20 @@ def get_eager_double_grad(func, ...@@ -663,15 +677,20 @@ def get_eager_double_grad(func,
ddy = paddle.ones(shape=d_input.shape, dtype=d_input.dtype) ddy = paddle.ones(shape=d_input.shape, dtype=d_input.dtype)
ddy.stop_gradient = False ddy.stop_gradient = False
ddys.append(ddy) ddys.append(ddy)
dd_inputs = paddle.grad( dd_inputs = paddle.grad(
outputs=d_inputs, outputs=d_inputs,
inputs=inputs, inputs=inputs,
grad_outputs=ddys, grad_outputs=ddys,
create_graph=create_graph) create_graph=create_graph,
allow_unused=True)
if return_mid_result: if return_mid_result:
return dd_inputs, inputs + ddys return dd_inputs, inputs + ddys
else: else:
return [dd_input.numpy() for dd_input in dd_inputs] return [
dd_input.numpy() for dd_input in dd_inputs if dd_input is not None
]
def double_grad_check_for_dygraph(func, def double_grad_check_for_dygraph(func,
...@@ -693,7 +712,6 @@ def double_grad_check_for_dygraph(func, ...@@ -693,7 +712,6 @@ def double_grad_check_for_dygraph(func,
y (Variable|list[Variable]): output variables to the program. y (Variable|list[Variable]): output variables to the program.
x_init (numpy.array|list[numpy.array]|None): the init value for input x. x_init (numpy.array|list[numpy.array]|None): the init value for input x.
place (fluid.CPUPlace or fluid.CUDAPlace): the device. place (fluid.CPUPlace or fluid.CUDAPlace): the device.
eps (float): perturbation for finite differences.
atol (float): absolute tolerance. atol (float): absolute tolerance.
rtol (float): relative tolerance. rtol (float): relative tolerance.
raise_exception (bool): whether to raise an exception if raise_exception (bool): whether to raise an exception if
...@@ -722,19 +740,25 @@ def double_grad_check_for_dygraph(func, ...@@ -722,19 +740,25 @@ def double_grad_check_for_dygraph(func,
paddle.disable_static() paddle.disable_static()
with _test_eager_guard(): with _test_eager_guard():
eager_double_grad = get_eager_double_grad(func, x_init, y_grads_init) eager_double_grad = get_eager_double_grad(func, x_init, y_grads_init,
place)
paddle.enable_static() paddle.enable_static()
static_double_grad = get_static_double_grad(x, y, x_init, y_grads_init, static_double_grad = get_static_double_grad(x, y, x_init, y_grads_init,
place) place)
if len(static_double_grad) != len(eager_double_grad):
msg = "The output grad tensor's number of static graph is different with dygraph, " \
"please check the python api unit test used."
raise RuntimeError(msg)
for i in six.moves.xrange(len(static_double_grad)): for i in six.moves.xrange(len(static_double_grad)):
if not np.allclose(static_double_grad[i], eager_double_grad[i], rtol, if not np.allclose(static_double_grad[i], eager_double_grad[i], rtol,
atol): atol):
msg = 'Check eager double result fail. Mismatch between static_graph double grad %s ' \ msg = 'Check eager double result fail. Mismatch between static_graph double grad ' \
'and eager double grad %s on %s,\n' \ 'and eager double grad on %s, the output double grad tensor\'s index is : %d \n' \
'static:%s\n eager:%s\n' \ 'static:%s\n eager:%s\n' \
% (static_double_grad[i].name, eager_double_grad[i].name, str(place), static_double_grad[i], eager_double_grad[i]) % (str(place), i, static_double_grad[i], eager_double_grad[i])
return fail_test(msg) return fail_test(msg)
...@@ -794,6 +818,7 @@ def get_static_triple_grad(x, ...@@ -794,6 +818,7 @@ def get_static_triple_grad(x,
def get_eager_triple_grad(func, def get_eager_triple_grad(func,
x_init=None, x_init=None,
dy_init=None, dy_init=None,
place=None,
return_mid_result=False): return_mid_result=False):
""" """
Get triple Grad result of dygraph. Get triple Grad result of dygraph.
...@@ -802,12 +827,13 @@ def get_eager_triple_grad(func, ...@@ -802,12 +827,13 @@ def get_eager_triple_grad(func,
func: A wrapped dygraph function that its logic is equal to static program func: A wrapped dygraph function that its logic is equal to static program
x_init (numpy.array|list[numpy.array]|None): the init value for input x. x_init (numpy.array|list[numpy.array]|None): the init value for input x.
dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output. dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output.
place (fluid.CPUPlace or fluid.CUDAPlace): the device.
return_mid_result (list[Tensor], list[Tensor]): If set True, the return_mid_result (list[Tensor], list[Tensor]): If set True, the
Returns: Returns:
A list of numpy array that stores second derivative result calulated by dygraph A list of numpy array that stores second derivative result calulated by dygraph
""" """
dd_y, dd_x = get_eager_double_grad( dd_y, dd_x = get_eager_double_grad(
func, x_init, dy_init, return_mid_result=True) func, x_init, dy_init, place, return_mid_result=True)
# calcluate third derivative # calcluate third derivative
dddys = [] dddys = []
...@@ -839,7 +865,6 @@ def triple_grad_check_for_dygraph(func, ...@@ -839,7 +865,6 @@ def triple_grad_check_for_dygraph(func,
y (Variable|list[Variable]): output variables to the program. y (Variable|list[Variable]): output variables to the program.
x_init (numpy.array|list[numpy.array]|None): the init value for input x. x_init (numpy.array|list[numpy.array]|None): the init value for input x.
place (fluid.CPUPlace or fluid.CUDAPlace): the device. place (fluid.CPUPlace or fluid.CUDAPlace): the device.
eps (float): perturbation for finite differences.
atol (float): absolute tolerance. atol (float): absolute tolerance.
rtol (float): relative tolerance. rtol (float): relative tolerance.
raise_exception (bool): whether to raise an exception if raise_exception (bool): whether to raise an exception if
...@@ -868,17 +893,23 @@ def triple_grad_check_for_dygraph(func, ...@@ -868,17 +893,23 @@ def triple_grad_check_for_dygraph(func,
paddle.disable_static() paddle.disable_static()
with _test_eager_guard(): with _test_eager_guard():
eager_triple_grad = get_eager_triple_grad(func, x_init, y_grads_init) eager_triple_grad = get_eager_triple_grad(func, x_init, y_grads_init,
place)
paddle.enable_static() paddle.enable_static()
static_triple_grad = get_static_triple_grad(x, y, x_init, y_grads_init, static_triple_grad = get_static_triple_grad(x, y, x_init, y_grads_init,
place) place)
if len(static_triple_grad) != len(eager_triple_grad):
msg = "The output grad tensor's number of static graph is different with dygraph, " \
"please check the python api unit test used."
raise RuntimeError(msg)
for i in six.moves.xrange(len(static_triple_grad)): for i in six.moves.xrange(len(static_triple_grad)):
if not np.allclose(static_triple_grad[i], eager_triple_grad[i], rtol, if not np.allclose(static_triple_grad[i], eager_triple_grad[i], rtol,
atol): atol):
msg = 'Check eager double result fail. Mismatch between static_graph double grad %s ' \ msg = 'Check eager double result fail. Mismatch between static_graph double grad ' \
'and eager double grad %s on %s,\n' \ 'and eager double grad on %s, the output double grad tensor\'s index is : %d \n' \
'static:%s\n eager:%s\n' \ 'static:%s\n eager:%s\n' \
% (static_triple_grad[i].name, eager_triple_grad[i].name, str(place), static_triple_grad[i], eager_triple_grad[i]) % (str(place), i, static_triple_grad[i], eager_triple_grad[i])
return fail_test(msg) return fail_test(msg)
...@@ -52,6 +52,9 @@ class TestSigmoidTripleGradCheck(unittest.TestCase): ...@@ -52,6 +52,9 @@ class TestSigmoidTripleGradCheck(unittest.TestCase):
class TestSigmoidDoubleGradCheck(unittest.TestCase): class TestSigmoidDoubleGradCheck(unittest.TestCase):
def sigmoid_wrapper(self, x):
return fluid.layers.sigmoid(x[0])
@prog_scope() @prog_scope()
def func(self, place): def func(self, place):
shape = [2, 3, 7, 9] shape = [2, 3, 7, 9]
...@@ -64,6 +67,8 @@ class TestSigmoidDoubleGradCheck(unittest.TestCase): ...@@ -64,6 +67,8 @@ class TestSigmoidDoubleGradCheck(unittest.TestCase):
x_arr[np.abs(x_arr) < 0.005] = 0.002 x_arr[np.abs(x_arr) < 0.005] = 0.002
gradient_checker.double_grad_check( gradient_checker.double_grad_check(
[x], y, x_init=x_arr, place=place, eps=eps) [x], y, x_init=x_arr, place=place, eps=eps)
gradient_checker.double_grad_check_for_dygraph(
self.sigmoid_wrapper, [x], y, x_init=x_arr, place=place)
def test_grad(self): def test_grad(self):
paddle.enable_static() paddle.enable_static()
...@@ -75,6 +80,9 @@ class TestSigmoidDoubleGradCheck(unittest.TestCase): ...@@ -75,6 +80,9 @@ class TestSigmoidDoubleGradCheck(unittest.TestCase):
class TestTanhTripleGradCheck(unittest.TestCase): class TestTanhTripleGradCheck(unittest.TestCase):
def tanh_wrapper(self, x):
return paddle.tanh(x[0])
@prog_scope() @prog_scope()
def func(self, place): def func(self, place):
shape = [2, 3, 7, 9] shape = [2, 3, 7, 9]
...@@ -87,6 +95,8 @@ class TestTanhTripleGradCheck(unittest.TestCase): ...@@ -87,6 +95,8 @@ class TestTanhTripleGradCheck(unittest.TestCase):
x_arr[np.abs(x_arr) < 0.005] = 0.002 x_arr[np.abs(x_arr) < 0.005] = 0.002
gradient_checker.triple_grad_check( gradient_checker.triple_grad_check(
[x], y, x_init=x_arr, place=place, eps=eps) [x], y, x_init=x_arr, place=place, eps=eps)
gradient_checker.triple_grad_check_for_dygraph(
self.tanh_wrapper, [x], y, x_init=x_arr, place=place)
def test_grad(self): def test_grad(self):
paddle.enable_static() paddle.enable_static()
...@@ -98,6 +108,9 @@ class TestTanhTripleGradCheck(unittest.TestCase): ...@@ -98,6 +108,9 @@ class TestTanhTripleGradCheck(unittest.TestCase):
class TestTanhDoubleGradCheck(unittest.TestCase): class TestTanhDoubleGradCheck(unittest.TestCase):
def tanh_wrapper(self, x):
return paddle.tanh(x[0])
@prog_scope() @prog_scope()
def func(self, place): def func(self, place):
shape = [2, 3, 7, 9] shape = [2, 3, 7, 9]
...@@ -110,6 +123,8 @@ class TestTanhDoubleGradCheck(unittest.TestCase): ...@@ -110,6 +123,8 @@ class TestTanhDoubleGradCheck(unittest.TestCase):
x_arr[np.abs(x_arr) < 0.005] = 0.002 x_arr[np.abs(x_arr) < 0.005] = 0.002
gradient_checker.double_grad_check( gradient_checker.double_grad_check(
[x], y, x_init=x_arr, place=place, eps=eps) [x], y, x_init=x_arr, place=place, eps=eps)
gradient_checker.double_grad_check_for_dygraph(
self.tanh_wrapper, [x], y, x_init=x_arr, place=place)
def test_grad(self): def test_grad(self):
paddle.enable_static() paddle.enable_static()
...@@ -173,6 +188,9 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase): ...@@ -173,6 +188,9 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase):
class TestELUDoubleGradCheck(unittest.TestCase): class TestELUDoubleGradCheck(unittest.TestCase):
def elu_wrapper(self, x):
return paddle.nn.functional.elu(x[0], alpha=0.2)
@prog_scope() @prog_scope()
def func(self, place): def func(self, place):
shape = [2, 4, 4, 4] shape = [2, 4, 4, 4]
...@@ -189,6 +207,8 @@ class TestELUDoubleGradCheck(unittest.TestCase): ...@@ -189,6 +207,8 @@ class TestELUDoubleGradCheck(unittest.TestCase):
x_arr = np.random.uniform(-1, 1, shape).astype(dtype) x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
gradient_checker.double_grad_check( gradient_checker.double_grad_check(
[x], y, x_init=x_arr, place=place, eps=eps) [x], y, x_init=x_arr, place=place, eps=eps)
gradient_checker.double_grad_check_for_dygraph(
self.elu_wrapper, [x], y, x_init=x_arr, place=place)
def test_grad(self): def test_grad(self):
paddle.enable_static() paddle.enable_static()
......
...@@ -139,6 +139,9 @@ class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase): ...@@ -139,6 +139,9 @@ class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase):
class TestElementwiseSubDoubleGradCheck(unittest.TestCase): class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
def subtract_wrapper(self, x):
return paddle.subtract(x[0], x[1])
@prog_scope() @prog_scope()
def func(self, place): def func(self, place):
# the shape of input variable should be clearly specified, not inlcude -1. # the shape of input variable should be clearly specified, not inlcude -1.
...@@ -156,6 +159,11 @@ class TestElementwiseSubDoubleGradCheck(unittest.TestCase): ...@@ -156,6 +159,11 @@ class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
gradient_checker.double_grad_check( gradient_checker.double_grad_check(
[x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps) [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
gradient_checker.double_grad_check_for_dygraph(
self.subtract_wrapper, [x, y],
out,
x_init=[x_arr, y_arr],
place=place)
def test_grad(self): def test_grad(self):
paddle.enable_static() paddle.enable_static()
...@@ -195,6 +203,9 @@ class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase): ...@@ -195,6 +203,9 @@ class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase):
class TestElementwiseDivDoubleGradCheck(unittest.TestCase): class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
def divide_wrapper(self, x):
return paddle.divide(x[0], x[1])
@prog_scope() @prog_scope()
def func(self, place): def func(self, place):
# the shape of input variable should be clearly specified, not inlcude -1. # the shape of input variable should be clearly specified, not inlcude -1.
...@@ -213,6 +224,12 @@ class TestElementwiseDivDoubleGradCheck(unittest.TestCase): ...@@ -213,6 +224,12 @@ class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
gradient_checker.double_grad_check( gradient_checker.double_grad_check(
[x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps, atol=1e-3) [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps, atol=1e-3)
gradient_checker.double_grad_check_for_dygraph(
self.divide_wrapper, [x, y],
out,
x_init=[x_arr, y_arr],
place=place,
atol=1e-3)
def test_grad(self): def test_grad(self):
paddle.enable_static() paddle.enable_static()
......
...@@ -112,7 +112,10 @@ def elu(x, alpha=1.0, name=None): ...@@ -112,7 +112,10 @@ def elu(x, alpha=1.0, name=None):
# [ 1. 15.6 ]] # [ 1. 15.6 ]]
""" """
if in_dynamic_mode(): if in_dygraph_mode():
return _C_ops.final_state_elu(x, alpha)
if _in_legacy_dygraph():
return _C_ops.elu(x, 'alpha', alpha) return _C_ops.elu(x, 'alpha', alpha)
check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu') check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
......
...@@ -466,6 +466,7 @@ ...@@ -466,6 +466,7 @@
func : DeformableConvInferMeta func : DeformableConvInferMeta
kernel : kernel :
func : deformable_conv func : deformable_conv
data_type : x
optional : mask optional : mask
backward : deformable_conv_grad backward : deformable_conv_grad
...@@ -546,6 +547,7 @@ ...@@ -546,6 +547,7 @@
func : DropoutInferMeta func : DropoutInferMeta
kernel : kernel :
func : dropout func : dropout
data_type : x
optional : seed_tensor optional : seed_tensor
backward : dropout_grad backward : dropout_grad
...@@ -1065,6 +1067,7 @@ ...@@ -1065,6 +1067,7 @@
func : LayerNormInferMeta func : LayerNormInferMeta
kernel : kernel :
func : layer_norm func : layer_norm
data_type : x
backward : layer_norm_grad backward : layer_norm_grad
optional : scale, bias optional : scale, bias
...@@ -1608,6 +1611,7 @@ ...@@ -1608,6 +1611,7 @@
func : PsroiPoolInferMeta func : PsroiPoolInferMeta
kernel : kernel :
func : psroi_pool func : psroi_pool
data_type : x
optional : boxes_num optional : boxes_num
backward : psroi_pool_grad backward : psroi_pool_grad
...@@ -1713,6 +1717,7 @@ ...@@ -1713,6 +1717,7 @@
func : RoiAlignInferMeta func : RoiAlignInferMeta
kernel : kernel :
func : roi_align func : roi_align
data_type : x
optional : boxes_num optional : boxes_num
backward : roi_align_grad backward : roi_align_grad
...@@ -1723,6 +1728,7 @@ ...@@ -1723,6 +1728,7 @@
func : RoiPoolInferMeta func : RoiPoolInferMeta
kernel : kernel :
func : roi_pool func : roi_pool
data_type : x
optional : boxes_num optional : boxes_num
intermediate : arg_max intermediate : arg_max
backward : roi_pool_grad backward : roi_pool_grad
......
...@@ -152,6 +152,18 @@ ...@@ -152,6 +152,18 @@
kernel : kernel :
func : atanh_grad func : atanh_grad
- backward_api : batch_norm_double_grad
forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias)
args : (Tensor x, Tensor scale, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor grad_out, Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
output : Tensor(x_grad), Tensor(scale_grad), Tensor(grad_out_grad)
infer_meta :
func : GeneralTernaryGradInferMeta
param : [x, scale, x]
kernel :
func : batch_norm_grad_grad
data_type : x
optional : out_mean, out_variance
- backward_api : batch_norm_grad - backward_api : batch_norm_grad
forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
...@@ -163,6 +175,7 @@ ...@@ -163,6 +175,7 @@
func : batch_norm_grad func : batch_norm_grad
data_type : out_grad data_type : out_grad
optional : mean_out, variance_out, reserve_space optional : mean_out, variance_out, reserve_space
backward : batch_norm_double_grad
- backward_api : bce_loss_grad - backward_api : bce_loss_grad
forward : bce_loss (Tensor input, Tensor label) -> Tensor(out) forward : bce_loss (Tensor input, Tensor label) -> Tensor(out)
...@@ -362,6 +375,7 @@ ...@@ -362,6 +375,7 @@
func : DeformableConvGradInferMeta func : DeformableConvGradInferMeta
kernel : kernel :
func : deformable_conv_grad func : deformable_conv_grad
data_type : x
optional : mask optional : mask
- backward_api : depthwise_conv2d_transpose_grad - backward_api : depthwise_conv2d_transpose_grad
...@@ -414,6 +428,18 @@ ...@@ -414,6 +428,18 @@
kernel : kernel :
func : dist_grad func : dist_grad
- backward_api : divide_double_grad
forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad)
infer_meta :
func : GeneralTernaryGradInferMeta
param : [y, grad_x, grad_x]
kernel :
func : divide_double_grad
data_type : out
optional : grad_x_grad, grad_y_grad
- backward_api : divide_grad - backward_api : divide_grad
forward : divide (Tensor x, Tensor y) -> Tensor(out) forward : divide (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, int axis = -1) args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, int axis = -1)
...@@ -423,6 +449,7 @@ ...@@ -423,6 +449,7 @@
param : [x, y] param : [x, y]
kernel : kernel :
func : divide_grad func : divide_grad
backward : divide_double_grad
- backward_api : dropout_grad - backward_api : dropout_grad
forward : dropout (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(mask) forward : dropout (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(mask)
...@@ -455,6 +482,16 @@ ...@@ -455,6 +482,16 @@
kernel : kernel :
func : elementwise_pow_grad func : elementwise_pow_grad
- backward_api : elu_double_grad
forward : elu_grad (Tensor x, Tensor out, Tensor grad_out, float alpha)-> Tensor(grad_x)
args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha)
output : Tensor(x_grad), Tensor(grad_out_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, x]
kernel :
func : elu_double_grad
- backward_api : elu_grad - backward_api : elu_grad
forward : elu (Tensor x, float alpha) -> Tensor(out) forward : elu (Tensor x, float alpha) -> Tensor(out)
args : (Tensor x, Tensor out, Tensor out_grad, float alpha) args : (Tensor x, Tensor out, Tensor out_grad, float alpha)
...@@ -464,6 +501,7 @@ ...@@ -464,6 +501,7 @@
param : [x] param : [x]
kernel : kernel :
func : elu_grad func : elu_grad
backward : elu_double_grad
- backward_api : erf_grad - backward_api : erf_grad
forward : erf (Tensor x) -> Tensor(out) forward : erf (Tensor x) -> Tensor(out)
...@@ -633,6 +671,7 @@ ...@@ -633,6 +671,7 @@
param : [x] param : [x]
kernel : kernel :
func : graph_send_recv_grad func : graph_send_recv_grad
data_type : out_grad
optional: out, dst_count optional: out, dst_count
- backward_api : gumbel_softmax_grad - backward_api : gumbel_softmax_grad
...@@ -1287,6 +1326,7 @@ ...@@ -1287,6 +1326,7 @@
param : [x] param : [x]
kernel : kernel :
func : psroi_pool_grad func : psroi_pool_grad
data_type : x
optional : boxes_num optional : boxes_num
# output is optional # output is optional
...@@ -1381,6 +1421,7 @@ ...@@ -1381,6 +1421,7 @@
param : [x] param : [x]
kernel : kernel :
func : roi_align_grad func : roi_align_grad
data_type : boxes
optional : boxes_num optional : boxes_num
- backward_api : roi_pool_grad - backward_api : roi_pool_grad
...@@ -1392,6 +1433,7 @@ ...@@ -1392,6 +1433,7 @@
param : [x] param : [x]
kernel : kernel :
func : roi_pool_grad func : roi_pool_grad
data_type : x
optional : boxes_num optional : boxes_num
- backward_api : roll_grad - backward_api : roll_grad
...@@ -1498,7 +1540,7 @@ ...@@ -1498,7 +1540,7 @@
func : UnchangedInferMeta func : UnchangedInferMeta
param : [x] param : [x]
kernel : kernel :
func : sigmoid_cross_entropy_with_logits_grad func : sigmoid_cross_entropy_with_logits_grad
- backward_api : sigmoid_double_grad - backward_api : sigmoid_double_grad
forward : sigmoid_grad (Tensor out, Tensor fwd_grad_out) -> Tensor(grad_x) forward : sigmoid_grad (Tensor out, Tensor fwd_grad_out) -> Tensor(grad_x)
...@@ -1654,6 +1696,18 @@ ...@@ -1654,6 +1696,18 @@
func : strided_slice_grad func : strided_slice_grad
no_need_buffer : x no_need_buffer : x
- backward_api : subtract_double_grad
forward : subtract_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
output : Tensor(grad_out_grad)
infer_meta :
func : UnchangedInferMeta
param : [grad_out]
kernel :
func : subtract_double_grad
optional : grad_x_grad, grad_y_grad
no_need_buffer : y, grad_out
- backward_api : subtract_grad - backward_api : subtract_grad
forward : subtract (Tensor x, Tensor y) -> Tensor(out) forward : subtract (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1) args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
...@@ -1664,6 +1718,7 @@ ...@@ -1664,6 +1718,7 @@
kernel : kernel :
func : subtract_grad func : subtract_grad
no_need_buffer : x, y no_need_buffer : x, y
backward : subtract_double_grad
- backward_api : sum_double_grad - backward_api : sum_double_grad
forward : sum_grad (Tensor x, Tensor grad_out, int64_t[] dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x) forward : sum_grad (Tensor x, Tensor grad_out, int64_t[] dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x)
...@@ -1720,6 +1775,17 @@ ...@@ -1720,6 +1775,17 @@
kernel : kernel :
func : tan_grad func : tan_grad
- backward_api : tanh_double_grad
forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
args : (Tensor out, Tensor grad_out, Tensor grad_x_grad)
output : Tensor(out_grad), Tensor(grad_out_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [out, out]
kernel :
func : tanh_double_grad
backward : tanh_triple_grad
- backward_api : tanh_grad - backward_api : tanh_grad
forward : tanh (Tensor x) -> Tensor(out) forward : tanh (Tensor x) -> Tensor(out)
args : (Tensor out, Tensor out_grad) args : (Tensor out, Tensor out_grad)
...@@ -1729,6 +1795,7 @@ ...@@ -1729,6 +1795,7 @@
param : [out] param : [out]
kernel : kernel :
func : tanh_grad func : tanh_grad
backward : tanh_double_grad
- backward_api : tanh_shrink_grad - backward_api : tanh_shrink_grad
forward : tanh_shrink (Tensor x) -> Tensor(out) forward : tanh_shrink (Tensor x) -> Tensor(out)
...@@ -1740,6 +1807,16 @@ ...@@ -1740,6 +1807,16 @@
kernel : kernel :
func : tanh_shrink_grad func : tanh_shrink_grad
- backward_api : tanh_triple_grad
forward : tanh_double_grad (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_out_new), Tensor(grad_out_grad)
args : (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward, Tensor grad_out_new_grad, Tensor grad_out_grad_grad)
output : Tensor(out_grad), Tensor(grad_out_forward_grad), Tensor(grad_x_grad_forward_grad)
infer_meta :
func : GeneralTernaryGradInferMeta
param : [out, out, grad_x_grad_forward]
kernel :
func : tanh_triple_grad
- backward_api : thresholded_relu_grad - backward_api : thresholded_relu_grad
forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out) forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out)
args : (Tensor x, Tensor out_grad, float threshold) args : (Tensor x, Tensor out_grad, float threshold)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册