From 24ec6ed093f24344622066d9a393992f8c3793df Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 29 Apr 2022 09:48:59 +0800 Subject: [PATCH] Add some double/triple grad kernel yaml file (#42361) * add double yaml * add inline func --- .../final_state_generator/codegen_utils.py | 17 ++-- paddle/phi/api/lib/kernel_dispatch.h | 12 ++- paddle/phi/kernels/activation_grad_kernel.h | 6 +- paddle/phi/kernels/batch_norm_grad_kernel.h | 12 +-- .../phi/kernels/cpu/batch_norm_grad_kernel.cc | 12 +-- .../cpu/elementwise_subtract_grad_kernel.cc | 2 +- .../elementwise_subtract_grad_kernel.h | 2 +- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 12 +-- .../gpu/elementwise_subtract_grad_kernel.cu | 2 +- .../phi/kernels/impl/activation_grad_impl.h | 6 +- paddle/phi/ops/compat/activation_sig.cc | 4 +- paddle/phi/ops/compat/batch_norm_sig.cc | 14 ++-- paddle/phi/ops/compat/elementwise_sig.cc | 2 +- .../fluid/tests/unittests/gradient_checker.py | 61 ++++++++++---- .../unittests/test_activation_nn_grad.py | 20 +++++ .../unittests/test_elementwise_nn_grad.py | 17 ++++ python/paddle/nn/functional/activation.py | 5 +- python/paddle/utils/code_gen/api.yaml | 6 ++ python/paddle/utils/code_gen/backward.yaml | 79 ++++++++++++++++++- 19 files changed, 224 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 7769c5371b..61ed1deb27 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -22,17 +22,12 @@ import os ### Global Variables ### ######################## ops_to_fill_zero_for_empty_grads = set([ - "split_grad", - "rnn_grad", - "matmul_double_grad", - "matmul_triple_grad", - "sigmoid_double_grad", - "sigmoid_triple_grad", - "add_double_grad", - "add_triple_grad", - "multiply_double_grad", - "multiply_triple_grad", - "conv2d_grad_grad", + "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad", + "sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad", + "add_triple_grad", "multiply_double_grad", "multiply_triple_grad", + "conv2d_grad_grad", "batch_norm_double_grad", "tanh_double_grad", + "tanh_triple_grad", "subtract_double_grad", "divide_double_grad", + "log_double_grad", "elu_double_grad" ]) # For API dispatch used at python-level diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h index be545ac9ce..9f2ad6c62c 100644 --- a/paddle/phi/api/lib/kernel_dispatch.h +++ b/paddle/phi/api/lib/kernel_dispatch.h @@ -96,8 +96,7 @@ struct KernelKeyParser : ArgsIterator { // TODO(chenweihang): deal with multiple diff input Tensors // TODO(chenweihang): add global device guard method to set backend - void operator()(const Tensor& x) { - const phi::TensorBase& tensor = *x.impl(); + inline void AssignKernelKeySet(const phi::TensorBase& tensor) { key_set.backend_set = key_set.backend_set | detail::GetTensorBackendSet(tensor); // TODO(chenweihang): select multi layout and dtype @@ -110,6 +109,8 @@ struct KernelKeyParser : ArgsIterator { } } + void operator()(const Tensor& x) { AssignKernelKeySet(*x.impl()); } + void operator()(const std::vector& x) { const phi::TensorBase& tensor = *x.at(0).impl(); key_set.backend_set = @@ -119,6 +120,13 @@ struct KernelKeyParser : ArgsIterator { key_set.dtype = tensor.dtype(); } + void operator()(const paddle::optional x) { + if (x.get_ptr() != nullptr) { + const phi::TensorBase& tensor = *(x.get_ptr()->impl()); + AssignKernelKeySet(tensor); + } + } + // skip other type args, these args don't used in kernel selection template void operator()(const T& x) { diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index 065d018852..fd42756ba3 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -82,18 +82,18 @@ void ReluDoubleGradKernel(const Context& dev_ctx, template void TanhDoubleGradKernel(const Context& dev_ctx, const DenseTensor& out, - const DenseTensor& ddx, const DenseTensor& dout, + const DenseTensor& ddx, DenseTensor* dout_new, DenseTensor* ddout); template void TanhTripleGradKernel(const Context& dev_ctx, const DenseTensor& out, - const DenseTensor& ddx, const DenseTensor& dout, - const DenseTensor& d_ddout, + const DenseTensor& ddx, const DenseTensor& d_dout_new, + const DenseTensor& d_ddout, DenseTensor* d_out_new, DenseTensor* d_dout, DenseTensor* d_ddx); diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h index 73752f015c..2cb3b16a02 100644 --- a/paddle/phi/kernels/batch_norm_grad_kernel.h +++ b/paddle/phi/kernels/batch_norm_grad_kernel.h @@ -66,16 +66,16 @@ void BatchNormGradKernel(const Context& dev_ctx, template void BatchNormDoubleGradKernel(const Context& dev_ctx, - const DenseTensor& x_grad_grad, - const DenseTensor& scale_grad_grad, - const DenseTensor& bias_grad_grad, - const DenseTensor& y_grad, const DenseTensor& x, const DenseTensor& scale, - const DenseTensor& saved_mean, - const DenseTensor& saved_variance, paddle::optional mean, paddle::optional variance, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const DenseTensor& y_grad, + const DenseTensor& x_grad_grad, + const DenseTensor& scale_grad_grad, + const DenseTensor& bias_grad_grad, float momentum, float epsilon, const std::string& data_layout, diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc index ae87886b89..bf01c24f4f 100644 --- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc @@ -341,16 +341,16 @@ void BatchNormGradKernel(const Context& dev_ctx, template void BatchNormDoubleGradKernel(const Context& ctx, - const DenseTensor& x_grad_grad, - const DenseTensor& scale_grad_grad, - const DenseTensor& bias_grad_grad, - const DenseTensor& y_grad, const DenseTensor& x, const DenseTensor& scale, - const DenseTensor& saved_mean, - const DenseTensor& saved_variance, paddle::optional mean, paddle::optional variance, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const DenseTensor& y_grad, + const DenseTensor& x_grad_grad, + const DenseTensor& scale_grad_grad, + const DenseTensor& bias_grad_grad, float momentum, float epsilon, const std::string& data_layout_str, diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc index c785eacb9a..b86ead04db 100644 --- a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc @@ -38,9 +38,9 @@ void SubtractGradKernel(const Context& dev_ctx, template void SubtractDoubleGradKernel(const Context& dev_ctx, const DenseTensor& y, + const DenseTensor& dout, paddle::optional ddx, paddle::optional ddy, - const DenseTensor& dout, int axis, DenseTensor* ddout) { phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); diff --git a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h index 7be91b4b9f..97df769f4d 100644 --- a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h +++ b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h @@ -30,9 +30,9 @@ void SubtractGradKernel(const Context& dev_ctx, template void SubtractDoubleGradKernel(const Context& dev_ctx, const DenseTensor& y, + const DenseTensor& dout, paddle::optional ddx, paddle::optional ddy, - const DenseTensor& dout, int axis, DenseTensor* ddout); diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index e15b4cc10d..35d36c3287 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -908,16 +908,16 @@ void BatchNormGradKernel(const Context &dev_ctx, template void BatchNormDoubleGradKernel(const Context &ctx, - const DenseTensor &x_grad_grad, - const DenseTensor &scale_grad_grad, - const DenseTensor &bias_grad_grad, - const DenseTensor &y_grad, const DenseTensor &x, const DenseTensor &scale, - const DenseTensor &saved_mean, - const DenseTensor &saved_variance, paddle::optional mean, paddle::optional variance, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + const DenseTensor &y_grad, + const DenseTensor &x_grad_grad, + const DenseTensor &scale_grad_grad, + const DenseTensor &bias_grad_grad, float momentum, float epsilon, const std::string &data_layout_str, diff --git a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu index 20f3b73e40..017616df27 100644 --- a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu @@ -46,9 +46,9 @@ void SubtractGradKernel(const Context& dev_ctx, template void SubtractDoubleGradKernel(const Context& dev_ctx, const DenseTensor& y, + const DenseTensor& dout, paddle::optional ddx, paddle::optional ddy, - const DenseTensor& dout, int axis, DenseTensor* ddout) { phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index bf9b7cdf55..2f35acc095 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -152,8 +152,8 @@ void LeakyReluDoubleGradKernel(const Context& dev_ctx, template void TanhDoubleGradKernel(const Context& dev_ctx, const DenseTensor& out, - const DenseTensor& ddx, const DenseTensor& dout, + const DenseTensor& ddx, DenseTensor* dout_new, DenseTensor* ddout) { if (dout_new) { @@ -171,10 +171,10 @@ void TanhDoubleGradKernel(const Context& dev_ctx, template void TanhTripleGradKernel(const Context& dev_ctx, const DenseTensor& out, - const DenseTensor& ddx, const DenseTensor& dout, - const DenseTensor& d_ddout, + const DenseTensor& ddx, const DenseTensor& d_dout_new, + const DenseTensor& d_ddout, DenseTensor* d_out_new, DenseTensor* d_dout, DenseTensor* d_ddx) { diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 5900b49946..157eaa279d 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -121,13 +121,13 @@ KernelSignature ReluDoubleGradOpArgumentMapping( KernelSignature TanhDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"}); + "tanh_double_grad", {"Out", "DOut", "DDX"}, {}, {"DOutNew", "DDOut"}); } KernelSignature TanhTripleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("tanh_triple_grad", - {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"}, + {"Out", "DOut", "DDX", "D_DOut_New", "D_DDOut"}, {}, {"D_OutNew", "D_DOut", "D_DDx"}); } diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc index 14affe60b9..1c6b63d70c 100644 --- a/paddle/phi/ops/compat/batch_norm_sig.cc +++ b/paddle/phi/ops/compat/batch_norm_sig.cc @@ -82,16 +82,16 @@ KernelSignature BatchNormGradOpArgumentMapping( KernelSignature BatchNormGradGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("batch_norm_grad_grad", - {"DDX", - "DDScale", - "DDBias", - "DY", - "X", + {"X", "Scale", + "Mean", + "Variance", "SavedMean", "SavedVariance", - "Mean", - "Variance"}, + "DY", + "DDX", + "DDScale", + "DDBias"}, {"momentum", "epsilon", "data_layout", diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc index 19110eb0e0..13a5a6fd4a 100644 --- a/paddle/phi/ops/compat/elementwise_sig.cc +++ b/paddle/phi/ops/compat/elementwise_sig.cc @@ -133,7 +133,7 @@ KernelSignature ElementwiseSubGradOpArgumentMapping( KernelSignature ElementwiseSubDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"}); + "subtract_double_grad", {"Y", "DOut", "DDX", "DDY"}, {"axis"}, {"DDOut"}); } KernelSignature ElementwiseDivGradOpArgumentMapping( diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py index 32a7e442ea..defbffe8f2 100644 --- a/python/paddle/fluid/tests/unittests/gradient_checker.py +++ b/python/paddle/fluid/tests/unittests/gradient_checker.py @@ -560,7 +560,10 @@ def get_static_double_grad(x, # so, they are also the input of second-order backward. x += y_grads x_init += dy_init - y = dx + + # filter None in dx for DX/DY may be None in kernel + filted_dx = [dxi for dxi in dx if dxi is not None] + y = filted_dx # check input arguments x = _as_list(x) @@ -619,6 +622,7 @@ def get_static_double_grad(x, def get_eager_double_grad(func, x_init=None, dy_init=None, + place=None, return_mid_result=False): """ Get Double Grad result of dygraph. @@ -627,6 +631,7 @@ def get_eager_double_grad(func, func: A wrapped dygraph function that its logic is equal to static program x_init (numpy.array|list[numpy.array]|None): the init value for input x. dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output. + place (fluid.CPUPlace or fluid.CUDAPlace): the device. return_mid_result (bool): A flag that controls the return content. Returns: If 'return_mid_result' set True. @@ -635,6 +640,10 @@ def get_eager_double_grad(func, If 'return_mid_result' set False. A list of numpy array that stores second derivative result calulated by dygraph. """ + if isinstance(place, fluid.CPUPlace): + paddle.set_device("cpu") + if isinstance(place, fluid.CUDAPlace): + paddle.set_device("gpu") inputs = [] dys = [] for x in x_init: @@ -648,7 +657,12 @@ def get_eager_double_grad(func, # calculate first derivative outputs = func(inputs) d_inputs = paddle.grad( - outputs=outputs, inputs=inputs, grad_outputs=dys, create_graph=True) + outputs=outputs, + inputs=inputs, + grad_outputs=dys, + create_graph=True, + allow_unused=True) + d_inputs = [d_input for d_input in d_inputs if d_input is not None] # calcluate second derivative inputs = inputs + dys @@ -663,15 +677,20 @@ def get_eager_double_grad(func, ddy = paddle.ones(shape=d_input.shape, dtype=d_input.dtype) ddy.stop_gradient = False ddys.append(ddy) + dd_inputs = paddle.grad( outputs=d_inputs, inputs=inputs, grad_outputs=ddys, - create_graph=create_graph) + create_graph=create_graph, + allow_unused=True) + if return_mid_result: return dd_inputs, inputs + ddys else: - return [dd_input.numpy() for dd_input in dd_inputs] + return [ + dd_input.numpy() for dd_input in dd_inputs if dd_input is not None + ] def double_grad_check_for_dygraph(func, @@ -693,7 +712,6 @@ def double_grad_check_for_dygraph(func, y (Variable|list[Variable]): output variables to the program. x_init (numpy.array|list[numpy.array]|None): the init value for input x. place (fluid.CPUPlace or fluid.CUDAPlace): the device. - eps (float): perturbation for finite differences. atol (float): absolute tolerance. rtol (float): relative tolerance. raise_exception (bool): whether to raise an exception if @@ -722,19 +740,25 @@ def double_grad_check_for_dygraph(func, paddle.disable_static() with _test_eager_guard(): - eager_double_grad = get_eager_double_grad(func, x_init, y_grads_init) + eager_double_grad = get_eager_double_grad(func, x_init, y_grads_init, + place) paddle.enable_static() static_double_grad = get_static_double_grad(x, y, x_init, y_grads_init, place) + if len(static_double_grad) != len(eager_double_grad): + msg = "The output grad tensor's number of static graph is different with dygraph, " \ + "please check the python api unit test used." + raise RuntimeError(msg) + for i in six.moves.xrange(len(static_double_grad)): if not np.allclose(static_double_grad[i], eager_double_grad[i], rtol, atol): - msg = 'Check eager double result fail. Mismatch between static_graph double grad %s ' \ - 'and eager double grad %s on %s,\n' \ + msg = 'Check eager double result fail. Mismatch between static_graph double grad ' \ + 'and eager double grad on %s, the output double grad tensor\'s index is : %d \n' \ 'static:%s\n eager:%s\n' \ - % (static_double_grad[i].name, eager_double_grad[i].name, str(place), static_double_grad[i], eager_double_grad[i]) + % (str(place), i, static_double_grad[i], eager_double_grad[i]) return fail_test(msg) @@ -794,6 +818,7 @@ def get_static_triple_grad(x, def get_eager_triple_grad(func, x_init=None, dy_init=None, + place=None, return_mid_result=False): """ Get triple Grad result of dygraph. @@ -802,12 +827,13 @@ def get_eager_triple_grad(func, func: A wrapped dygraph function that its logic is equal to static program x_init (numpy.array|list[numpy.array]|None): the init value for input x. dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output. + place (fluid.CPUPlace or fluid.CUDAPlace): the device. return_mid_result (list[Tensor], list[Tensor]): If set True, the Returns: A list of numpy array that stores second derivative result calulated by dygraph """ dd_y, dd_x = get_eager_double_grad( - func, x_init, dy_init, return_mid_result=True) + func, x_init, dy_init, place, return_mid_result=True) # calcluate third derivative dddys = [] @@ -839,7 +865,6 @@ def triple_grad_check_for_dygraph(func, y (Variable|list[Variable]): output variables to the program. x_init (numpy.array|list[numpy.array]|None): the init value for input x. place (fluid.CPUPlace or fluid.CUDAPlace): the device. - eps (float): perturbation for finite differences. atol (float): absolute tolerance. rtol (float): relative tolerance. raise_exception (bool): whether to raise an exception if @@ -868,17 +893,23 @@ def triple_grad_check_for_dygraph(func, paddle.disable_static() with _test_eager_guard(): - eager_triple_grad = get_eager_triple_grad(func, x_init, y_grads_init) + eager_triple_grad = get_eager_triple_grad(func, x_init, y_grads_init, + place) paddle.enable_static() static_triple_grad = get_static_triple_grad(x, y, x_init, y_grads_init, place) + if len(static_triple_grad) != len(eager_triple_grad): + msg = "The output grad tensor's number of static graph is different with dygraph, " \ + "please check the python api unit test used." + raise RuntimeError(msg) + for i in six.moves.xrange(len(static_triple_grad)): if not np.allclose(static_triple_grad[i], eager_triple_grad[i], rtol, atol): - msg = 'Check eager double result fail. Mismatch between static_graph double grad %s ' \ - 'and eager double grad %s on %s,\n' \ + msg = 'Check eager double result fail. Mismatch between static_graph double grad ' \ + 'and eager double grad on %s, the output double grad tensor\'s index is : %d \n' \ 'static:%s\n eager:%s\n' \ - % (static_triple_grad[i].name, eager_triple_grad[i].name, str(place), static_triple_grad[i], eager_triple_grad[i]) + % (str(place), i, static_triple_grad[i], eager_triple_grad[i]) return fail_test(msg) diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index 72240be41d..9fcb386418 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -52,6 +52,9 @@ class TestSigmoidTripleGradCheck(unittest.TestCase): class TestSigmoidDoubleGradCheck(unittest.TestCase): + def sigmoid_wrapper(self, x): + return fluid.layers.sigmoid(x[0]) + @prog_scope() def func(self, place): shape = [2, 3, 7, 9] @@ -64,6 +67,8 @@ class TestSigmoidDoubleGradCheck(unittest.TestCase): x_arr[np.abs(x_arr) < 0.005] = 0.002 gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps) + gradient_checker.double_grad_check_for_dygraph( + self.sigmoid_wrapper, [x], y, x_init=x_arr, place=place) def test_grad(self): paddle.enable_static() @@ -75,6 +80,9 @@ class TestSigmoidDoubleGradCheck(unittest.TestCase): class TestTanhTripleGradCheck(unittest.TestCase): + def tanh_wrapper(self, x): + return paddle.tanh(x[0]) + @prog_scope() def func(self, place): shape = [2, 3, 7, 9] @@ -87,6 +95,8 @@ class TestTanhTripleGradCheck(unittest.TestCase): x_arr[np.abs(x_arr) < 0.005] = 0.002 gradient_checker.triple_grad_check( [x], y, x_init=x_arr, place=place, eps=eps) + gradient_checker.triple_grad_check_for_dygraph( + self.tanh_wrapper, [x], y, x_init=x_arr, place=place) def test_grad(self): paddle.enable_static() @@ -98,6 +108,9 @@ class TestTanhTripleGradCheck(unittest.TestCase): class TestTanhDoubleGradCheck(unittest.TestCase): + def tanh_wrapper(self, x): + return paddle.tanh(x[0]) + @prog_scope() def func(self, place): shape = [2, 3, 7, 9] @@ -110,6 +123,8 @@ class TestTanhDoubleGradCheck(unittest.TestCase): x_arr[np.abs(x_arr) < 0.005] = 0.002 gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps) + gradient_checker.double_grad_check_for_dygraph( + self.tanh_wrapper, [x], y, x_init=x_arr, place=place) def test_grad(self): paddle.enable_static() @@ -173,6 +188,9 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase): class TestELUDoubleGradCheck(unittest.TestCase): + def elu_wrapper(self, x): + return paddle.nn.functional.elu(x[0], alpha=0.2) + @prog_scope() def func(self, place): shape = [2, 4, 4, 4] @@ -189,6 +207,8 @@ class TestELUDoubleGradCheck(unittest.TestCase): x_arr = np.random.uniform(-1, 1, shape).astype(dtype) gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps) + gradient_checker.double_grad_check_for_dygraph( + self.elu_wrapper, [x], y, x_init=x_arr, place=place) def test_grad(self): paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py index 8f6f9851c7..ccfed61185 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py @@ -139,6 +139,9 @@ class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase): class TestElementwiseSubDoubleGradCheck(unittest.TestCase): + def subtract_wrapper(self, x): + return paddle.subtract(x[0], x[1]) + @prog_scope() def func(self, place): # the shape of input variable should be clearly specified, not inlcude -1. @@ -156,6 +159,11 @@ class TestElementwiseSubDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps) + gradient_checker.double_grad_check_for_dygraph( + self.subtract_wrapper, [x, y], + out, + x_init=[x_arr, y_arr], + place=place) def test_grad(self): paddle.enable_static() @@ -195,6 +203,9 @@ class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase): class TestElementwiseDivDoubleGradCheck(unittest.TestCase): + def divide_wrapper(self, x): + return paddle.divide(x[0], x[1]) + @prog_scope() def func(self, place): # the shape of input variable should be clearly specified, not inlcude -1. @@ -213,6 +224,12 @@ class TestElementwiseDivDoubleGradCheck(unittest.TestCase): gradient_checker.double_grad_check( [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps, atol=1e-3) + gradient_checker.double_grad_check_for_dygraph( + self.divide_wrapper, [x, y], + out, + x_init=[x_arr, y_arr], + place=place, + atol=1e-3) def test_grad(self): paddle.enable_static() diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 34acbfbf75..e64efda7b3 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -112,7 +112,10 @@ def elu(x, alpha=1.0, name=None): # [ 1. 15.6 ]] """ - if in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_elu(x, alpha) + + if _in_legacy_dygraph(): return _C_ops.elu(x, 'alpha', alpha) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu') diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index d401e7c519..35976b6f87 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -466,6 +466,7 @@ func : DeformableConvInferMeta kernel : func : deformable_conv + data_type : x optional : mask backward : deformable_conv_grad @@ -546,6 +547,7 @@ func : DropoutInferMeta kernel : func : dropout + data_type : x optional : seed_tensor backward : dropout_grad @@ -1065,6 +1067,7 @@ func : LayerNormInferMeta kernel : func : layer_norm + data_type : x backward : layer_norm_grad optional : scale, bias @@ -1608,6 +1611,7 @@ func : PsroiPoolInferMeta kernel : func : psroi_pool + data_type : x optional : boxes_num backward : psroi_pool_grad @@ -1713,6 +1717,7 @@ func : RoiAlignInferMeta kernel : func : roi_align + data_type : x optional : boxes_num backward : roi_align_grad @@ -1723,6 +1728,7 @@ func : RoiPoolInferMeta kernel : func : roi_pool + data_type : x optional : boxes_num intermediate : arg_max backward : roi_pool_grad diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 3b47470139..c875162dcd 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -152,6 +152,18 @@ kernel : func : atanh_grad +- backward_api : batch_norm_double_grad + forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias) + args : (Tensor x, Tensor scale, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor grad_out, Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + output : Tensor(x_grad), Tensor(scale_grad), Tensor(grad_out_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param : [x, scale, x] + kernel : + func : batch_norm_grad_grad + data_type : x + optional : out_mean, out_variance + - backward_api : batch_norm_grad forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) @@ -163,6 +175,7 @@ func : batch_norm_grad data_type : out_grad optional : mean_out, variance_out, reserve_space + backward : batch_norm_double_grad - backward_api : bce_loss_grad forward : bce_loss (Tensor input, Tensor label) -> Tensor(out) @@ -362,6 +375,7 @@ func : DeformableConvGradInferMeta kernel : func : deformable_conv_grad + data_type : x optional : mask - backward_api : depthwise_conv2d_transpose_grad @@ -414,6 +428,18 @@ kernel : func : dist_grad +- backward_api : divide_double_grad + forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) + args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) + output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param : [y, grad_x, grad_x] + kernel : + func : divide_double_grad + data_type : out + optional : grad_x_grad, grad_y_grad + - backward_api : divide_grad forward : divide (Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, int axis = -1) @@ -423,6 +449,7 @@ param : [x, y] kernel : func : divide_grad + backward : divide_double_grad - backward_api : dropout_grad forward : dropout (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(mask) @@ -455,6 +482,16 @@ kernel : func : elementwise_pow_grad +- backward_api : elu_double_grad + forward : elu_grad (Tensor x, Tensor out, Tensor grad_out, float alpha)-> Tensor(grad_x) + args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha) + output : Tensor(x_grad), Tensor(grad_out_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, x] + kernel : + func : elu_double_grad + - backward_api : elu_grad forward : elu (Tensor x, float alpha) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad, float alpha) @@ -464,6 +501,7 @@ param : [x] kernel : func : elu_grad + backward : elu_double_grad - backward_api : erf_grad forward : erf (Tensor x) -> Tensor(out) @@ -633,6 +671,7 @@ param : [x] kernel : func : graph_send_recv_grad + data_type : out_grad optional: out, dst_count - backward_api : gumbel_softmax_grad @@ -1287,6 +1326,7 @@ param : [x] kernel : func : psroi_pool_grad + data_type : x optional : boxes_num # output is optional @@ -1381,6 +1421,7 @@ param : [x] kernel : func : roi_align_grad + data_type : boxes optional : boxes_num - backward_api : roi_pool_grad @@ -1392,6 +1433,7 @@ param : [x] kernel : func : roi_pool_grad + data_type : x optional : boxes_num - backward_api : roll_grad @@ -1498,7 +1540,7 @@ func : UnchangedInferMeta param : [x] kernel : - func : sigmoid_cross_entropy_with_logits_grad + func : sigmoid_cross_entropy_with_logits_grad - backward_api : sigmoid_double_grad forward : sigmoid_grad (Tensor out, Tensor fwd_grad_out) -> Tensor(grad_x) @@ -1654,6 +1696,18 @@ func : strided_slice_grad no_need_buffer : x +- backward_api : subtract_double_grad + forward : subtract_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) + args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) + output : Tensor(grad_out_grad) + infer_meta : + func : UnchangedInferMeta + param : [grad_out] + kernel : + func : subtract_double_grad + optional : grad_x_grad, grad_y_grad + no_need_buffer : y, grad_out + - backward_api : subtract_grad forward : subtract (Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1) @@ -1664,6 +1718,7 @@ kernel : func : subtract_grad no_need_buffer : x, y + backward : subtract_double_grad - backward_api : sum_double_grad forward : sum_grad (Tensor x, Tensor grad_out, int64_t[] dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x) @@ -1720,6 +1775,17 @@ kernel : func : tan_grad +- backward_api : tanh_double_grad + forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x) + args : (Tensor out, Tensor grad_out, Tensor grad_x_grad) + output : Tensor(out_grad), Tensor(grad_out_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [out, out] + kernel : + func : tanh_double_grad + backward : tanh_triple_grad + - backward_api : tanh_grad forward : tanh (Tensor x) -> Tensor(out) args : (Tensor out, Tensor out_grad) @@ -1729,6 +1795,7 @@ param : [out] kernel : func : tanh_grad + backward : tanh_double_grad - backward_api : tanh_shrink_grad forward : tanh_shrink (Tensor x) -> Tensor(out) @@ -1740,6 +1807,16 @@ kernel : func : tanh_shrink_grad +- backward_api : tanh_triple_grad + forward : tanh_double_grad (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_out_new), Tensor(grad_out_grad) + args : (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward, Tensor grad_out_new_grad, Tensor grad_out_grad_grad) + output : Tensor(out_grad), Tensor(grad_out_forward_grad), Tensor(grad_x_grad_forward_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param : [out, out, grad_x_grad_forward] + kernel : + func : tanh_triple_grad + - backward_api : thresholded_relu_grad forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out) args : (Tensor x, Tensor out_grad, float threshold) -- GitLab