Add double grad yaml for celu/sqrt/rsqrt/square op (#42895)

* add double grad yaml * fix bugs when compile infrt

Add double grad yaml for celu/sqrt/rsqrt/square op (#42895)
* add double grad yaml * fix bugs when compile infrt
0211a833 · YuanRisheng · GitHub · e5ebd347 · 0211a833 · 0211a833
8 changed file
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -28,7 +28,8 @@ ops_to_fill_zero_for_empty_grads = set([
    "multiply_triple_grad", "conv2d_grad_grad", "batch_norm_double_grad",
    "tanh_double_grad", "tanh_triple_grad", "subtract_double_grad",
    "divide_double_grad", "log_double_grad", "elu_double_grad",
-    "leaky_relu_double_grad"
+    "leaky_relu_double_grad", "sqrt_double_grad", "rsqrt_double_grad",
+    "square_double_grad", "celu_double_grad"
 ])

 # For API dispatch used at python-level

--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -78,7 +78,7 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta)
-DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(celu, alpha)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Celu, alpha)

 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)

--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -253,6 +253,9 @@ class TestELUDoubleGradCheck(unittest.TestCase):


 class TestCELUDoubleGradCheck(unittest.TestCase):
+    def celu_wrapper(self, x):
+        return paddle.nn.functional.celu(x[0], alpha=0.2)
+
    @prog_scope()
    def func(self, place):
        shape = [2, 4, 4, 4]
@@ -269,6 +272,8 @@ class TestCELUDoubleGradCheck(unittest.TestCase):
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        gradient_checker.double_grad_check(
            [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.celu_wrapper, [x], y, x_init=x_arr, place=place)

    def test_grad(self):
        paddle.enable_static()
@@ -280,6 +285,9 @@ class TestCELUDoubleGradCheck(unittest.TestCase):


 class TestSqrtDoubleGradCheck(unittest.TestCase):
+    def sqrt_wrapper(self, x):
+        return paddle.sqrt(x[0])
+
    @prog_scope()
    def func(self, place):
        shape = [2, 3, 7, 9]
@@ -294,6 +302,8 @@ class TestSqrtDoubleGradCheck(unittest.TestCase):

        gradient_checker.double_grad_check(
            [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.sqrt_wrapper, [x], y, x_init=x_arr, place=place)

    def test_grad(self):
        paddle.enable_static()
@@ -305,6 +315,9 @@ class TestSqrtDoubleGradCheck(unittest.TestCase):


 class TestRsqrtDoubleGradCheck(unittest.TestCase):
+    def rsqrt_wrapper(self, x):
+        return paddle.rsqrt(x[0])
+
    @prog_scope()
    def func(self, place):
        shape = [2, 3, 7, 9]
@@ -319,6 +332,8 @@ class TestRsqrtDoubleGradCheck(unittest.TestCase):

        gradient_checker.double_grad_check(
            [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.rsqrt_wrapper, [x], y, x_init=x_arr, place=place)

    def test_grad(self):
        paddle.enable_static()
@@ -330,6 +345,9 @@ class TestRsqrtDoubleGradCheck(unittest.TestCase):


 class TestSquareDoubleGradCheck(unittest.TestCase):
+    def square_wrapper(self, x):
+        return paddle.square(x[0])
+
    @prog_scope()
    def func(self, place):
        # the shape of input variable should be clearly specified, not inlcude -1.
@@ -344,6 +362,8 @@ class TestSquareDoubleGradCheck(unittest.TestCase):

        gradient_checker.double_grad_check(
            [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.square_wrapper, [x], y, x_init=x_arr, place=place)

    def test_grad(self):
        paddle.enable_static()

--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -2003,6 +2003,7 @@ class TestCELU(TestActivation):
        self.op_type = "celu"
        self.init_dtype()

+        self.python_api = paddle.nn.functional.celu
        np.random.seed(1024)
        x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
        alpha = 1.5
@@ -2014,7 +2015,7 @@ class TestCELU(TestActivation):
    def test_check_grad(self):
        if self.dtype == np.float16:
            return
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)


 class TestCELUAPI(unittest.TestCase):
@@ -2080,6 +2081,11 @@ class TestCELUAPI(unittest.TestCase):
                name='x_fp16', shape=[10, 12], dtype='float16')
            self.celu(x_fp16)

+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_dygraph_api()
+            self.test_errors()
+

 class TestReciprocal(TestActivation):
    def setUp(self):

--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -63,8 +63,10 @@ def celu(x, alpha=1.0, name=None):
    if alpha == 0:
        raise ZeroDivisionError("alpha cannot be 0 for celu")

-    if in_dynamic_mode():
+    if _in_legacy_dygraph():
        return _C_ops.celu(x, 'alpha', alpha)
+    if in_dygraph_mode():
+        return _C_ops.final_state_celu(x, alpha)

    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu')
    helper = LayerHelper("celu", **locals())

--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -21,7 +21,7 @@ import string
 from six.moves import cStringIO
 from ..static import Variable
 from ..fluid.proto import framework_pb2
-from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_
+from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_, _non_static_mode, in_dygraph_mode
 from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
 import paddle
@@ -256,7 +256,13 @@ def generate_activation_fn(op_type):
    op_proto = OpProtoHolder.instance().get_op_proto(op_type)

    def func(x, name=None):
-        if paddle.in_dynamic_mode():
+        final_state_op_type = "final_state_%s" % op_type
+        if in_dygraph_mode() and hasattr(_C_ops, final_state_op_type):
+            op = getattr(_C_ops, final_state_op_type)
+            return op(x)
+        # TODO(dev): Because some ops' yaml has not been migrated.
+        # Replace it with _in_legacy_dygraph while all yaml work is done.
+        if _non_static_mode():
            op = getattr(_C_ops, op_type)
            return op(x)


--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -319,6 +319,16 @@
    func : ceil
  backward : ceil_grad

+- api : celu
+  args : (Tensor x, float alpha)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : celu
+  backward : celu_grad
+
 # cholesky
 - api : cholesky
  args : (Tensor x, bool upper)

--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -232,6 +232,27 @@
  kernel :
    func : ceil_grad

+- backward_api : celu_double_grad
+  forward : celu_grad(Tensor x, Tensor grad_out, float alpha) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
+  kernel :
+    func : celu_double_grad
+
+- backward_api : celu_grad
+  forward : celu(Tensor x, float alpha) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float alpha)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : celu_grad
+  backward : celu_double_grad
+
 - backward_api : cholesky_grad
  forward : cholesky (Tensor x, bool upper) -> Tensor(out)
  args : (Tensor out, Tensor out_grad, bool upper)
@@ -1544,6 +1565,16 @@
  kernel :
    func : round_grad

+- backward_api : rsqrt_double_grad
+  forward : rsqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_x, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  kernel :
+    func : rsqrt_double_grad
+
 - backward_api : rsqrt_grad
  forward : rsqrt (Tensor x) -> Tensor(out)
  args : (Tensor out, Tensor out_grad)
@@ -1553,6 +1584,7 @@
    param : [out]
  kernel :
    func : rsqrt_grad
+  backward : rsqrt_double_grad

 - backward_api : scale_double_grad
  forward : scale_grad (Tensor grad_out, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_x)
@@ -1731,6 +1763,16 @@
  invoke : concat( out_grad, axis)
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.

+- backward_api : sqrt_double_grad
+  forward : sqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_x, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  kernel :
+    func : sqrt_double_grad
+
 - backward_api : sqrt_grad
  forward : sqrt (Tensor x) -> Tensor(out)
  args : (Tensor out, Tensor out_grad)
@@ -1740,6 +1782,17 @@
    param : [out]
  kernel :
    func : sqrt_grad
+  backward : sqrt_double_grad
+
+- backward_api : square_double_grad
+  forward : square_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
+  kernel :
+    func : square_double_grad

 - backward_api : square_grad
  forward : square (Tensor x) -> Tensor(out)
@@ -1750,6 +1803,7 @@
    param : [x]
  kernel :
    func : square_grad
+  backward : square_double_grad

 - backward_api : squeeze_grad
  forward : squeeze(Tensor x, int[] axes) -> Tensor(out), Tensor(xshape)