[prim] add gelu vjp rule

2add31f4 · cxxly · Xiaoxu Chen · 325fdf1d · 2add31f4 · 2add31f4
5 changed file
--- a/paddle/fluid/prim/api/api.yaml
+++ b/paddle/fluid/prim/api/api.yaml
@@ -46,3 +46,5 @@
 - where
 - reshape
 - split
+- erf
+- tanh
--- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -22,8 +22,10 @@

 #include "paddle/fluid/prim/api/all.h"
 #include "paddle/fluid/prim/api/generated_prim/prim_generated_api.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/ddim.h"
+
 namespace paddle {
 namespace prim {
 using Tensor = paddle::Tensor;
@@ -1176,11 +1178,11 @@ void dropout_grad(const Tensor& mask,
  } else {
    if (mode == "upscale_in_train") {
      if (p.to<float>() == 1.0f) {
-        set_output<T>(out_grad * 0.0, x_grad);
+        set_output<T>(scale<T>(out_grad, 0.0), x_grad);
      } else {
-        set_output<T>(
-            out_grad * cast<T>(mask, out_grad.dtype()) / (1.0 - p.to<float>()),
-            x_grad);
+        set_output<T>(scale<T>(out_grad * cast<T>(mask, out_grad.dtype()),
+                               1.0 / (1.0 - p.to<float>())),
+                      x_grad);
      }
    } else {
      set_output<T>(out_grad * cast<T>(mask, out_grad.dtype()), x_grad);
@@ -1362,5 +1364,78 @@ void batch_norm_grad(const Tensor& x,
  }
 }

+template <typename T>
+void gelu_grad(const Tensor& x,
+               const Tensor& out_grad,
+               bool approximate,
+               Tensor* x_grad) {
+  if (!x_grad) return;
+  // Promote to fp32 when the input type is fp16 for keeping consistent with
+  // phi kernel
+
+  if (x.dtype() == phi::DataType::FLOAT16 ||
+      x.dtype() == phi::DataType::BFLOAT16) {
+    auto promoted_x = cast<T>(x, phi::DataType::FLOAT32);
+    auto promoted_out_grad = cast<T>(out_grad, phi::DataType::FLOAT32);
+    if (approximate) {
+      float kbeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+      float kkappa = 0.044715;
+      auto x_sq = promoted_x * promoted_x;
+      auto x_cube = x_sq * promoted_x;
+      auto inner = kbeta * (promoted_x + kkappa * x_cube);
+      auto tanh_inner = tanh<T>(inner);
+
+      auto left = scale<T>(promoted_x, 0.5);
+      auto right = scale<T>(tanh_inner, 1., 1.);
+
+      auto left_derivative = scale<T>(right, 0.5);
+
+      auto tanh_derivative = scale<T>(tanh_inner * tanh_inner, -1., 1.);
+      auto inner_derivative = kbeta * (scale<T>(3 * kkappa * x_sq, 1., 1.));
+      auto right_derivative = left * tanh_derivative * inner_derivative;
+
+      set_output<T>(
+          cast<T>(promoted_out_grad * (left_derivative + right_derivative),
+                  x.type()),
+          x_grad);
+    } else {
+      float kalpha = M_SQRT1_2;
+      float kbeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
+      auto cdf = scale<T>(scale<T>(erf<T>(kalpha * promoted_x), 1., 1.), 0.5);
+      auto pdf = kbeta * exp<T>(scale<T>(promoted_x * promoted_x, -0.5));
+      set_output<T>(
+          cast<T>(promoted_out_grad * (cdf + promoted_x * pdf), x.type()),
+          x_grad);
+    }
+  } else {
+    // Scale only support fp32 attr in static graph mode, use elementwise_xx
+    // when precision is over fp32.
+    if (approximate) {
+      auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+      auto kKappa = 0.044715;
+      auto x_sq = x * x;
+      auto x_cube = x_sq * x;
+      auto inner = kBeta * (x + kKappa * x_cube);
+      auto tanh_inner = tanh<T>(inner);
+
+      auto left = scale<T>(x, 0.5);
+      auto right = scale<T>(tanh_inner, 1., 1.);
+
+      auto left_derivative = scale<T>(right, 0.5);
+
+      auto tanh_derivative = scale<T>(tanh_inner * tanh_inner, -1., 1.);
+      auto inner_derivative = kBeta * (scale<T>(3 * kKappa * x_sq, 1., 1.));
+      auto right_derivative = left * tanh_derivative * inner_derivative;
+
+      set_output<T>(out_grad * (left_derivative + right_derivative), x_grad);
+    } else {
+      auto kAlpha = M_SQRT1_2;
+      auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
+      auto cdf = scale<T>(scale<T>(erf<T>(kAlpha * x), 1., 1.), 0.5);
+      auto pdf = kBeta * exp<T>(scale<T>(x * x, -0.5));
+      set_output<T>(out_grad * (cdf + x * pdf), x_grad);
+    }
+  }
+}
 }  // namespace prim
 }  // namespace paddle
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -635,6 +635,7 @@
    param: [x]
  kernel :
    func : gelu_grad
+  composite: gelu_grad(x, out_grad, approximate, x_grad)

 - backward_op : grid_sample_grad
  forward : grid_sample (Tensor x, Tensor grid, str mode, str padding_mode, bool align_corners) -> Tensor(out)

--- a/python/paddle/fluid/tests/unittests/prim/test_comp_custom_vjp.py
+++ b/python/paddle/fluid/tests/unittests/prim/test_comp_custom_vjp.py
@@ -44,8 +44,7 @@ class TestCustomVJP(unittest.TestCase):
            'fill_any_like',
            'cast',
            'elementwise_mul',
-            'fill_constant',
-            'elementwise_div',
+            'scale',
        )
        self.ops_all_enable = (
            'uniform_random',
@@ -59,8 +58,7 @@ class TestCustomVJP(unittest.TestCase):
            'fill_any_like',
            'cast',
            'elementwise_mul',
-            'fill_constant',
-            'elementwise_div',
+            'scale',
        )

    def test_enable_prim_fwd(self):

--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -2031,6 +2031,10 @@ class TestGeluApproximate(TestActivation):
        self.outputs = {'Out': out}
        self.attrs = {"approximate": approximate}

+        # The backward decomposite of gelu is inconsistent with raw kernel,
+        # lower threshold to support 1e-5 for pass the unittest
+        self.rev_comp_rtol = 1e-5
+
    def test_check_output(self):
        self.check_output(check_prim=True)

@@ -2057,6 +2061,9 @@ class TestGelu(TestActivation):
        self.inputs = {'X': x}
        self.outputs = {'Out': out}
        self.attrs = {"approximate": approximate}
+        # The backward decomposite of gelu is inconsistent with raw kernel,
+        # lower threshold to support 1e-5 for pass the unittest
+        self.rev_comp_rtol = 1e-5

    def if_enable_cinn(self):
        self.enable_cinn = False
@@ -2088,6 +2095,11 @@ class TestGELUAPI(unittest.TestCase):
            if paddle.is_compiled_with_cuda()
            else paddle.CPUPlace()
        )
+        self.enable_cinn = False
+
+        # The backward decomposite of gelu is inconsistent with raw kernel,
+        # lower threshold to support 1e-5 for pass the unittest
+        self.rev_comp_rtol = 1e-5

    def test_static_api(self):
        with paddle_static_guard():
@@ -3910,7 +3922,7 @@ create_test_act_fp16_class(TestAsinh, grad_atol=0.85)
 create_test_act_fp16_class(TestAtanh, grad_atol=0.85)
 create_test_act_fp16_class(TestRound, grad_check=False)
 create_test_act_fp16_class(TestRelu, check_prim=True)
-create_test_act_fp16_class(TestGelu, check_prim=True)
+create_test_act_fp16_class(TestGelu, check_prim=True, enable_cinn=False)
 create_test_act_fp16_class(TestBRelu)
 create_test_act_fp16_class(TestRelu6)
 create_test_act_fp16_class(TestSoftRelu, grad_atol=0.85)