[cherry pick] Some optimizations of elementwise_add, gelu and dropout for AMP (#30152)

* Improve performance of elementwise_add grad op (#29187) * pass stop_gradient for cast op * improve performance of elementwise_add grad * use tensor copy async * dygraph branch * fix dygraph branch * add ut * make gelu fp16 computing more robust (#29484) * Add fast path for dropout when p == 0 (#29553) * add fast path for p == 0 in dropout * add ut

[cherry pick] Some optimizations of elementwise_add, gelu and dropout for AMP (#30152)
* Improve performance of elementwise_add grad op (#29187) * pass stop_gradient for cast op * improve performance of elementwise_add grad * use tensor copy async * dygraph branch * fix dygraph branch * add ut * make gelu fp16 computing more robust (#29484) * Add fast path for dropout when p == 0 (#29553) * add fast path for p == 0 in dropout * add ut
07f68fad · Leo Chen · GitHub · 44b81e63 · 07f68fad · 07f68fad
9 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -144,7 +144,20 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
    // skip out
    auto *out = dout;

-    if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    // Special case when dy is not needed and dx doesn't reduce
+    if (dx != nullptr && dy == nullptr && dx->dims() == dout->dims()) {
+      VLOG(4) << "Special case when dy is not needed and dx doesn't "
+                 "reduce";
+      framework::TensorCopy(
+          *dout, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), dx);
+    } else if (dx == nullptr && dy != nullptr && dy->dims() == dout->dims()) {
+      VLOG(4) << "Special case when dx is not needed and dy doesn't "
+                 "reduce";
+      framework::TensorCopy(
+          *dout, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), dy);
+    } else if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
      elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
    } else {
      default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,

--- a/paddle/fluid/operators/gelu_op.h
+++ b/paddle/fluid/operators/gelu_op.h
@@ -36,10 +36,22 @@ struct GeluFunctor {
  void operator()(Device d, X x, Out out, bool approximate) const {
    if (approximate) {
      // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3})))
+      if (std::is_same<T, platform::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto temp =
+            (static_cast<float>(M_2_SQRTPI * M_SQRT1_2) *
+             (casted_x + static_cast<float>(0.044715) * casted_x.cube()))
+                .tanh();
+        out.device(d) = (casted_x * static_cast<float>(0.5) *
+                         (static_cast<float>(1) + temp))
+                            .template cast<T>();
+      } else {
        auto temp = (static_cast<T>(M_2_SQRTPI * M_SQRT1_2) *
                     (x + static_cast<T>(0.044715) * x.cube()))
                        .tanh();
        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+      }
    } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
@@ -60,8 +72,17 @@ struct GeluFunctor {
      }
 #else
      // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
+      if (std::is_same<T, platform::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto temp = (casted_x * static_cast<float>(M_SQRT1_2)).erf();
+        out.device(d) = (casted_x * static_cast<float>(0.5) *
+                         (static_cast<float>(1) + temp))
+                            .template cast<T>();
+      } else {
        auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+      }
 #endif
    }
  }
@@ -72,6 +93,24 @@ struct GeluGradFunctor {
  template <typename Device, typename X, typename dOut, typename dX>
  void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const {
    if (approximate) {
+      if (std::is_same<T, platform::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto casted_dout = dout.template cast<float>();
+
+        const float kAlpha = static_cast<float>(M_2_SQRTPI * M_SQRT1_2);
+        const float kBeta =
+            kAlpha * static_cast<float>(0.044715) * static_cast<float>(3);
+        const auto y =
+            (kAlpha *
+             ((static_cast<float>(0.044715) * casted_x.cube()) + casted_x))
+                .tanh();
+        dx.device(d) = (static_cast<float>(0.5) * casted_dout *
+                        (static_cast<float>(1) + y +
+                         (casted_x - casted_x * y.square()) *
+                             (kAlpha + kBeta * casted_x.square())))
+                           .template cast<T>();
+      } else {
        const T kAlpha = static_cast<T>(M_2_SQRTPI * M_SQRT1_2);
        const T kBeta = kAlpha * static_cast<T>(0.044715) * static_cast<T>(3);
        const auto y =
@@ -79,6 +118,7 @@ struct GeluGradFunctor {
        dx.device(d) = static_cast<T>(0.5) * dout *
                       (static_cast<T>(1) + y +
                        (x - x * y.square()) * (kAlpha + kBeta * x.square()));
+      }
    } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
@@ -117,6 +157,18 @@ struct GeluGradFunctor {
 #else
      // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) *
      // exp(- x^2 / 2)
+      if (std::is_same<T, platform::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto casted_dout = dout.template cast<float>();
+        auto first = static_cast<float>(0.5) *
+                     (static_cast<float>(1) +
+                      ((casted_x * static_cast<float>(M_SQRT1_2)).erf()));
+        auto second = static_cast<float>(0.5 * M_2_SQRTPI * M_SQRT1_2) *
+                      casted_x *
+                      (-static_cast<float>(0.5) * casted_x.square()).exp();
+        dx.device(d) = (casted_dout * (first + second)).template cast<T>();
+      } else {
        auto first =
            static_cast<T>(0.5) *
            (static_cast<T>(1) + ((x * static_cast<T>(M_SQRT1_2)).erf()));
@@ -124,6 +176,7 @@ struct GeluGradFunctor {
        auto second = static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
                      (-static_cast<T>(0.5) * x.square()).exp();
        dx.device(d) = dout * (first + second);
+      }
 #endif
    }
  }

--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -1476,6 +1476,9 @@ class Dropout(layers.Layer):
        self._is_test = is_test

    def forward(self, input):
+        # fast return for p == 0
+        if self._dropout_prob == 0:
+            return input
        prog = default_main_program()
        if (self._seed is None or self._seed == 0) and prog.random_seed != 0:
            self._seed = prog.random_seed

--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -179,6 +179,7 @@ def monkey_patch_variable():
            outputs={"Out": [out]},
            attrs={"in_dtype": self.dtype,
                   "out_dtype": out.dtype})
+        out.stop_gradient = self.stop_gradient
        return out

    def _scalar_op_(var, scale, bias):

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1018,6 +1018,9 @@ def dropout(x,
            x = fluid.data(name="data", shape=[None, 32, 32], dtype="float32")
            dropped = fluid.layers.dropout(x, dropout_prob=0.5)
    """
+    # fast return for p == 0
+    if dropout_prob == 0:
+        return x

    def get_attrs(prog, dropout_prob, is_test, seed):
        if (seed is None or seed == 0) and prog.random_seed != 0:

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -224,6 +224,11 @@ def cast(x, dtype):
            x = paddle.to_tensor([2, 3, 4], 'float64')
            y = paddle.cast(x, 'uint8')
    """
+    if in_dygraph_mode():
+        if not isinstance(dtype, core.VarDesc.VarType):
+            dtype = convert_np_dtype_to_dtype_(dtype)
+        out = core.ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+
    check_variable_and_dtype(
        x, 'x',
        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
@@ -234,7 +239,8 @@ def cast(x, dtype):
    ], 'cast')

    helper = LayerHelper('cast', **locals())
-    out = helper.create_variable_for_type_inference(dtype=dtype)
+    out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=x.stop_gradient)
    helper.append_op(
        type='cast',
        inputs={'X': [x]},

--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -302,13 +302,16 @@ class TestDropoutFAPI(unittest.TestCase):
                training=False,
                mode='downscale_in_infer')
            res10 = paddle.nn.functional.dropout(x=input, p=1., training=True)
+            res11 = paddle.fluid.layers.dropout(x=input, dropout_prob=0.)

            in_np = np.random.random([40, 40]).astype("float32")
            res_np = in_np
            res_np2 = np.zeros_like(in_np)

            exe = fluid.Executor(place)
-            res_list = [res1, res2, res3, res4, res5, res6, res7, res8, res9]
+            res_list = [
+                res1, res2, res3, res4, res5, res6, res7, res8, res9, res11
+            ]
            for res in res_list:
                fetches = exe.run(fluid.default_main_program(),
                                  feed={"input": in_np},
@@ -383,8 +386,12 @@ class TestDropoutFAPI(unittest.TestCase):
                    mode='downscale_in_infer')
                res10 = paddle.nn.functional.dropout(
                    x=input, p=1., training=True)
+                dropout = paddle.fluid.dygraph.Dropout(p=0, )
+                res11 = dropout(input)

-            res_list = [res1, res2, res3, res4, res5, res6, res7, res8, res9]
+            res_list = [
+                res1, res2, res3, res4, res5, res6, res7, res8, res9, res11
+            ]
            for res in res_list:
                self.assertTrue(np.allclose(res.numpy(), res_np))
            self.assertTrue(np.allclose(res10.numpy(), res_np2))

--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -257,6 +257,19 @@ class TestMathOpPatches(unittest.TestCase):
                       fetch_list=[b])
        self.assertTrue(numpy.allclose(-a_np, b_np))

+    @prog_scope()
+    def test_astype(self):
+        a = fluid.layers.data(name="a", shape=[10, 1])
+        b = a.astype('float32')
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.uniform(-1, 1, size=[10, 1]).astype('float64')
+
+        b_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np},
+                       fetch_list=[b])
+        self.assertTrue(numpy.allclose(a_np.astype('float32'), b_np))
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -887,6 +887,10 @@ def dropout(x,
            print(y_01)

    """
+    # fast return for p == 0
+    if p == 0:
+        return x
+
    if not isinstance(p, (float, int)):
        raise TypeError("p argument should be a number")
    if p < 0 or p > 1: