[AMP OP&Test] Support fp16/bf16 for cumsum (#51694)

* add fp16 unittest * support bf16 and add unittest * fix according to review

[AMP OP&Test] Support fp16/bf16 for cumsum (#51694)
* add fp16 unittest * support bf16 and add unittest * fix according to review
01eeba5e · Siming Dai · GitHub · 9c238d2b · 01eeba5e · 01eeba5e
3 changed file
--- a/paddle/phi/kernels/gpu/cum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
@@ -29,6 +29,7 @@ namespace cub = hipcub;
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -82,5 +83,6 @@ PD_REGISTER_KERNEL(cumsum_grad,
                   int16_t,
                   int,
                   int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #endif
--- a/paddle/phi/kernels/gpu/cum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -28,6 +28,7 @@ namespace cub = hipcub;
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -217,7 +218,8 @@ __global__ void BlockScanKernel(T* d_out,
 }
 template <typename Context, typename T>
-typename std::enable_if<!std::is_same<T, phi::dtype::float16>::value>::type
+typename std::enable_if<!std::is_same<T, phi::dtype::float16>::value &&
+                        !std::is_same<T, phi::dtype::bfloat16>::value>::type
 ThrustCumsumKernel(const Context& dev_ctx,
                   const T* in_data,
                   T* out_data,
@@ -261,6 +263,15 @@ ThrustCumsumKernel(const Context& dev_ctx,
                   bool reverse,
                   bool exclusive) {}
+template <typename Context, typename T>
+typename std::enable_if<std::is_same<T, phi::dtype::bfloat16>::value>::type
+ThrustCumsumKernel(const Context& dev_ctx,
+                   const phi::dtype::bfloat16* in_data,
+                   phi::dtype::bfloat16* out_data,
+                   int64_t size,
+                   bool reverse,
+                   bool exclusive) {}
 template <typename T, typename Context, typename Op>
 void ScanKernel(const Context& dev_ctx,
                const DenseTensor& x,
@@ -301,6 +312,7 @@ void ScanKernel(const Context& dev_ctx,
  // Use thrust for parallel acceleration when the input size is equal to the
  // length of the ‘axis’ dimension.
  if (!std::is_same<T, phi::dtype::float16>::value &&
+      !std::is_same<T, phi::dtype::bfloat16>::value &&
      std::is_same<Op, cub::Sum>::value && size == out_dims[axis]) {
    ThrustCumsumKernel<Context, T>(
        dev_ctx, in_data, out_data, size, reverse, exclusive);
@@ -440,7 +452,8 @@ PD_REGISTER_KERNEL(cumsum,
                   int16_t,
                   int,
                   int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(logcumsumexp,
                   GPU,

--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -17,7 +17,7 @@ import tempfile
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
@@ -117,10 +117,15 @@ class TestSumOp1(OpTest):
        self.op_type = "cumsum"
        self.prim_op_type = "prim"
        self.python_api = paddle.cumsum
-        self.enable_cinn = True
+        self.set_enable_cinn()
-        self.attrs = {'axis': 2}
+        self.init_dtype()
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.set_attrs_input_output()
-        self.outputs = {'Out': self.inputs['X'].cumsum(axis=2)}
+        if self.dtype == np.uint16:
+            self.inputs = {'X': convert_float_to_uint16(self.x)}
+            self.outputs = {'Out': convert_float_to_uint16(self.out)}
+        else:
+            self.inputs = {'X': self.x}
+            self.outputs = {'Out': self.out}
    def test_check_output(self):
        self.check_output()
@@ -128,109 +133,56 @@ class TestSumOp1(OpTest):
    def test_check_grad(self):
        self.check_grad(['X'], 'Out', check_prim=True)
+    def init_dtype(self):
+        self.dtype = self.dtype_ = np.float64
-class TestSumOp2(OpTest):
+    def set_enable_cinn(self):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.prim_op_type = "prim"
-        self.python_api = paddle.cumsum
        self.enable_cinn = True
-        self.attrs = {'axis': -1, 'reverse': True}
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.outputs = {
-            'Out': np.flip(
-                np.flip(self.inputs['X'], axis=2).cumsum(axis=2), axis=2
-            )
-        }
-    def test_check_output(self):
+    def set_attrs_input_output(self):
-        self.check_output()
+        self.attrs = {'axis': 2}
+        self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
-    def test_check_grad(self):
+        self.out = self.x.cumsum(axis=2)
-        self.check_grad(['X'], 'Out', check_prim=True)
-class TestSumOp3(OpTest):
+class TestSumOp2(TestSumOp1):
-    def setUp(self):
+    def set_attrs_input_output(self):
-        self.op_type = "cumsum"
+        self.attrs = {'axis': -1, 'reverse': True}
-        self.prim_op_type = "prim"
+        self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
-        self.python_api = paddle.cumsum
+        self.out = np.flip(np.flip(self.x, axis=2).cumsum(axis=2), axis=2)
-        self.enable_cinn = True
-        self.attrs = {'axis': 1}
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
+class TestSumOp3(TestSumOp1):
-        self.check_grad(['X'], 'Out', check_prim=True)
+    def set_attrs_input_output(self):
+        self.attrs = {'axis': 1}
+        self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
+        self.out = self.x.cumsum(axis=1)
-class TestSumOp4(OpTest):
+class TestSumOp4(TestSumOp1):
-    def setUp(self):
+    def set_attrs_input_output(self):
-        self.op_type = "cumsum"
-        self.prim_op_type = "prim"
-        self.python_api = paddle.cumsum
-        self.enable_cinn = True
        self.attrs = {'axis': 0}
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
-        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+        self.out = self.x.cumsum(axis=0)
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
+class TestSumOp5(TestSumOp1):
-        self.check_grad(['X'], 'Out', check_prim=True)
+    def set_attrs_input_output(self):
+        self.x = np.random.random((5, 20)).astype(self.dtype_)
+        self.out = self.x.cumsum(axis=1)
-class TestSumOp5(OpTest):
+class TestSumOp6(TestSumOp1):
-    def setUp(self):
+    def set_attrs_input_output(self):
-        self.op_type = "cumsum"
-        self.prim_op_type = "prim"
-        self.python_api = paddle.cumsum
-        self.enable_cinn = True
-        self.inputs = {'X': np.random.random((5, 20)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
-class TestSumOp6(OpTest):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.prim_op_type = "prim"
-        self.python_api = paddle.cumsum
        self.attrs = {'axis': -1, 'flatten': True}
-        self.inputs = {'X': np.random.random((5, 6, 5)).astype("float64")}
+        self.x = np.random.random((5, 6, 5)).astype(self.dtype_)
-        self.outputs = {'Out': self.inputs['X'].cumsum()}
+        self.out = self.x.cumsum()
-        self.enable_cinn = False
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
-class TestSumOp7(OpTest):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.prim_op_type = "prim"
-        self.python_api = paddle.cumsum
-        self.enable_cinn = True
-        self.inputs = {'X': np.random.random((100)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
+class TestSumOp7(TestSumOp1):
-        self.check_grad(['X'], 'Out', check_prim=True)
+    def set_attrs_input_output(self):
+        self.x = np.random.random((100)).astype(self.dtype_)
+        self.out = self.x.cumsum(axis=0)
 class TestCumsumFP16(unittest.TestCase):
@@ -263,19 +215,15 @@ class TestSumOpExclusive1(OpTest):
        self.op_type = "cumsum"
        self.prim_op_type = "prim"
        self.python_api = paddle.cumsum
-        self.enable_cinn = True
+        self.set_enable_cinn()
-        self.attrs = {'axis': 2, "exclusive": True}
+        self.init_dtype()
-        a = np.random.random((4, 5, 20)).astype("float64")
+        self.set_attrs_input_output()
-        self.inputs = {'X': a}
+        if self.dtype == np.uint16:
-        self.outputs = {
+            self.inputs = {'X': convert_float_to_uint16(self.x)}
-            'Out': np.concatenate(
+            self.outputs = {'Out': convert_float_to_uint16(self.out)}
-                (
+        else:
-                    np.zeros((4, 5, 1), dtype=np.float64),
+            self.inputs = {'X': self.x}
-                    a[:, :, :-1].cumsum(axis=2),
+            self.outputs = {'Out': self.out}
-                ),
-                axis=2,
-            )
-        }
    def test_check_output(self):
        self.check_output()
@@ -283,103 +231,98 @@ class TestSumOpExclusive1(OpTest):
    def test_check_grad(self):
        self.check_grad(['X'], 'Out', check_prim=True)
+    def init_dtype(self):
+        self.dtype = self.dtype_ = np.float64
-class TestSumOpExclusive2(OpTest):
+    def set_enable_cinn(self):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.prim_op_type = "prim"
-        self.python_api = paddle.cumsum
        self.enable_cinn = True
-        self.attrs = {'axis': 2, "exclusive": True}
-        a = np.random.random((1, 1, 100)).astype("float64")
-        self.inputs = {'X': a}
-        self.outputs = {
-            'Out': np.concatenate(
-                (
-                    np.zeros((1, 1, 1), dtype=np.float64),
-                    a[:, :, :-1].cumsum(axis=2),
-                ),
-                axis=2,
-            )
-        }
-    def test_check_output(self):
+    def set_attrs_input_output(self):
-        self.check_output()
+        self.attrs = {'axis': 2, 'exclusive': True}
+        self.x = np.random.random((4, 5, 20)).astype(self.dtype_)
-    def test_check_grad(self):
+        self.out = np.concatenate(
-        self.check_grad(['X'], 'Out', check_prim=True)
+            (
+                np.zeros((4, 5, 1), dtype=self.dtype_),
+                self.x[:, :, :-1].cumsum(axis=2),
+            ),
+            axis=2,
+        )
-class TestSumOpExclusive3(OpTest):
+class TestSumOpExclusive2(TestSumOpExclusive1):
-    def setUp(self):
+    def set_attrs_input_output(self):
-        self.op_type = "cumsum"
+        self.attrs = {'axis': 2, 'exclusive': True}
-        self.prim_op_type = "prim"
+        self.x = np.random.random((1, 1, 100)).astype(self.dtype_)
-        self.python_api = paddle.cumsum
+        self.out = np.concatenate(
-        self.enable_cinn = True
+            (
-        self.attrs = {'axis': 2, "exclusive": True}
+                np.zeros((1, 1, 1), dtype=self.dtype_),
-        a = np.random.random((4, 5, 20)).astype("float64")
+                self.x[:, :, :-1].cumsum(axis=2),
-        self.inputs = {'X': a}
+            ),
-        self.outputs = {
+            axis=2,
-            'Out': np.concatenate(
+        )
-                (
-                    np.zeros((4, 5, 1), dtype=np.float64),
-                    a[:, :, :-1].cumsum(axis=2),
-                ),
-                axis=2,
-            )
-        }
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
+class TestSumOpExclusive3(TestSumOpExclusive1):
-        self.check_grad(['X'], 'Out', check_prim=True)
+    def set_attrs_input_output(self):
+        self.attrs = {'axis': 2, 'exclusive': True}
+        self.x = np.random.random((4, 5, 20)).astype(self.dtype_)
+        self.out = np.concatenate(
+            (
+                np.zeros((4, 5, 1), dtype=self.dtype_),
+                self.x[:, :, :-1].cumsum(axis=2),
+            ),
+            axis=2,
+        )
-class TestSumOpExclusive4(OpTest):
+class TestSumOpExclusive4(TestSumOpExclusive1):
-    def setUp(self):
+    def set_attrs_input_output(self):
-        self.op_type = "cumsum"
+        self.attrs = {'axis': 2, 'exclusive': True}
-        self.prim_op_type = "prim"
+        self.x = np.random.random((1, 1, 100)).astype(self.dtype_)
-        self.python_api = paddle.cumsum
+        self.out = np.concatenate(
-        self.enable_cinn = True
+            (
-        self.attrs = {'axis': 2, "exclusive": True}
+                np.zeros((1, 1, 1), dtype=self.dtype_),
-        a = np.random.random((1, 1, 100)).astype("float64")
+                self.x[:, :, :-1].cumsum(axis=2),
-        self.inputs = {'X': a}
+            ),
-        self.outputs = {
+            axis=2,
-            'Out': np.concatenate(
+        )
-                (
-                    np.zeros((1, 1, 1), dtype=np.float64),
-                    a[:, :, :-1].cumsum(axis=2),
-                ),
-                axis=2,
-            )
-        }
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
+class TestSumOpExclusive5(TestSumOpExclusive1):
-        self.check_grad(['X'], 'Out', check_prim=True)
+    def set_attrs_input_output(self):
+        self.attrs = {'axis': 2, 'exclusive': True}
+        self.x = np.random.random((4, 5, 40)).astype(self.dtype_)
+        self.out = np.concatenate(
+            (
+                np.zeros((4, 5, 1), dtype=self.dtype_),
+                self.x[:, :, :-1].cumsum(axis=2),
+            ),
+            axis=2,
+        )
-class TestSumOpExclusive5(OpTest):
+class TestSumOpExclusiveFP16(OpTest):
    def setUp(self):
        self.op_type = "cumsum"
        self.prim_op_type = "prim"
        self.python_api = paddle.cumsum
-        self.enable_cinn = True
+        self.init_dtype()
+        self.enable_cinn = False
        self.attrs = {'axis': 2, "exclusive": True}
-        a = np.random.random((4, 5, 40)).astype("float64")
+        self.x = np.random.random((4, 5, 20)).astype(self.dtype)
-        self.inputs = {'X': a}
+        self.out = np.concatenate(
-        self.outputs = {
+            (
-            'Out': np.concatenate(
+                np.zeros((4, 5, 1), dtype=self.dtype),
-                (
+                self.x[:, :, :-1].cumsum(axis=2),
-                    np.zeros((4, 5, 1), dtype=np.float64),
+            ),
-                    a[:, :, :-1].cumsum(axis=2),
+            axis=2,
-                ),
+        )
-                axis=2,
+        if self.dtype == np.uint16:
-            )
+            self.inputs = {'X': convert_float_to_uint16(self.x)}
-        }
+            self.outputs = {'Out': convert_float_to_uint16(self.out)}
+        else:
+            self.inputs = {'X': self.x}
+            self.outputs = {'Out': self.out}
    def test_check_output(self):
        self.check_output()
@@ -387,25 +330,37 @@ class TestSumOpExclusive5(OpTest):
    def test_check_grad(self):
        self.check_grad(['X'], 'Out', check_prim=True)
+    def init_dtype(self):
+        self.dtype = np.float16
-class TestSumOpExclusiveFP16(OpTest):
+class TestSumOpReverseExclusive(OpTest):
    def setUp(self):
        self.op_type = "cumsum"
        self.prim_op_type = "prim"
        self.python_api = paddle.cumsum
-        self.enable_cinn = False
+        self.set_enable_cinn()
-        self.attrs = {'axis': 2, "exclusive": True}
+        self.init_dtype()
-        a = np.random.random((4, 5, 20)).astype("float16")
+        self.attrs = {
-        self.inputs = {'X': a}
+            'axis': 2,
-        self.outputs = {
+            'reverse': True,
-            'Out': np.concatenate(
+            'exclusive': True,
-                (
-                    np.zeros((4, 5, 1), dtype=np.float16),
-                    a[:, :, :-1].cumsum(axis=2),
-                ),
-                axis=2,
-            )
        }
+        self.x = np.random.random((4, 5, 6)).astype(self.dtype_)
+        a = np.flip(self.x, axis=2)
+        self.out = np.concatenate(
+            (
+                np.flip(a[:, :, :-1].cumsum(axis=2), axis=2),
+                np.zeros((4, 5, 1), dtype=self.dtype_),
+            ),
+            axis=2,
+        )
+        if self.dtype == np.uint16:
+            self.inputs = {'X': convert_float_to_uint16(self.x)}
+            self.outputs = {'Out': convert_float_to_uint16(self.out)}
+        else:
+            self.inputs = {'X': self.x}
+            self.outputs = {'Out': self.out}
    def test_check_output(self):
        self.check_output()
@@ -413,32 +368,89 @@ class TestSumOpExclusiveFP16(OpTest):
    def test_check_grad(self):
        self.check_grad(['X'], 'Out', check_prim=True)
+    def init_dtype(self):
+        self.dtype = self.dtype_ = np.float64
-class TestSumOpReverseExclusive(OpTest):
+    def set_enable_cinn(self):
-    def setUp(self):
-        self.op_type = "cumsum"
-        self.prim_op_type = "prim"
-        self.python_api = paddle.cumsum
        self.enable_cinn = True
-        self.attrs = {'axis': 2, 'reverse': True, "exclusive": True}
-        a = np.random.random((4, 5, 6)).astype("float64")
-        self.inputs = {'X': a}
-        a = np.flip(a, axis=2)
-        self.outputs = {
-            'Out': np.concatenate(
-                (
-                    np.flip(a[:, :, :-1].cumsum(axis=2), axis=2),
-                    np.zeros((4, 5, 1), dtype=np.float64),
-                ),
-                axis=2,
-            )
-        }
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
+def create_test_fp16_class(parent, max_relative_error=1e-2):
-        self.check_grad(['X'], 'Out', check_prim=True)
+    class TestCumsumFP16Op(parent):
+        def init_dtype(self):
+            self.dtype = self.dtype_ = np.float16
+        def set_enable_cinn(self):
+            self.enable_cinn = False
+        def test_check_output(self):
+            self.check_output()
+        def test_check_grad(self):
+            self.check_grad(
+                ['X'],
+                'Out',
+                check_prim=True,
+            )
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestCumsumFP16Op.__name__ = cls_name
+    globals()[cls_name] = TestCumsumFP16Op
+create_test_fp16_class(TestSumOp1)
+create_test_fp16_class(TestSumOp2)
+create_test_fp16_class(TestSumOp3)
+create_test_fp16_class(TestSumOp4)
+create_test_fp16_class(TestSumOp5)
+create_test_fp16_class(TestSumOp6)
+create_test_fp16_class(TestSumOpExclusive1)
+create_test_fp16_class(TestSumOpExclusive2)
+create_test_fp16_class(TestSumOpExclusive3)
+create_test_fp16_class(TestSumOpExclusive4)
+create_test_fp16_class(TestSumOpExclusive5)
+create_test_fp16_class(TestSumOpReverseExclusive)
+def create_test_bf16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA or not support bfloat16",
+    )
+    class TestCumsumBF16Op(parent):
+        def init_dtype(self):
+            self.dtype = np.uint16
+            self.dtype_ = np.float32
+        def set_enable_cinn(self):
+            self.enable_cinn = False
+        def test_check_output(self):
+            place = paddle.CUDAPlace(0)
+            self.check_output_with_place(place, check_prim=True)
+        def test_check_grad(self):
+            place = paddle.CUDAPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out", check_prim=True)
+    cls_name = "{0}_{1}".format(parent.__name__, "BF16")
+    TestCumsumBF16Op.__name__ = cls_name
+    globals()[cls_name] = TestCumsumBF16Op
+create_test_bf16_class(TestSumOp1)
+create_test_bf16_class(TestSumOp2)
+create_test_bf16_class(TestSumOp3)
+create_test_bf16_class(TestSumOp4)
+create_test_bf16_class(TestSumOp5)
+create_test_bf16_class(TestSumOp6)
+create_test_bf16_class(TestSumOpExclusive1)
+create_test_bf16_class(TestSumOpExclusive2)
+create_test_bf16_class(TestSumOpExclusive3)
+create_test_bf16_class(TestSumOpExclusive4)
+create_test_bf16_class(TestSumOpExclusive5)
+create_test_bf16_class(TestSumOpReverseExclusive)
 class BadInputTest(unittest.TestCase):