[AMP OP&Test] Support bf16/fp16 for roll op and add ut. (#51565)

1fbf423a · Yuang Liu · GitHub · 8fc9a19f · 1fbf423a · 1fbf423a
4 changed file
--- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
@@ -14,7 +14,9 @@

 #include "paddle/phi/kernels/roll_grad_kernel.h"

+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/roll_kernel_impl.h"

@@ -81,6 +83,7 @@ PD_REGISTER_KERNEL(roll_grad,
                   ALL_LAYOUT,
                   phi::RollGradKernel,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   float,
                   double,
                   int,

--- a/paddle/phi/kernels/gpu/roll_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
@@ -14,7 +14,9 @@

 #include "paddle/phi/kernels/roll_kernel.h"

+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/array.h"
 #include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
@@ -83,6 +85,7 @@ PD_REGISTER_KERNEL(roll,
                   ALL_LAYOUT,
                   phi::RollKernel,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   float,
                   double,
                   int,

--- a/python/paddle/fluid/tests/unittests/test_roll_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roll_op.py
@@ -15,10 +15,11 @@
 import unittest

 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16

 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard


@@ -27,13 +28,17 @@ class TestRollOp(OpTest):
        self.python_api = paddle.roll
        self.op_type = "roll"
        self.init_dtype_type()
-        self.inputs = {'X': np.random.random(self.x_shape).astype(self.dtype)}
        self.attrs = {'shifts': self.shifts, 'axis': self.axis}
-        self.outputs = {
-            'Out': np.roll(
-                self.inputs['X'], self.attrs['shifts'], self.attrs['axis']
-            )
-        }
+        bf16_ut = self.dtype == np.uint16
+        x = np.random.random(self.x_shape).astype(
+            np.float32 if bf16_ut else self.dtype
+        )
+        out = np.roll(x, self.attrs['shifts'], self.attrs['axis'])
+        if bf16_ut:
+            x = convert_float_to_uint16(x)
+            out = convert_float_to_uint16(out)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}

    def init_dtype_type(self):
        self.dtype = np.float64
@@ -56,6 +61,62 @@ class TestRollOpCase2(TestRollOp):
        self.axis = [-1, -2]


+class TestRollFP16OP(TestRollOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.x_shape = (100, 4, 5)
+        self.shifts = [101, -1]
+        self.axis = [0, -2]
+
+
+class TestRollFP16OpCase2(TestRollOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.x_shape = (100, 10, 5)
+        self.shifts = [8, -1]
+        self.axis = [-1, -2]
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestRollBF16OP(TestRollOp):
+    def init_dtype_type(self):
+        self.dtype = np.uint16
+        self.x_shape = (10, 4, 5)
+        self.shifts = [101, -1]
+        self.axis = [0, -2]
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=True)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out', check_eager=True)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestRollBF16OpCase2(TestRollOp):
+    def init_dtype_type(self):
+        self.dtype = np.uint16
+        self.x_shape = (10, 5, 5)
+        self.shifts = [8, -1]
+        self.axis = [-1, -2]
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=True)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out', check_eager=True)
+
+
 class TestRollAPI(unittest.TestCase):
    def input_data(self):
        self.data_x = np.array(

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1710,6 +1710,21 @@ def roll(x, shifts, axis=None, name=None):
    if in_dygraph_mode():
        return _C_ops.roll(x, shifts, axis)
    else:
+        check_variable_and_dtype(
+            x,
+            'dtype',
+            [
+                'float16',
+                'float32',
+                'uint16',
+                'float64',
+                'int32',
+                'int64',
+                'complex64',
+                'complex128',
+            ],
+            'roll',
+        )
        helper = LayerHelper("roll", **locals())
        check_type(axis, 'axis', (list, tuple), 'roll')