diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
index 99ef7515e2c999fb09608beebee21fc3bb307bf3..d3aa8798008a9a10f13a3337dbc8e28450c9488a 100644
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -39,7 +39,7 @@ __global__ void RollCudaKernel(const T* input,
 
 #pragma unroll
   for (size_t i = 0; i < Rank; i++) {
-    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
+    new_dim_idx = (output_idx / strides[i]) % sizes[i] + shifts[i];
     if (new_dim_idx >= sizes[i]) {
       output_idx += (shifts[i] - sizes[i]) * strides[i];
     } else {
diff --git a/python/paddle/fluid/tests/unittests/test_roll_op.py b/python/paddle/fluid/tests/unittests/test_roll_op.py
index 10fdfb25825a99200907b8ad4a684b08e17ec4cb..d28d3482cc94b8097a842f969af8da93fcf6cb72 100644
--- a/python/paddle/fluid/tests/unittests/test_roll_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roll_op.py
@@ -61,6 +61,14 @@ class TestRollOpCase2(TestRollOp):
         self.axis = [-1, -2]
 
 
+class TestRollOpCase3(TestRollOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+        self.x_shape = (11, 11)
+        self.shifts = [1, 1]
+        self.axis = [-1, 1]
+
+
 class TestRollFP16OP(TestRollOp):
     def init_dtype_type(self):
         self.dtype = np.float16
@@ -77,6 +85,14 @@ class TestRollFP16OpCase2(TestRollOp):
         self.axis = [-1, -2]
 
 
+class TestRollFP16OpCase3(TestRollOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.x_shape = (11, 11)
+        self.shifts = [1, 1]
+        self.axis = [-1, 1]
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
@@ -117,6 +133,26 @@ class TestRollBF16OpCase2(TestRollOp):
         self.check_grad_with_place(self.place, ['X'], 'Out', check_eager=True)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestRollBF16OpCase3(TestRollOp):
+    def init_dtype_type(self):
+        self.dtype = np.uint16
+        self.x_shape = (11, 11)
+        self.shifts = [1, 1]
+        self.axis = [-1, 1]
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=True)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out', check_eager=True)
+
+
 class TestRollAPI(unittest.TestCase):
     def input_data(self):
         self.data_x = np.array(