diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h index 99ef7515e2c999fb09608beebee21fc3bb307bf3..d3aa8798008a9a10f13a3337dbc8e28450c9488a 100644 --- a/paddle/phi/kernels/gpu/roll_kernel_impl.h +++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h @@ -39,7 +39,7 @@ __global__ void RollCudaKernel(const T* input, #pragma unroll for (size_t i = 0; i < Rank; i++) { - new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i]; + new_dim_idx = (output_idx / strides[i]) % sizes[i] + shifts[i]; if (new_dim_idx >= sizes[i]) { output_idx += (shifts[i] - sizes[i]) * strides[i]; } else { diff --git a/python/paddle/fluid/tests/unittests/test_roll_op.py b/python/paddle/fluid/tests/unittests/test_roll_op.py index 10fdfb25825a99200907b8ad4a684b08e17ec4cb..d28d3482cc94b8097a842f969af8da93fcf6cb72 100644 --- a/python/paddle/fluid/tests/unittests/test_roll_op.py +++ b/python/paddle/fluid/tests/unittests/test_roll_op.py @@ -61,6 +61,14 @@ class TestRollOpCase2(TestRollOp): self.axis = [-1, -2] +class TestRollOpCase3(TestRollOp): + def init_dtype_type(self): + self.dtype = np.float32 + self.x_shape = (11, 11) + self.shifts = [1, 1] + self.axis = [-1, 1] + + class TestRollFP16OP(TestRollOp): def init_dtype_type(self): self.dtype = np.float16 @@ -77,6 +85,14 @@ class TestRollFP16OpCase2(TestRollOp): self.axis = [-1, -2] +class TestRollFP16OpCase3(TestRollOp): + def init_dtype_type(self): + self.dtype = np.float16 + self.x_shape = (11, 11) + self.shifts = [1, 1] + self.axis = [-1, 1] + + @unittest.skipIf( not core.is_compiled_with_cuda() or not core.is_bfloat16_supported(core.CUDAPlace(0)), @@ -117,6 +133,26 @@ class TestRollBF16OpCase2(TestRollOp): self.check_grad_with_place(self.place, ['X'], 'Out', check_eager=True) +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", +) +class TestRollBF16OpCase3(TestRollOp): + def init_dtype_type(self): + self.dtype = np.uint16 + self.x_shape = (11, 11) + self.shifts = [1, 1] + self.axis = [-1, 1] + self.place = core.CUDAPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place, check_eager=True) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X'], 'Out', check_eager=True) + + class TestRollAPI(unittest.TestCase): def input_data(self): self.data_x = np.array(