diff --git a/paddle/phi/backends/gpu/gpu_utils.h b/paddle/phi/backends/gpu/gpu_utils.h
index ea97a086afc36e4e679b81723ece01529cc21884..0bb0aef7be1f13ccd93e32ce83064de97291493d 100644
--- a/paddle/phi/backends/gpu/gpu_utils.h
+++ b/paddle/phi/backends/gpu/gpu_utils.h
@@ -86,7 +86,8 @@ template <typename IndexType = int>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType
 FlatTensorIndex(const Index3& index, const Dim3& dims) {
   IndexType flat_index = index[0];
-  for (int i = 1; i < 3; i++) {
+#pragma unroll
+  for (int i = 1; i < 3; ++i) {
     flat_index = flat_index * dims[i] + index[i];
   }
   return flat_index;
@@ -97,7 +98,8 @@ template <typename IndexType = int>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index3
 ConvertTensorIndex(IndexType index, const Dim3& dims) {
   Index3 tensor_index;
-  for (int i = 2; i >= 0; i--) {
+#pragma unroll
+  for (int i = 2; i >= 0; --i) {
     IndexType new_index = index / dims[i];
     tensor_index[i] = static_cast<int>(index - dims[i] * new_index);
     index = new_index;
diff --git a/paddle/phi/kernels/funcs/transpose_function.cu.h b/paddle/phi/kernels/funcs/transpose_function.cu.h
index f49a3785622dda215d356493c9ea60faa8989633..00a9b528f6f741655dd69a4f591d3d379c5fc2bc 100644
--- a/paddle/phi/kernels/funcs/transpose_function.cu.h
+++ b/paddle/phi/kernels/funcs/transpose_function.cu.h
@@ -153,7 +153,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input,
   if (x < in_effective_thread_num) {
     // Read a tile from input using block.
     int x_i = x / TileY;
-    int x_j = x % TileY;
+    int x_j = x - x_i * TileY;
     IndexType input_ind =
         input_origin_block_flat_index + x_i * input_dims[2] + x_j;
     IndexType input_inc = BlockReadRows * input_dims[2];