diff --git a/paddle/phi/backends/gpu/gpu_utils.h b/paddle/phi/backends/gpu/gpu_utils.h index ea97a086afc36e4e679b81723ece01529cc21884..0bb0aef7be1f13ccd93e32ce83064de97291493d 100644 --- a/paddle/phi/backends/gpu/gpu_utils.h +++ b/paddle/phi/backends/gpu/gpu_utils.h @@ -86,7 +86,8 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType FlatTensorIndex(const Index3& index, const Dim3& dims) { IndexType flat_index = index[0]; - for (int i = 1; i < 3; i++) { +#pragma unroll + for (int i = 1; i < 3; ++i) { flat_index = flat_index * dims[i] + index[i]; } return flat_index; @@ -97,7 +98,8 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index3 ConvertTensorIndex(IndexType index, const Dim3& dims) { Index3 tensor_index; - for (int i = 2; i >= 0; i--) { +#pragma unroll + for (int i = 2; i >= 0; --i) { IndexType new_index = index / dims[i]; tensor_index[i] = static_cast(index - dims[i] * new_index); index = new_index; diff --git a/paddle/phi/kernels/funcs/transpose_function.cu.h b/paddle/phi/kernels/funcs/transpose_function.cu.h index f49a3785622dda215d356493c9ea60faa8989633..00a9b528f6f741655dd69a4f591d3d379c5fc2bc 100644 --- a/paddle/phi/kernels/funcs/transpose_function.cu.h +++ b/paddle/phi/kernels/funcs/transpose_function.cu.h @@ -153,7 +153,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, if (x < in_effective_thread_num) { // Read a tile from input using block. int x_i = x / TileY; - int x_j = x % TileY; + int x_j = x - x_i * TileY; IndexType input_ind = input_origin_block_flat_index + x_i * input_dims[2] + x_j; IndexType input_inc = BlockReadRows * input_dims[2];