first commit (#51683)

50df0170 · limingshu · GitHub · 0b1a8a83 · 50df0170 · 50df0170
显示空白变更内容
内联并排

Showing with 5 addition and 3 deletion

paddle/phi/backends/gpu/gpu_utils.h paddle/phi/backends/gpu/gpu_utils.h +4 -2

paddle/phi/kernels/funcs/transpose_function.cu.h paddle/phi/kernels/funcs/transpose_function.cu.h +1 -1

未找到文件。
--- a/paddle/phi/backends/gpu/gpu_utils.h
+++ b/paddle/phi/backends/gpu/gpu_utils.h
@@ -86,7 +86,8 @@ template <typename IndexType = int>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType
 FlatTensorIndex(const Index3& index, const Dim3& dims) {
  IndexType flat_index = index[0];
-  for (int i = 1; i < 3; i++) {
+#pragma unroll
+  for (int i = 1; i < 3; ++i) {
    flat_index = flat_index * dims[i] + index[i];
  }
  return flat_index;
@@ -97,7 +98,8 @@ template <typename IndexType = int>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index3
 ConvertTensorIndex(IndexType index, const Dim3& dims) {
  Index3 tensor_index;
-  for (int i = 2; i >= 0; i--) {
+#pragma unroll
+  for (int i = 2; i >= 0; --i) {
    IndexType new_index = index / dims[i];
    tensor_index[i] = static_cast<int>(index - dims[i] * new_index);
    index = new_index;

--- a/paddle/phi/kernels/funcs/transpose_function.cu.h
+++ b/paddle/phi/kernels/funcs/transpose_function.cu.h
@@ -153,7 +153,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input,
  if (x < in_effective_thread_num) {
    // Read a tile from input using block.
    int x_i = x / TileY;
-    int x_j = x % TileY;
+    int x_j = x - x_i * TileY;
    IndexType input_ind =
        input_origin_block_flat_index + x_i * input_dims[2] + x_j;
    IndexType input_inc = BlockReadRows * input_dims[2];