Modify reduce_op.op.h for xpu2 with kernel primitive api (#36904)

* Modify reduce_op.op.h for xpu2 with kernel primitive api

Modify reduce_op.op.h for xpu2 with kernel primitive api (#36904)
* Modify reduce_op.op.h for xpu2 with kernel primitive api
9c5d5665 · niuliling123 · GitHub · d08753df · 9c5d5665 · 9c5d5665
6 changed file
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
@@ -360,12 +360,12 @@ __device__ __forceinline__ void ReadDataBc(
 * reduce_last_dim: Used to indicate whether the dimension of reduce contains
 * the lowest dimension.
 */
-template <typename T, int NX, int NY, int BlockSize, int Rank,
+template <typename Tx, typename Ty, int NX, int NY, int BlockSize, int Rank,
-          typename IndexCal, bool IsBoundary = false>
+          typename IndexCal, typename Functor, bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataReduce(
-    T* dst, const T* __restrict__ src, int block_offset,
+    Ty* dst, const Tx* __restrict__ src, int block_offset,
    const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
-    int stride_ny, bool reduce_last_dim) {
+    int stride_ny, Functor func, bool reduce_last_dim) {
  int thread_offset = 0;
  int left_idx = 0;
  if (reduce_last_dim) {
@@ -385,7 +385,7 @@ __device__ __forceinline__ void ReadDataReduce(
        }
      }
      uint32_t index_src = index_cal(thread_offset + block_offset);
-      dst[ny] = src[index_src];
+      dst[ny] = static_cast<Ty>(func(src[index_src]));
      thread_offset += stride_ny;
    }
  } else {
@@ -400,7 +400,7 @@ __device__ __forceinline__ void ReadDataReduce(
          }
        }
        uint32_t index_src = index_cal(thread_offset + block_offset);
-        dst[nx + ny * NX] = src[index_src];
+        dst[nx + ny * NX] = static_cast<Ty>(func(src[index_src]));
        thread_offset += stride_ny;
      }
    }

--- a/paddle/fluid/operators/kernel_primitives/helper_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/helper_primitives.h
@@ -17,64 +17,49 @@
 namespace paddle {
 namespace operators {
 namespace kernel_primitives {
-namespace details {
-static __device__ __forceinline__ platform::float16 ExpFunctor(
+#ifdef PADDLE_WITH_XPU2
-    platform::float16 x) {
+struct dim3 {
-  return ::Eigen::numext::exp(x);
+  int x;
-}
+  int y;
-static __device__ __forceinline__ float ExpFunctor(float x) { return expf(x); }
+  int z;
-static __device__ __forceinline__ double ExpFunctor(double x) { return exp(x); }
-static __device__ __forceinline__ platform::float16 LogFunctor(
-    platform::float16 x) {
-  return ::Eigen::numext::log(x);
-}
-static __device__ __forceinline__ float LogFunctor(float x) { return logf(x); }
-static __device__ __forceinline__ double LogFunctor(double x) { return log(x); }
-/*************************** Compute Functor****************************/
+  explicit inline dim3(int split_x, int split_y = 1, int split_z = 1) {
-// for margin_cross_entropy
+    x = split_x;
-template <typename Tx, typename Ty = Tx>
+    y = split_y;
-struct ExpLogitTransformer {
+    z = split_z;
-  HOSTDEVICE explicit inline ExpLogitTransformer(int n) {}
-  HOSTDEVICE inline Ty operator()(const Tx* x) const {
-    return static_cast<Ty>(details::ExpFunctor(x[0]));
-  }
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
-    return static_cast<Ty>(details::ExpFunctor(x));
  }
 };
+#endif
-// Post processing function for sum, max, min, prod, any
+struct DimConfig {
-template <typename Tx, typename Ty = Tx>
+  int split_num_x;
-struct IdentityFunctor {
+  int split_num_y;
-  HOSTDEVICE explicit inline IdentityFunctor(int n) {}
+  int split_num_z;
+  int deal_size_x;
+  int deal_size_y;
+  int deal_size_z;
+  int rem_x;
+  int rem_y;
+  int rem_z;
-  HOSTDEVICE inline Ty operator()(const Tx* x) const {
+  HOSTDEVICE explicit inline DimConfig(int split_x, int split_y, int split_z,
-    return static_cast<Ty>(x[0]);
+                                       int size_x, int size_y, int size_z) {
+    split_num_x = split_x;
+    split_num_y = split_y;
+    split_num_z = split_z;
+    deal_size_x = size_x;
+    deal_size_y = size_y;
+    deal_size_z = size_z;
  }
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+  HOSTDEVICE void SetRem(int rem_nx, int rem_ny, int rem_nz) {
-    return static_cast<Ty>(x);
+    rem_x = rem_nx;
+    rem_y = rem_ny;
+    rem_z = rem_nz;
  }
 };
-// Post processing function for mean
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
-  HOSTDEVICE inline T operator()(const T* x) const { return x[0] * n_inv; }
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
- private:
-  T n_inv;
-};
-}  // namespace details
 }  // namespace kernel_primitives
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
@@ -13,11 +13,45 @@
 // limitations under the License.
 #pragma once
+#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
+#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
+#ifdef PADDLE_WITH_XPU2
+#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
+#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
+#define THREAD_ID_X core_id()
+#define THREAD_ID_Y 0
+#define THREAD_ID_Z 0
+#define BLOCK_NUM_X core_num()
+#define BLOCK_NUM_Y 0
+#define BLOCK_NUM_Z 0
+#define BLOCK_ID_X cluster_id()
+#define BLOCK_ID_Y 0
+#define BLOCK_ID_Z 0
+#define GRID_NUM_X cluster_num()
+#define GRID_NUM_Y 0
+#define GRID_NUM_Z 0
+#else
 #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
 #include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
-#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
+#define THREAD_ID_X threadIdx.x
-#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
+#define THREAD_ID_Y threadIdx.y
+#define THREAD_ID_Z threadIdx.z
+#define BLOCK_NUM_X blockDim.x
+#define BLOCK_NUM_Y blockDim.y
+#define BLOCK_NUM_Z blockDim.z
+#define BLOCK_ID_X blockIdx.x
+#define BLOCK_ID_Y blockIdx.y
+#define BLOCK_ID_Z blockIdx.z
+#define GRID_NUM_X gridDim.x
+#define GRID_NUM_Y gridDim.y
+#define GRID_NUM_Z gridDim.z
+#endif
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -130,7 +130,7 @@ __global__ void AddMarginToPositiveLogitsKernel(
 template <typename Tx, typename Ty = Tx>
 struct ExpAndSum {
-  using Transformer = kpds::ExpLogitTransformer<Tx>;
+  using Transformer = kps::ExpFunctor<Tx>;
  inline Ty initial() { return static_cast<Ty>(0.0f); }
@@ -159,7 +159,7 @@ __global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row,
                                        const int64_t N, const int64_t D) {
  CUDA_KERNEL_LOOP(i, N * D) {
    auto row = i / D;
-    logits[i] -= kpds::LogFunctor(logits_sum_per_row[row]);
+    logits[i] -= kps::details::Log(logits_sum_per_row[row]);
  }
 }
@@ -174,9 +174,9 @@ __global__ void HardLabelSoftmaxWithCrossEntropyKernel(
    if ((col + start_index) == labels[row]) {
      auto softmax = log_softmax[i];
      loss[row] = -softmax;
-      log_softmax[i] = kpds::ExpFunctor(softmax);
+      log_softmax[i] = kps::details::Exp(softmax);
    } else {
-      log_softmax[i] = kpds::ExpFunctor(log_softmax[i]);
+      log_softmax[i] = kps::details::Exp(log_softmax[i]);
    }
  }
 }

--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -24,11 +24,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-namespace kpds = paddle::operators::kernel_primitives::details;
+namespace kps = paddle::operators::kernel_primitives;
 template <typename Tx, typename Ty = Tx>
 struct CustomMin {
-  using Transformer = kpds::IdentityFunctor<Tx>;
+  using Transformer = kps::IdentityFunctor<Tx>;
  inline Ty initial() {
    return static_cast<Ty>(std::numeric_limits<Ty>::max());
@@ -41,7 +41,7 @@ struct CustomMin {
 template <typename Tx, typename Ty = Tx>
 struct CustomMax {
-  using Transformer = kpds::IdentityFunctor<Tx>;
+  using Transformer = kps::IdentityFunctor<Tx>;
  inline Ty initial() {
    return static_cast<Ty>(std::numeric_limits<Ty>::lowest());
@@ -55,7 +55,7 @@ struct CustomMax {
 // for cub::Reduce
 template <typename Tx, typename Ty = Tx>
 struct CustomSum {
-  using Transformer = kpds::IdentityFunctor<Tx, Ty>;
+  using Transformer = kps::IdentityFunctor<Tx, Ty>;
  inline Ty initial() { return static_cast<Ty>(0.0f); }
@@ -66,7 +66,7 @@ struct CustomSum {
 template <typename Tx, typename Ty = Tx>
 struct CustomMean {
-  using Transformer = kpds::DivideFunctor<Tx>;
+  using Transformer = kps::DivideFunctor<Tx>;
  inline Ty initial() { return static_cast<Ty>(0.0f); }
@@ -77,7 +77,7 @@ struct CustomMean {
 template <typename Tx, typename Ty = Tx>
 struct CustomMul {
-  using Transformer = kpds::IdentityFunctor<Tx>;
+  using Transformer = kps::IdentityFunctor<Tx>;
  inline Ty initial() { return static_cast<Ty>(1.0f); }
@@ -88,7 +88,7 @@ struct CustomMul {
 template <typename Tx, typename Ty = Tx>
 struct CustomLogicalOr {
-  using Transformer = kpds::IdentityFunctor<Tx>;
+  using Transformer = kps::IdentityFunctor<Tx>;
  inline Ty initial() { return static_cast<Ty>(false); }
@@ -99,7 +99,7 @@ struct CustomLogicalOr {
 template <typename Tx, typename Ty = Tx>
 struct CustomLogicalAnd {
-  using Transformer = kpds::IdentityFunctor<Tx>;
+  using Transformer = kps::IdentityFunctor<Tx>;
  inline Ty initial() { return static_cast<Ty>(true); }

--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -165,10 +165,93 @@ struct IndexCalculator {
  framework::Array<platform::FastDivMod, kMaxRank> divmoders;
 };
+template <bool ReduceLastDim = false>
+struct ReduceIndexMapping {
+  const kps::DimConfig dim;
+  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
+      : dim(dims) {}
+  __device__ __forceinline__ int BlockIdX() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return (cluster_id() / dim.split_num_x % dim.split_num_y);
+    } else {
+      return cluster_id() % dim.split_num_x;
+    }
+#else
+    return blockIdx.x;
+#endif
+  }
+  __device__ __forceinline__ int BlockIdY() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return (cluster_id() % dim.split_num_x);
+    } else {
+      return (cluster_id() / dim.split_num_x % dim.split_num_y);
+    }
+#else
+    return blockIdx.y;
+#endif
+  }
+  __device__ __forceinline__ int BlockDimX() {
+#ifdef PADDLE_WITH_XPU2
+    return dim.deal_size_x;
+#else
+    return blockDim.x;
+#endif
+  }
+  __device__ __forceinline__ int BlockDimY() {
+#ifdef PADDLE_WITH_XPU2
+    return dim.deal_size_y;
+#else
+    return blockDim.y;
+#endif
+  }
+  __device__ __forceinline__ int GridDimX() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.split_num_y;
+    } else {
+      return dim.split_num_x;
+    }
+#else
+    return gridDim.x;
+#endif
+  }
+  __device__ __forceinline__ int GridDimY() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.split_num_x;
+    } else {
+      return dim.split_num_y;
+    }
+#else
+    return gridDim.y;
+#endif
+  }
+  __device__ __forceinline__ int GetLoopSize() {
+#ifdef PADDLE_WITH_XPU2
+    if (ReduceLastDim) {
+      return dim.deal_size_y;
+    } else {
+      return dim.deal_size_x;
+    }
+#else
+    return 1;
+#endif
+  }
+};
 // when reduce_type == kReduceLastDim this struct will be used
 // for higher performance
-struct LastDimIndexCal {
+struct OneDimIndexCal {
-  explicit LastDimIndexCal(int num) : stride(num) {}
+  explicit OneDimIndexCal(int num) : stride(num) {}
  __device__ inline int operator()(int index) const { return index * stride; }
  int stride;
@@ -331,8 +414,16 @@ struct ReduceConfig {
    if (rank == reduce_rank || is_last_dim) {
      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
    } else if (reduce_rank == 1) {
-      // ReduceFirstDim and reduceSecondDim
+// ReduceFirstDim and reduceSecondDim
+#ifdef PADDLE_WITH_XPU2
+      if (reduce_dim[0] == 0) {
+        reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
+      } else {
+        reduce_type = static_cast<int>(ReduceType::kReduceAny);
+      }
+#else
      reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
+#endif
    } else {
      reduce_type = static_cast<int>(ReduceType::kReduceAny);
    }
@@ -408,59 +499,61 @@ struct ReduceConfig {
  // for ReduceHigherDim: if block is enough -> splite reduce_num
  //                     else init block(32, 1) grid(block_num, 1)
  // for others: block(block_num, 1) , grid(left_num, 1)
-  void SetBlockDim() {
+  void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) {
-    // init
-    int block_num = details::GetBlockDim(reduce_num);
-    should_reduce_again = false;
-    dim3 block_dim(block_num, 1);
-    dim3 grid_dim(left_num, 1);
-    blocking_size = reduce_num;
-    if (reduce_type == ReduceType::kReduceHigherDim) {
    int last_dim_num = x_dim.back();
    // update left_num
    int grid_z = left_num / last_dim_num;
    left_num = last_dim_num;
+    grid_dim->z = grid_z;
-      block_dim.z = 1;
-      grid_dim.z = grid_z;
    int device_id = platform::GetCurrentDeviceId();
    int max_mp = platform::GetCUDAMultiProcessors(device_id);
    int max_threads_per_mp =
        platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
    int max_threads = max_threads_per_mp * max_mp;
    // init
    int num_block = (max_threads / left_num);
+    block_dim->x = details::GetBlockDim(left_num);
+    grid_dim->x = details::AlignUp(left_num, block_dim->x);
+    blocking_size = reduce_num;
    if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
      blocking_size = details::GetLastPow2(reduce_num / num_block);
      if (blocking_size <= 1) {
        blocking_size = details::GetLastPow2(sqrt(reduce_num));
      } else if (blocking_size * 2 < reduce_num) {
        blocking_size *= 2;
      }
      should_reduce_again = true;
+      grid_dim->y = details::AlignUp(reduce_num, blocking_size);
+    }
+  }
-        block_dim.x = details::GetBlockDim(left_num);
+  void SetBlockDim() {
-        block_dim.y = 1;
+    // init
-        grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x;
+    int block_num = details::GetBlockDim(reduce_num);
-        grid_dim.y = (reduce_num + blocking_size - 1) / blocking_size;
+    should_reduce_again = false;
+    dim3 block_dim(block_num, 1, 1);
-      } else {
+    dim3 grid_dim(left_num, 1, 1);
-        block_dim.x = details::GetBlockDim(left_num);
-        block_dim.y = 1;
    blocking_size = reduce_num;
-        grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x;
+#ifdef PADDLE_WITH_XPU2
+    if (reduce_last_dim) {
+      block_dim.x = 128;
+      block_dim.y = reduce_num;
+      grid_dim.x = 8;
+      grid_dim.y = 1;
+    } else {
+      block_dim.x = 128;
+      block_dim.y = left_num;
+      grid_dim.x = 8;
      grid_dim.y = 1;
    }
+#else
+    if (reduce_type == ReduceType::kReduceHigherDim) {
+      SetBlockDimForHigher(&block_dim, &grid_dim);
    } else {
      SetBlockDimForReduceAny(&block_dim, &grid_dim);
    }
+#endif
    block = block_dim;
    grid = grid_dim;
@@ -487,72 +580,6 @@ struct ReduceConfig {
  dim3 block;
  dim3 grid;
 };
-/* size : how many colonms left have to be reduced
- * loop : how many rows data have to be reduced
- * block_size: max rows this block to reduce
- */
-template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
-          typename TransformOp, bool IsBoundary = false>
-__device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer,
-                                     TransformOp transformer, MPType init,
-                                     int reduce_num, int left_num,
-                                     int block_size) {
-  const int NY = 1;
-  int idx = blockIdx.x * blockDim.x;
-  int idy = blockIdx.y * block_size;
-  // block_offset of rows
-  Tx reduce_input[NY];
-  MPType reduce_compute[NY];
-  MPType result = init;
-  // the offset of this block
-  int block_offset = idy * left_num + idx + blockIdx.z * reduce_num * left_num;
-  const Tx* input = x + block_offset;
-  int store_offset =
-      blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num + idx;
-  // how many columns left
-  int size = left_num - idx;
-  // how many rows have to be reduced
-  int loop = reduce_num - idy;
-  loop = loop > block_size ? block_size : loop;
-  for (int loop_index = 0; loop_index < loop; loop_index += NY) {
-    kps::ReadData<Tx, Tx, 1, NY, 1, IsBoundary>(
-        &reduce_input[0], input + loop_index * left_num, size, NY, 1, left_num);
-    kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-        &reduce_compute[0], &reduce_input[0], transformer);
-    kps::Reduce<MPType, NY, 1, 1, ReduceOp,
-                kps::details::ReduceMode::kLocalMode>(
-        &result, &reduce_compute[0], reducer, false);
-  }
-  Ty temp_data = static_cast<Ty>(result);
-  kps::WriteData<Ty, 1, 1, 1, IsBoundary>(y + store_offset, &temp_data, size);
-}
-template <typename Tx, typename MPType, typename ReduceOp, typename TransformOp,
-          typename Calculator, bool IsBoundary>
-__device__ void ReduceAnyKernelImpl(const Tx* input, MPType* reduce_var,
-                                    ReduceOp reducer, TransformOp transformer,
-                                    MPType init, int reduce_num, int input_idx,
-                                    bool reduce_last_dim,
-                                    const Calculator& reduce_index_calculator,
-                                    int stride, int num) {
-  Tx input_reg[REDUCE_VEC_SIZE];
-  MPType input_compute[REDUCE_VEC_SIZE];
-  MPType input_transform[REDUCE_VEC_SIZE];
-  kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
-  kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator, IsBoundary>(
-      &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num,
-      1, stride, reduce_last_dim);
-  kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-      &input_transform[0], &input_reg[0], transformer);
-  kps::Init<MPType, REDUCE_VEC_SIZE, IsBoundary>(input_compute, input_transform,
-                                                 num);
-  kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
-              kps::details::ReduceMode::kLocalMode>(
-      reduce_var, &input_compute[0], reducer, reduce_last_dim);
-}
 // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
 // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
@@ -564,54 +591,76 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer,
                                int reduce_num, int left_num,
                                bool reduce_last_dim,
                                const Calculator reduce_index_calculator,
-                                const Calculator left_index_calculator) {
+                                const Calculator left_index_calculator,
+                                const kps::DimConfig dim) {
  int input_idx, left_idx, stride;
  int block_size = 0;
  bool need_store = true;
+  int loop_left = 0;
  int tid = 0;
  // the last dim gets involved in reduction
+  int store_offset = 0;
+  int stride_left = 0;
  if (reduce_last_dim) {
-    input_idx = blockIdx.y * blockDim.x;
+    auto block = ReduceIndexMapping<true>(dim);
-    left_idx = blockIdx.x * blockDim.y + threadIdx.y;
+    input_idx = block.BlockIdY() * block.BlockDimX();
-    stride = gridDim.y * blockDim.x;
+    left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
-    block_size = blockDim.x;
+    stride = block.GridDimY() * block.BlockDimX();
-    need_store = (threadIdx.x == 0) && (left_idx < left_num);
+    block_size = block.BlockDimX();
+    need_store = (THREAD_ID_X == 0) && (left_idx < left_num);
+    store_offset = block.BlockIdY() * left_num + left_idx;
+    loop_left = min(block.GetLoopSize(), left_num - left_idx);
+    stride_left = 1;
    tid = threadIdx.x;
  } else {
-    input_idx = blockIdx.y * blockDim.y;
+    auto block = ReduceIndexMapping<false>(dim);
-    left_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    input_idx = block.BlockIdY() * block.BlockDimY();
-    stride = gridDim.y * blockDim.y;
+    left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
-    block_size = blockDim.y;
+    stride = block.GridDimY() * block.BlockDimY();
-    need_store = (threadIdx.y == 0) && (left_idx < left_num);
+    block_size = block.BlockDimY();
+    need_store = (THREAD_ID_Y == 0) && (left_idx < left_num);
+    loop_left = min(block.GetLoopSize(), left_num - left_idx);
+    stride_left = block.BlockDimX() * block.GridDimX();
+    store_offset = block.BlockIdY() * left_num + left_idx;
    tid = threadIdx.y;
  }
-  int store_offset = blockIdx.y * left_num + left_idx;
  // calculate the offset, means the addr where each thread really start.
-  int input_offset = left_index_calculator(left_idx);
+  // 1. reduce for each thread
+  MPType input_compute[REDUCE_VEC_SIZE];
+  Tx input_reg[REDUCE_VEC_SIZE];
+  for (int i = 0; i < loop_left; i += stride_left) {
+    int input_offset = left_index_calculator(left_idx + i);
    const Tx* input = x + input_offset;
    MPType reduce_var = init;
-  Ty store_data;
-  // 1. reduce for each thread
-  if (left_idx < left_num) {
    // load REDUCE_VEC_SIZE data once, and then compute
    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
    for (; input_idx + block_size < bound;
         input_idx += REDUCE_VEC_SIZE * stride) {
-      ReduceAnyKernelImpl<Tx, MPType, ReduceOp, TransformOp, Calculator, false>(
+      kps::ReadDataReduce<Tx, Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator,
-          input, &reduce_var, reducer, transformer, init, reduce_num, input_idx,
+                          kps::IdentityFunctor<Tx>, false>(
-          reduce_last_dim, reduce_index_calculator, stride, reduce_num);
+          &input_reg[0], input, input_idx, reduce_index_calculator, 1,
-    }
+          reduce_num, 1, stride, kps::IdentityFunctor<Tx>(), reduce_last_dim);
-    int num = (reduce_num - input_idx - tid + stride - 1) / stride;
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-    ReduceAnyKernelImpl<Tx, MPType, ReduceOp, TransformOp, Calculator, true>(
+          &input_compute[0], &input_reg[0], transformer);
-        input, &reduce_var, reducer, transformer, init, reduce_num - input_idx,
+      kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
-        input_idx, reduce_last_dim, reduce_index_calculator, stride, num);
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
    }
+    kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
+    kps::ReadDataReduce<Tx, MPType, 1, REDUCE_VEC_SIZE, 1, 1, Calculator,
+                        TransformOp, true>(
+        &input_compute[0], input, input_idx, reduce_index_calculator, 1,
+        reduce_num - input_idx, 1, stride, transformer, reduce_last_dim);
+    kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
+                kps::details::ReduceMode::kLocalMode>(
+        &reduce_var, &input_compute[0], reducer, reduce_last_dim);
    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
        &reduce_var, &reduce_var, reducer, reduce_last_dim);
    if (need_store) {
-    y[store_offset] = static_cast<Ty>(reduce_var);
+      y[store_offset + i] = static_cast<Ty>(reduce_var);
+    }
  }
 }
@@ -620,21 +669,55 @@ template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
 __global__ void ReduceHigherDimKernel(const Tx* x, Ty* y, ReduceOp reducer,
                                      TransformOp transformer, MPType init,
                                      int reduce_num, int left_num,
-                                      int blocking_size) {
+                                      int blocking_size,
+                                      const kps::DimConfig dim) {
  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
  // function will be used
-  // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
+  auto block = ReduceIndexMapping<false>(dim);
-  //     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx /
+  int idy = block.BlockIdY() * blocking_size;
-  //     32
+  int idx = block.BlockIdX() * block.BlockDimX();
-  //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
+  int idz = BLOCK_ID_Z * left_num;
-  int idx = blockIdx.x * blockDim.x;
+  int stride = dim.split_num_x * dim.deal_size_x;
-  int size = left_num - idx;
+  int size = left_num - dim.rem_x;
-  if (size >= blockDim.x) {  // complete segment
+  int loop_size = min(reduce_num - idy, blocking_size);
-    HigherDimDealSegment<Tx, Ty, MPType, ReduceOp, TransformOp>(
+  int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
-        x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
+  int block_offset = idy * left_num + idz * reduce_num;
-  } else {
+  const Tx* input = x + block_offset;
-    HigherDimDealSegment<Tx, Ty, MPType, ReduceOp, TransformOp, true>(
+  Tx reduce_input;
-        x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
+  for (; idx < size; idx += stride) {
+    MPType reduce_var = init;
+    MPType reduce_compute = init;
+    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
+      kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
+                                            input + loop_idx * left_num + idx,
+                                            block.BlockDimX(), 1, 1, left_num);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &reduce_compute, &reduce_input, transformer);
+      kps::Reduce<MPType, 1, 1, 1, ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &reduce_compute, reducer, false);
+    }
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::WriteData<Ty, 1, 1, 1, false>(y + store_offset + idx, &result,
+                                       block.BlockDimX());
+  }
+  if (idx < left_num) {
+    MPType reduce_var = init;
+    MPType reduce_compute = init;
+    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
+      kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
+                                           input + loop_idx * left_num + idx,
+                                           dim.rem_x, 1, 1, left_num);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &reduce_compute, &reduce_input, transformer);
+      kps::Reduce<MPType, 1, 1, 1, ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &reduce_compute, reducer, false);
+    }
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::WriteData<Ty, 1, 1, 1, true>(y + store_offset + idx, &result,
+                                      dim.rem_x);
  }
 }
@@ -648,14 +731,27 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
    int stride_reduce = 1;
    int stride_left = config.reduce_num;
    // for higher performance
-    auto reduce_index_calculator = LastDimIndexCal(stride_reduce);
+    auto reduce_index_calculator = OneDimIndexCal(stride_reduce);
-    auto left_index_calculator = LastDimIndexCal(stride_left);
+    auto left_index_calculator = OneDimIndexCal(stride_left);
+    kps::DimConfig dim =
+        kps::DimConfig(config.grid.x, config.grid.y, config.grid.z,
+                       config.block.x, config.block.y, 0);
+    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
+#ifdef PADDLE_WITH_XPU2
+    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
+                    OneDimIndexCal><<<8, 128, stream>>>(
+        x_data, config.output_data, reducer, TransformOp(config.reduce_num),
+        init, config.reduce_num, config.left_num, config.reduce_last_dim,
+        reduce_index_calculator, left_index_calculator, dim);
+#else
    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
-                    LastDimIndexCal><<<config.grid, config.block, 0, stream>>>(
+                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
        x_data, config.output_data, reducer, TransformOp(config.reduce_num),
        init, config.reduce_num, config.left_num, config.reduce_last_dim,
-        reduce_index_calculator, left_index_calculator);
+        reduce_index_calculator, left_index_calculator, dim);
+#endif
  } else {
    int reduce_rank = config.reduce_strides.size();
@@ -665,11 +761,25 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
                        config.x_strides);
    auto left_index_calculator = IndexCalculator(
        left_rank, config.left_dim, config.left_strides, config.x_strides);
+    kps::DimConfig dim =
+        kps::DimConfig(config.grid.x, config.grid.y, config.grid.z,
+                       config.block.x, config.block.y, 0);
+    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
+#ifdef PADDLE_WITH_XPU2
+    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
+                    IndexCalculator><<<8, 128, stream>>>(
+        x_data, config.output_data, reducer, TransformOp(config.reduce_num),
+        init, config.reduce_num, config.left_num, config.reduce_last_dim,
+        reduce_index_calculator, left_index_calculator, dim);
+#else
    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
        x_data, config.output_data, reducer, TransformOp(config.reduce_num),
        init, config.reduce_num, config.left_num, config.reduce_last_dim,
-        reduce_index_calculator, left_index_calculator);
+        reduce_index_calculator, left_index_calculator, dim);
+#endif
  }
  if (config.should_reduce_again) {
@@ -683,12 +793,25 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
      grid = dim3(config.grid.x, 1, config.grid.z);
    }
+    auto last_index = OneDimIndexCal(1);
+    auto first_index = OneDimIndexCal(config.left_num);
+    kps::DimConfig dim =
+        kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
+    dim.SetRem(config.left_num % block.x, 0, 0);
+#ifdef PADDLE_WITH_XPU2
+    ReduceHigherDimKernel<Ty, Ty, MPType, ReduceOp,
+                          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
+        config.output_data, y_data, reducer,
+        kps::IdentityFunctor<Ty, MPType>(config.grid.y), init, config.grid.y,
+        config.left_num, config.grid.y, dim);
+#else
    ReduceHigherDimKernel<
        Ty, Ty, MPType, ReduceOp,
-        kps::details::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
+        kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
        config.output_data, y_data, reducer,
-        kps::details::IdentityFunctor<Ty, MPType>(config.grid.y), init,
+        kps::IdentityFunctor<Ty, MPType>(config.grid.y), init, config.grid.y,
-        config.grid.y, config.left_num, config.grid.y);
+        config.left_num, config.grid.y, dim);
+#endif
  }
 }
@@ -699,7 +822,7 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
                             gpuStream_t stream) {
  auto x_dim = framework::vectorize<int>(x.dims());
  auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
-  config.Run();  // get the parameters of LaunchReduceKernel
+  config.Run();
  int numel = x.numel();
  // after config.run()
  // SetOutputData for ReduceHigherDim when should_reduce_again is true,
@@ -759,23 +882,49 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
  //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
  if (config.reduce_type == ReduceType::kReduceHigherDim) {
    using TransformOp = typename ReduceOp<Tx, MPType>::Transformer;
+    kps::DimConfig dim =
+        kps::DimConfig(config.grid.x, config.grid.y, config.grid.z,
+                       config.block.x, config.blocking_size, 0);
+    dim.SetRem(config.left_num % config.block.x,
+               config.reduce_num % config.blocking_size, 0);
+#ifdef PADDLE_WITH_XPU2
+    ReduceHigherDimKernel<Tx, Ty, MPType, ReduceOp<Tx, MPType>,
+                          TransformOp><<<8, 128, stream>>>(
+        x_data, config.output_data, reducer, TransformOp(config.reduce_num),
+        reducer.initial(), config.reduce_num, config.left_num,
+        config.blocking_size, dim);
+#else
    ReduceHigherDimKernel<
        Tx, Ty, MPType, ReduceOp<Tx, MPType>,
        TransformOp><<<config.grid, config.block, 0, stream>>>(
        x_data, config.output_data, reducer, TransformOp(config.reduce_num),
        reducer.initial(), config.reduce_num, config.left_num,
-        config.blocking_size);
+        config.blocking_size, dim);
+#endif
    if (config.should_reduce_again) {
      dim3 block = dim3(config.block.x, 1, 1);
      dim3 grid = dim3(config.grid.x, 1, config.grid.z);
-      ReduceHigherDimKernel<Ty, Ty, MPType, ReduceOp<Tx, MPType>,
+      kps::DimConfig dim2 =
-                            kps::details::IdentityFunctor<
+          kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
-                                Ty, MPType>><<<grid, block, 0, stream>>>(
+      dim2.SetRem(config.left_num % config.block.x, 0, 0);
+#ifdef PADDLE_WITH_XPU2
+      ReduceHigherDimKernel<
+          Ty, Ty, MPType, ReduceOp<Tx, MPType>,
+          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
+          config.output_data, y_data, reducer,
+          kps::IdentityFunctor<Ty, MPType>(config.grid.y), reducer.initial(),
+          config.grid.y, config.left_num, config.grid.y, dim2);
+#else
+      ReduceHigherDimKernel<
+          Ty, Ty, MPType, ReduceOp<Tx, MPType>,
+          kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
          config.output_data, y_data, reducer,
-          kps::details::IdentityFunctor<Ty, MPType>(config.grid.y),
+          kps::IdentityFunctor<Ty, MPType>(config.grid.y), reducer.initial(),
-          reducer.initial(), config.grid.y, config.left_num, config.grid.y);
+          config.grid.y, config.left_num, config.grid.y, dim2);
+#endif
    }
    return;
  }