Delete the template parameter BLockSize in Kernel Primitive API (#45220)

1a0cd447 · niuliling123 · GitHub · 3a7b1810 · 1a0cd447 · 1a0cd447
13 changed file
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -112,17 +112,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
  auto dst_functor =
      DstMaskFunctor<T, float>(1.0f - dropout_prob, is_upscale_in_train);
  for (; fix < main_offset; fix += stride) {
-    kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
-    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(
+    kps::ReadData<T, kCount, 1, false>(&dst_mask[0], src + fix, deal_size);
+    kps::ElementwiseRandom<SType, float, kCount, Rand>(
        &rands[0], Rand(), &state);
    // dst
    kps::OperatorTernary<T, float, T, DstMaskFunctor<T, float>>(
        &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
-    kps::WriteData<T, kCount, 1, 1, false>(dst + fix, &dst_mask[0], deal_size);
+    kps::WriteData<T, kCount, 1, false>(dst + fix, &dst_mask[0], deal_size);
    // mask
-    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
        &mask_result[0], &dst_mask[kCount], Cast());
-    kps::WriteData<MaskType, kCount, 1, 1, false>(
+    kps::WriteData<MaskType, kCount, 1, false>(
        mask + fix, &mask_result[0], deal_size);
    if (fix > idx * kCount + 1) {
      __syncthreads();
@@ -130,17 +130,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
  }
  int remainder = n - fix;
  if (remainder > 0) {
-    kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
-    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(
+    kps::ReadData<T, kCount, 1, true>(&dst_mask[0], src + fix, remainder);
+    kps::ElementwiseRandom<SType, float, kCount, Rand>(
        &rands[0], Rand(), &state);
    // dst
    kps::OperatorTernary<T, float, T, DstMaskFunctor<T, float>>(
        &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
-    kps::WriteData<T, kCount, 1, 1, true>(dst + fix, &dst_mask[0], remainder);
+    kps::WriteData<T, kCount, 1, true>(dst + fix, &dst_mask[0], remainder);
    // mask
-    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
        &mask_result[0], &dst_mask[kCount], Cast());
-    kps::WriteData<MaskType, kCount, 1, 1, true>(
+    kps::WriteData<MaskType, kCount, 1, true>(
        mask + fix, &mask_result[0], remainder);
    __syncthreads();
  }
@@ -233,17 +233,17 @@ __global__ void VectorizedGeneratorMask(const size_t n,

  auto mask_functor = MaskFunctor<T, float>(1.0f - dropout_prob);
  for (; fix < main_offset; fix += stride) {
-    kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
-    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(
+    kps::ReadData<T, kCount, 1, false>(&dst_mask[0], src + fix, deal_size);
+    kps::ElementwiseRandom<SType, float, kCount, Rand>(
        &rands[0], Rand(), &state);
    // dst
    kps::OperatorBinary<float, T, MaskFunctor<T, float>>(
        &dst_mask[0], &rands[0], mask_functor, kCount);

    // mask
-    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
        &mask_result[0], &dst_mask[0], Cast());
-    kps::WriteData<MaskType, kCount, 1, 1, false>(
+    kps::WriteData<MaskType, kCount, 1, false>(
        mask + fix, &mask_result[0], deal_size);
    if (fix > idx * kCount + 1) {
      __syncthreads();
@@ -251,16 +251,16 @@ __global__ void VectorizedGeneratorMask(const size_t n,
  }
  int remainder = n - fix;
  if (remainder > 0) {
-    kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
-    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(
+    kps::ReadData<T, kCount, 1, true>(&dst_mask[0], src + fix, remainder);
+    kps::ElementwiseRandom<SType, float, kCount, Rand>(
        &rands[0], Rand(), &state);
    // dst
    kps::OperatorBinary<float, T, MaskFunctor<T, float>>(
        &dst_mask[0], &rands[0], mask_functor, kCount);
    // mask
-    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
        &mask_result[0], &dst_mask[0], Cast());
-    kps::WriteData<MaskType, kCount, 1, 1, true>(
+    kps::WriteData<MaskType, kCount, 1, true>(
        mask + fix, &mask_result[0], remainder);
    __syncthreads();
  }

--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -73,24 +73,23 @@ __global__ void BroadcastKernelBinary(

  // load in0
  if (use_broadcast[0]) {
-    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1>(
+    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD>(
        arg0, in0, fix, configlists[0], numel);
  } else {
    kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg0, in0 + fix, num);
  }
  // load in1
  if (use_broadcast[1]) {
-    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1>(
+    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD>(
        arg1, in1, fix, configlists[1], numel);
  } else {
-    kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg1, in1 + fix, num);
+    kernel_primitives::ReadData<InT, VecSize, 1>(arg1, in1 + fix, num);
  }
  // compute
-  kernel_primitives::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(
+  kernel_primitives::ElementwiseBinary<InT, OutT, VecSize, 1, Functor>(
      result, arg0, arg1, func);
  // store
-  kernel_primitives::WriteData<OutT, VecSize, 1, 1, true>(
-      out + fix, result, num);
+  kernel_primitives::WriteData<OutT, VecSize, 1, true>(out + fix, result, num);
 }

 // bias add forward impl for "[m, n] + [n] = [m, n]"

--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -266,10 +266,10 @@ __device__ __forceinline__ void LoadData(
  // numel : whole num of output
  // num: how many data will be deal with in this time
  if (need_broadcast) {
-    kps::ReadDataBc<T, VecSize, 1, 1, IsBoundary>(
+    kps::ReadDataBc<T, VecSize, 1, IsBoundary>(
        dst, src, block_offset, config, numel, read_lens);
  } else {
-    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(
+    kps::ReadData<T, VecSize, 1, IsBoundary>(
        dst, src + block_offset, num, read_lens);
  }
 }

--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -278,11 +278,10 @@ __global__ void DistributionKernel(size_t size,
  MT args[kCount];
  T result[kCount];
  for (size_t i = idx; i < size; i += total_thread * kCount) {
-    kps::ElementwiseRandom<SType, MT, kCount, 1, DistOp>(
-        &args[0], dist, &state);
-    kps::ElementwiseUnary<MT, T, kCount, 1, 1, TransformOp>(
+    kps::ElementwiseRandom<SType, MT, kCount, DistOp>(&args[0], dist, &state);
+    kps::ElementwiseUnary<MT, T, kCount, 1, TransformOp>(
        &result[0], &args[0], trans);
-    kps::WriteData<T, T, kCount, 1, 1, true>(
+    kps::WriteData<T, T, kCount, 1, true>(
        out_data + i, &result[0], size - i, 1, stride, 1);
    __syncthreads();
  }

--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -519,13 +519,13 @@ struct Loader {
    kps::Init<Type, ArgsT, Index, VecSize>(
        args, static_cast<Type>(1.0f), read_lens);
    if (is_boundary) {
-      kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
+      kps::ReadData<Type, VecSize, 1, ArgsT, Index, true>(
          args,
          reinterpret_cast<const _ptr_ Type *>(in[Index]) + offset,
          num,
          read_lens);
    } else {
-      kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
+      kps::ReadData<Type, VecSize, 1, ArgsT, Index, false>(
          args,
          reinterpret_cast<const _ptr_ Type *>(in[Index]) + offset,
          num,
@@ -595,7 +595,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
                                    InT (*args)[VecSize],
                                    OutT *result,
                                    int read_lens) {
-    kps::ElementwiseAny<InT, OutT, VecSize, 1, 1, Arity, Functor>(
+    kps::ElementwiseAny<InT, OutT, VecSize, 1, Arity, Functor>(
        result, args, func);
  }
 };
@@ -606,7 +606,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 0, false> {
                                    InT (*args)[VecSize],
                                    OutT *result,
                                    int read_lens) {
-    kps::ElementwiseConstant<InT, OutT, VecSize, 1, 1, Functor>(result, func);
+    kps::ElementwiseConstant<InT, OutT, VecSize, 1, Functor>(result, func);
  }
 };

@@ -616,7 +616,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
                                    InT (*args)[VecSize],
                                    OutT *result,
                                    int read_lens) {
-    kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Functor>(
+    kps::ElementwiseUnary<InT, OutT, VecSize, 1, Functor>(
        result, args[0], func);
  }
 };
@@ -627,7 +627,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
                                    InT (*args)[VecSize],
                                    OutT *result,
                                    int read_lens) {
-    kps::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(
+    kps::ElementwiseBinary<InT, OutT, VecSize, 1, Functor>(
        result, args[0], args[1], func, read_lens);
  }
 };
@@ -638,7 +638,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
                                    InT (*args)[VecSize],
                                    OutT *result,
                                    int read_lens) {
-    kps::ElementwiseTernary<InT, OutT, VecSize, 1, 1, Functor>(
+    kps::ElementwiseTernary<InT, OutT, VecSize, 1, Functor>(
        result, args[0], args[1], args[2], func);
  }
 };
@@ -703,7 +703,7 @@ struct ElementwiseWriteDataCallerBc {
    }
 #pragma unroll
    for (int i = 0; i < NumOuts; ++i) {
-      kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
+      kps::WriteData<OutT, VecSize, 1, IsBoundary>(
          outs[i] + block_offset, dst[i], num, read_lens);
    }
  }
@@ -716,7 +716,7 @@ struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
                                             kps::IndexType block_offset,
                                             int num,
                                             int read_lens) {
-    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
+    kps::WriteData<OutT, VecSize, 1, IsBoundary>(
        outs[0] + block_offset, src, num, read_lens);
  }
 };

--- a/paddle/phi/kernels/funcs/index_impl.cu.h
+++ b/paddle/phi/kernels/funcs/index_impl.cu.h
@@ -36,18 +36,18 @@ __global__ void VectorizedIndexKernel(T *out,
  size_t args[VecSize];
  T result[VecSize];
  for (; data_offset < main_offset; data_offset += stride) {
-    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
-    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
+    kps::InitWithDataIndex<size_t, VecSize, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, Functor>(
        &result[0], &args[0], func);
-    kps::WriteData<T, VecSize, 1, 1, false>(
+    kps::WriteData<T, VecSize, 1, false>(
        out + data_offset, &result[0], BLOCK_NUM_X * VecSize);
  }
  size_t num = numel - data_offset;
  if (num > 0) {
-    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
-    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
+    kps::InitWithDataIndex<size_t, VecSize, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, Functor>(
        &result[0], &args[0], func);
-    kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
+    kps::WriteData<T, VecSize, 1, true>(out + data_offset, &result[0], num);
  }
 }


--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -712,7 +712,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
                          1,
                          REDUCE_VEC_SIZE,
                          1,
-                          1,
                          Calculator,
                          kps::IdentityFunctor<Tx>,
                          false>(&input_reg[0],
@@ -725,12 +724,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
                                 stride,
                                 kps::IdentityFunctor<Tx>(),
                                 reduce_last_dim);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, TransformOp>(
          &input_compute[0], &input_reg[0], transformer);
      kps::Reduce<MPType,
                  REDUCE_VEC_SIZE,
                  1,
-                  1,
                  ReduceOp,
                  kps::details::ReduceMode::kLocalMode>(
          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
@@ -742,7 +740,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
                        1,
                        REDUCE_VEC_SIZE,
                        1,
-                        1,
                        Calculator,
                        TransformOp,
                        true>(&input_compute[0],
@@ -758,12 +755,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
    kps::Reduce<MPType,
                REDUCE_VEC_SIZE,
                1,
-                1,
                ReduceOp,
                kps::details::ReduceMode::kLocalMode>(
        &reduce_var, &input_compute[0], reducer, reduce_last_dim);

-    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
+    kps::Reduce<MPType, 1, 1, ReduceOp, kps::details::kGlobalMode>(
        &reduce_var, &reduce_var, reducer, reduce_last_dim);
    if (is_mean) {
      reduce_var = reduce_var / static_cast<MPType>(reduce_num);
@@ -807,27 +803,22 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
    MPType reduce_var = init;
    MPType reduce_compute = init;
    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
-      kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
-                                            input + loop_idx * left_num + idx,
-                                            block.BlockDimX(),
-                                            1,
-                                            1,
-                                            left_num);
-      kps::ElementwiseUnary<Tx, MPType, 1, 1, 1, TransformOp>(
+      kps::ReadData<Tx, Tx, 1, 1, false>(&reduce_input,
+                                         input + loop_idx * left_num + idx,
+                                         block.BlockDimX(),
+                                         1,
+                                         1,
+                                         left_num);
+      kps::ElementwiseUnary<Tx, MPType, 1, 1, TransformOp>(
          &reduce_compute, &reduce_input, transformer);
-      kps::Reduce<MPType,
-                  1,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
+      kps::Reduce<MPType, 1, 1, ReduceOp, kps::details::ReduceMode::kLocalMode>(
          &reduce_var, &reduce_compute, reducer, false);
    }
    if (is_mean) {
      reduce_var = reduce_var / static_cast<MPType>(mean_div);
    }
    Ty result = static_cast<Ty>(reduce_var);
-    kps::WriteData<Ty, 1, 1, 1, false>(
+    kps::WriteData<Ty, 1, 1, false>(
        y + store_offset + idx, &result, block.BlockDimX());
  }

@@ -835,20 +826,15 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
    MPType reduce_var = init;
    MPType reduce_compute = init;
    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
-      kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
-                                           input + loop_idx * left_num + idx,
-                                           dim.rem_x,
-                                           1,
-                                           1,
-                                           left_num);
-      kps::ElementwiseUnary<Tx, MPType, 1, 1, 1, TransformOp>(
+      kps::ReadData<Tx, Tx, 1, 1, true>(&reduce_input,
+                                        input + loop_idx * left_num + idx,
+                                        dim.rem_x,
+                                        1,
+                                        1,
+                                        left_num);
+      kps::ElementwiseUnary<Tx, MPType, 1, 1, TransformOp>(
          &reduce_compute, &reduce_input, transformer);
-      kps::Reduce<MPType,
-                  1,
-                  1,
-                  1,
-                  ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
+      kps::Reduce<MPType, 1, 1, ReduceOp, kps::details::ReduceMode::kLocalMode>(
          &reduce_var, &reduce_compute, reducer, false);
    }

@@ -856,8 +842,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
      reduce_var = reduce_var / static_cast<MPType>(mean_div);
    }
    Ty result = static_cast<Ty>(reduce_var);
-    kps::WriteData<Ty, 1, 1, 1, true>(
-        y + store_offset + idx, &result, dim.rem_x);
+    kps::WriteData<Ty, 1, 1, true>(y + store_offset + idx, &result, dim.rem_x);
  }
 }


--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -71,21 +71,21 @@ __device__ void GetBlockCountImpl(const InT *in,
  int store_fix = BLOCK_ID_X + repeat * GRID_NUM_X;

  kps::Init<InT, VecSize>(&in_data[0], static_cast<InT>(0.0f));
-  kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
-  kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Cast>(
+  kps::ReadData<InT, VecSize, 1, IsBoundary>(&in_data[0], in, num);
+  kps::ElementwiseUnary<InT, OutT, VecSize, 1, Cast>(
      &temp[0], &in_data[0], Cast());
-  kps::Reduce<OutT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+  kps::Reduce<OutT, VecSize, 1, Add, Mode::kLocalMode>(
      &result, &temp[0], Add(), true);
-  kps::Reduce<OutT, 1, 1, 1, Add, Mode::kGlobalMode>(
+  kps::Reduce<OutT, 1, 1, Add, Mode::kGlobalMode>(
      &result, &result, Add(), true);
  if (store_fix == 0) {
    // first block's fix_size = 0;
    OutT tmp = static_cast<OutT>(0.0f);
-    kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix, &tmp, 1);
+    kps::WriteData<OutT, 1, 1, true>(out + store_fix, &tmp, 1);
  }

  // store num of this block
-  kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix + 1, &result, 1);
+  kps::WriteData<OutT, 1, 1, true>(out + store_fix + 1, &result, 1);
 }

 // Count how many data is not zero in current block
@@ -132,12 +132,12 @@ __device__ void CumsumImpl(
  // set pre_cumsum
  kps::Init<OutT, VecSize>(&temp[0], *pre_cumsum);
  // load data to arg
-  kps::ReadData<InT, InT, VecSize, 1, 1, IsBoundary>(
+  kps::ReadData<InT, InT, VecSize, 1, IsBoundary>(
      &arg[0], in, num, 1, BLOCK_NUM_X, 1);
  // block cumsum
-  kps::Cumsum<InT, OutT, 1, Functor>(&result[0], &arg[0], func);
+  kps::Cumsum<InT, OutT, Functor>(&result[0], &arg[0], func);
  // result = cumsum_result + pre_cumsum
-  kps::ElementwiseBinary<OutT, OutT, VecSize, 1, 1, Functor>(
+  kps::ElementwiseBinary<OutT, OutT, VecSize, 1, Functor>(
      &result[0], &result[0], &temp[0], func);
  // get the last prefix sum
  if ((THREAD_ID_X == BLOCK_NUM_X - 1) && !IsBoundary) {
@@ -146,7 +146,7 @@ __device__ void CumsumImpl(
  __syncthreads();
  // update pre_cumsum
  *pre_cumsum = max_thread_data;
-  kps::WriteData<OutT, OutT, VecSize, 1, 1, IsBoundary>(
+  kps::WriteData<OutT, OutT, VecSize, 1, IsBoundary>(
      out, &result[0], num, 1, BLOCK_NUM_X, 1);
 }

@@ -189,7 +189,7 @@ struct SelectCaller {
    int64_t in_data[VecSize];
    OutT store_data[VecSize * phi::DDim::kMaxRank];
    // set index
-    kps::InitWithDataIndex<int64_t, VecSize, 1, 1>(&in_data[0], data_offset);
+    kps::InitWithDataIndex<int64_t, VecSize, 1>(&in_data[0], data_offset);
    // Get store data according to mask_idt
    kps::OperatorTernary<MT, int64_t, OutT, Functor>(
        store_data, mask_data, &in_data[0], func, VecSize);
@@ -215,7 +215,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 1> {
                                    int num) {
    InT in_data[VecSize];
    OutT store_data[VecSize * phi::DDim::kMaxRank];
-    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+    kps::ReadData<InT, VecSize, 1, IsBoundary>(&in_data[0], in, num);
    // Get store data according to mask_idt
    kps::OperatorTernary<MT, InT, OutT, Functor>(
        store_data, mask_data, &in_data[0], func, VecSize);
@@ -244,7 +244,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
    kps::details::ReadData<InT>(&in_data[0], in + thread_fix, store_num);
    kps::OperatorTernary<MT, InT, OutT, Functor>(
        store_data, mask_data, &in_data[0], func, VecSize);
-    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(out, &store_data[0], num);
+    kps::WriteData<OutT, VecSize, 1, IsBoundary>(out, &store_data[0], num);
  }
 };

@@ -285,16 +285,16 @@ __device__ void SelectKernelImpl(OutT *out,
  kps::Init<IdT, kCVecSize>(&num_thread[0], init_idx);
  kps::Init<MT, VecSize>(&mask_data[0], init_mask);
  // Load mask
-  kps::ReadData<MT, VecSize, 1, 1, IsBoundary>(&mask_data[0], mask, num);
+  kps::ReadData<MT, VecSize, 1, IsBoundary>(&mask_data[0], mask, num);
  // Cast from MT to int
-  kps::ElementwiseUnary<MT, IdT, VecSize, 1, 1, Cast>(
+  kps::ElementwiseUnary<MT, IdT, VecSize, 1, Cast>(
      &mask_idt[0], &mask_data[0], Cast());
  // Get the num of thread only num_thread[1] has data
-  kps::Reduce<IdT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+  kps::Reduce<IdT, VecSize, 1, Add, Mode::kLocalMode>(
      &num_thread[0], &mask_idt[0], Add(), true);
  // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
  // thread_fix
-  kps::Cumsum<IdT, IdT, 1, Add>(&cumsum_thread[0], &num_thread[0], Add());
+  kps::Cumsum<IdT, IdT, Add>(&cumsum_thread[0], &num_thread[0], Add());
  // get thread_fix
  int thread_fix =
      (static_cast<int>(cumsum_thread[0] - num_thread[0]) * store_rank);

--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -311,9 +311,9 @@ __global__ void WarpSoftmaxForward(T* softmax,
    const VecT* src_v =
        reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
    VecT* reg_v = reinterpret_cast<VecT*>(&src_data[i][0][0]);
-    kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
+    kps::ReadData<VecT, VecT, kLoopsV, 1, true>(
        &reg_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1);
-    kps::ElementwiseUnary<T, AccT, kVItem, 1, 1, DataTransFunctor<T, AccT>>(
+    kps::ElementwiseUnary<T, AccT, kVItem, 1, DataTransFunctor<T, AccT>>(
        &sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor<T, AccT>());
  }

@@ -321,7 +321,6 @@ __global__ void WarpSoftmaxForward(T* softmax,
  kps::Reduce<AccT,
              kVItem,
              kBatchSize,
-              1,
              ReduceMaxFunctor<AccT>,
              kMode::kLocalMode>(
      &max[0], &sub_data[0][0][0], ReduceMaxFunctor<AccT>(), true);
@@ -330,15 +329,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
 // compute sum
 #pragma unroll
  for (int i = 0; i < kBatchSize; ++i) {
-    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, UnarySubFunctor<AccT>>(
        &sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor<AccT>(max[i]));
-    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpFunctor<AccT>>(
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, ExpFunctor<AccT>>(
        &exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor<AccT>());
  }
  kps::Reduce<AccT,
              kVItem,
              kBatchSize,
-              1,
              kps::AddFunctor<AccT>,
              kMode::kLocalMode>(
      &sum[0], &exp_data[0][0][0], kps::AddFunctor<AccT>(), true);
@@ -351,15 +349,15 @@ __global__ void WarpSoftmaxForward(T* softmax,
        reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
    VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
    if (LogMode) {
-      kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+      kps::ElementwiseUnary<AccT, T, kVItem, 1, UnarySubFunctor<AccT>>(
          &out_tmp[i][0][0],
          &sub_data[i][0][0],
          UnarySubFunctor<AccT>(std::log(sum[i])));
    } else {
-      kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
+      kps::ElementwiseUnary<AccT, T, kVItem, 1, UnaryDivFunctor<AccT>>(
          &out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
    }
-    kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
+    kps::WriteData<VecT, VecT, kLoopsV, 1, true>(
        &softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
  }
 }
@@ -417,9 +415,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
    int ptr = (first_batch + i) * stride;
    const VecT* src_v = reinterpret_cast<const VecT*>(&src[ptr]);
    const VecT* grad_v = reinterpret_cast<const VecT*>(&grad[ptr]);
-    kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
+    kps::ReadData<VecT, VecT, kLoopsV, 1, true>(
        &src_reg[i][0], &src_v[0], idx_max_v[i], 0, kWarpSize, flag);
-    kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
+    kps::ReadData<VecT, VecT, kLoopsV, 1, true>(
        &grad_reg[i][0], &grad_v[0], idx_max_v[i], 0, kWarpSize, flag);
  }

@@ -430,9 +428,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
  const T* grad_ptr = reinterpret_cast<const T*>(&grad_reg[0][0]);
  constexpr int kStep = kBatchSize * kLoopsV * kVSize;
  constexpr int kVItem = kLoopsV * kVSize;
-  kps::ElementwiseUnary<T, AccT, kStep, 1, 1, DataTransFunctor<T, AccT>>(
+  kps::ElementwiseUnary<T, AccT, kStep, 1, DataTransFunctor<T, AccT>>(
      &src_tmp[0][0][0], &src_ptr[0], DataTransFunctor<T, AccT>());
-  kps::ElementwiseUnary<T, AccT, kStep, 1, 1, DataTransFunctor<T, AccT>>(
+  kps::ElementwiseUnary<T, AccT, kStep, 1, DataTransFunctor<T, AccT>>(
      &grad_tmp[0][0][0], &grad_ptr[0], DataTransFunctor<T, AccT>());

  // compute sum
@@ -444,17 +442,15 @@ __global__ void WarpSoftmaxBackward(T* dst,
    kps::Reduce<AccT,
                kVItem,
                kBatchSize,
-                1,
                kps::AddFunctor<AccT>,
                kps::details::ReduceMode::kLocalMode>(
        &sum[0], &grad_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
  } else {
-    kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
+    kps::ElementwiseBinary<AccT, AccT, kStep, 1, kps::MulFunctor<AccT>>(
        &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
    kps::Reduce<AccT,
                kVItem,
                kBatchSize,
-                1,
                kps::AddFunctor<AccT>,
                kps::details::ReduceMode::kLocalMode>(
        &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
@@ -470,17 +466,17 @@ __global__ void WarpSoftmaxBackward(T* dst,
    AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[i][0][0]);
    AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[i][0][0]);
    if (LogMode) {
-      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpMulFunctor<AccT>>(
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, ExpMulFunctor<AccT>>(
          &out[i][0][0], &srcptr[0], ExpMulFunctor<AccT>(sum[i]));
-      kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::SubFunctor<AccT>>(
+      kps::ElementwiseBinary<AccT, T, kVItem, 1, kps::SubFunctor<AccT>>(
          &out_tmp[i][0][0],
          &gradptr[0],
          &out[i][0][0],
          kps::SubFunctor<AccT>());
    } else {
-      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, UnarySubFunctor<AccT>>(
          &out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
-      kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
+      kps::ElementwiseBinary<AccT, T, kVItem, 1, kps::MulFunctor<AccT>>(
          &out_tmp[i][0][0],
          &srcptr[0],
          &out[i][0][0],
@@ -488,7 +484,7 @@ __global__ void WarpSoftmaxBackward(T* dst,
    }
    VecT* dst_v = reinterpret_cast<VecT*>(&dst[(first_batch + i) * stride]);
    VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
-    kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
+    kps::WriteData<VecT, VecT, kLoopsV, 1, true>(
        &dst_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
  }
 }
@@ -636,7 +632,7 @@ __global__ void NormalSoftmaxForward(
      }

      if (blockDim.y > 1) {
-        kps::Reduce<AccT, 1, 1, 1, kps::MaxFunctor<AccT>, kMode::kGlobalMode>(
+        kps::Reduce<AccT, 1, 1, kps::MaxFunctor<AccT>, kMode::kGlobalMode>(
            &max_value, &max_value, kps::MaxFunctor<AccT>(), false);
      }

@@ -647,7 +643,7 @@ __global__ void NormalSoftmaxForward(
        sum += std::exp(value - max_value);
      }
      if (blockDim.y > 1) {
-        kps::Reduce<AccT, 1, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
+        kps::Reduce<AccT, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
            &sum, &sum, kps::AddFunctor<AccT>(), false);
      }

@@ -695,7 +691,7 @@ __global__ void NormalSoftmaxBackward(T* input_grad,
        }
      }
      if (blockDim.y > 1) {
-        kps::Reduce<AccT, 1, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
+        kps::Reduce<AccT, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
            &sum, &sum, kps::AddFunctor<AccT>(), false);
      }


--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -200,7 +200,6 @@ __device__ inline int GetLastPow2(int n) {
 * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * OpFunc: Compute functor which has an operator() as following:
 *     template <typename InT, typename OutT>
@@ -215,12 +214,7 @@ __device__ inline int GetLastPow2(int n) {
 * in: The register pointer of in, the size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void ElementwiseUnary(OutT* out,
                                                 const InT* in,
                                                 OpFunc compute) {
@@ -239,7 +233,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
 * OutT: The data type of out.
 * NX: The number of data columns computed by each thread.
 * NY: The number of data rows computed by each thread.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * OpFunc: Compute functor which has an operator() as following:
 *     template <typename InT>
@@ -255,12 +248,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
 * in2: The register pointer of second input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void ElementwiseBinary(OutT* out,
                                                  const InT* in1,
                                                  const InT* in2,
@@ -271,12 +259,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
  }
 }

-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void ElementwiseBinary(
    OutT* out, const InT* in1, const InT* in2, OpFunc compute, int read_lens) {
 #pragma unroll
@@ -294,7 +277,6 @@ __device__ __forceinline__ void ElementwiseBinary(
 * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * OpFunc: Compute functor which has an operator() as following
 *     template <typename InT>
@@ -312,12 +294,7 @@ __device__ __forceinline__ void ElementwiseBinary(
 * in3: The register pointer of third input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void ElementwiseTernary(
    OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
 #pragma unroll
@@ -335,7 +312,6 @@ __device__ __forceinline__ void ElementwiseTernary(
 * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * Arity: The size of ins.
 * OpFunc: Compute functor which has an operator() as following:
@@ -351,13 +327,7 @@ __device__ __forceinline__ void ElementwiseTernary(
 * ins: A pointers of array consisting of multiple inputs.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Arity,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, int Arity, class OpFunc>
 __device__ __forceinline__ void ElementwiseAny(OutT* out,
                                               InT (*ins)[NX * NY],
                                               OpFunc compute) {
@@ -382,7 +352,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
 * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * OpFunc: Compute functor which has an operator() as following
 *     template <typename InT, typename OutT>
@@ -398,12 +367,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
 * in2: The register pointer of second input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void CycleBinary(OutT* out,
                                            const InT* in1,
                                            const InT* in2,
@@ -428,7 +392,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
 * T: The type of data.
 * NX: The number of data continuously loaded by each thread.
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * ReduceFunctor: Compute functor which has an operator() as following
 *     template <typename InT>
@@ -448,7 +411,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
 template <typename T,
          int NX,
          int NY,
-          int BlockSize,
          class ReduceFunctor,
          details::ReduceMode Mode>
 __device__ __forceinline__ void Reduce(T* out,
@@ -494,7 +456,6 @@ __device__ __forceinline__ void Reduce(T* out,
 * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. Currently only
 * GPU was supported.
 * OpFunc: Compute functor which has an operator() as following
 *     template <typename InT>
@@ -509,12 +470,7 @@ __device__ __forceinline__ void Reduce(T* out,
 * out: The register pointer of out, the size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX * NY; idx++) {
@@ -532,7 +488,6 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
 * hiprandStatePhilox4_32_10_t.
 * OutT: the type of out register.
 * ReturnsCount: The number of random data generated by OpFunc.
- * BlockSize: Identifies the current device thread index method. Currently only
 * GPU was supported.
 * OpFunc: Compute functor which has an operator() as following
 *     template <typename T>
@@ -549,11 +504,7 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
 * compute: Compute function which was declared like OpFunc<T>().
 */

-template <typename StateType,
-          typename OutT,
-          int ReturnsCount,
-          int BlockSize,
-          class OpFunc>
+template <typename StateType, typename OutT, int ReturnsCount, class OpFunc>
 __device__ __forceinline__ void ElementwiseRandom(OutT* out,
                                                  OpFunc compute,
                                                  StateType* state) {
@@ -571,7 +522,6 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
 * @template paraments
 * InT: the type of input register.
 * OutT: the type of out register.
- * BlockSize: Identifies the current device thread index method. Currently only
 * GPU was supported.
 * OpFunc: Compute functor which has an operator() as following
 *     template <typename T>
@@ -589,7 +539,7 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
 */

 #define SHARED_SIZE_LIMIT 512
-template <typename InT, typename OutT, int BlockSize, class OpFunc>
+template <typename InT, typename OutT, class OpFunc>
 __device__ __forceinline__ void Cumsum(OutT* out,
                                       const InT* in,
                                       OpFunc compute) {
@@ -632,7 +582,6 @@ __device__ __forceinline__ void Cumsum(OutT* out,
 * @template paraments
 * InT: the type of input register.
 * OutT: the type of out register.
- * BlockSize: Identifies the current device thread index method. Currently only
 * GPU was supported.
 *
 * @param
@@ -645,7 +594,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
 #define SHARED_SIZE_LIMIT 1024
 // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
 // larger than blockDim.x * 2
-template <typename InT, typename OutT, int BlockSize>
+template <typename InT, typename OutT>
 __device__ __forceinline__ void Sort(OutT* out,
                                     const InT* in,
                                     int num,
@@ -689,7 +638,6 @@ __device__ __forceinline__ void Sort(OutT* out,
 * InT: The type of input register.
 * OutT: The type of out register.
 * IndexType: The type of index.
- * BlockSize: Identifies the current device thread index method. Currently only
 * GPU was supported.
 *
 * @param
@@ -701,7 +649,7 @@ __device__ __forceinline__ void Sort(OutT* out,
 * monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
 * sorted in escending.
 */
-template <typename InT, typename OutT, typename IndexType, int BlockSize>
+template <typename InT, typename OutT, typename IndexType>
 __device__ __forceinline__ void Sort(OutT* out,
                                     IndexType* out_index,
                                     const InT* in,

--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -89,7 +89,6 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
 * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * OpFunc: Compute functor which has an operator() as following:
 *     template <typename InT, typename OutT>
@@ -104,12 +103,7 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
 * in: The register pointer of in, the size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void ElementwiseUnary(OutT* out,
                                                 const InT* in,
                                                 OpFunc compute) {
@@ -128,7 +122,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
 * OutT: The data type of out.
 * NX: The number of data columns computed by each thread.
 * NY: The number of data rows computed by each thread.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * OpFunc: Compute functor which has an operator() as following:
 *     template <typename InT>
@@ -144,12 +137,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
 * in2: The register pointer of second input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void ElementwiseBinary(OutT* out,
                                                  const InT* in1,
                                                  const InT* in2,
@@ -160,12 +148,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
  }
 }

-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void ElementwiseBinary(
    OutT* out, const InT* in1, const InT* in2, OpFunc compute, int read_lens) {
  for (int idx = 0; idx < read_lens; ++idx) {
@@ -182,7 +165,6 @@ __device__ __forceinline__ void ElementwiseBinary(
 * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * OpFunc: Compute functor which has an operator() as following
 *     template <typename InT>
@@ -200,12 +182,7 @@ __device__ __forceinline__ void ElementwiseBinary(
 * in3: The register pointer of third input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void ElementwiseTernary(
    OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
 #pragma unroll
@@ -223,7 +200,6 @@ __device__ __forceinline__ void ElementwiseTernary(
 * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * Arity: The size of ins
 * OpFunc: Compute functor which has an operator() as following:
@@ -239,13 +215,7 @@ __device__ __forceinline__ void ElementwiseTernary(
 * ins: A pointers of array consisting of multiple inputs.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Arity,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, int Arity, class OpFunc>
 __device__ __forceinline__ void ElementwiseAny(OutT* out,
                                               InT (*ins)[NX * NY],
                                               OpFunc compute) {
@@ -270,7 +240,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
 * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * OpFunc: Compute functor which has an operator() as following
 *     template <typename InT, typename OutT>
@@ -286,12 +255,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
 * in2: The register pointer of second input, size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void CycleBinary(OutT* out,
                                            const InT* in1,
                                            const InT* in2,
@@ -316,7 +280,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
 * T: The type of data.
 * NX: The number of data continuously loaded by each thread.
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * ReduceFunctor: Compute functor which has an operator() as following
 *     template <typename InT>
@@ -336,7 +299,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
 template <typename T,
          int NX,
          int NY,
-          int BlockSize,
          class ReduceFunctor,
          details::ReduceMode Mode>
 __device__ __forceinline__ void Reduce(T* out,
@@ -369,7 +331,6 @@ __device__ __forceinline__ void Reduce(T* out,
 * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * OpFunc: Compute functor which has an operator() as following
 *     template <typename InT>
@@ -384,12 +345,7 @@ __device__ __forceinline__ void Reduce(T* out,
 * out: The register pointer of out, the size is NX * NY.
 * compute: Compute function which was declared like OpFunc<InT>().
 */
-template <typename InT,
-          typename OutT,
-          int NX,
-          int NY,
-          int BlockSize,
-          class OpFunc>
+template <typename InT, typename OutT, int NX, int NY, class OpFunc>
 __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX * NY; idx++) {

--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -144,7 +144,6 @@ __device__ __forceinline__ void ReadData(T* dst,
 * Ty: The type of data that needs to be stored in registers.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
@@ -161,12 +160,7 @@ __device__ __forceinline__ void ReadData(T* dst,
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename Tx,
-          typename Ty,
-          int NX,
-          int NY,
-          int BlockSize,
-          bool IsBoundary = false>
+template <typename Tx, typename Ty, int NX, int NY, bool IsBoundary = false>
 __device__ __forceinline__ void ReadData(Ty* dst,
                                         const Tx* __restrict__ src,
                                         int size_nx,
@@ -275,7 +269,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
 * T: The type of data.
 * NX: Each thread load NX data from global memory continuously.
 * NY: Each thread need to load NY rows, only NY = 1 was supported.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
 * When the number of data processed by this block is less than
@@ -287,7 +280,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
 * src: The data pointer of the current block.
 * size: The current block needs to load size data continuously.
 */
-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, bool IsBoundary = false>
 __device__ __forceinline__ void ReadData(T* dst,
                                         const T* __restrict__ src,
                                         int num) {
@@ -319,7 +312,7 @@ __device__ __forceinline__ void ReadData(T* dst,
  }
 }

-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, bool IsBoundary = false>
 __device__ __forceinline__ void ReadData(T* dst,
                                         const T* __restrict__ src,
                                         int num,
@@ -361,7 +354,6 @@ __device__ __forceinline__ void ReadData(T* dst,
 * NY: Each thread need to load NY rows, only NY = 1 was supported.
 * ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
 * Index: The index of data stored in dst.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
 * When the number of data processed by this block is less than
@@ -376,7 +368,6 @@ __device__ __forceinline__ void ReadData(T* dst,
 template <typename T,
          int NX,
          int NY,
-          int BlockSize,
          typename ArgsT,
          int Index,
          bool IsBoundary = false>
@@ -419,7 +410,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
 * T: The type of data stored in the global memory.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
@@ -437,7 +427,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
    T* dst,
    const T* __restrict__ src,
@@ -479,7 +469,6 @@ __device__ __forceinline__ void ReadDataBc(
 * T: The type of data.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
@@ -507,7 +496,6 @@ template <typename Tx,
          typename Ty,
          int NX,
          int NY,
-          int BlockSize,
          int Rank,
          typename IndexCal,
          typename Functor,
@@ -572,7 +560,6 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
 * T: The type of data.
 * NX: The number of data continuously writed by each thread.
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
@@ -584,7 +571,7 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
 * src: The register pointer, the size is NX * NY.
 * size: The current block needs to load size elements continuously.
 */
-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, bool IsBoundary = false>
 __device__ __forceinline__ void WriteData(T* dst,
                                          T* __restrict__ src,
                                          int num) {
@@ -613,7 +600,7 @@ __device__ __forceinline__ void WriteData(T* dst,
  }
 }

-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, bool IsBoundary = false>
 __device__ __forceinline__ void WriteData(T* dst,
                                          T* __restrict__ src,
                                          int num,
@@ -652,7 +639,6 @@ __device__ __forceinline__ void WriteData(T* dst,
 * Ty: The type of data that stored in the global memory.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
@@ -669,12 +655,7 @@ __device__ __forceinline__ void WriteData(T* dst,
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename Tx,
-          typename Ty,
-          int NX,
-          int NY,
-          int BlockSize,
-          bool IsBoundary = false>
+template <typename Tx, typename Ty, int NX, int NY, bool IsBoundary = false>
 __device__ __forceinline__ void WriteData(Ty* dst,
                                          const Tx* __restrict__ src,
                                          int size_nx,
@@ -766,7 +747,6 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
 * T: The type of data stored in the global memory.
 * NX: The number of data continuously loaded by each thread.
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
@@ -782,7 +762,7 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
 * coordinate mapping relationship between output data and input data.
 * total_num_output: Total number of original output.
 */
-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
    T* dst,
    const T* __restrict__ src,
@@ -820,14 +800,13 @@ __device__ __forceinline__ void ReadDataBc(
 * T: Data type of register.
 * NX: Number of data to initialize.
 * NY: Number of data to initialize, NY only can be 1.
- * BlockSize: Identifies the current device thread index method. For GPU,
 * threadIdx.x is used as the thread index. Currently only GPU was supported.
 *
 * @param：
 * dst: The register pointer of the thread, the size is NX.
 * init_data: The register pointer of init data, the size is NX.
 */
-template <typename T, int NX, int NY, int BlockSize>
+template <typename T, int NX, int NY>
 __device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
  int thread_offset = block_offset + threadIdx.x * NX;
 #pragma unroll

--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -337,7 +337,6 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
 * Ty: The type of data that needs to be stored in registers.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
@@ -354,12 +353,7 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename Tx,
-          typename Ty,
-          int NX,
-          int NY,
-          int BlockSize,
-          bool IsBoundary = false>
+template <typename Tx, typename Ty, int NX, int NY, bool IsBoundary = false>
 __device__ __inline__ void ReadData(Ty* dst,
                                    const Tx _global_ptr_* src,
                                    int size_nx,
@@ -472,7 +466,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
 * T: The type of data.
 * NX: Each thread load NX data from global memory continuously.
 * NY: Each thread need to load NY rows, only NY = 1 was supported.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
 * When the number of data processed by this block is less than
@@ -484,7 +477,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
 * src: The data pointer of the current block.
 * size: The current block needs to load size data continuously.
 */
-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
+template <typename T, int NX, int NY, bool IsBoundary>
 __device__ __inline__ void ReadData(T* dst,
                                    const T _global_ptr_* src,
                                    int num) {
@@ -502,7 +495,7 @@ __device__ __inline__ void ReadData(T* dst,
  }
 }

-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
+template <typename T, int NX, int NY, bool IsBoundary>
 __device__ __inline__ void ReadData(T* dst,
                                    const T _global_ptr_* src,
                                    int num,
@@ -531,7 +524,6 @@ __device__ __inline__ void ReadData(T* dst,
 * NY: Each thread need to load NY rows, only NY = 1 was supported.
 * ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
 * Index: The index of data stored in dst.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
 * When the number of data processed by this block is less than
@@ -546,7 +538,6 @@ __device__ __inline__ void ReadData(T* dst,
 template <typename T,
          int NX,
          int NY,
-          int BlockSize,
          typename ArgsT,
          int Index,
          bool IsBoundary>
@@ -582,7 +573,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
 * T: The type of data stored in the global memory.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
@@ -599,7 +589,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, bool IsBoundary = false>
 __device__ __inline__ void ReadDataBc(T* dst,
                                      const T _global_ptr_* src,
                                      uint32_t block_offset,
@@ -634,7 +624,6 @@ __device__ __inline__ void ReadDataBc(T* dst,
 * T: The type of data.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
@@ -662,7 +651,6 @@ template <typename Tx,
          typename Ty,
          int NX,
          int NY,
-          int BlockSize,
          int Rank,
          typename IndexCal,
          typename Functor,
@@ -733,7 +721,6 @@ __device__ __forceinline__ void ReadDataReduce(
 * T: The type of data.
 * NX: The number of data continuously writed by each thread.
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
@@ -746,7 +733,7 @@ __device__ __forceinline__ void ReadDataReduce(
 * size: The current block needs to load size elements continuously.
 */

-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
+template <typename T, int NX, int NY, bool IsBoundary>
 __device__ void WriteData(T _global_ptr_* dst,
                          const T* src,
                          int num,
@@ -766,7 +753,7 @@ __device__ void WriteData(T _global_ptr_* dst,
  }
 }

-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
+template <typename T, int NX, int NY, bool IsBoundary>
 __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
  int thread_offset = core_id() * NX;
  mfence_local();
@@ -793,7 +780,6 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
 * Ty: The type of data stored in the global memory.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
@@ -810,12 +796,7 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename Tx,
-          typename Ty,
-          int NX,
-          int NY,
-          int BlockSize,
-          bool IsBoundary = false>
+template <typename Tx, typename Ty, int NX, int NY, bool IsBoundary = false>
 __device__ __inline__ void WriteData(Ty _global_ptr_* dst,
                                     const Tx* src,
                                     int size_nx,
@@ -1190,7 +1171,6 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
 * T: The type of data stored in the global memory.
 * NX: The number of data continuously loaded by each thread.
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
@@ -1206,7 +1186,7 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
 * read_lens: The number of data continuously loaded by each thread.
 * total_num_output: Total number of original output.
 */
-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, bool IsBoundary = false>
 __device__ __inline__ void ReadDataBc(T* dst,
                                      const T _global_ptr_* src,
                                      uint32_t block_offset,
@@ -1238,14 +1218,13 @@ __device__ __inline__ void ReadDataBc(T* dst,
 * T: Data type of register.
 * NX: Number of data to initialize.
 * NY: Number of data to initialize, NY only can be 1.
- * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
 *
 * @param：
 * dst: The register pointer of the thread, the size is NX.
 * init_data: The register pointer of init data, the size is NX.
 */
-template <typename T, int NX, int NY, int BlockSize>
+template <typename T, int NX, int NY>
 __device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
  int thread_offset = block_offset + core_id() * NX;
 #pragma unroll