delete rank switch in broadcast_function.h for compile (#42645)

8501fb00 · niuliling123 · GitHub · 8ffebb5a · 8501fb00 · 8501fb00
5 changed file
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -51,8 +51,7 @@ template <typename InT, typename OutT, int ShapeSize, int VecSize,
 __global__ void BroadcastKernelBinary(
    const InT* __restrict__ in0, const InT* __restrict__ in1, OutT* out,
    phi::Array<bool, MAX_INPUT_NUM> use_broadcast, uint32_t numel,
-    phi::Array<kps::details::BroadcastConfig<ShapeSize>, MAX_INPUT_NUM>
+    phi::Array<kps::details::BroadcastConfig, MAX_INPUT_NUM> configlists,
-        configlists,
    int main_tid, int tail_tid, Functor func) {
  int fix = blockIdx.x * blockDim.x * VecSize;
  int num = tail_tid;
@@ -65,14 +64,14 @@ __global__ void BroadcastKernelBinary(
  // load in0
  if (use_broadcast[0]) {
-    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
+    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1>(
        arg0, in0, fix, configlists[0], numel);
  } else {
    kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg0, in0 + fix, num);
  }
  // load in1
  if (use_broadcast[1]) {
-    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
+    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1>(
        arg1, in1, fix, configlists[1], numel);
  } else {
    kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg1, in1 + fix, num);
@@ -104,7 +103,7 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
  int main_tid = numel / (data_per_thread * vec_size * threads);
  int tail_tid = numel % (data_per_thread * vec_size * threads);
-  phi::Array<kps::details::BroadcastConfig<2>, MAX_INPUT_NUM> configlists;
+  phi::Array<kps::details::BroadcastConfig, MAX_INPUT_NUM> configlists;
  phi::Array<bool, MAX_INPUT_NUM> use_broadcast;
  use_broadcast[0] = false;
@@ -115,7 +114,7 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
  // Here, dims are transposed due to the logic in BroadcastConfig.
  std::vector<int64_t> input1_dims = {n, 1};
  std::vector<int64_t> out_dims = {n, m};
-  configlists[1] = kps::details::BroadcastConfig<2>(out_dims, input1_dims, 2);
+  configlists[1] = kps::details::BroadcastConfig(out_dims, input1_dims, 2);
  auto func = AddFunctor<T>();
  auto stream = ctx.stream();

--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -223,31 +223,42 @@ struct DimensionsTransform {
  }
 };
-template <typename T, int VecSize, int Rank, bool IsBoundary = false>
+template <typename InT, typename OutT, int NumOuts = 1>
-__device__ __forceinline__ void LoadData(
+int GetVecsize(const std::vector<const DenseTensor *> &ins,
-    T *dst,
+               std::vector<DenseTensor *> *outs) {
-    const _ptr_ T *src,
+  int in_vec_size = 4;
-    uint32_t block_offset,
+  int out_vec_size = 4;
-    const kps::details::BroadcastConfig<Rank> &config,
+  if (NumOuts > 1) {
-    int numel,
+    for (int i = 0; i < NumOuts; ++i) {
-    int num,
+      PADDLE_ENFORCE_EQ(
-    int need_broadcast) {
+          (*outs)[i]->dims(),
-  // numel : whole num of output
+          (*outs)[0]->dims(),
-  // num: how many data will be deal with in this time
+          phi::errors::InvalidArgument(
-  if (need_broadcast) {
+              "The shape of each output tensor shall be identical yet, but "
-    kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(
+              "%d-th output tensor`s shape is not.",
-        dst, src, block_offset, config, numel);
+              i));
+      out_vec_size = std::min(
+          phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
+    }
  } else {
-    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
+    out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
+  }
+  for (auto *in : ins) {
+    auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
+    in_vec_size = in->dims() == (*outs)[0]->dims()
+                      ? std::min(temp_size, in_vec_size)
+                      : in_vec_size;
  }
+  return std::min(out_vec_size, in_vec_size);
 }
-template <typename T, int VecSize, int Rank, bool IsBoundary = false>
+template <typename T, int VecSize, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
    T *dst,
    const _ptr_ T *src,
    uint32_t block_offset,
-    const kps::details::BroadcastConfig<Rank> &config,
+    const kps::details::BroadcastConfig &config,
    int numel,
    int num,
    int need_broadcast,
@@ -255,7 +266,7 @@ __device__ __forceinline__ void LoadData(
  // numel : whole num of output
  // num: how many data will be deal with in this time
  if (need_broadcast) {
-    kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(
+    kps::ReadDataBc<T, VecSize, 1, 1, IsBoundary>(
        dst, src, block_offset, config, numel, read_lens);
  } else {
    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(
@@ -269,14 +280,13 @@ template <typename InT,
          int Arity,
          int NumOuts,
          int VecSize,
-          int Rank,
          bool IsBoundary = false>
 __device__ void VectorizedBroadcastKernelImpl(
    const phi::Array<const _ptr_ InT *__restrict__, Arity> &ins,
    phi::Array<_ptr_ OutT *, NumOuts> outs,
    const phi::Array<int, Arity> &use_broadcast,
    uint32_t numel,
-    const phi::Array<kps::details::BroadcastConfig<Rank>, Arity> &configs,
+    const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
    int num,
    int block_offset,
    int read_lens,
@@ -287,7 +297,7 @@ __device__ void VectorizedBroadcastKernelImpl(
 #pragma unroll
  for (int i = 0; i < Arity; i++) {
    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f), read_lens);
-    LoadData<InT, VecSize, Rank, IsBoundary>(args[i],
+    LoadData<InT, VecSize, IsBoundary>(args[i],
                                       ins[i],
                                       block_offset,
                                       configs[i],
@@ -315,14 +325,13 @@ template <typename InT,
          typename Functor,
          int Arity,
          int NumOuts,
-          int VecSize,
+          int VecSize>
-          int Rank>
 __global__ void VectorizedBroadcastKernel(
    phi::Array<const _ptr_ InT *__restrict__, Arity> ins,
    phi::Array<_ptr_ OutT *, NumOuts> outs,
    phi::Array<int, Arity> use_broadcast,
    uint32_t numel,
-    phi::Array<kps::details::BroadcastConfig<Rank>, Arity> configs,
+    phi::Array<kps::details::BroadcastConfig, Arity> configs,
    int main_offset,
    int tail_tid,
    int read_lens,
@@ -338,7 +347,6 @@ __global__ void VectorizedBroadcastKernel(
                                  Arity,
                                  NumOuts,
                                  VecSize,
-                                  Rank,
                                  false>(ins,
                                         outs,
                                         use_broadcast,
@@ -357,7 +365,6 @@ __global__ void VectorizedBroadcastKernel(
                                  Arity,
                                  NumOuts,
                                  VecSize,
-                                  Rank,
                                  true>(ins,
                                        outs,
                                        use_broadcast,
@@ -376,7 +383,6 @@ __global__ void VectorizedBroadcastKernel(
                                  Arity,
                                  NumOuts,
                                  VecSize,
-                                  Rank,
                                  false>(ins,
                                         outs,
                                         use_broadcast,
@@ -393,7 +399,6 @@ __global__ void VectorizedBroadcastKernel(
                                  Arity,
                                  NumOuts,
                                  VecSize,
-                                  Rank,
                                  true>(ins,
                                        outs,
                                        use_broadcast,
@@ -412,15 +417,14 @@ template <typename InT,
          typename Functor,
          int Arity,
          int NumOuts,
-          int VecSize,
+          int VecSize>
-          int Rank>
+void LaunchBroadcastKernel(
-void LaunchBroadcastKernel(const KPDevice &ctx,
+    const KPDevice &ctx,
    const std::vector<const DenseTensor *> &ins,
    std::vector<DenseTensor *> *outs,
    Functor func,
-                           DimensionsTransform merge_dims) {
+    const phi::Array<kps::details::BroadcastConfig, Arity> &configs) {
  int numel = (*outs)[0]->numel();
-  phi::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
  phi::Array<int, Arity> use_broadcast;
  phi::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
  phi::Array<_ptr_ OutT *, NumOuts> outs_data;
@@ -432,132 +436,41 @@ void LaunchBroadcastKernel(const KPDevice &ctx,
  for (int i = 0; i < Arity; i++) {
    use_broadcast[i] = (ins[i]->numel() != numel);
    ins_data[i] = (const _ptr_ InT *)(ins[i]->data<InT>());
-#ifdef PADDLE_WITH_XPU_KP
-    if (i == 0) {
-      configs[i] = kps::details::BroadcastConfig<Rank>(merge_dims.out_dims,
-                                                       merge_dims.in_dims[0],
-                                                       merge_dims.in_dims[1],
-                                                       merge_dims.dim_size);
-    } else if (i == 1) {
-      configs[i] = kps::details::BroadcastConfig<Rank>(merge_dims.out_dims,
-                                                       merge_dims.in_dims[1],
-                                                       merge_dims.in_dims[0],
-                                                       merge_dims.dim_size);
-    }
-#else
-    if (use_broadcast[i]) {
-      // get the broadcast config,
-      // if data shape is[m, n], then you should set data_dim = {n, m}
-      // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
-      configs[i] = kps::details::BroadcastConfig<Rank>(
-          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
-    }
-#endif
  }
 #ifdef PADDLE_WITH_XPU_KP
  const int threads = 64;
  const int blocks = 8;
  int read_lens = configs[0].buf_len;
+  auto stream = ctx.x_context()->xpu_stream;
  int main_offset = (numel / (read_lens * threads)) * read_lens * threads;
  int tail_tid = numel % (read_lens * threads);
-  auto stream = ctx.x_context()->xpu_stream;
-  if (configs[0].cmp_type != kps::details::OptType::CanNotOptimize) {
-    main_offset = numel;
-    VectorizedBroadcastKernel<InT,
-                              OutT,
-                              Functor,
-                              Arity,
-                              NumOuts,
-                              512,
-                              Rank><<<blocks, threads, stream>>>(ins_data,
-                                                                 outs_data,
-                                                                 use_broadcast,
-                                                                 numel,
-                                                                 configs,
-                                                                 main_offset,
-                                                                 tail_tid,
-                                                                 read_lens,
-                                                                 func);
-  } else {
-    VectorizedBroadcastKernel<InT,
-                              OutT,
-                              Functor,
-                              Arity,
-                              NumOuts,
-                              256,
-                              Rank><<<blocks, threads, stream>>>(ins_data,
-                                                                 outs_data,
-                                                                 use_broadcast,
-                                                                 numel,
-                                                                 configs,
-                                                                 main_offset,
-                                                                 tail_tid,
-                                                                 read_lens,
-                                                                 func);
-  }
 #else
-  const int threads = 256;
+  auto gpu_config =
-  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
+      phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  int read_lens = VecSize;
-  int tail_tid = numel % (VecSize * threads);
  auto stream = ctx.stream();
+  auto threads = gpu_config.thread_per_block;
+  auto blocks = gpu_config.block_per_grid;
+  int main_offset = (numel / (read_lens * gpu_config.GetBlockSize())) *
+                    read_lens * gpu_config.GetBlockSize();
+  int tail_tid = numel % (read_lens * gpu_config.GetBlockSize());
+#endif
  VectorizedBroadcastKernel<InT,
                            OutT,
                            Functor,
                            Arity,
                            NumOuts,
-                            VecSize,
+                            VecSize><<<blocks, threads, 0, stream>>>(
-                            Rank><<<blocks, threads, 0, stream>>>(ins_data,
+      ins_data,
      outs_data,
      use_broadcast,
      numel,
      configs,
      main_offset,
      tail_tid,
-                                                                  VecSize,
+      read_lens,
      func);
-#endif
-}
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
-void BroadcastKernelForDifferentDimSize(
-    const KPDevice &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
-  const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
-#define CALL_BROADCAST_FOR_DIM_SIZE(rank)                                     \
-  case rank: {                                                                \
-    LaunchBroadcastKernel<InT, OutT, Functor, Arity, NumOuts, VecSize, rank>( \
-        ctx, ins, outs, func, merge_dims);                                    \
-  } break;
-  switch (merge_dims.dim_size) {
-    CALL_BROADCAST_FOR_DIM_SIZE(1);
-    CALL_BROADCAST_FOR_DIM_SIZE(2);
-    CALL_BROADCAST_FOR_DIM_SIZE(3);
-    CALL_BROADCAST_FOR_DIM_SIZE(4);
-    CALL_BROADCAST_FOR_DIM_SIZE(5);
-    CALL_BROADCAST_FOR_DIM_SIZE(6);
-    CALL_BROADCAST_FOR_DIM_SIZE(7);
-    CALL_BROADCAST_FOR_DIM_SIZE(8);
-    default: {
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "The maximum dimension of input tensor is expected to be less than "
-          "%d, but received %d.",
-          merge_dims.dim_size,
-          phi::DDim::kMaxRank));
-    }
-  }
-#undef CALL_BROADCAST_FOR_DIM_SIZE
 }
 template <ElementwiseType ET,
@@ -574,79 +487,82 @@ void BroadcastKernelForDifferentVecSize(
  using Traits = paddle::platform::FunctionTraits<Functor>;
  const int kArity =
      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
-  PADDLE_ENFORCE_EQ(ins.size(),
+  PADDLE_ENFORCE_EQ(
+      ins.size(),
      kArity,
-                    phi::errors::InvalidArgument(
+      phi::errors::InvalidArgument("The number of inputs is expected to be "
-                        "The number of inputs is expected to be equal to the "
+                                   "equal to the "
-                        "arity of functor. But received: the number of inputs "
+                                   "arity of functor. But recieved: the "
+                                   "number of inputs "
                                   "is %d, the arity of functor is %d.",
                                   ins.size(),
                                   kArity));
-  PADDLE_ENFORCE_LE(kArity,
+  PADDLE_ENFORCE_LE(
+      kArity,
      3,
-                    phi::errors::InvalidArgument(
+      phi::errors::InvalidArgument("Currently only broadcast of ternary is "
-                        "Currently only broadcast of ternary is supported "
+                                   "supported "
                                   "and verified, but received %d.",
                                   kArity));
-  PADDLE_ENFORCE_EQ(outs->size(),
+  PADDLE_ENFORCE_EQ(
+      outs->size(),
      NumOuts,
-                    phi::errors::InvalidArgument(
+      phi::errors::InvalidArgument("Number of outputs shall equal to number "
-                        "Number of outputs shall equal to number of functions, "
+                                   "of functions, "
-                        "but number of outputs is %d, of functions is %d.",
+                                   "but number of outputs is %d, of "
+                                   "functions is %d.",
                                   outs->size(),
                                   NumOuts));
-  int in_vec_size = 4;
-  int out_vec_size = 4;
+  // mergedim and get vec_size
-  if (NumOuts > 1) {
+  const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
-    for (int i = 0; i < NumOuts; ++i) {
+  phi::Array<kps::details::BroadcastConfig, kArity> configs;
+// get vec_size
+#ifdef PADDLE_WITH_XPU_KP
  PADDLE_ENFORCE_EQ(
-          (*outs)[i]->dims(),
+      ins.size(),
-          (*outs)[0]->dims(),
+      2,
      phi::errors::InvalidArgument(
-              "The shape of each output tensor shall be identical yet, but "
+          "XPU only support inputs is 2, but received %d", ins.size()));
-              "%d-th output tensor`s shape is not.",
+  configs[0] = kps::details::BroadcastConfig(merge_dims.out_dims,
-              i));
+                                             merge_dims.in_dims[0],
-      out_vec_size = std::min(
+                                             merge_dims.in_dims[1],
-          phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
+                                             merge_dims.dim_size);
-    }
+  configs[1] = kps::details::BroadcastConfig(merge_dims.out_dims,
-  } else {
+                                             merge_dims.in_dims[1],
-    out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
+                                             merge_dims.in_dims[0],
+                                             merge_dims.dim_size);
+  auto type = kps::details::OptType::CanNotOptimize;
+  bool is_optimize = configs[0].cmp_type != type;
+  int vec_size = is_optimize ? VecSizeL : VecSizeM;
+#else
+  for (int i = 0; i < kArity; i++) {
+    // get the broadcast config,
+    // if data shape is[m, n], then you should set data_dim = {n, m}
+    // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
+    if (ins[i]->numel()) {
+      configs[i] = kps::details::BroadcastConfig(
+          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
    }
-  for (auto *in : ins) {
-    auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
-    in_vec_size = in->dims() == (*outs)[0]->dims()
-                      ? std::min(temp_size, in_vec_size)
-                      : in_vec_size;
  }
-  int vec_size = std::min(out_vec_size, in_vec_size);
+  int vec_size = GetVecsize<InT, OutT, NumOuts>(ins, outs);
+#endif
  switch (vec_size) {
-    case 4: {
+    case VecSizeL: {
-      BroadcastKernelForDifferentDimSize<InT,
+      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeL>(
-                                         OutT,
+          ctx, ins, outs, func, configs);
-                                         Functor,
-                                         kArity,
-                                         NumOuts,
-                                         4>(ctx, ins, outs, axis, func);
      break;
    }
-    case 2: {
+    case VecSizeM: {
-      BroadcastKernelForDifferentDimSize<InT,
+      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeM>(
-                                         OutT,
+          ctx, ins, outs, func, configs);
-                                         Functor,
-                                         kArity,
-                                         NumOuts,
-                                         2>(ctx, ins, outs, axis, func);
      break;
    }
-    case 1: {
+    case VecSizeS: {
-      BroadcastKernelForDifferentDimSize<InT,
+      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeS>(
-                                         OutT,
+          ctx, ins, outs, func, configs);
-                                         Functor,
-                                         kArity,
-                                         NumOuts,
-                                         1>(ctx, ins, outs, axis, func);
      break;
    }
    default: {

--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -82,10 +82,10 @@ struct FastDivMod {
 * index of the output data. if input or output shape is [dim0, dim1] then dims
 * must be [dim1, dim0].
 */
-template <int kDims>
 struct BroadcastConfig {
-  FastDivMod divmoders[kDims];
+  FastDivMod divmoders[phi::DDim::kMaxRank];
  uint32_t strides[phi::DDim::kMaxRank];
+  int kDims;
  HOSTDEVICE BroadcastConfig() {}
  HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
@@ -109,7 +109,7 @@ struct BroadcastConfig {
                                            std::multiplies<int64_t>())
                          : strides_in[i];
    }
+    kDims = dim_size;
    memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t));
    memcpy(divmoders, divmoders_in.data(), kDims * sizeof(FastDivMod));
  }
@@ -436,17 +436,12 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename T,
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
    T* dst,
    const T* __restrict__ src,
    uint32_t block_offset,
-    details::BroadcastConfig<Rank> config,
+    const details::BroadcastConfig& config,
    int total_num_output,
    int stride_nx,
    int stride_ny) {
@@ -465,7 +460,8 @@ __device__ __forceinline__ void ReadDataBc(
        }
      }
 #pragma unroll
-      for (int i = 0; i < Rank; ++i) {
+      for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
+        if (i >= config.kDims) break;
        auto fast_divmoder = config.divmoders[i].Divmod(index_output);
        index_output = fast_divmoder.val[0];
        index_src += fast_divmoder.val[1] * config.strides[i];
@@ -785,53 +781,14 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
 * coordinate mapping relationship between output data and input data.
 * total_num_output: Total number of original output.
 */
-template <typename T,
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataBc(
-    T* dst,
-    const T* __restrict__ src,
-    uint32_t block_offset,
-    details::BroadcastConfig<Rank> config,
-    int total_num_output) {
-  uint32_t thread_offset = block_offset + threadIdx.x * NX;
-  uint32_t index_src = 0;
-#pragma unroll
-  for (uint32_t nx = 0; nx < NX; ++nx) {
-    uint32_t index_output = thread_offset + nx;
-    index_src = 0;
-    if (IsBoundary) {
-      if (index_output >= total_num_output) {
-        break;
-      }
-    }
-#pragma unroll
-    for (int i = 0; i < Rank; ++i) {
-      auto fast_divmoder = config.divmoders[i].Divmod(index_output);
-      index_output = fast_divmoder.val[0];
-      index_src += fast_divmoder.val[1] * config.strides[i];
-    }
-    dst[nx] = src[index_src];
-  }
-}
-template <typename T,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
    T* dst,
    const T* __restrict__ src,
    uint32_t block_offset,
-    details::BroadcastConfig<Rank> config,
+    const details::BroadcastConfig& config,
    int total_num_output,
-    int read_lens) {
+    int read_lens = NX) {
  uint32_t thread_offset = block_offset + threadIdx.x * NX;
  uint32_t index_src = 0;
@@ -845,7 +802,8 @@ __device__ __forceinline__ void ReadDataBc(
      }
    }
 #pragma unroll
-    for (int i = 0; i < Rank; ++i) {
+    for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
+      if (i >= config.kDims) break;
      auto fast_divmoder = config.divmoders[i].Divmod(index_output);
      index_output = fast_divmoder.val[0];
      index_src += fast_divmoder.val[1] * config.strides[i];
@@ -853,6 +811,7 @@ __device__ __forceinline__ void ReadDataBc(
    dst[nx] = src[index_src];
  }
 }
 /**
 * @brief Initialize register with data index.
 *

--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -65,7 +65,6 @@ struct alignas(sizeof(T) * VecSize) VectorType {
 * must be [dim1, dim0].
 */
 #pragma pack(4)
-template <int kDims>
 struct BroadcastConfig {
  int strides_in[phi::DDim::kMaxRank];
  int strides_out[phi::DDim::kMaxRank];
@@ -78,7 +77,7 @@ struct BroadcastConfig {
  int n = 1;
  int k = 1;
  int buf_len = 0;
+  int kDims;
  HOSTDEVICE BroadcastConfig() {}
  HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
@@ -99,7 +98,7 @@ struct BroadcastConfig {
    for (int i = 0; i < dim_size; i++) {
      dim_tmp[i] = in_dims[i];
    }
+    kDims = dim_size;
    memcpy(strides_in, strides_in_tmp.data(), kDims * sizeof(int));
    memcpy(strides_out, strides_out_tmp.data(), kDims * sizeof(int));
    memcpy(in_dim, dim_tmp.data(), kDims * sizeof(int));
@@ -551,7 +550,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
 * NY: The number of data rows loaded by each thread.
 * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
 * NX x NY x core_num(), boundary judgment is required to avoid memory access
@@ -567,16 +565,11 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
 * stride_nx: Each read one element stride stride_nx elements in the last dim.
 * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
-template <typename T,
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
 __device__ __inline__ void ReadDataBc(T* dst,
                                      const T _global_ptr_* src,
                                      uint32_t block_offset,
-                                      details::BroadcastConfig<Rank> config,
+                                      const details::BroadcastConfig& config,
                                      int total_num_output,
                                      int stride_nx,
                                      int stride_ny) {
@@ -882,60 +875,6 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
  }
 }
-/**
- * @brief Read 1D data from global memory to register with broadcast form.
- *
- * @template paraments
- * T: The type of data stored in the global memory.
- * NX: The number of data continuously loaded by each thread.
- * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
- * BlockSize: Identifies the current device thread index method. For xpu,
- * core_id() is used as the index.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
- * IsBoundary: Indicates whether to perform block access storage out-of-bounds
- * judgment. When the number of data processed by the block is less than
- * NX x NY x core_num(), boundary judgment is required to avoid memory access
- * crossing the boundary.
- *
- * @param：
- * dst: The register pointer of the thread, the size is NX * NY.
- * src: The original input data pointer of kernel.
- * block_offset: The data offset of this block, core_num() * blockIdx.x * NX;
- * config: Calculation configuration of broadcast. It is used to calculate the
- * coordinate mapping relationship between output data and input data.
- * total_num_output: Total number of original output.
- */
-template <typename T,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(
-    T* dst,
-    const T _global_ptr_* src,
-    uint32_t block_offset,
-    const details::BroadcastConfig<Rank>& config,
-    int total_num_output) {
-  int thread_offset = block_offset + core_id() * NX;
-  int index_src = 0;
-  __local__ T in_temp;
-#pragma unroll
-  for (int nx = 0; nx < NX; ++nx) {
-    int index_output = thread_offset + nx;
-    index_src = 0;
-    if (IsBoundary) {
-      if (index_output >= total_num_output) {
-        break;
-      }
-    }
-    index_src = config(index_output);
-    GM2LM(src + index_src, &in_temp, sizeof(T));
-    dst[nx] = in_temp;
-  }
-}
 /**
 * @brief Read data from global memory to local memory with broadcast
 * {m, 1, k}-> {m, n, k} form.
@@ -952,12 +891,12 @@ __device__ __inline__ void ReadDataBc(
 * coordinate mapping relationship between output data and input data.
 * read_lens: The number of data continuously loaded by each thread.
 */
-template <typename T, int Rank>
+template <typename T>
 __device__ __inline__ void ReadDataBcM1kMnk(
    T* dst,
    const T _global_ptr_* src,
    int thread_offset,
-    const details::BroadcastConfig<Rank>& config,
+    const details::BroadcastConfig& config,
    int read_lens) {
  int index_output = thread_offset;
  int index_base = config(index_output);
@@ -999,12 +938,12 @@ __device__ __inline__ void ReadDataBcM1kMnk(
 * coordinate mapping relationship between output data and input data.
 * read_lens: The number of data continuously loaded by each thread.
 */
-template <typename T, int Rank>
+template <typename T>
 __device__ __inline__ void ReadDataBcM1Mn(
    T* dst,
    const T _global_ptr_* src,
    int thread_offset,
-    const details::BroadcastConfig<Rank>& config,
+    const details::BroadcastConfig& config,
    int read_lens) {
  int index_output = thread_offset;
  int index_base = config(index_output);
@@ -1027,7 +966,6 @@ __device__ __inline__ void ReadDataBcM1Mn(
 *
 * @template paraments
 * T: Data type of register.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 *
 * @param：
 * dst: The register pointer of the thread, the size is NX.
@@ -1037,12 +975,12 @@ __device__ __inline__ void ReadDataBcM1Mn(
 * coordinate mapping relationship between output data and input data.
 * read_lens: The number of data continuously loaded by each thread.
 */
-template <typename T, int Rank>
+template <typename T>
 __device__ __inline__ void ReadDataBc1NMn(
    T* dst,
    const T _global_ptr_* src,
    int thread_offset,
-    const details::BroadcastConfig<Rank>& config,
+    const details::BroadcastConfig& config,
    int read_lens) {
  int index_output = thread_offset;
  int index_base = config(index_output);
@@ -1075,7 +1013,6 @@ __device__ __inline__ void ReadDataBc1NMn(
 *
 * @template paraments
 * T: Data type of register.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 *
 * @param：
 * dst: The register pointer of the thread, the size is NX.
@@ -1085,12 +1022,12 @@ __device__ __inline__ void ReadDataBc1NMn(
 * coordinate mapping relationship between output data and input data.
 * read_lens: The number of data continuously loaded by each thread.
 */
-template <typename T, int Rank>
+template <typename T>
 __device__ __inline__ void ReadDataBc1N1Mnk(
    T* dst,
    const T _global_ptr_* src,
    int thread_offset,
-    const details::BroadcastConfig<Rank>& config,
+    const details::BroadcastConfig& config,
    int read_lens) {
  int index_output = thread_offset;
  int index_base = config(index_output);
@@ -1130,7 +1067,6 @@ __device__ __inline__ void ReadDataBc1N1Mnk(
 *
 * @template paraments
 * T: Data type of register.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 *
 * @param：
 * dst: The register pointer of the thread, the size is NX.
@@ -1140,12 +1076,11 @@ __device__ __inline__ void ReadDataBc1N1Mnk(
 * coordinate mapping relationship between output data and input data.
 * read_lens: The number of data continuously loaded by each thread.
 */
-template <typename T, int Rank>
+template <typename T>
-__device__ __inline__ void ReadDataBc1N(
+__device__ __inline__ void ReadDataBc1N(T* dst,
-    T* dst,
                                        const T _global_ptr_* src,
                                        int thread_offset,
-    const details::BroadcastConfig<Rank>& config,
+                                        const details::BroadcastConfig& config,
                                        int read_lens) {
  int index_output = thread_offset;
  int index_base = config(index_output);
@@ -1174,12 +1109,12 @@ __device__ __inline__ void ReadDataBc1N(
 * total_num_output: Total number of original output.
 * read_lens: The number of data continuously loaded by each thread.
 */
-template <typename T, int Rank, bool IsBoundary = false>
+template <typename T, bool IsBoundary = false>
 __device__ __inline__ void ReadDataBcCanNotCmp(
    T* dst,
    const T _global_ptr_* src,
    int thread_offset,
-    const details::BroadcastConfig<Rank>& config,
+    const details::BroadcastConfig& config,
    int total_num_output,
    int read_lens) {
  int index_output = thread_offset;
@@ -1215,7 +1150,6 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
 * BlockSize: Identifies the current device thread index method. For xpu,
 * core_id() is used as the index.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
 * NX x NY x core_num(), boundary judgment is required to avoid memory access
@@ -1230,33 +1164,27 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
 * read_lens: The number of data continuously loaded by each thread.
 * total_num_output: Total number of original output.
 */
-template <typename T,
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-          int NX,
+__device__ __inline__ void ReadDataBc(T* dst,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(
-    T* dst,
                                      const T _global_ptr_* src,
                                      uint32_t block_offset,
-    const details::BroadcastConfig<Rank>& config,
+                                      const details::BroadcastConfig& config,
                                      int total_num_output,
                                      int read_lens) {
  int thread_offset = block_offset + core_id() * read_lens;
  if (config.cmp_type == details::OptType::MNK_M1K) {
-    ReadDataBcM1kMnk<T, Rank>(dst, src, thread_offset, config, read_lens);
+    ReadDataBcM1kMnk<T>(dst, src, thread_offset, config, read_lens);
  } else if (config.cmp_type == details::OptType::N_1) {
-    ReadDataBc1N<T, Rank>(dst, src, thread_offset, config, read_lens);
+    ReadDataBc1N<T>(dst, src, thread_offset, config, read_lens);
  } else if (config.cmp_type == details::OptType::MN_M) {
-    ReadDataBcM1Mn<T, Rank>(dst, src, thread_offset, config, read_lens);
+    ReadDataBcM1Mn<T>(dst, src, thread_offset, config, read_lens);
  } else if (config.cmp_type == details::OptType::MN_N) {
-    ReadDataBc1NMn<T, Rank>(dst, src, thread_offset, config, read_lens);
+    ReadDataBc1NMn<T>(dst, src, thread_offset, config, read_lens);
  } else if (config.cmp_type == details::OptType::MNK_1N1) {
-    ReadDataBc1N1Mnk<T, Rank>(dst, src, thread_offset, config, read_lens);
+    ReadDataBc1N1Mnk<T>(dst, src, thread_offset, config, read_lens);
  } else {
-    ReadDataBcCanNotCmp<T, Rank, IsBoundary>(
+    ReadDataBcCanNotCmp<T, IsBoundary>(
        dst, src, thread_offset, config, total_num_output, read_lens);
  }
 }

--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -40,7 +40,9 @@
 #define GRID_NUM_X cluster_num()
 #define GRID_NUM_Y 0
 #define GRID_NUM_Z 0
+#define VecSizeL 512
+#define VecSizeM 256
+#define VecSizeS 128
 #else
 #define KPStream gpuStream_t
@@ -64,6 +66,9 @@
 #define GRID_NUM_Y gridDim.y
 #define GRID_NUM_Z gridDim.z
+#define VecSizeL 4
+#define VecSizeM 2
+#define VecSizeS 1
 #endif
 // include file