Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1 (#36373)

* Update the implement of reduceAnyKernel according to kernel primitive api * Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1

Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1 (#36373)
* Update the implement of reduceAnyKernel according to kernel primitive api * Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1
921c0917 · niuliling123 · GitHub · 5eb640c6 · 921c0917 · 921c0917
5 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -171,7 +171,7 @@ __device__ __forceinline__ void LoadData(
  // num: how many data will be deal with in this time
  if (need_broadcast) {
    kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(dst, src, block_offset,
-                                                        config, numel, 1, 1);
+                                                        config, numel);
  } else {
    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
  }

--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -72,14 +72,14 @@ __global__ void BroadcastKernelBinary(
  // load in0
  if (use_broadcast[0]) {
    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
-        arg0, in0, fix, configlists[0], numel, 1, 1);
+        arg0, in0, fix, configlists[0], numel);
  } else {
    kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg0, in0 + fix, num);
  }
  // load in1
  if (use_broadcast[1]) {
    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
-        arg1, in1, fix, configlists[1], numel, 1, 1);
+        arg1, in1, fix, configlists[1], numel);
  } else {
    kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg1, in1 + fix, num);
  }

--- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
@@ -135,17 +135,16 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
 }  // namespace details

 /**
- * @brief Perform unary calculation according to OpFunc. Size of input and
+ * @brief Perform unary calculation according to OpFunc. Shape of input and
 * output are the same.
 *
 * @template paraments
- * InT: Data type of in.
- * OutT: Data type of out.
+ * InT: The data type of in.
+ * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * OpFunc: Compute functor which has an operator() as following:
 *     template <typename InT, typename OutT>
 *     struct XxxFunctor {
@@ -170,21 +169,20 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
 }

 /**
- * @brief Binary calculation according to OpFunc. Size of The input and output
+ * @brief Binary calculation according to OpFunc. Shape of The input and output
 * are the same.
 *
 * @template paraments
- * InT: Data type of in1 and in2.
- * OutT: Data type of out.
- * NX: The number of data columns loaded by each thread.
- * NY: The number of data rows loaded by each thread.
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns computed by each thread.
+ * NY: The number of data rows computed by each thread.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * OpFunc: Compute functor which has an operator() as following:
- *     template <typename InT, typename OutT>
+ *     template <typename InT>
 *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {
+ *       HOSTDEVICE InT operator()(const InT& a, const InT& b) const {
 *         return ...;
 *       }
 *     };
@@ -193,7 +191,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
 * out: The register pointer of out, the size is NX * NY.
 * in1: The register pointer of fist input, size is NX * NY.
 * in2: The register pointer of second input, size is NX * NY.
- * compute: Compute function which was declared like OpFunc<InT, OutT>().
+ * compute: Compute function which was declared like OpFunc<InT>().
 */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize,
          class OpFunc>
@@ -207,21 +205,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
 }

 /**
- * @brief Ternary calculation according to OpFunc. Size of input and output
+ * @brief Ternary calculation according to OpFunc. Shape of input and output
 * are the same.
 *
 * @template paraments
- * InT: Data type of in1 and in2.
- * OutT: Data type of out.
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * OpFunc: Compute functor which has an operator() as following
- *     template <typename InT, typename OutT>
+ *     template <typename InT>
 *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c)
+ *       HOSTDEVICE InT operator()(const InT& a, const InT& b, const InT& c)
 * const {
 *         return ...;
 *       }
@@ -232,7 +229,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
 * in1: The register pointer of fist input, size is NX * NY.
 * in2: The register pointer of second input, size is NX * NY.
 * in3: The register pointer of third input, size is NX * NY.
- * compute: Compute function which was declared like OpFunc<InT, OutT>().
+ * compute: Compute function which was declared like OpFunc<InT>().
 */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize,
          class OpFunc>
@@ -247,30 +244,29 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
 }

 /**
- * @brief Multivariate calculation according to OpFunc. Size of input and output
- * are the same.
+ * @brief Multivariate calculation according to OpFunc. Shape of inputs and
+ * output are the same.
 *
 * @template paraments
- * InT: Data type of in1, in2 and in3.
- * OutT: Data type of out.
+ * InT: The data type of in1, in2 and in3.
+ * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
- * Arity: The size of ins
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
+ * Arity: The size of ins.
 * OpFunc: Compute functor which has an operator() as following:
- *     template <typename InT, typename OutT>
+ *     template <typename InT>
 *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const InT* args) const {
+ *       HOSTDEVICE InT operator()(const InT* args) const {
 *         return ...;
 *       }
 *     };
 *
 * @param
 * out: The register pointer of out, the size is NX * NY.
- * ins: An array of pointers consisting of multiple inputs.
- * compute: Compute function which was declared like OpFunc<InT, OutT>().
+ * ins: A pointers of array consisting of multiple inputs.
+ * compute: Compute function which was declared like OpFunc<InT>().
 */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
          class OpFunc>
@@ -293,13 +289,12 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
 * shape is [NY, NX].
 *
 * @template paraments
- * InT: Data type of in1 and in2.
- * OutT: Data type of out.
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * OpFunc: Compute functor which has an operator() as following
 *     template <typename InT, typename OutT>
 *     struct XxxFunctor {
@@ -339,8 +334,7 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
 * NX: The number of data continuously loaded by each thread.
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * ReduceFunctor: Compute functor which has an operator() as following
 *     template <typename InT>
 *     struct ReduceFunctor {

--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
@@ -118,8 +118,8 @@ struct BroadcastConfig {
 }  // namespace details

 /**
- * @brief Read 2D data from global memory to registers according to Tx type, and
- * store it as Ty type.
+ * @brief Read 2D data from global memory to register according to Tx type, and
+ * store it as Ty type into register.
 *
 * @template paraments
 * Tx: The type of data stored in the global memory.
@@ -127,8 +127,7 @@ struct BroadcastConfig {
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
 * NX x NY x blockDim, boundary judgment is required to avoid memory access
@@ -136,20 +135,20 @@ struct BroadcastConfig {
 *
 * @param：
 * dst: The register pointer of the thread, the size is NX * NY.
- * src: Data pointer of the current block.
- * size_nx: The current block needs to load size_nx columns of data, this
- * parameter will be used when IsBoundary = true.
- * size_ny: The current block needs to load size_ny rows of data. This parameter
- * will be used when IsBoundary = true.
- * stride_nx: The stride of cols.
- * stride_ny: The stride of rows.
+ * src: The data pointer of the current block.
+ * size_nx: The maximum offset of the current block is size_nx elements in the
+ * lowest dimension. The parameters are only calculated when isboundary = true.
+ * size_ny: The maximum offset of the current block is size_ny elements in the
+ * first dimension. The parameters are only calculated when isboundary = true.
+ * stride_nx: Each read one element stride stride_nx elements in the last dim.
+ * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
          bool IsBoundary = false>
 __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
                                         int size_nx, int size_ny,
                                         int stride_nx, int stride_ny) {
-  int thread_offset = threadIdx.x * NX;
+  int thread_offset = threadIdx.x;
  int left_size_nx = size_nx - thread_offset;

  // Each branch is added for better performance
@@ -165,7 +164,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
 #pragma unroll
    for (int idy = 0; idy < NY; ++idy) {
      if (IsBoundary) {
-        if (idy >= size_ny) {
+        if (idy * stride_ny >= size_ny) {
          break;
        }
      }
@@ -175,7 +174,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
 #pragma unroll
    for (int idx = 0; idx < NX; ++idx) {
      if (IsBoundary) {
-        if (idx >= left_size_nx) {
+        if (idx * stride_nx >= left_size_nx) {
          break;
        }
      }
@@ -185,14 +184,14 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
 #pragma unroll
    for (int idx = 0; idx < NX; ++idx) {
      if (IsBoundary) {
-        if (idx >= left_size_nx) {
+        if (idx * stride_nx >= left_size_nx) {
          break;
        }
      }
 #pragma unroll
      for (int idy = 0; idy < NY; ++idy) {
        if (IsBoundary) {
-          if (idy >= size_ny) {
+          if (idy * stride_ny >= size_ny) {
            break;
          }
        }
@@ -223,25 +222,24 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
 }

 /**
- * @brief Read 2D data from global memory to registers. When IsBoundary = true
+ * @brief Read 1D data from global memory to register. When IsBoundary = true
 * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
 * improve memory access efficiency.
 *
 * @template paraments
- * T: Data type of src and dst.
- * NX: The number of data continuously loaded by each thread.
- * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * T: The type of data.
+ * NX: Each thread load NX data from global memory continuously.
+ * NY: Each thread need to load NY rows, only NY = 1 was supported.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
 * When the number of data processed by this block is less than
- * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
 * crossing the boundary.
 *
 * @param：
 * dst: The register pointer of the thread, the size is NX * NY.
- * src: Data pointer of the current block.
+ * src: The data pointer of the current block.
 * size: The current block needs to load size data continuously.
 */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
@@ -276,31 +274,29 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
 }

 /**
- * @brief Read 2D data from global memory to registers for broadcast.
+ * @brief Read 2D data from global memory to registers with broadcast form.
 *
 * @template paraments
 * T: The type of data stored in the global memory.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
- * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
 * crossing the boundary.
 *
 * @param：
 * dst: The register pointer of the thread, the size is NX * NY.
- * src: Raw input data pointer of kernel.
- * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX;
+ * src: The original input data pointer of this kernel.
+ * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX.
 * config: Calculation configuration of broadcast. It is used to calculate the
- * coordinate mapping relationship between output data and input data. Please
- * refer to the sample code for specific usage.
+ * coordinate mapping relationship between output data and input data.
 * total_num_output: Total number of original output.
- * stride_nx: The stride of cols.
- * stride_ny: The stride of rows.
+ * stride_nx: Each read one element stride stride_nx elements in the last dim.
+ * stride_ny: Each read one element stride stride_ny elements in the first dim.
 */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
          bool IsBoundary = false>
@@ -308,7 +304,7 @@ __device__ __forceinline__ void ReadDataBc(
    T* dst, const T* __restrict__ src, uint32_t block_offset,
    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
    int stride_ny) {
-  uint32_t thread_offset = block_offset + threadIdx.x * NX;
+  uint32_t thread_offset = block_offset + threadIdx.x;
  uint32_t index_src = 0;

 #pragma unroll
@@ -334,37 +330,33 @@ __device__ __forceinline__ void ReadDataBc(
 }

 /**
- * @brief Read 2D data from global memory to registers for reduce.
+ * @brief Read 2D data from global memory to register with reduce form.
 *
 * @template paraments
- * T: The type of data stored in the global memory.
+ * T: The type of data.
 * NX: The number of data columns loaded by each thread.
 * NY: The number of data rows loaded by each thread.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
- * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
 * crossing the boundary.
 *
 * @param：
 * dst: The register pointer of the thread, the size is NX * NY.
- * src: Raw input data pointer of kernel.
- * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX;
+ * src: The input data pointer of this block.
+ * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX.
 * index_cal: Calculation configuration of Reduce. It is used to calculate the
- * coordinate mapping relationship between output data and input data. Please
- * refer to the sample code for specific usage.
- * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX;
- * index_cal: get the global index in src, attention config was declared in
- * host;
+ * coordinate mapping relationship between output data and input data.
 * size_nx: The current block needs to load size_nx columns of data, this
- * parameter will be used when IsBoundary = true.
- * size_ny: The current block needs to load size_ny rows of data. This parameter
+ * parameter will participate in the calculation when isboundary = true.
+ * size_ny: The current block needs to load size_ny rows of data, this parameter
+ * will participate in the calculation when isboundary = true.
 * will be used when IsBoundary = true.
- * stride_nx: The stride of cols.
- * stride_ny: The stride of rows.
+ * stride_nx: Each read one element stride stride_nx columns.
+ * stride_ny: Each read one element stride stride_ny raws.
 * reduce_last_dim: Used to indicate whether the dimension of reduce contains
 * the lowest dimension.
 */
@@ -375,10 +367,13 @@ __device__ __forceinline__ void ReadDataReduce(
    const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
    int stride_ny, bool reduce_last_dim) {
  int thread_offset = 0;
+  int left_idx = 0;
  if (reduce_last_dim) {
-    thread_offset = block_offset + threadIdx.x;
+    thread_offset = threadIdx.x;
+    left_idx = threadIdx.y;
  } else {
-    thread_offset = block_offset + threadIdx.y;
+    thread_offset = threadIdx.y;
+    left_idx = threadIdx.x;
  }

  if (NX == 1) {
@@ -389,30 +384,25 @@ __device__ __forceinline__ void ReadDataReduce(
          break;
        }
      }
-      uint32_t index_src = index_cal(thread_offset);
+      uint32_t index_src = index_cal(thread_offset + block_offset);
      dst[ny] = src[index_src];
      thread_offset += stride_ny;
    }
  } else {
 #pragma unroll
    for (int nx = 0; nx < NX; ++nx) {
-      if (IsBoundary) {
-        if (nx * stride_nx >= size_nx) {
-          break;
-        }
-      }
 #pragma unroll
      for (int ny = 0; ny < NY; ++ny) {
        if (IsBoundary) {
-          if (nx * stride_nx >= size_nx) {
+          if ((thread_offset >= size_ny) ||
+              (left_idx + nx * stride_nx >= size_nx)) {
            break;
          }
        }
-        uint32_t index_src = index_cal(thread_offset);
+        uint32_t index_src = index_cal(thread_offset + block_offset);
        dst[nx + ny * NX] = src[index_src];
        thread_offset += stride_ny;
      }
-      thread_offset += stride_nx;
    }
  }
 }
@@ -424,20 +414,19 @@ __device__ __forceinline__ void ReadDataReduce(
 *
 * @template paraments
 * T: The type of data.
- * NX: The number of data continuously loaded by each thread.
+ * NX: The number of data continuously writed by each thread.
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
 * BlockSize: Identifies the current device thread index method. For GPU,
- * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * the index. Currently only GPU was supported.
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds
 * judgment. When the number of data processed by the block is less than
- * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
 * crossing the boundary.
 *
 * @param：
- * dst: Data pointer of the current block.
- * src: The register pointer of the thread, the size is NX * NY.
- * size: The current block needs to load size data continuously.
+ * dst: The data pointer of the current block.
+ * src: The register pointer, the size is NX * NY.
+ * size: The current block needs to load size elements continuously.
 */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
@@ -467,6 +456,165 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
  }
 }

+/**
+ * @brief Write 2D data from register to global memory according to Tx type, and
+ * store it as Ty type.
+ *
+ * @template paraments
+ * Tx: The type of data that needs to be stored in registers.
+ * Ty: The type of data that stored in the global memory.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The data pointer of the current block.
+ * src: The register pointer of the thread, the size is NX * NY.
+ * size_nx: The maximum offset of the current block is size_nx elements in the
+ * lowest dimension. The parameters are only calculated when isboundary = true.
+ * size_ny: The maximum offset of the current block is size_ny elements in the
+ * first dimension. The parameters are only calculated when isboundary = true.
+ * stride_nx: Each read one element stride stride_nx elements in the last dim.
+ * stride_ny: Each read one element stride stride_ny elements in the first dim.
+ */
+template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+          bool IsBoundary = false>
+__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src,
+                                          int size_nx, int size_ny,
+                                          int stride_nx, int stride_ny) {
+  int thread_offset = threadIdx.x;
+  int left_size_nx = size_nx - thread_offset;
+
+  // Each branch is added for better performance
+  if (NX == 1 && NY == 1) {  // for NX == 1 and NY == 1
+    if (IsBoundary) {
+      if (left_size_nx > 0) {
+        dst[thread_offset] = static_cast<Ty>(src[0]);
+      }
+    } else {
+      dst[thread_offset] = static_cast<Ty>(src[0]);
+    }
+  } else if (NX == 1) {  // for NX == 1 and NY != 1
+#pragma unroll
+    for (int idy = 0; idy < NY; ++idy) {
+      if (IsBoundary) {
+        if (idy * stride_ny >= size_ny) {
+          break;
+        }
+      }
+      dst[thread_offset + idy * stride_ny] = static_cast<Ty>(src[idy]);
+    }
+  } else if (NY == 1) {  // for NY == 1 and NX != 1
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (IsBoundary) {
+        if (idx * stride_nx >= left_size_nx) {
+          break;
+        }
+      }
+      dst[thread_offset + idx * stride_nx] = static_cast<Ty>(src[idx]);
+    }
+  } else {  // for NX != 1 and NY != 1
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (IsBoundary) {
+        if (idx * stride_nx >= left_size_nx) {
+          break;
+        }
+      }
+#pragma unroll
+      for (int idy = 0; idy < NY; ++idy) {
+        if (IsBoundary) {
+          if (idy * stride_ny >= size_ny) {
+            break;
+          }
+        }
+        dst[thread_offset + idx * stride_nx + idy * stride_ny] =
+            static_cast<Ty>(src[idy * NX + idx]);
+      }
+    }
+  }
+}
+
+/**
+ * @brief Initialize register with init_data.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: The register pointer of init data, the size is NX.
+ */
+template <typename T, int NX, bool IsBoundary = false>
+__device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    if (IsBoundary) {
+      if (i >= num) {
+        break;
+      }
+    }
+    dst[i] = init_data[i];
+  }
+}
+
+/**
+ * @brief Read 1D data from global memory to register with broadcast form.
+ *
+ * @template paraments
+ * T: The type of data stored in the global memory.
+ * NX: The number of data continuously loaded by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index. Currently only GPU was supported.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The original input data pointer of kernel.
+ * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX;
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * total_num_output: Total number of original output.
+ */
+template <typename T, int NX, int NY, int BlockSize, int Rank,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataBc(
+    T* dst, const T* __restrict__ src, uint32_t block_offset,
+    details::BroadcastConfig<Rank> config, int total_num_output) {
+  uint32_t thread_offset = block_offset + threadIdx.x * NX;
+  uint32_t index_src = 0;
+
+#pragma unroll
+  for (uint32_t nx = 0; nx < NX; ++nx) {
+    uint32_t index_output = thread_offset + nx;
+    index_src = 0;
+    if (IsBoundary) {
+      if (index_output >= total_num_output) {
+        break;
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < Rank; ++i) {
+      auto fast_divmoder = config.divmoders[i].Divmod(index_output);
+      index_output = fast_divmoder.val[0];
+      index_src += fast_divmoder.val[1] * config.strides[i];
+    }
+    dst[nx] = src[index_src];
+  }
+}
+
 }  // namespace kernel_primitives
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -529,6 +529,31 @@ __device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer,
  kps::WriteData<Ty, 1, 1, 1, IsBoundary>(y + store_offset, &temp_data, size);
 }

+template <typename Tx, typename MPType, typename ReduceOp, typename TransformOp,
+          typename Calculator, bool IsBoundary>
+__device__ void ReduceAnyKernelImpl(const Tx* input, MPType* reduce_var,
+                                    ReduceOp reducer, TransformOp transformer,
+                                    MPType init, int reduce_num, int input_idx,
+                                    bool reduce_last_dim,
+                                    const Calculator& reduce_index_calculator,
+                                    int stride, int num) {
+  Tx input_reg[REDUCE_VEC_SIZE];
+  MPType input_compute[REDUCE_VEC_SIZE];
+  MPType input_transform[REDUCE_VEC_SIZE];
+
+  kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
+  kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator, IsBoundary>(
+      &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num,
+      1, stride, reduce_last_dim);
+  kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+      &input_transform[0], &input_reg[0], transformer);
+  kps::Init<MPType, REDUCE_VEC_SIZE, IsBoundary>(input_compute, input_transform,
+                                                 num);
+  kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
+              kps::details::ReduceMode::kLocalMode>(
+      reduce_var, &input_compute[0], reducer, reduce_last_dim);
+}
+
 // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
 // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
 // function will be used
@@ -570,37 +595,17 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer,
  // 1. reduce for each thread
  if (left_idx < left_num) {
    // load REDUCE_VEC_SIZE data once, and then compute
-    Tx input_reg[REDUCE_VEC_SIZE];
-    MPType input_compute[REDUCE_VEC_SIZE];
    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
    for (; input_idx + block_size < bound;
         input_idx += REDUCE_VEC_SIZE * stride) {
-      kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator>(
-          &input_reg[0], input, input_idx, reduce_index_calculator, 1,
-          reduce_num, 1, stride, reduce_last_dim);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &input_compute[0], &input_reg[0], transformer);
-      kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
-    }
-
-    kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
-    kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator, true>(
-        &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num,
-        1, stride, reduce_last_dim);
-    input_idx += tid;
-#pragma unroll
-    for (int i = 0; i < REDUCE_VEC_SIZE; ++i) {
-      if (input_idx >= reduce_num) {
-        break;
-      }
-      input_compute[i] = static_cast<MPType>(transformer(input_reg[i]));
-      input_idx += stride;
-    }
-    kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
-                kps::details::ReduceMode::kLocalMode>(
-        &reduce_var, &input_compute[0], reducer, reduce_last_dim);
+      ReduceAnyKernelImpl<Tx, MPType, ReduceOp, TransformOp, Calculator, false>(
+          input, &reduce_var, reducer, transformer, init, reduce_num, input_idx,
+          reduce_last_dim, reduce_index_calculator, stride, reduce_num);
+    }
+    int num = (reduce_num - input_idx - tid + stride - 1) / stride;
+    ReduceAnyKernelImpl<Tx, MPType, ReduceOp, TransformOp, Calculator, true>(
+        input, &reduce_var, reducer, transformer, init, reduce_num - input_idx,
+        input_idx, reduce_last_dim, reduce_index_calculator, stride, num);
  }

  kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(