[icafe-31094] Add function comments and instructions to the Primitive API (#35743)

bc0df48b · niuliling123 · GitHub · 372a1a75 · bc0df48b · bc0df48b
2 changed file
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
@@ -54,8 +54,8 @@ class MPTypeTrait<platform::float16> {
 };
 /**
- * @brief will be used in BlockYReduce, get the index of reduce_num in shared
+ * @brief Will be used in BlockYReduce, get the index of reduce_num in shared
- * memory
+ * memory.
 */
 __device__ __forceinline__ int SharedMemoryIndex(int index) {
  return (threadIdx.y + index) * blockDim.x + threadIdx.x;
@@ -83,7 +83,7 @@ __device__ __forceinline__ T WarpReduce(T val, ReduceOp reducer) {
 */
 /**
- * @brief BlockXReduce reduce along blockDim.x
+ * @brief BlockXReduce reduce along blockDim.x.
 */
 template <typename T, typename ReduceOp>
 __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
@@ -115,7 +115,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
 }
 /**
- * @brief BlockYReduce reduce along blockDim.y
+ * @brief BlockYReduce reduce along blockDim.y.
 */
 template <typename T, typename ReduceOp>
 __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
@@ -135,24 +135,33 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
 }  // namespace details
 /**
- * @brief unary function
+ * @brief Perform unary calculation according to OpFunc. Size of input and
- * @param
+ * output are the same.
- * T: data type of in
+ *
- * OutT: data type of out
+ * @template paraments
- * NX: the cols of in
+ * InT: Data type of in.
- * NY: the rows of in
+ * OutT: Data type of out.
- * BlockSize: the config of this device
+ * NX: The number of data columns loaded by each thread.
- * OpFunc: compute functor which have an operator() as following
+ * NY: The number of data rows loaded by each thread.
- *     template <typename T, typename OutT>
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT, typename OutT>
 *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const T& a) const {
+ *       HOSTDEVICE OutT operator()(const InT& a) const {
 *         return ...;
 *       }
 *     };
+ *
+ * @param：
+ * out: The register pointer of out, the size is NX * NY.
+ * in: The register pointer of in, the size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename T, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in,
+__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
                                                 OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX * NY; idx++) {
@@ -161,25 +170,35 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in,
 }
 /**
- * @brief binary function, in1 and in2 have same shape
+ * @brief Binary calculation according to OpFunc. Size of The input and output
- * @param
+ * are the same.
- * T: data type of in1, in2
+ *
- * OutT: data type of out
+ * @template paraments
- * NX: the cols of in1, in2
+ * InT: Data type of in1 and in2.
- * NY: the rows of in1, in2
+ * OutT: Data type of out.
- * BlockSize: the config of this device
+ * NX: The number of data columns loaded by each thread.
- * OpFunc: compute functor which have an operator() as following
+ * NY: The number of data rows loaded by each thread.
- *     template <typename T, typename OutT>
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT, typename OutT>
 *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const T& a, const T& b) const {
+ *       HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {
 *         return ...;
 *       }
 *     };
+ *
+ * @param：
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * NY.
+ * in2: The register pointer of second input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename T, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1,
+__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
-                                                  const T* in2,
+                                                  const InT* in2,
                                                  OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX * NY; ++idx) {
@@ -188,25 +207,38 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1,
 }
 /**
- * @brief ternary function, in1, in2 and in3 have same shape
+ * @brief Ternary calculation according to OpFunc. Size of input and output
- * @param
+ * are the same.
- * T: data type of in1, in2, in3
+ *
- * OutT: data type of out
+ * @template paraments
- * NX: the cols of in1, in2
+ * InT: Data type of in1 and in2.
- * NY: the rows of in1, in2
+ * OutT: Data type of out.
- * BlockSize: the config of this device
+ * NX: The number of data columns loaded by each thread.
- * OpFunc: compute functor which have an operator() as following
+ * NY: The number of data rows loaded by each thread.
- *     template <typename T, typename OutT>
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT, typename OutT>
 *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const T& a, const T& b, const T& c) const {
+ *       HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c)
+ * const {
 *         return ...;
 *       }
 *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * NY.
+ * in2: The register pointer of second input, size is NX * NY.
+ * in3: The register pointer of third input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename T, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1,
+__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
-                                                   const T* in2, const T* in3,
+                                                   const InT* in2,
+                                                   const InT* in3,
                                                   OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX * NY; ++idx) {
@@ -215,27 +247,36 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1,
 }
 /**
- * @brief a general function for elementwise computation, all inputs have
+ * @brief Multivariate calculation according to OpFunc. Size of input and output
- *        the same shape.
+ * are the same.
- * @param
+ *
- * T: data type of in1, in2, in3
+ * @template paraments
- * OutT: data type of out
+ * InT: Data type of in1, in2 and in3.
- * NX: the cols of in1, in2
+ * OutT: Data type of out.
- * NY: the rows of in1, in2
+ * NX: The number of data columns loaded by each thread.
- * BlockSize: the config of this device
+ * NY: The number of data rows loaded by each thread.
- * OpFunc: compute functor which have an operator() as following
+ * BlockSize: Identifies the current device thread index method. For GPU,
- *     template <typename T, typename OutT>
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * Arity: The size of ins
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT, typename OutT>
 *     struct XxxFunctor {
- *       HOSTDEVICE OutT operator()(const T* args) const {
+ *       HOSTDEVICE OutT operator()(const InT* args) const {
 *         return ...;
 *       }
 *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * ins: An array of pointers consisting of multiple inputs.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
 */
-template <typename T, typename OutT, int NX, int NY, int BlockSize, int Arity,
+template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
          class OpFunc>
-__device__ __forceinline__ void ElementwiseAny(OutT* out, T (*ins)[NX * NY],
+__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
                                               OpFunc compute) {
-  T args[Arity];
+  InT args[Arity];
 #pragma unroll
  for (int idx = 0; idx < NX * NY; ++idx) {
 #pragma unroll
@@ -247,20 +288,36 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, T (*ins)[NX * NY],
 }
 /**
- * @brief cycle binary function, in1's shape size is [1, NX], in2's shape size
+ * @brief Binary calculation according to OpFunc. Shape of in1 and in2 are the
- * is [NY, NX], out's shape size is [NY, NX]
+ * different. Shape of in1 is [1, NX], but in2's shape is [NY, NX], the output
+ * shape is [NY, NX].
+ *
+ * @template paraments
+ * InT: Data type of in1 and in2.
+ * OutT: Data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT, typename OutT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {
+ *         return ...;
+ *       }
+ *     };
+ *
 * @param
- * T: data type of in1, in2
+ * out: The register pointer of out, the size is NX * NY.
- * OutT: data type of out
+ * in1: The register pointer of fist input, size is NX * 1.
- * NX: the cols of in1, in2
+ * in2: The register pointer of second input, size is NX * NY.
- * NY: the rows of in1, in2
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
- * BlockSize: the config of this device
- * OpFunc: compute functor eg: in1 + in2, in1 - in2
 */
-template <typename T, typename OutT, int NX, int NY, int BlockSize,
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
          class OpFunc>
-__device__ __forceinline__ void CycleBinary(OutT* out, const T* in1,
+__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
-                                            const T* in2, OpFunc compute) {
+                                            const InT* in2, OpFunc compute) {
 #pragma unroll
  for (int idx = 0; idx < NX; idx++) {
 #pragma unroll
@@ -272,26 +329,37 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1,
 }
 /**
- * @brief reduce function, in's shape size is [NX, NY].
+ * @brief The Reduce provides collective methods for computing a parallel
- * If ReduceMode == kLocalMode then reduce NX, the shape of out is [NY, 1],
+ * reduction of items partitioned across a CUDA block and intra thread. When
- * if ReduceMode == kGlobalMode then reduce between different threads, the
+ * ReduceMode == kLocalMode, thread reduce along nx. When ReduceMode ==
- * shape of out is [NY, NX]. If reduce_last_dim is false and reduce_num was
+ * kGlobalMode, use shared memory to reduce between threads.
- * split, BlockYReduce will be called. If reduce_last_dim is true and
+ *
- * reduce_num was split, BlockXReduce will be called
+ * @template paraments
- * @typename
+ * T: The type of data.
- * T: data type of in
+ * NX: The number of data continuously loaded by each thread.
- * NX: the cols of in
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
- * NY: the rows of in
+ * BlockSize: Identifies the current device thread index method. For GPU,
- * BlockSize: the config of this device
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * OpFunc: reduce functor, eg: CustomSum, CustomMean in reduce_functor_op.h
+ * the index. Currently only GPU was supported.
- * @param:
+ * ReduceFunctor: Compute functor which has an operator() as following
- * reducer: reduce functor, eg: CustomSum<T>()
+ *     template <typename InT>
- * reduce_last_dim: if in's last dim need to be reduce then reduce_last_dim =
+ *     struct ReduceFunctor {
- * true
+ *       HOSTDEVICE InT operator()(const InT& a, const InT& b) const {
+ *         return ...;
+ *       }
+ *     };
+ * ReduceMode: Reduce mode, can be kLocalMode, kGlobalMode.
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * in: The register pointer of in, the size is NX * NY.
+ * reducer: Compute function which was declared like ReduceFunctor<InT>().
+ * reduce_last_dim: if the last dim gets involved in reduction.
 */
-template <typename T, int NX, int NY, int BlockSize, class OpFunc,
+template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
          details::ReduceMode Mode>
-__device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer,
+__device__ __forceinline__ void Reduce(T* out, const T* in,
+                                       ReduceFunctor reducer,
                                       bool reduce_last_dim) {
  int block_index = blockDim.y;
@@ -302,7 +370,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer,
    if (block_reduce_y) {
 #pragma unroll
      for (int i = 0; i < NY * NX; i++) {  // reduce along blockdim.y
-        out[i] = details::BlockYReduce<T, OpFunc>(out[i], reducer);
+        out[i] = details::BlockYReduce<T, ReduceFunctor>(out[i], reducer);
      }
    }
@@ -310,7 +378,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer,
    if (reduce_last_dim) {
 #pragma unroll
      for (int i = 0; i < NY * NX; i++) {  // reduce along blockDim.x
-        out[i] = details::BlockXReduce<T, OpFunc>(out[i], reducer);
+        out[i] = details::BlockXReduce<T, ReduceFunctor>(out[i], reducer);
      }
    }
  } else {  // else  kLocalMode

--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
@@ -32,7 +32,13 @@ template <typename T, int VecSize>
 struct alignas(sizeof(T) * VecSize) VectorType {
  T val[VecSize];
 };
+/**
+ * Fast division : Replace division in CUDA with multiplication to improve
+ * kernel performance.
+ * 1. Complete the division calculation on the CPU, and record the calculation
+ * results by using the divider and shift_val.
+ * 2. Set the divisor on the GPU through Div() to complete the calculation.
+ */
 struct FastDivMod {
  // 1st value represents the result of input number divides by recorded divisor
  // 2nd value represents the result of input number modulo by recorded divisor
@@ -71,6 +77,11 @@ struct FastDivMod {
  uint32_t multiplier;
 };
+/**
+ * Configuration of broadcast. Calculate the input data index according to the
+ * index of the output data. if input or output shape is [dim0, dim1] then dims
+ * must be [dim1, dim0].
+ */
 template <int kDims>
 struct BroadcastConfig {
  FastDivMod divmoders[kDims];
@@ -107,65 +118,31 @@ struct BroadcastConfig {
 }  // namespace details
 /**
- * @brief load data from src to dst, src can be 1D data or 2D data. Note that
+ * @brief Read 2D data from global memory to registers according to Tx type, and
- * you can use this function when you are sure that the data will not cross the
+ * store it as Ty type.
- * boundary.
+ *
- * @typename:
+ * @template paraments
- * Tx: data type of src
+ * Tx: The type of data stored in the global memory.
- * Ty: data type of dstt
+ * Ty: The type of data that needs to be stored in registers.
- * NX: the cols of src, dst
+ * NX: The number of data columns loaded by each thread.
- * NY: the rows of src, dst
+ * NY: The number of data rows loaded by each thread.
- * BlockSize: the config of this device
+ * BlockSize: Identifies the current device thread index method. For GPU,
- * @param：
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * stride_nx: the stride of cols
+ * the index. Currently only GPU was supported.
- * stride_ny: the stride of rows
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
- */
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim, boundary judgment is required to avoid memory access
-template <typename Tx, typename Ty, int NX, int NY, int BlockSize>
+ * crossing the boundary.
-__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
+ *
-                                         int stride_nx, int stride_ny) {
-  int thread_offset = threadIdx.x * NX;
-  if (NY == 1 && NX == 1) {
-    dst[0] = static_cast<Ty>(src[thread_offset]);
-  } else if (NX == 1) {
-#pragma unroll
-    for (int idy = 0; idy < NY; ++idy) {
-      dst[idy] = static_cast<Ty>(src[thread_offset + idy * stride_ny]);
-    }
-  } else if (NY == 1) {
-#pragma unroll
-    for (int idx = 0; idx < NX; ++idx) {
-      dst[idx] = static_cast<Ty>(src[thread_offset + idx * stride_nx]);
-    }
-  } else {
-#pragma unroll
-    for (int idx = 0; idx < NX; ++idx) {
-#pragma unroll
-      for (int idy = 0; idy < NY; ++idy) {
-        dst[idy * NX + idx] = static_cast<Ty>(
-            src[thread_offset + idx * stride_nx + idy * stride_ny]);
-      }
-    }
-  }
-}
-/**
- * @brief load data from src to dst with stride, src can be 1D data or 2D data.
- * When boundary judgment is required, you need to set a to true, and a is false
- * by default.
- * @typename:
- * Tx: data type of src
- * Ty: data type of dstt
- * NX: the cols of src, dst
- * NY: the rows of src, dst
- * BlockSize: the config of this device
- * IsBoundary: whether to make boundary judgment
 * @param：
- * size_nx: number of columns to be processed by the current block
+ * dst: The register pointer of the thread, the size is NX * NY.
- * size_ny: number of rows to be processed by the current block
+ * src: Data pointer of the current block.
- * stride_nx: the stride of cols
+ * size_nx: The current block needs to load size_nx columns of data, this
- * stride_ny: the stride of rows
+ * parameter will be used when IsBoundary = true.
+ * size_ny: The current block needs to load size_ny rows of data. This parameter
+ * will be used when IsBoundary = true.
+ * stride_nx: The stride of cols.
+ * stride_ny: The stride of rows.
 */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
          bool IsBoundary = false>
@@ -226,6 +203,17 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
  }
 }
+/**
+ * @brief Initialize register with init_data.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: Initial value.
+ */
 template <typename T, int NX>
 __device__ __forceinline__ void Init(T* dst, T init_data) {
 #pragma unroll
@@ -234,18 +222,27 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
  }
 }
-/** @brief: ReadData
+/**
- * @brief load data from src to dst, src can be 1D data, you should set NY = 1.
+ * @brief Read 2D data from global memory to registers. When IsBoundary = true
- * When boundary judgment is required, you need to set a to true, and a is false
+ * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
- * by default.
+ * improve memory access efficiency.
- * @typename:
+ *
- * T : the data type of src
+ * @template paraments
- * NX: the cols of src, dst
+ * T: Data type of src and dst.
- * NY: in this function NY only can be 1
+ * NX: The number of data continuously loaded by each thread.
- * BlockSize: the config of this device
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
- * IsBoundary: whether to make boundary judgment
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
 * @param：
- * num: number of columns to be processed by the current block
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: Data pointer of the current block.
+ * size: The current block needs to load size data continuously.
 */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
@@ -279,28 +276,38 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
 }
 /**
- * @brief: read data for broadcast
+ * @brief Read 2D data from global memory to registers for broadcast.
- * @typename:
+ *
- * T : the data type of src
+ * @template paraments
- * NX: the cols of src, dst
+ * T: The type of data stored in the global memory.
- * NY: in this function NY only can be 1
+ * NX: The number of data columns loaded by each thread.
- * BlockSize: the config of this device
+ * NY: The number of data rows loaded by each thread.
- * ShapeSize: the shape size of out. eg in[1, 35], out[32, 35] then shape size
+ * BlockSize: Identifies the current device thread index method. For GPU,
- * is 2
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * IsBoundary: whether to make boundary judgment
+ * the index. Currently only GPU was supported.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
 * @param：
- * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX;
+ * dst: The register pointer of the thread, the size is NX * NY.
- * config: get the global index in src, attention config was declared in host;
+ * src: Raw input data pointer of kernel.
- * total_num_output: total num of output
+ * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX;
- * stride_nx: the stride of cols
+ * config: Calculation configuration of broadcast. It is used to calculate the
- * stride_ny: the stride of rows
+ * coordinate mapping relationship between output data and input data. Please
+ * refer to the sample code for specific usage.
+ * total_num_output: Total number of original output.
+ * stride_nx: The stride of cols.
+ * stride_ny: The stride of rows.
 */
-template <typename T, int NX, int NY, int BlockSize, int ShapeSize,
+template <typename T, int NX, int NY, int BlockSize, int Rank,
          bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
    T* dst, const T* __restrict__ src, uint32_t block_offset,
-    details::BroadcastConfig<ShapeSize> config, int total_num_output,
+    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
-    int stride_nx, int stride_ny) {
+    int stride_ny) {
  uint32_t thread_offset = block_offset + threadIdx.x * NX;
  uint32_t index_src = 0;
@@ -316,7 +323,7 @@ __device__ __forceinline__ void ReadDataBc(
        }
      }
 #pragma unroll
-      for (int i = 0; i < ShapeSize; ++i) {
+      for (int i = 0; i < Rank; ++i) {
        auto fast_divmoder = config.divmoders[i].Divmod(index_output);
        index_output = fast_divmoder.val[0];
        index_src += fast_divmoder.val[1] * config.strides[i];
@@ -327,27 +334,41 @@ __device__ __forceinline__ void ReadDataBc(
 }
 /**
- * @brief: read data for broadcast
+ * @brief Read 2D data from global memory to registers for reduce.
- * @typename:
+ *
- * T : the data type of src
+ * @template paraments
- * NX: the cols of src, dst
+ * T: The type of data stored in the global memory.
- * NY: in this function NY only can be 1
+ * NX: The number of data columns loaded by each thread.
- * BlockSize: the config of this device
+ * NY: The number of data rows loaded by each thread.
- * ShapeSize: the shape size of out. eg in[1, 35], out[32, 35] then shape size
+ * BlockSize: Identifies the current device thread index method. For GPU,
- * is 2
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
- * IndexCal: get the global index in src, attention config was declared in host;
+ * the index. Currently only GPU was supported.
- * IsBoundary: whether to make boundary judgment
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
 * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: Raw input data pointer of kernel.
+ * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX;
+ * index_cal: Calculation configuration of Reduce. It is used to calculate the
+ * coordinate mapping relationship between output data and input data. Please
+ * refer to the sample code for specific usage.
 * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX;
 * index_cal: get the global index in src, attention config was declared in
 * host;
- * size_nx: number of columns to be processed by the current block
+ * size_nx: The current block needs to load size_nx columns of data, this
- * size_ny: number of rows to be processed by the current block
+ * parameter will be used when IsBoundary = true.
- * stride_nx: the stride of cols
+ * size_ny: The current block needs to load size_ny rows of data. This parameter
- * stride_ny: the stride of rows
+ * will be used when IsBoundary = true.
- * reduce_last_dim: according to the block split set threadIdx
+ * stride_nx: The stride of cols.
+ * stride_ny: The stride of rows.
+ * reduce_last_dim: Used to indicate whether the dimension of reduce contains
+ * the lowest dimension.
 */
-template <typename T, int NX, int NY, int BlockSize, int ShapeSize,
+template <typename T, int NX, int NY, int BlockSize, int Rank,
          typename IndexCal, bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataReduce(
    T* dst, const T* __restrict__ src, int block_offset,
@@ -397,17 +418,26 @@ __device__ __forceinline__ void ReadDataReduce(
 }
 /**
- * @brief store data from src to dst, src can be 1D data, you should set NY = 1.
+ * @brief Write 2D data from registers to global memory. When IsBoundary = true
- * When boundary judgment is required, you need to set a to true, and a is false
+ * and (NX % 4 == 0 or Nx % 2 == 0), the data will be vectorized to improve the
- * by default.
+ * data loading efficiency
- * @typename:
+ *
- * T : the data type of src
+ * @template paraments
- * NX: the cols of src, dst
+ * T: The type of data.
- * NY: in this function NY only can be 1
+ * NX: The number of data continuously loaded by each thread.
- * BlockSize: the config of this device
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
- * IsBoundary: whether to make boundary judgment
+ * BlockSize: Identifies the current device thread index method. For GPU,
+ * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
+ * the index. Currently only GPU was supported.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
 * @param：
- * num: number of columns to be processed by the current block
+ * dst: Data pointer of the current block.
+ * src: The register pointer of the thread, the size is NX * NY.
+ * size: The current block needs to load size data continuously.
 */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,