Add io api and compute api for XPU (#36423)

89a8989f · niuliling123 · GitHub · 92d6a048 · 89a8989f · 89a8989f
2 changed file
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+
+namespace paddle {
+namespace operators {
+namespace kernel_primitives {
+namespace details {
+
+// kGlobalMode: block reduce, each block gets an output;
+// kLocalMode: thread reduce, each thread gets an output;
+enum ReduceMode { kGlobalMode, kLocalMode };
+
+template <typename T>
+class MPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class MPTypeTrait<platform::float16> {
+ public:
+  using Type = float;
+};
+
+static inline __device__ void sync_all() {
+  __asm__ __volatile__(
+      "sync_local\t\n"
+      "csr_set csr3, %0\t\n"
+      "sync_group csr3" ::"r"(-1));
+}
+
+#define ncores 64
+template <typename T, typename OpFunc, int VecSize>
+__device__ void BlockXReduce(T* data, OpFunc reducer) {
+  __shared__ T sum_array[ncores * VecSize];
+  int core_idx = core_id() * VecSize;
+  mfence();
+  sync_all();
+
+#pragma unroll
+  for (int i = 0; i < VecSize; i++) {
+    mfence();
+    sum_array[core_idx + i] = data[i];
+    mfence();
+    data[i] = 0;
+  }
+  sync_all();
+#pragma unroll
+  for (int i = 0; i < VecSize; i++) {
+#pragma unroll
+    for (int j = 0; j < ncores; j++) {
+      mfence();
+      T tmp = sum_array[j * VecSize + i];
+      mfence();
+      data[i] = reducer(data[i], tmp);
+      mfence();
+    }
+  }
+  sync_all();
+}
+#undef ncores
+
+}  // namespace details
+
+/**
+ * @brief Perform unary calculation according to OpFunc. Shape of input and
+ * output are the same.
+ *
+ * @template paraments
+ * InT: The data type of in.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT, typename OutT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE OutT operator()(const InT& a) const {
+ *         return ...;
+ *       }
+ *     };
+ *
+ * @param：
+ * out: The register pointer of out, the size is NX * NY.
+ * in: The register pointer of in, the size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
+ */
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
+                                                 OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; idx++) {
+    out[idx] = static_cast<OutT>(compute(in[idx]));
+  }
+}
+
+/**
+ * @brief Binary calculation according to OpFunc. Shape of The input and output
+ * are the same.
+ *
+ * @template paraments
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns computed by each thread.
+ * NY: The number of data rows computed by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()(const InT& a, const InT& b) const {
+ *         return ...;
+ *       }
+ *     };
+ *
+ * @param：
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * NY.
+ * in2: The register pointer of second input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT>().
+ */
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
+                                                  const InT* in2,
+                                                  OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; ++idx) {
+    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx]));
+  }
+}
+
+/**
+ * @brief Ternary calculation according to OpFunc. Shape of input and output
+ * are the same.
+ *
+ * @template paraments
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()(const InT& a, const InT& b, const InT& c)
+ * const {
+ *         return ...;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * NY.
+ * in2: The register pointer of second input, size is NX * NY.
+ * in3: The register pointer of third input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT>().
+ */
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
+                                                   const InT* in2,
+                                                   const InT* in3,
+                                                   OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; ++idx) {
+    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
+  }
+}
+
+/**
+ * @brief Multivariate calculation according to OpFunc. Shape of inputs and
+ * output are the same.
+ *
+ * @template paraments
+ * InT: The data type of in1, in2 and in3.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * Arity: The size of ins
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()(const InT* args) const {
+ *         return ...;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * ins: A pointers of array consisting of multiple inputs.
+ * compute: Compute function which was declared like OpFunc<InT>().
+ */
+template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
+                                               OpFunc compute) {
+  __local__ InT args[Arity];
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; ++idx) {
+#pragma unroll
+    for (int j = 0; j < Arity; ++j) {
+      args[j] = ins[j][idx];
+    }
+    out[idx] = static_cast<OutT>(compute(args));
+  }
+}
+
+/**
+ * @brief Binary calculation according to OpFunc. The shape of in1 and in2 are
+ * different. When in1's shape is [1, NX], in2's shape is [NY, NX], then
+ * output's shape is [NY, NX].
+ *
+ * @template paraments
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT, typename OutT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {
+ *         return ...;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * 1.
+ * in2: The register pointer of second input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
+ */
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
+                                            const InT* in2, OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX; idx++) {
+#pragma unroll
+    for (int idy = 0; idy < NY; idy++) {
+      out[idx + idy * NX] =
+          static_cast<OutT>(compute(in1[idx], in2[idx + idy * NX]));
+    }
+  }
+}
+
+/**
+ * @brief The Reduce provides collective methods for computing a parallel
+ * reduction of items partitioned across a CUDA block and intra thread. When
+ * ReduceMode == kLocalMode, thread reduce along nx. When ReduceMode ==
+ * kGlobalMode, use shared memory to reduce between threads.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: The number of data continuously loaded by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * ReduceFunctor: Compute functor which has an operator() as following
+ *     template <typename InT>
+ *     struct ReduceFunctor {
+ *       HOSTDEVICE InT operator()(const InT& a, const InT& b) const {
+ *         return ...;
+ *       }
+ *     };
+ * ReduceMode: Reduce mode, can be kLocalMode, kGlobalMode.
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * in: The register pointer of in, the size is NX * NY.
+ * reducer: Compute function which was declared like ReduceFunctor<InT>().
+ * reduce_last_dim: if the last dim gets involved in reduction.
+ */
+template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
+          details::ReduceMode Mode>
+__device__ __forceinline__ void Reduce(T* out, const T* in,
+                                       ReduceFunctor reducer,
+                                       bool reduce_last_dim) {
+  if (Mode == kGlobalMode) {
+#pragma unroll
+    for (int i = 0; i < NY; ++i) {
+#pragma unroll
+      for (int j = 0; j < NX; ++j) {
+        out[i] = reducer(out[i], in[i * NX + j]);
+      }
+    }
+    BlockXReduce<T, OpFunc, NY>(out, reducer);
+  } else {  // else  kLocalMode
+#pragma unroll
+    for (int i = 0; i < NY; ++i) {
+#pragma unroll
+      for (int j = 0; j < NX; ++j) {
+        out[i] = reducer(out[i], in[i * NX + j]);
+      }
+    }
+  }
+}
+
+}  // namespace kernel_primitives
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+
+namespace paddle {
+namespace operators {
+namespace kernel_primitives {
+namespace details {
+
+template <typename T, int VecSize>
+struct alignas(sizeof(T) * VecSize) VectorType {
+  T val[VecSize];
+};
+
+/**
+ * Configuration of broadcast. Calculate the input data index according to the
+ * index of the output data. if input or output shape is [dim0, dim1] then dims
+ * must be [dim1, dim0].
+ */
+template <int kDims>
+struct BroadcastConfig {
+  uint32_t stride_in[framework::DDim::kMaxRank];
+  uint32_t stride_out[framework::DDim::kMaxRank];
+  uint32_t shape_in[framework::DDim::kMaxRank];
+
+  HOSTDEVICE BroadcastConfig() {}
+
+  HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
+                             const std::vector<int64_t>& in_dims,
+                             int dim_size) {
+    std::vector<uint32_t> strides_in;
+    std::vector<uint32_t> strides_out;
+    std::vector<uint32_t> shapes_in;
+
+    strides_out.resize(dim_size, 1);
+    strides_in.resize(dim_size, 1);
+    shapes_in.resize(dim_size, 1);
+
+    for (int i = 0; i < dim_size; ++i) {
+      shape_in[i] = in_dims[dim_size - i - 1];
+    }
+
+    for (int i = 1; i < dim_size - 1; ++i) {
+      strides_out[dim_size - i - 1] = std::accumulate(
+          out_dims.begin(), out_dims.begin() + i, 1, std::multiplies<int64_t>())
+          strides_in[dim_size - i - 1] =
+              std::accumulate(in_dims.begin(), in_dims.begin() + i, 1,
+                              std::multiplies<int64_t>())
+    }
+
+    memcpy(stride_in, strides_in.data(), kDims * sizeof(uint32_t));
+    memcpy(stride_out, strides_out.data(), kDims * sizeof(uint32_t));
+    memcpy(shape_in, shapes_in.data(), kDims * sizeof(uint32_t));
+  }
+};
+
+}  // namespace details
+
+/**
+ * @brief Read 2D data from global memory to register according to Tx type, and
+ * store it as Ty type into register.
+ *
+ * @template paraments
+ * Tx: The type of data stored in the global memory.
+ * Ty: The type of data that needs to be stored in registers.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size_nx: The maximum offset of the current block is size_nx elements in the
+ * lowest dimension. The parameters are only calculated when isboundary = true.
+ * size_ny: The maximum offset of the current block is size_ny elements in the
+ * first dimension. The parameters are only calculated when isboundary = true.
+ * stride_nx: Each read one element stride stride_nx elements in the last dim.
+ * stride_ny: Each read one element stride stride_ny elements in the first dim.
+ */
+template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
+                                         int size_nx, int size_ny,
+                                         int stride_nx, int stride_ny) {
+  int thread_offset = core_id();
+  int left_size_nx = size_nx - thread_offset;
+  __local__ T in_temp[1];
+  // Each branch is added for better performance
+  if (NX == 1 && NY == 1) {  // for NX == 1 and NY == 1
+    if (IsBoundary) {
+      if (left_size_nx > 0) {
+        GM2LM(src + thread_offset, in_temp, sizeof(Tx));
+        dst[0] = static_cast<Ty>(in_temp[0]);
+      }
+    } else {
+      GM2LM(src + thread_offset, in_temp, sizeof(Tx));
+      dst[0] = static_cast<Ty>(in_temp[0]);
+    }
+  } else if (NX == 1) {  // for NX == 1 and NY != 1
+#pragma unroll
+    for (int idy = 0; idy < NY; ++idy) {
+      if (IsBoundary) {
+        if (idy * stride_ny >= size_ny) {
+          break;
+        }
+      }
+      GM2LM(src + thread_offset + idy * stride_ny, in_temp, sizeof(Tx));
+      dst[idy] = static_cast<Ty>(in_temp[0]);
+    }
+  } else if (NY == 1) {  // for NY == 1 and NX != 1
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (IsBoundary) {
+        if (idx * stride_nx >= left_size_nx) {
+          break;
+        }
+      }
+      GM2LM(src + thread_offset + idx * stride_nx, in_temp, sizeof(Tx));
+      dst[idx] = static_cast<Ty>(in_temp[0]);
+    }
+  } else {  // for NX != 1 and NY != 1
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+#pragma unroll
+      for (int idy = 0; idy < NY; ++idy) {
+        if (IsBoundary) {
+          if (idy * stride_ny >= size_ny || idx * stride_nx >= left_size_nx) {
+            break;
+          }
+        }
+        int fix = thread_offset + idx * stride_nx + idy * stride_ny;
+        GM2LM(src + fix, in_temp, sizeof(Tx));
+        dst[idy * NX + idx] = static_cast<Ty>(in_temp[0]);
+      }
+    }
+  }
+}
+
+/**
+ * @brief Initialize register with init_data.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: Initial value.
+ */
+template <typename T, int NX>
+__device__ __forceinline__ void Init(T* dst, T init_data) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    dst[i] = init_data;
+  }
+}
+
+/**
+ * @brief Read 1D data from global memory to register. When IsBoundary = true
+ * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
+ * improve memory access efficiency.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: Each thread load NX data from global memory continuously.
+ * NY: Each thread need to load NY rows, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size: The current block needs to load size data continuously.
+ */
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+__device__ __forceinline__ void ReadData(T* dst, const T _global_ptr_* src,
+                                         int num) {
+  int thread_offset = core_id() * NX;
+  __local__ T in_temp[1];
+  if (IsBoundary) {  // core_num() * NX > num
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (idx + thread_offset < num) {
+        GM2LM(src + thread_offset + idx, in_temp, sizeof(T));
+        dst[idx] = in_temp[0];
+      }
+    }
+  } else {  // core_num() * NX < num
+    GM2LM(src + thread_offset, dst, NX * sizeof(T));
+  }
+}
+
+/**
+ * @brief Read 2D data from global memory to registers with broadcast form.
+ *
+ * @template paraments
+ * T: The type of data stored in the global memory.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: Raw input data pointer of kernel.
+ * block_offset: Data offset of this block, core_num() *  cluster_id() * NX;
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * total_num_output: Total number of original output.
+ * stride_nx: Each read one element stride stride_nx elements in the last dim.
+ * stride_ny: Each read one element stride stride_ny elements in the first dim.
+ */
+template <typename T, int NX, int NY, int BlockSize, int Rank,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataBc(
+    T* dst, const T _global_ptr_* src, uint32_t block_offset,
+    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
+    int stride_ny) {
+  uint32_t thread_offset = block_offset + core_id();
+  uint32_t index_src = 0;
+  __local__ T in_temp[1];
+
+#pragma unroll
+  for (int ny = 0; ny < NY; ++ny) {
+#pragma unroll
+    for (uint32_t nx = 0; nx < NX; ++nx) {
+      uint32_t index_output = thread_offset + ny * stride_ny + nx * stride_nx;
+      index_src = 0;
+      if (IsBoundary) {
+        if (index_output >= total_num_output) {
+          break;
+        }
+      }
+#pragma unroll
+      for (int i = 0; i < Rank; ++i) {
+        uint32_t tmp = index_output / config.stride_out[i];
+        index_output = index_output - tmp * config.stride_out[i];
+        index_src += (tmp % config.shape_in[i]) * config.stride_in[i];
+      }
+      GM2LM(src + index_src, in_temp, sizeof(T));
+      dst[nx + ny * NX] = in_temp[0];
+    }
+  }
+}
+
+/**
+ * @brief Read 2D data from global memory to register with reduce form.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The input data pointer of this block.
+ * block_offset: The data offset of this block, blockDim.x * cluster_id() * NX.
+ * index_cal: Calculation configuration of Reduce. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * size_nx: The current block needs to load size_nx columns of data, this
+ * parameter will participate in the calculation when isboundary = true.
+ * size_ny: The current block needs to load size_ny rows of data, this parameter
+ * will participate in the calculation when isboundary = true.
+ * will be used when IsBoundary = true.
+ * stride_nx: Each read one element stride stride_nx columns.
+ * stride_ny: Each read one element stride stride_ny raws.
+ * reduce_last_dim: Used to indicate whether the dimension of reduce contains
+ * the lowest dimension.
+ */
+template <typename T, int NX, int NY, int BlockSize, int Rank,
+          typename IndexCal, bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataReduce(
+    T* dst, const T _global_ptr_* src, int block_offset,
+    const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
+    int stride_ny, bool reduce_last_dim) {
+  __local__ T in_temp[1];
+  int thread_offset = 0;
+  int left_size_nx = size_nx;
+  int left_size_ny = size_ny;
+  if (reduce_last_dim) {
+    thread_offset = block_offset + core_id();
+    left_size_nx -= thread_offset;
+  } else {
+    thread_offset = block_offset + core_id();
+    left_size_ny -= thread_offset;
+  }
+
+  if (NX == 1) {
+#pragma unroll
+    for (int ny = 0; ny < NY; ++ny) {
+      if (IsBoundary) {
+        if (ny * stride_ny >= left_size_ny) {
+          break;
+        }
+      }
+      uint32_t index_src = index_cal(thread_offset);
+      GM2LM(src + index_src, in_temp, sizeof(T));
+      dst[ny] = in_temp[0];
+      thread_offset += stride_ny;
+    }
+  } else {
+#pragma unroll
+    for (int nx = 0; nx < NX; ++nx) {
+#pragma unroll
+      for (int ny = 0; ny < NY; ++ny) {
+        if (IsBoundary) {
+          if ((ny * stride_ny >= left_size_ny) ||
+              (nx * stride_nx >= left_size_nx)) {
+            break;
+          }
+        }
+        uint32_t index_src = index_cal(thread_offset);
+        GM2LM(src + index_src, in_temp, sizeof(T));
+        dst[nx + ny * NX] = in_temp[0];
+        thread_offset += stride_ny;
+      }
+      thread_offset += stride_nx;
+    }
+  }
+}
+/**
+ * @brief Write 1D data from registers to global memory. When IsBoundary = true
+ * and (NX % 4 == 0 or Nx % 2 == 0), the data will be vectorized to improve the
+ * data loading efficiency
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: The number of data continuously writed by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The data pointer of the current block.
+ * src: The register pointer, the size is NX * NY.
+ * size: The current block needs to load size elements continuously.
+ */
+
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
+__device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
+  int thread_offset = core_id() * NX;
+  __local__ T in_temp[1];
+  if (IsBoundary) {  // core_num() * NX > num
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (idx + thread_offset < num) {
+        in_temp[0] = src[idx];
+        LM2GM(in_temp, dst + idx + thread_offset, sizeof(T));
+      }
+    }
+  } else {  // core_num() * NX < num
+    LM2GM(src, dst + thread_offset, NX * sizeof(T));
+  }
+}
+
+/**
+ * @brief Write 2D data from register to global memory according to Tx type, and
+ * store it as Ty type.
+ *
+ * @template paraments
+ * Tx: The type of data that needs to be stored in registers.
+ * Ty: The type of data stored in the global memory.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: Data pointer of the current block.
+ * src: The register pointer of the thread, the size is NX * NY.
+ * size_nx: The current block needs to load size_nx columns of data, this
+ * parameter will be used when IsBoundary = true.
+ * size_ny: The current block needs to load size_ny rows of data. This parameter
+ * will be used when IsBoundary = true.
+ * stride_nx: Each read one element stride stride_nx elements in the last dim.
+ * stride_ny: Each read one element stride stride_ny elements in the first dim.
+ */
+template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+          bool IsBoundary = false>
+__device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
+                                          int size_nx, int size_ny,
+                                          int stride_nx, int stride_ny) {
+  int thread_offset = core_id();
+  int left_size_nx = size_nx - thread_offset;
+  __local__ Ty in_temp[1];
+
+  // Each branch is added for better performance
+  if (NX == 1 && NY == 1) {
+    if (IsBoundary) {
+      if (left_size_nx > 0) {
+        in_temp[0] = static_cast<Ty>(src[0]);
+        LM2GM(in_temp, dst + thread_offset, sizeof(T));
+      }
+    } else {
+      in_temp[0] = static_cast<Ty>(src[0]);
+      LM2GM(in_temp, dst + thread_offset, sizeof(T));
+    }
+  } else if (NX == 1) {
+#pragma unroll
+    for (int idy = 0; idy < NY; ++idy) {
+      if (IsBoundary) {
+        if (idy * stride_ny >= size_ny) {
+          break;
+        }
+      }
+
+      in_temp[0] = static_cast<Ty>(src[idy]);
+      LM2GM(in_temp, dst + thread_offset + idy * stride_ny, sizeof(T));
+    }
+  } else if (NY == 1) {  // for NY == 1 and NX != 1
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (IsBoundary) {
+        if (idx * stride_nx >= left_size_nx) {
+          break;
+        }
+      }
+
+      in_temp[0] = static_cast<Ty>(src[idx]);
+      LM2GM(in_temp, dst + thread_offset + idx * stride_nx, sizeof(T));
+    }
+  } else {  // for NX != 1 and NY != 1
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (IsBoundary) {
+        if (idx * stride_nx >= left_size_nx) {
+          break;
+        }
+      }
+#pragma unroll
+      for (int idy = 0; idy < NY; ++idy) {
+        if (IsBoundary) {
+          if (idy * stride_ny >= size_ny) {
+            break;
+          }
+        }
+        in_temp[0] = static_cast<Ty>(src[idx + idy * NX]);
+        LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny,
+              sizeof(T));
+      }
+    }
+  }
+}
+
+/**
+ * @brief Initialize register with init_data.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: The register pointer of init data, the size is NX.
+ */
+template <typename T, int NX, bool IsBoundary = false>
+__device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    if (IsBoundary) {
+      if (i >= num) {
+        break;
+      }
+    }
+    dst[i] = init_data[i];
+  }
+}
+
+/**
+ * @brief Read 1D data from global memory to register with broadcast form.
+ *
+ * @template paraments
+ * T: The type of data stored in the global memory.
+ * NX: The number of data continuously loaded by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The original input data pointer of kernel.
+ * block_offset: The data offset of this block, core_num() * blockIdx.x * NX;
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * total_num_output: Total number of original output.
+ */
+template <typename T, int NX, int NY, int BlockSize, int Rank,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataBc(
+    T* dst, const T _global_ptr_* src, uint32_t block_offset,
+    details::BroadcastConfig<Rank> config, int total_num_output) {
+  uint32_t thread_offset = block_offset + core_id() * NX;
+  uint32_t index_src = 0;
+  __local__ T in_temp[1];
+
+#pragma unroll
+  for (uint32_t nx = 0; nx < NX; ++nx) {
+    uint32_t index_output = thread_offset + nx;
+    index_src = 0;
+    if (IsBoundary) {
+      if (index_output >= total_num_output) {
+        break;
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < Rank; ++i) {
+      uint32_t tmp = index_output / config.stride_out[i];
+      index_output = index_output - tmp * config.stride_out[i];
+      index_src += (tmp % config.shape_in[i]) * config.stride_in[i];
+    }
+    GM2LM(src + index_src, in_temp, sizeof(T));
+    dst[nx + ny * NX] = in_temp[0];
+  }
+}
+
+}  // namespace kernel_primitives
+}  // namespace operators
+}  // namespace paddle