fix add_n kernel of large shape (#53751)

b720873d · Leo Chen · GitHub · 6fee5a3e · b720873d
显示空白变更内容
内联并排

Showing with 6 addition and 22 deletion

paddle/phi/kernels/gpu/add_n_kernel.cu paddle/phi/kernels/gpu/add_n_kernel.cu +6 -22

未找到文件。
--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -23,34 +23,20 @@ namespace phi {
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
-template <class T>
-__global__ void Sum2CUDAKernel(const T *in_0,
-                               const T *in_1,
-                               T *out,
-                               int64_t N) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  while (id < N) {
-    out[id] = in_0[id] + in_1[id];
-    id += blockDim.x * gridDim.x;
-  }
-}
 template <class T>
 __global__ void SumArrayCUDAKernel(
    T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
-  while (id < N) {
+    MPType total(read_dst ? static_cast<MPType>(out[idx])
-    MPType total(read_dst ? static_cast<MPType>(out[id])
                          : static_cast<MPType>(0));
    for (int i = 0; i < in_size; ++i) {
      const T *tmp = in[i];
      if (tmp) {
-        total += static_cast<MPType>(tmp[id]);
+        total += static_cast<MPType>(tmp[idx]);
      }
    }
-    out[id] = static_cast<T>(total);
+    out[idx] = static_cast<T>(total);
-    id += blockDim.x * gridDim.x;
  }
 }
@@ -58,16 +44,14 @@ template <class T>
 __global__ void SumSelectedRowsCUDAKernel(T **sr_in_out,
                                          int64_t N,
                                          size_t rows) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
-  while (id < N) {
    for (int i = 0; i < 2 * rows; i += 2) {
      const T *tmp = sr_in_out[i];
      T *tmp_out = sr_in_out[i + 1];
      if (tmp && tmp_out) {
-        tmp_out[id] += tmp[id];
+        tmp_out[idx] += tmp[idx];
      }
    }
-    id += blockDim.x * gridDim.x;
  }
 }