From 1b69e528d3ded3cbacb1a45e9e9aa04c6b3b3250 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Tue, 15 Dec 2020 19:16:38 +0800
Subject: [PATCH] optimize for long width for elementwise (#29602)

---
 .../elementwise/elementwise_add_op.h          | 96 ++++++++++++++++++-
 1 file changed, 93 insertions(+), 3 deletions(-)
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 0e8d202a9a..44c233be57 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -19,6 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
+#ifdef PADDLE_WITH_CUDA
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#endif
 
 namespace paddle {
 namespace operators {
@@ -121,6 +126,20 @@ elementwise_add_grad(const framework::ExecutionContext &ctx,
 #ifdef PADDLE_WITH_CUDA
 #ifdef __NVCC__
 
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) AlignedVector {
+  T val[Size];
+};
+
+template <typename T>
+inline int VectorizedSize(const T *pointer) {
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value;  // NOLINT
+  if (address % vec4 == 0) {
+    return 4;
+  }
+  return 1;
+}
 template <typename T, int BLOCK_W, int BLOCK_H>
 __global__ void MatrixColReduce(const T *__restrict__ in, T *__restrict__ out,
                                 size_t width, size_t height) {
@@ -200,6 +219,45 @@ __global__ void FP16MatrixColReduce(
     if ((threadIdx.y == 0) && ((w) < width)) out[w] = sdata[0][threadIdx.x];
   }
 }
+
+template <typename T>
+__global__ void MatrixReduceLongWidth(const T *__restrict__ in, T *out,
+                                      size_t width, size_t height) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (; idx < width; idx += blockDim.x * gridDim.x) {
+    T sum = static_cast<T>(0);
+    for (int row = 0; row < height; row++) {
+      sum += in[idx + row * width];
+    }
+
+    out[idx] = sum;
+  }
+}
+
+template <typename T, int VEC_SIZE>
+__global__ void VecMatrixReduceLongWidth(const T *__restrict__ in, T *out,
+                                         size_t width, size_t height) {
+  using LoadT = AlignedVector<T, VEC_SIZE>;
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int w = idx * VEC_SIZE;
+  int width_stride = blockDim.x * gridDim.x * VEC_SIZE;
+  for (; w < width; w += width) {
+    T zero = static_cast<T>(0);
+    T sum[VEC_SIZE] = {zero};
+    T tmp_vec[VEC_SIZE] = {zero};
+    LoadT *tmp_ptr = reinterpret_cast<LoadT *>(&tmp_vec);
+    for (int row = 0; row < height; row++) {
+      int offset = width * row + w;
+      *tmp_ptr = *reinterpret_cast<const LoadT *>(&in[offset]);
+      for (int v = 0; v < VEC_SIZE; v++) {
+        sum[v] += tmp_vec[v];
+      }
+    }
+
+    for (int v = 0; v < VEC_SIZE; v++) out[w + v] = sum[v];
+  }
+}
 #endif
 #endif
 bool static RunSpecialDims(const framework::DDim &dx_dims,
@@ -301,6 +359,21 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
             *dout, ctx.GetPlace(),
             ctx.template device_context<platform::DeviceContext>(), dy);
       }
+      // special optimization using cub
+      if (width == 1) {
+        int nums = height;
+        size_t temp_storage_bytes = 0;
+        auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes,
+                                          dout_data, out_data, nums, stream);
+        PADDLE_ENFORCE_CUDA_SUCCESS(err);
+        framework::Tensor tmp;
+        auto *temp_storage = tmp.mutable_data<uint8_t>(
+            framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
+            ctx.GetPlace());
+        err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes,
+                                     dout_data, out_data, nums, stream);
+        PADDLE_ENFORCE_CUDA_SUCCESS(err);
+      }
 
       constexpr int block_x = 32;
       constexpr int block_y = 32;
@@ -311,7 +384,8 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
       int max_blocks = std::max(max_physical_threads / (block_x * block_y), 1);
       int theory_block = (width + blocks.x - 1) / blocks.x;
       dim3 grids(std::min(theory_block, max_blocks));
-      if (std::is_same<T, paddle::platform::float16>::value) {
+      if (std::is_same<T, paddle::platform::float16>::value &&
+          (width / height) < 32) {
         const paddle::platform::float16 *ptr1 =
             reinterpret_cast<const paddle::platform::float16 *>(dout_data);
         paddle::platform::float16 *ptr2 =
@@ -325,8 +399,24 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
         }
         return;
       }
-      MatrixColReduce<T, block_x, block_y><<<grids, blocks, 0, stream>>>(
-          dout_data, out_data, width, height);
+
+      if (width / height < 32) {
+        MatrixColReduce<T, block_x, block_y><<<grids, blocks, 0, stream>>>(
+            dout_data, out_data, width, height);
+      } else {
+        size_t thread_nums = 1024;
+        size_t block_nums = (width + thread_nums - 1) / thread_nums;
+        int vec_size = VectorizedSize<T>(dx_data);
+        if (vec_size == 4 && width % 4 == 0) {
+          block_nums = (width / vec_size + thread_nums - 1) / thread_nums;
+          VecMatrixReduceLongWidth<T,
+                                   4><<<block_nums, thread_nums, 0, stream>>>(
+              dout_data, out_data, width, height);
+        } else {
+          MatrixReduceLongWidth<T><<<block_nums, thread_nums, 0, stream>>>(
+              dout_data, out_data, width, height);
+        }
+      }
       return;
     }
 
-- 
GitLab