From 5296294dae9c1593906b7a3200cb688d9588ceb3 Mon Sep 17 00:00:00 2001
From: zhaoyuchen2018 <45989343+zhaoyuchen2018@users.noreply.github.com>
Date: Tue, 20 Aug 2019 20:53:18 +0800
Subject: [PATCH] Fix elementwise performance poor issue (#19278)

For small case use 1D block is better than 2D block.

Refer to this issue: #19275
---
 .../elementwise/elementwise_op_function.h     | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 2b108efef4a..7d0256cc1cf 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -406,11 +406,20 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x,
                                        const T *y, const T *out, const T *dout,
                                        int h, int w, DX_OP dx_op, DY_OP dy_op,
                                        T *dx, T *dy) {
-  // suppose perfoemance improves with h increased.
-  dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
-  int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-  FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
-      x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
+  // For small case use 1D block
+  constexpr int half_walf = 16;
+  if (w < half_walf || h < half_walf) {
+    int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
+    int gird_size = w;
+    ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
+        x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
+  } else {
+    // suppose perfoemance improves with h increased.
+    dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
+    int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
+    FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
+        x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
+  }
 }
 
 #endif
-- 
GitLab