From 5296294dae9c1593906b7a3200cb688d9588ceb3 Mon Sep 17 00:00:00 2001 From: zhaoyuchen2018 <45989343+zhaoyuchen2018@users.noreply.github.com> Date: Tue, 20 Aug 2019 20:53:18 +0800 Subject: [PATCH] Fix elementwise performance poor issue (#19278) For small case use 1D block is better than 2D block. Refer to this issue: #19275 --- .../elementwise/elementwise_op_function.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 2b108efef4a..7d0256cc1cf 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -406,11 +406,20 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x, const T *y, const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { - // suppose perfoemance improves with h increased. - dim3 block_size = dim3(BLOCK_X, BLOCK_Y); - int grid_size = (w + BLOCK_X - 1) / BLOCK_X; - FastElemwiseGradBroadcast1CUDAKernel<<>>( - x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + // For small case use 1D block + constexpr int half_walf = 16; + if (w < half_walf || h < half_walf) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); + int gird_size = w; + ElemwiseGradBroadcast1CUDAKernel<<>>( + x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + } else { + // suppose perfoemance improves with h increased. + dim3 block_size = dim3(BLOCK_X, BLOCK_Y); + int grid_size = (w + BLOCK_X - 1) / BLOCK_X; + FastElemwiseGradBroadcast1CUDAKernel<<>>( + x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + } } #endif -- GitLab