diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 2b108efef4a34b5e03bd55cd59adfbfb0df67e22..7d0256cc1cf4ec7ba899212d9f618edeaa7facbc 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -406,11 +406,20 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x, const T *y, const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { - // suppose perfoemance improves with h increased. - dim3 block_size = dim3(BLOCK_X, BLOCK_Y); - int grid_size = (w + BLOCK_X - 1) / BLOCK_X; - FastElemwiseGradBroadcast1CUDAKernel<<>>( - x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + // For small case use 1D block + constexpr int half_walf = 16; + if (w < half_walf || h < half_walf) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); + int gird_size = w; + ElemwiseGradBroadcast1CUDAKernel<<>>( + x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + } else { + // suppose perfoemance improves with h increased. + dim3 block_size = dim3(BLOCK_X, BLOCK_Y); + int grid_size = (w + BLOCK_X - 1) / BLOCK_X; + FastElemwiseGradBroadcast1CUDAKernel<<>>( + x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + } } #endif