未验证 提交 5296294d 编写于 作者: Z zhaoyuchen2018 提交者: GitHub

Fix elementwise performance poor issue (#19278)

For small case use 1D block is better than 2D block.

Refer to this issue: #19275
上级 6527a7df
...@@ -406,11 +406,20 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x, ...@@ -406,11 +406,20 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x,
const T *y, const T *out, const T *dout, const T *y, const T *out, const T *dout,
int h, int w, DX_OP dx_op, DY_OP dy_op, int h, int w, DX_OP dx_op, DY_OP dy_op,
T *dx, T *dy) { T *dx, T *dy) {
// For small case use 1D block
constexpr int half_walf = 16;
if (w < half_walf || h < half_walf) {
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
int gird_size = w;
ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
} else {
// suppose perfoemance improves with h increased. // suppose perfoemance improves with h increased.
dim3 block_size = dim3(BLOCK_X, BLOCK_Y); dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
int grid_size = (w + BLOCK_X - 1) / BLOCK_X; int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>( FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
x, y, out, dout, h, w, dx_op, dy_op, dx, dy); x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
}
} }
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册