From 4fe9ca6959c2ac68e00f5346d4916caaa85e6379 Mon Sep 17 00:00:00 2001 From: zhaoyuchen2018 <45989343+zhaoyuchen2018@users.noreply.github.com> Date: Fri, 3 Apr 2020 17:19:31 +0800 Subject: [PATCH] improve elementwise performance. (#23405) * improve elementwise performance. * Add contiguous check, test=develop --- .../elementwise/elementwise_op_function.h | 155 +++++++++++++++++- 1 file changed, 149 insertions(+), 6 deletions(-) mode change 100644 => 100755 paddle/fluid/operators/elementwise/elementwise_op_function.h diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h old mode 100644 new mode 100755 index b243a9a18e4..f6969611820 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -548,6 +548,64 @@ static __global__ void FastCommonGradBroadcastAllCUDAKernel( } } +template +static __global__ void FastCommonGradBroadcastOneCUDAKernel( + const T *x, const T *y, const T *out, const T *dout, int pre, int n, + int post, int y_pre, int y_n, int y_post, bool is_xsize, OP op, T *dd) { + int tid = threadIdx.x; + int bid = blockIdx.x; + + T val(0); + if (is_xsize) { + // do reduce for x + for (int i = tid; i < n; i += ELEMWISE_MAX_BLOCK_DIM) { + int b_i = bid / post; + int b_j = bid % post; + int x_offset = b_i * n * post + b_j; + int out_offset = b_i * n * post + i * post + b_j; + + // Get y pre rows id with x post and y_pre. + int b_yi = bid / (post * y_pre); + int b_yj = bid % y_post; + int y_offset = b_yi * y_n + i * y_post + b_yj; + + if (dd) { + val += op(x[x_offset], y[y_offset], out[out_offset], dout[out_offset]); + } + } + if (dd) { + int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n; + val = paddle::platform::reduceSum(val, tid, h); + if (tid == 0) { + dd[bid] = val; + } + } + } else { + // do reduce for y + for (int i = tid; i < n; i += ELEMWISE_MAX_BLOCK_DIM) { + int b_i = bid / post; + int b_j = bid % post; + int y_offset = b_i * n * post + b_j; + int out_offset = b_i * n * post + i * post + b_j; + + int b_yi = bid / (post * y_pre); + int b_yj = bid % y_post; + int x_offset = b_yi * y_n + i * y_post + b_yj; + + if (dd) { + val += op(x[x_offset], y[y_offset], out[out_offset], dout[out_offset]); + } + } + if (dd) { + int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n; + val = paddle::platform::reduceSum(val, tid, h); + if (tid == 0) { + dd[bid] = val; + } + } + } +} + // Check input can be split into 2 parts static inline bool SplitDims(const std::vector &y_broadcast_pos, int max_dim) { @@ -568,6 +626,16 @@ static inline bool SplitDims(const std::vector &y_broadcast_pos, return can_split_dim2; } +// Suppose only has contiguous dims +static inline bool CheckContiguousDims(const std::vector &broadcast_pos) { + for (int i = 1; i < broadcast_pos.size(); ++i) { + if (broadcast_pos[i] != broadcast_pos[i - 1] + 1) { + return false; + } + } + return true; +} + template void CommonGradBroadcastCUDA( const framework::Tensor &x, const framework::Tensor &y, @@ -644,6 +712,7 @@ void CommonGradBroadcastCUDA( y_broadcast_pos.emplace_back(i); } } + auto stream = ctx.stream(); bool can_split_x = false; bool can_split_y = false; @@ -751,10 +820,22 @@ void CommonGradBroadcastCUDA( int axis = broadcast_pos[0]; int pre = std::accumulate(out_dims_array, out_dims_array + axis, 1, std::multiplies()); - int mid = out_dims_array[axis]; - int post = - std::accumulate(out_dims_array + axis + 1, out_dims_array + max_dim, 1, - std::multiplies()); + int mid = 1; + int post = 1; + + if (broadcast_pos.size() == 1) { + mid = out_dims_array[axis]; + post = + std::accumulate(out_dims_array + axis + 1, out_dims_array + max_dim, + 1, std::multiplies()); + } else { + mid = std::accumulate(out_dims_array + axis, + out_dims_array + broadcast_pos.back() + 1, 1, + std::multiplies()); + post = + std::accumulate(out_dims_array + broadcast_pos.back() + 1, + out_dims_array + max_dim, 1, std::multiplies()); + } VLOG(3) << "FastBroadCastAllCUDAF pre:" << pre << " mid:" << mid << " post:" << post; @@ -767,6 +848,55 @@ void CommonGradBroadcastCUDA( dy_op, dx_data, dy_data); }; + auto FastBroadCastOneCUDAF = [&](const std::vector &broadcast_pos, + int max_dim, bool is_x) { + int axis = broadcast_pos[0]; + int pre = std::accumulate(out_dims_array, out_dims_array + axis, 1, + std::multiplies()); + int mid = out_dims_array[axis]; + int post = + std::accumulate(out_dims_array + axis + 1, out_dims_array + max_dim, 1, + std::multiplies()); + + int k_pre; + int k_mid; + int k_post; + + if (is_x) { + k_pre = std::accumulate(y_dims_array, y_dims_array + axis, 1, + std::multiplies()); + k_mid = y_dims_array[axis]; + k_post = std::accumulate(y_dims_array + axis + 1, y_dims_array + max_dim, + 1, std::multiplies()); + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid); + int grid_size = pre * post; + // we need to calc y offset with blockid, so do x_pre/y_pre to get left + // size. + if (k_pre != pre) k_pre = pre / k_pre; + + FastCommonGradBroadcastOneCUDAKernel<<>>( + x_data, y_data, out_data, dout_data, pre, mid, post, k_pre, k_mid, + k_post, true, dx_op, dx_data); + } else { + k_pre = std::accumulate(x_dims_array, x_dims_array + axis, 1, + std::multiplies()); + k_mid = x_dims_array[axis]; + k_post = std::accumulate(x_dims_array + axis + 1, x_dims_array + max_dim, + 1, std::multiplies()); + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid); + int grid_size = pre * post; + if (k_pre != pre) k_pre = pre / k_pre; + + FastCommonGradBroadcastOneCUDAKernel<<>>( + x_data, y_data, out_data, dout_data, pre, mid, post, k_pre, k_mid, + k_post, false, dy_op, dy_data); + } + VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid + << " post:" << post; + }; + // do fast elementwise if: 1. only one input need to do broadcast, we can // fallback // to old fast path. @@ -781,7 +911,9 @@ void CommonGradBroadcastCUDA( LOG(ERROR) << "Error, broadcast should not into w broadcast"; } return; - } else if (y_broadcast_pos.size() == 1) { // for only one dim broadcast. + } else if (y_broadcast_pos.size() == 1 || + CheckContiguousDims(y_broadcast_pos)) { // for only one dim and + // contiguous broadcast. // If cannot split, which means input has 3 parts FastBroadCastAllCUDAF(y_broadcast_pos, max_dim, true); return; @@ -797,7 +929,8 @@ void CommonGradBroadcastCUDA( LOG(ERROR) << "Error, broadcast should not into w broadcast"; } return; - } else if (x_broadcast_pos.size() == 1) { + } else if (x_broadcast_pos.size() == 1 || + CheckContiguousDims(x_broadcast_pos)) { FastBroadCastAllCUDAF(x_broadcast_pos, max_dim, false); return; } @@ -812,6 +945,9 @@ void CommonGradBroadcastCUDA( // finish at end LOG(ERROR) << "Error, broadcast should not into w broadcast"; } + } else if (y_broadcast_pos.size() == 1) { + FastBroadCastOneCUDAF(y_broadcast_pos, max_dim, false); + can_split_y = true; } can_split_x = SplitDims(x_broadcast_pos, max_dim); if (can_split_x) { @@ -820,6 +956,9 @@ void CommonGradBroadcastCUDA( } else { LOG(ERROR) << "Error, broadcast should not into w broadcast"; } + } else if (x_broadcast_pos.size() == 1) { + FastBroadCastOneCUDAF(x_broadcast_pos, max_dim, true); + can_split_x = true; } VLOG(3) << "CommonBroadcast can_split_y:" << can_split_y << " can_split_x:" << can_split_x; @@ -1492,6 +1631,10 @@ void CommonElementwiseBroadcastBackward( dx->mutable_data(x_dims, ctx.GetPlace()); } + VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" + << framework::make_ddim(x_dims_array) + << " ydim:" << framework::make_ddim(y_dims_array); + if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef __NVCC__ CommonGradBroadcastCUDA( -- GitLab