diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 5c7830353093ebbf6a5216be497ea6f1f1c21909..600524936b079fb59e4774f477d272d92c06bdf9 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -20,6 +20,7 @@ limitations under the License. */ #ifdef __NVCC__ #include +#include "paddle/fluid/platform/cuda_helper.h" constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; #endif @@ -361,13 +362,10 @@ template static __global__ void ElemwiseGradBroadcast1CUDAKernel( const T* x, const T* y, const T* out, const T* dout, int h, int w, DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) { - extern __shared__ char shm_buffer[]; - T* shm = reinterpret_cast(shm_buffer); - int j = blockIdx.x; int i = threadIdx.x; int tid = threadIdx.x; - shm[tid] = 0; + T val = 0; do { int x_offset = i * w + j; @@ -375,22 +373,16 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); } if (dy) { - shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); } i += ELEMWISE_MAX_BLOCK_DIM; } while (i < h); if (dy) { - __syncthreads(); - h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - - // Sum, could be optimized + val = platform::reduceSum(val, tid, h); if (threadIdx.x == 0) { - for (int k = 1; k < h; ++k) { - shm[0] += shm[k]; - } - dy[j] = shm[0]; + dy[j] = val; } } } @@ -402,10 +394,8 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x, T* dx, T* dy) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); int gird_size = w; - int shared_mem_size = block_size * sizeof(T); - ElemwiseGradBroadcast1CUDAKernel<<>>(x, y, out, dout, h, w, dx_op, - dy_op, dx, dy); + ElemwiseGradBroadcast1CUDAKernel<<>>( + x, y, out, dout, h, w, dx_op, dy_op, dx, dy); } #endif @@ -436,7 +426,6 @@ static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out, } #ifdef __NVCC__ - template static __global__ void ElemwiseGradBroadcast2CUDAKernel( const T* x, const T* y, const T* out, const T* dout, int pre, int n, @@ -444,9 +433,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( int tid = threadIdx.x; int j = blockIdx.x; - extern __shared__ char shm_buffer[]; - T* shm = reinterpret_cast(shm_buffer); - shm[tid] = 0; + T val = 0; int ttid = tid; while (true) { @@ -461,23 +448,18 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( } if (dy != nullptr) { - shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); } ttid += ELEMWISE_MAX_BLOCK_DIM; } if (dy) { - __syncthreads(); int h = pre * post; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - - // Sum, could be optimized - if (tid == 0) { - for (int i = 1; i < h; ++i) { - shm[0] += shm[i]; - } - dy[j] = shm[0]; + val = platform::reduceSum(val, tid, h); + if (threadIdx.x == 0) { + dy[j] = val; } } } @@ -489,10 +471,8 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x, DY_OP dy_op, T* dx, T* dy) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post); int gird_size = n; - int shared_mem_size = block_size * sizeof(T); - ElemwiseGradBroadcast2CUDAKernel<<>>(x, y, out, dout, pre, n, post, - dx_op, dy_op, dx, dy); + ElemwiseGradBroadcast2CUDAKernel<<>>( + x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy); } #endif diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h index 881d611d4ac26f992036f639097815aff625227b..a4ea4f21e3c16c9292cf67863616924e9d9f8aba 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_helper.h @@ -62,5 +62,53 @@ CUDA_ATOMIC_WRAPPER(Add, double) { } #endif +// __shfl_down has been deprecated as of CUDA 9.0. +#if CUDA_VERSION < 9000 +template +__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) { + return __shfl_down(val, delta); +} +#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; +#else +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) +#endif + +template +__device__ T reduceSum(T val, int tid, int len) { + // TODO(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. + __shared__ T shm[32]; + const int warpSize = 32; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += __shfl_down_sync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + + __syncthreads(); + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += __shfl_down_sync(mask, val, offset); + } + + return val; +} + } // namespace platform } // namespace paddle