From 4fbde42cdf2a10c9dc69f36ce911ca3bdadf22dd Mon Sep 17 00:00:00 2001 From: chengduo Date: Thu, 3 May 2018 09:28:35 +0800 Subject: [PATCH] Fix __shfl_down_sync_ of cross_entropy (#10345) * fix __shfl_down_sync_ of cross_entropy * use reduceSum * "fix ci" --- .../fluid/operators/elementwise_op_function.h | 42 +---------- paddle/fluid/operators/math/cross_entropy.cu | 65 +++------------- paddle/fluid/operators/row_conv_op.cu | 2 +- paddle/fluid/platform/cuda_device_function.h | 74 +++++++++++++++++++ paddle/fluid/platform/cuda_primitives.h | 13 ---- 5 files changed, 88 insertions(+), 108 deletions(-) create mode 100644 paddle/fluid/platform/cuda_device_function.h diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 953aedc8506..8b052611f80 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -22,6 +22,7 @@ limitations under the License. */ #ifdef __NVCC__ #include #include +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; #endif @@ -336,43 +337,6 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, } #ifdef __NVCC__ - -template -__device__ T reduceSum(T val, int tid, int len) { - // NOTE(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. - const int warpSize = 32; - __shared__ T shm[warpSize]; - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, tid < len); - - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::__shfl_down_sync(mask, val, offset); - - if (tid < warpSize) shm[tid] = 0; - - __syncthreads(); - - if (tid % warpSize == 0) { - shm[tid / warpSize] = val; - } - __syncthreads(); - - CREATE_SHFL_MASK(mask, tid < warpSize); - - if (tid < warpSize) { - val = shm[tid]; - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::__shfl_down_sync(mask, val, offset); - } - - return val; -} - template static __global__ void ElemwiseGradBroadcast1CUDAKernel( const T* x, const T* y, const T* out, const T* dout, int h, int w, @@ -395,7 +359,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( if (dy) { h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = reduceSum(val, tid, h); + val = paddle::platform::reduceSum(val, tid, h); if (threadIdx.x == 0) { dy[j] = val; } @@ -472,7 +436,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( if (dy) { int h = pre * post; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = reduceSum(val, tid, h); + val = paddle::platform::reduceSum(val, tid, h); if (threadIdx.x == 0) { dy[j] = val; } diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 6d2ba2bd0d6..0de58d5fddd 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { @@ -30,66 +31,22 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, } } -template -__device__ __forceinline__ T sum_single_warp(T val) { - val += platform::__shfl_down_sync(0, val, 16); - val += platform::__shfl_down_sync(0, val, 8); - val += platform::__shfl_down_sync(0, val, 4); - val += platform::__shfl_down_sync(0, val, 2); - val += platform::__shfl_down_sync(0, val, 1); - return val; -} - -// CUDA do not support dynamic arrary in template -// https://stackoverflow.com/questions/20497209 -template -struct SharedMemory { - // Ensure that we won't compile any un-specialized types - __device__ T* GetPointer() { return NULL; } -}; - -template <> -struct SharedMemory { - __device__ float* GetPointer() { - extern __shared__ float s_float[]; - return s_float; - } -}; - -template <> -struct SharedMemory { - __device__ double* GetPointer() { - extern __shared__ double s_double[]; - return s_double; - } -}; - template __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, const int class_num) { int tid = threadIdx.x; - SharedMemory d_sum_shared; - T* d_sum = d_sum_shared.GetPointer(); - d_sum[tid] = 0; + T val = 0; - int cur_idx = tid; - int next_idx = blockIdx.x * class_num + tid; - while (cur_idx < class_num) { - d_sum[tid] += - math::TolerableValue()(std::log(X[next_idx])) * label[next_idx]; - next_idx += blockDim.x; - cur_idx += blockDim.x; + int idx = blockIdx.x * class_num + tid; + int end = blockIdx.x * class_num + class_num; + for (; idx < end; idx += blockDim.x) { + val += math::TolerableValue()(std::log(X[idx])) * label[idx]; } - __syncthreads(); - for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) { - if (tid < stride) d_sum[tid] += d_sum[tid + stride]; - __syncthreads(); + val = paddle::platform::reduceSum(val, tid, blockDim.x); + if (threadIdx.x == 0) { + Y[blockIdx.x] = -val; } - - T val = d_sum[tid]; - val = sum_single_warp(val); - if (tid == 0) Y[blockIdx.x] = -val; } } // namespace @@ -113,9 +70,7 @@ class CrossEntropyFunctor { ? 512 : pow(2, static_cast(std::log2(class_num))); - SoftCrossEntropyKernel<<< - batch_size, block, block * sizeof(T), - reinterpret_cast(ctx).stream()>>>( + SoftCrossEntropyKernel<<>>( loss_data, prob_data, label_data, class_num); } else { const int64_t* label_data = labels->data(); diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index dd8e62aca47..79d08cf3d1e 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/row_conv_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/cuda_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h new file mode 100644 index 00000000000..7cfeaab35b8 --- /dev/null +++ b/paddle/fluid/platform/cuda_device_function.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +namespace paddle { +namespace platform { + +// __shfl_down and __shfl have been deprecated as of CUDA 9.0. +#if CUDA_VERSION < 9000 +template +__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) { + return __shfl_down(val, delta); +} + +template +__forceinline__ __device__ T __shfl_sync(unsigned, T val, int src_line, + int width) { + return __shfl(val, src_line, width); +} +#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; +#else +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) +#endif + +template +__device__ T reduceSum(T val, int tid, int len) { + // NOTE(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. + const int warpSize = 32; + __shared__ T shm[warpSize]; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += platform::__shfl_down_sync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + __syncthreads(); + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += platform::__shfl_down_sync(mask, val, offset); + } + return val; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index 866ff30a8be..8758af0804a 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -66,18 +66,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) { } #endif -// __shfl_down has been deprecated as of CUDA 9.0. -#if CUDA_VERSION < 9000 -template -__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) { - return __shfl_down(val, delta); -} -#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; -#else -#define FULL_WARP_MASK 0xFFFFFFFF -#define CREATE_SHFL_MASK(mask, predicate) \ - mask = __ballot_sync(FULL_WARP_MASK, (predicate)) -#endif - } // namespace platform } // namespace paddle -- GitLab