diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu index 3fd8995745acb4ad81d4473e7cd91da5b892522b..e7bf6d212dcf1730c32ed794576d569dff96f215 100644 --- a/paddle/fluid/operators/bilateral_slice_op.cu +++ b/paddle/fluid/operators/bilateral_slice_op.cu @@ -472,8 +472,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel { grid_sizes.gw = gw; grid_sizes.input_chans = input_chans; - platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D( - ctx.cuda_device_context(), grid_count, 512); + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count); BilateralSliceCudaGridGradKernel< T><< { grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes, has_offset, grid_count, output_chans); - config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), - guide_count, 512); + config = + platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count); BilateralSliceCudaGuideGradKernel< T><< { guide_grad_data, output_grad_data, grid_data, guide_data, input_data, grid_sizes, has_offset, guide_count, output_chans); - config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), - input_count, 512); + config = + platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count); BilateralSliceCudaInputGradKernel< T><<(x_data) == 4) ? 4 : 1; - int block_size = pten::funcs::GetThreadsConfig(dev_ctx, x_numel, vec_size); - int grid_size = - ((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size; - + auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); auto offset = - ((x_numel - 1) / (grid_size * block_size * vec_size) + 1) * vec_size; + ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size; GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment); @@ -206,23 +203,25 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, #ifdef __HIPCC__ if (vec_size == 4 && size % 4 == 0) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(VectorizedRandomGenerator), grid_size, - block_size, 0, stream, size, seed_data, dropout_prob, x_data, - mask_data, y_data, upscale_in_train, increment); + HIP_KERNEL_NAME(VectorizedRandomGenerator), + gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream, size, + seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, + increment); } else { hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator), - grid_size, block_size, 0, stream, size, seed_data, - dropout_prob, x_data, mask_data, y_data, - upscale_in_train, increment); + gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, + stream, size, seed_data, dropout_prob, x_data, + mask_data, y_data, upscale_in_train, increment); } #else if (vec_size == 4 && size % 4 == 0) { - VectorizedRandomGenerator<<>>( + VectorizedRandomGenerator<<< + gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>( size, seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, increment); } else { - RandomGenerator<<>>( + RandomGenerator<<>>( size, seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, increment); } @@ -265,7 +264,7 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, auto factor = static_cast(1.0f / (1.0f - dropout_prob)); auto stream = dev_ctx.stream(); platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, size); + platform::GetGpuLaunchConfig1D(dev_ctx, size, vec_size); DropoutGradCUDAKernel< T, uint8_t, 4><<>>( diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index b5c19a3edb81869becc516b3c223402e4fe775ea..779779b44da8d1df275b057bbb9d37828c6904ed 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -128,10 +128,10 @@ elementwise_add_grad(const framework::ExecutionContext& ctx, } else if (dx_data != dout_data && dy_data != dout_data) { auto size = x->numel(); int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); - dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1); + dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); dim3 grid_size = - dim3(((size + vec_size - 1) / vec_size + ELEMENTWISE_BLOCK_SIZE - 1) / - ELEMENTWISE_BLOCK_SIZE, + dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) / + PREDEFINED_BLOCK_SIZE, 1); SimpleElemwiseAddGradCUDAKernel< T><<mutable_data(ctx.GetPlace()); if (dy->dims() == dout->dims()) { if (dy_data != dout_data) { - dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1); + dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); auto size = dy->numel(); - dim3 grid_size = dim3( - (size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1); + dim3 grid_size = + dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); SimpleElemwiseSubGradCUDAKernel<<< grid_size, block_size, 0, ctx.template device_context().stream()>>>( @@ -100,10 +100,10 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx, const framework::Tensor* out, const framework::Tensor* dout, framework::Tensor* dx, framework::Tensor* dy) { - dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1); + dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); auto size = x->numel(); dim3 grid_size = - dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1); + dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); SimpleElemwiseSubGradCUDAKernel< T><<().stream()>>>( diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index c5b1fd939295043f669c1cbe83f77d5f425b9ca0..ebda9bbaa8b81b8b147ccc21ddbb047d1e6df1f7 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -23,6 +23,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu index 40a968b8a397d5097e32aae3443a381aa50528d9..4260d0516e3cccefaf0cff5ea4b4441af96ef146 100644 --- a/paddle/fluid/operators/index_sample_op.cu +++ b/paddle/fluid/operators/index_sample_op.cu @@ -15,7 +15,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/index_sample_op.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index 0cc552d34c5872165728c9186ff8fcb7d496627c..cec688262604a10cdce04d9cca324f324196c652 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/beam_search.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index 076d3aa3361f0f2cab8126d800dffae3804b44ab..9d96345eb1f6dca6fc5eb6cf5847baaf1a9019da 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -16,17 +16,10 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/pooling.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/fast_divmod.h" -#ifdef __HIPCC__ -#define POOLING_BLOCK_SIZE 256 -#else -#define POOLING_BLOCK_SIZE 512 -#endif - namespace paddle { namespace operators { namespace math { @@ -97,22 +90,6 @@ __device__ void OffsetPreparationFor4Dimension( } } -int GetThreadsPerBlock(const platform::CUDADeviceContext& ctx, - int threads_per_block, int64_t numel) { - int sm_count = ctx.GetSMCount(); - if (numel / (sm_count << 1) < threads_per_block) { - // Round up threads number into an exponential multiple of 2, while number - // of acitve blocks is about twice of SM, to acquire better performance. - threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 1)); - } else if (numel / (sm_count << 2) < threads_per_block) { - // Round up threads number into an exponential multiple of 2, while number - // of acitve blocks is about 4 times of SM, to acquire better performance. - threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 2)); - } - // Number of threads per block shall be larger than 64. - return std::max(64, threads_per_block); -} - template __global__ void KernelPool2D( const int nthreads, const T* input_data, const int channels, @@ -491,14 +468,13 @@ class Pool2dGradFunctor { T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int nthreads = batch_size * input_channels * input_height * input_width; - int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads); - int grids = (nthreads + blocks - 1) / blocks; - auto pool_divmods = FastDivModForPoolingWithMoreStaff( input_channels, input_width, input_height, ksize_width, ksize_height, stride_width, stride_height); - KernelPool2DGrad<<>>( + auto config = GetGpuLaunchConfig1D(context, nthreads); + KernelPool2DGrad<<< + config.block_per_grid, config.thread_per_block, 0, context.stream()>>>( nthreads, input_data, output_data, output_grad_data, output_width, output_height, input_width, input_height, ksize_width, ksize_height, stride_width, stride_height, padding_width, padding_height, @@ -541,14 +517,13 @@ class Pool2dGradFunctor { T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int nthreads = batch_size * input_channels * input_height * input_width; - int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads); - int grids = (nthreads + blocks - 1) / blocks; - auto pool_divmods = FastDivModForPoolingWithMoreStaff( input_channels, input_width, input_height, ksize_width, ksize_height, stride_width, stride_height); - KernelPool2DGrad<<>>( + auto config = GetGpuLaunchConfig1D(context, nthreads); + KernelPool2DGrad<<< + config.block_per_grid, config.thread_per_block, 0, context.stream()>>>( nthreads, input_data, output_data, output_grad_data, output_width, output_height, input_width, input_height, ksize_width, ksize_height, stride_width, stride_height, padding_width, padding_height, diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h index 7fe2367b5510eb063214a422c679f414c59089f7..cd78a89088cc612c3fb43e489cfb7ef2e07cfcf3 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h @@ -26,22 +26,6 @@ namespace platform { #define CREATE_SHFL_MASK(mask, predicate) \ mask = __ballot_sync(FULL_WARP_MASK, (predicate)) -inline static int RoundToPowerOfTwo(int dim) { - if (dim > 512) { - return 1024; - } else if (dim > 256) { - return 512; - } else if (dim > 128) { - return 256; - } else if (dim > 64) { - return 128; - } else if (dim > 32) { - return 64; - } else { - return 32; - } -} - #define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ case (dim): { \ constexpr auto kPowerOfTwoDim = (dim); \ diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index 55f4c8eb4cd55ee42f2095f697724493cc35909b..883767348f06a99c32664ca2575880737b7418b5 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Used for compute gpu launch parameter +// Used for compute gpu launch parameter config #pragma once @@ -30,11 +30,36 @@ #include #include "paddle/fluid/platform/device_context.h" +#ifdef __HIPCC__ +// HIP results in error or nan if > 256 +#define PREDEFINED_BLOCK_SIZE 256 +#else +/* CUDA performs better as thread_per_block + num is between [64, 512] */ +#define PREDEFINED_BLOCK_SIZE 512 +#endif + namespace paddle { namespace platform { inline int DivUp(int a, int b) { return (a + b - 1) / b; } +/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + for round integer value into next highest power of 2. */ +static inline int RoundToPowerOfTwo(int n) { + n--; + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); +#ifdef __HIPCC__ + return std::min(256, std::max(32, (n + 1))); +#else + return std::min(1024, std::max(32, (n + 1))); +#endif +} + #ifdef WITH_NV_JETSON // The number of threads cannot be assigned 1024 in some cases when the device // is nano or tx2 . @@ -48,54 +73,64 @@ inline void ChangeThreadNum(const platform::CUDADeviceContext& context, #endif struct GpuLaunchConfig { - dim3 theory_thread_count = dim3(1, 1, 1); + public: + GpuLaunchConfig() {} + + size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); } + + size_t GetGridSize() const { + return block_per_grid.x * block_per_grid.y * block_per_grid.z; + } + + size_t GetBlockSize() const { + return thread_per_block.x * thread_per_block.y * thread_per_block.z; + } + + int compute_capability = 0; dim3 thread_per_block = dim3(1, 1, 1); dim3 block_per_grid = dim3(1, 1, 1); - int compute_capability = 0; }; +/* According to NVIDIA, if number of threads per block is 64/128/256/512, + * cuda performs better. And number of blocks should be greater (at least + * 2x~4x) than number of SMs. Hence, SM count is took into account within + * this function to determine the right number of threads per block. */ inline GpuLaunchConfig GetGpuLaunchConfig1D( - const platform::CUDADeviceContext& context, int64_t element_count, -#ifdef PADDLE_WITH_HIP - // HIP will throw GPU memory access fault if threads > 256 - int max_threads = 256) { -#else - int max_threads = 1024) { -#endif - PADDLE_ENFORCE_GT(element_count, 0, - platform::errors::InvalidArgument( - "element count should be greater than 0," - " but received value is: %d.", - element_count)); - - const int theory_thread_count = element_count; - // Get Max threads in all SM - int max_physical_threads = context.GetMaxPhysicalThreadCount(); - int sm = context.GetSMCount(); - - // Compute physical threads we need, should small than max sm threads - const int physical_thread_count = - (std::min)(max_physical_threads, theory_thread_count); - + const platform::CUDADeviceContext& context, int64_t numel, + int vec_size = 1) { + PADDLE_ENFORCE_GT(numel, 0, platform::errors::InvalidArgument( + "element quantity should be greater than 0," + " but received value is: %d.", + numel)); // Get compute_capability const int capability = context.GetComputeCapability(); - + /* If thread number per block is 64/128/256/512, cuda performs better.*/ + int limit_threads = + std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock()); #ifdef WITH_NV_JETSON if (capability == 53 || capability == 62) { - max_threads = 512; + limit_threads = 512; } #endif - - // Need get from device - const int thread_per_block = - (std::min)(max_threads, context.GetMaxThreadsPerBlock()); - const int block_count = - (std::min)(DivUp(physical_thread_count, thread_per_block), sm); + int threads = limit_threads; + int sm_count = context.GetSMCount(); + int active_threads_num = numel / vec_size; + if (active_threads_num / (sm_count << 1) < limit_threads) { + // Round up threads number into an exponential multiple of 2, while number + // of acitve blocks is about twice of SM, to acquire better performance. + threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1)); + } else if (active_threads_num / (sm_count << 2) < limit_threads) { + // Round up threads number into an exponential multiple of 2, while number + // of acitve blocks is about 4 times of SM, to acquire better performance. + threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2)); + } + // Number of threads per block shall be larger than 64. + threads = std::max(64, threads); + int blocks = DivUp(DivUp(numel, vec_size), threads); GpuLaunchConfig config; - config.theory_thread_count.x = theory_thread_count; - config.thread_per_block.x = thread_per_block; - config.block_per_grid.x = block_count; + config.thread_per_block.x = threads; + config.block_per_grid.x = blocks; config.compute_capability = capability; return config; } @@ -120,7 +155,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D( GpuLaunchConfig config; // Noticed, block size is not align to 32, if needed do it yourself. - config.theory_thread_count = dim3(x_dim, y_dim, 1); config.thread_per_block = dim3(block_cols, block_rows, 1); int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks); diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h index 2263383f8fabb0c6b4d8cd30af4cf491f08520eb..13ffc2396946c5819c9276cf474d96a8057c4094 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h @@ -24,19 +24,6 @@ namespace platform { #define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) -inline static int RoundToPowerOfTwo(int dim) { - // HIP results in error or nan if > 256 - if (dim > 128) { - return 256; - } else if (dim > 64) { - return 128; - } else if (dim > 32) { - return 64; - } else { - return 32; - } -} - #define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ case (dim): { \ constexpr auto kPowerOfTwoDim = (dim); \ diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index e4cc894e48354b8d7e91aeee74384a7df0891ff3..049e430154a8ba1f209f885e6dd06383c2b65499 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -16,9 +16,9 @@ limitations under the License. */ #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/function_traits.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/funcs/cuda_kernel_config.h" namespace pten { @@ -239,18 +239,15 @@ void ElementwiseCudaKernel(const KPDevice &ctx, VecSize><<>>( ins_data, outs_data, numel, main_offset, func); #else - int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize); - int grid_size = - ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size; - int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size; + auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize); + int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize * + gpu_config.GetBlockSize(); auto stream = ctx.stream(); - VectorizedElementwiseKernel<<>>( - ins_data, outs_data, numel, main_offset, func); + VectorizedElementwiseKernel<<< + gpu_config.block_per_grid, + gpu_config.thread_per_block, + 0, + stream>>>(ins_data, outs_data, numel, main_offset, func); #endif }