From acef85b2c6d75d588712d21a37d6a9d4a2af8adc Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Sun, 26 Dec 2021 14:30:55 +0800 Subject: [PATCH] improve forward performace (#38279) --- paddle/fluid/operators/dropout_impl.cu.h | 39 ++++++++++++------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index c97a523caa..7491d6189e 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h" namespace paddle { namespace operators { @@ -180,9 +181,6 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, return; } - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, size); - // increment is used to set the args(offset) of curand_init, which defines // offset in subsequence. // The detail: @@ -192,11 +190,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, // same as the previous calls. uint64_t seed_data; uint64_t increment; - int vec_size = platform::GetVectorizedSize(x_data); - auto offset = ((x_numel - 1) / (config.block_per_grid.x * - config.thread_per_block.x * vec_size) + - 1) * - vec_size; + // VectorizedRandomGenerator use curand_uniform4, so we only support + // vec_size is 4; + int vec_size = (platform::GetVectorizedSize(x_data) == 4) ? 4 : 1; + int block_size = pten::GetThreadsConfig(dev_ctx, x_numel, vec_size); + int grid_size = + ((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size; + + auto offset = + ((x_numel - 1) / (grid_size * block_size * vec_size) + 1) * vec_size; GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment); @@ -204,26 +206,23 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, #ifdef __HIPCC__ if (vec_size == 4 && size % 4 == 0) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(VectorizedRandomGenerator), - config.block_per_grid, config.thread_per_block, 0, stream, size, - seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, - increment); + HIP_KERNEL_NAME(VectorizedRandomGenerator), grid_size, + block_size, 0, stream, size, seed_data, dropout_prob, x_data, + mask_data, y_data, upscale_in_train, increment); } else { hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator), - config.block_per_grid, config.thread_per_block, 0, - stream, size, seed_data, dropout_prob, x_data, - mask_data, y_data, upscale_in_train, increment); + grid_size, block_size, 0, stream, size, seed_data, + dropout_prob, x_data, mask_data, y_data, + upscale_in_train, increment); } #else if (vec_size == 4 && size % 4 == 0) { - VectorizedRandomGenerator< - T, uint8_t, - 4><<>>( + VectorizedRandomGenerator<<>>( size, seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, increment); } else { - RandomGenerator<<>>( + RandomGenerator<<>>( size, seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, increment); } -- GitLab