diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index c97a523caa7673c4acc3fe7cf9022b22071e26f4..7491d6189ebde3b4c70f16c0b9b6e66eea535605 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h" namespace paddle { namespace operators { @@ -180,9 +181,6 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, return; } - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, size); - // increment is used to set the args(offset) of curand_init, which defines // offset in subsequence. // The detail: @@ -192,11 +190,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, // same as the previous calls. uint64_t seed_data; uint64_t increment; - int vec_size = platform::GetVectorizedSize(x_data); - auto offset = ((x_numel - 1) / (config.block_per_grid.x * - config.thread_per_block.x * vec_size) + - 1) * - vec_size; + // VectorizedRandomGenerator use curand_uniform4, so we only support + // vec_size is 4; + int vec_size = (platform::GetVectorizedSize(x_data) == 4) ? 4 : 1; + int block_size = pten::GetThreadsConfig(dev_ctx, x_numel, vec_size); + int grid_size = + ((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size; + + auto offset = + ((x_numel - 1) / (grid_size * block_size * vec_size) + 1) * vec_size; GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment); @@ -204,26 +206,23 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, #ifdef __HIPCC__ if (vec_size == 4 && size % 4 == 0) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(VectorizedRandomGenerator), - config.block_per_grid, config.thread_per_block, 0, stream, size, - seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, - increment); + HIP_KERNEL_NAME(VectorizedRandomGenerator), grid_size, + block_size, 0, stream, size, seed_data, dropout_prob, x_data, + mask_data, y_data, upscale_in_train, increment); } else { hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator), - config.block_per_grid, config.thread_per_block, 0, - stream, size, seed_data, dropout_prob, x_data, - mask_data, y_data, upscale_in_train, increment); + grid_size, block_size, 0, stream, size, seed_data, + dropout_prob, x_data, mask_data, y_data, + upscale_in_train, increment); } #else if (vec_size == 4 && size % 4 == 0) { - VectorizedRandomGenerator< - T, uint8_t, - 4><<>>( + VectorizedRandomGenerator<<>>( size, seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, increment); } else { - RandomGenerator<<>>( + RandomGenerator<<>>( size, seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, increment); }