From 1e9127f688caf6e052bfec224982fefc4583a97c Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Wed, 16 Dec 2020 16:50:35 +0800 Subject: [PATCH] improve dropout grad (#29605) * improve grad perf --- paddle/fluid/operators/dropout_op.cu | 38 +++++----------- paddle/fluid/operators/dropout_op.h | 68 ++++++++++++++++++++++++++-- 2 files changed, 77 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index 2e4b9a1316..cf90b9eb52 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -27,22 +27,6 @@ limitations under the License. */ namespace paddle { namespace operators { -// aligned vector generates vectorized load/store on CUDA -template -struct alignas(sizeof(T) * Size) AlignedVector { - T val[Size]; -}; - -template -inline int VectorizedSize(const T* pointer) { - uint64_t address = reinterpret_cast(pointer); - constexpr int vec4 = std::alignment_of>::value; // NOLINT - if (address % vec4 == 0) { - return 4; - } - return 1; -} - template __global__ void RandomGenerator(const size_t n, uint64_t seed, const float dropout_prob, const T* src, @@ -154,12 +138,9 @@ class GPUDropoutKernel : public framework::OpKernel { return; } - int threads = 512; - int grid = (x_numel + threads - 1) / threads; const auto& dev_ctx = context.cuda_device_context(); - int blocks_per_sm = - dev_ctx.GetMaxPhysicalThreadCount() / dev_ctx.GetSMCount() / threads; - grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid); + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(dev_ctx, size); // increment is used to set the args(offset) of curand_init, which defines // offset in subsequence. @@ -171,8 +152,10 @@ class GPUDropoutKernel : public framework::OpKernel { uint64_t seed_data; uint64_t increment; int vec_size = VectorizedSize(x_data); - auto offset = - ((x_numel - 1) / (threads * grid * vec_size) + 1) * vec_size; + auto offset = ((x_numel - 1) / (config.block_per_grid.x * + config.thread_per_block.x * vec_size) + + 1) * + vec_size; int device_id = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()) .GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); @@ -197,12 +180,15 @@ class GPUDropoutKernel : public framework::OpKernel { increment = offset; } - if (vec_size == 4) { - VectorizedRandomGenerator<<>>( + if (vec_size == 4 && size % 4 == 0) { + VectorizedRandomGenerator< + T, uint8_t, + 4><<>>( size, seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, increment); } else { - RandomGenerator<<>>( + RandomGenerator<<>>( size, seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train, increment); } diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h index 161c4282ec..1f7f7ac224 100644 --- a/paddle/fluid/operators/dropout_op.h +++ b/paddle/fluid/operators/dropout_op.h @@ -17,13 +17,59 @@ limitations under the License. */ #include #include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/gpu_launch_config.h" namespace paddle { namespace operators { +// aligned vector generates vectorized load/store on CUDA +template +struct alignas(sizeof(T) * Size) AlignedVector { + T val[Size]; +}; + +template +inline int VectorizedSize(const T* pointer) { + uint64_t address = reinterpret_cast(pointer); + constexpr int vec4 = std::alignment_of>::value; // NOLINT + if (address % vec4 == 0) { + return 4; + } + return 1; +} + +#ifdef __NVCC__ +template +__global__ void DropoutGradCUDAKernel(const T* dout, const MaskType* mask, + const T factor, const int64_t size, + T* dx) { + int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; + + using LoadT = AlignedVector; + using MaskLoadT = AlignedVector; + + for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { + T dout_vec[VecSize]; + LoadT* value = reinterpret_cast(&dout_vec); + *value = *reinterpret_cast(&dout[i]); + + T dx_vec[VecSize]; + MaskType mask_vec[VecSize]; + +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + dx_vec[ii] = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; + } + + *(reinterpret_cast(&dx[i])) = *reinterpret_cast(&dx_vec[0]); + } +} +#endif + using Tensor = framework::Tensor; template @@ -119,6 +165,7 @@ class DropoutGradKernel : public framework::OpKernel { auto* grad_y = context.Input(framework::GradVarName("Out")); auto* mask = context.Input("Mask"); grad_x->mutable_data(context.GetPlace()); + auto size = grad_x->numel(); auto M = EigenVector::Flatten(*mask); auto dX = EigenVector::Flatten(*grad_x); @@ -126,7 +173,6 @@ class DropoutGradKernel : public framework::OpKernel { auto& place = *context.template device_context().eigen_device(); - auto& dropout_implementation = context.Attr("dropout_implementation"); if (dropout_implementation == "upscale_in_train") { @@ -134,8 +180,24 @@ class DropoutGradKernel : public framework::OpKernel { if (dropout_prob == 1.0f) { dX.device(place) = static_cast(0) * dY; } else { - dX.device(place) = - dY * M.cast() / static_cast(1.0f - dropout_prob); + int vec_size = VectorizedSize(grad_y->data()); + if (platform::is_gpu_place(context.GetPlace()) && vec_size == 4 && + size % 4 == 0) { +#ifdef __NVCC__ + auto factor = static_cast(1.0f / (1.0f - dropout_prob)); + auto stream = context.cuda_device_context().stream(); + platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D( + context.cuda_device_context(), size); + DropoutGradCUDAKernel< + T, uint8_t, + 4><<>>( + grad_y->data(), mask->data(), factor, size, + grad_x->data()); +#endif + } else { + dX.device(place) = + dY * M.cast() / static_cast(1.0f - dropout_prob); + } } } else { dX.device(place) = dY * M.cast(); -- GitLab