From 64f1485a79cb0b9b5a19fddebade0f6e0bf0bb3b Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Fri, 25 Feb 2022 13:03:07 +0800 Subject: [PATCH] replace implementation with cuda kernel (#39795) --- paddle/fluid/operators/dropout_impl.cu.h | 37 ++++++++++++++---------- paddle/phi/kernels/funcs/functors.h | 9 ++++-- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 21fdf69ac5..2fa956a2e6 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -36,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/kernels/funcs/functors.h" namespace paddle { namespace operators { @@ -270,32 +271,38 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, const Tensor& mask, int64_t size, Tensor* grad_x, bool is_test = false) { using MT = typename details::MPTypeTrait::Type; - auto dX = EigenVector::Flatten(*grad_x); - auto dY = EigenVector::Flatten(grad_y); - - auto& place = *dev_ctx.eigen_device(); + auto stream = dev_ctx.stream(); + MT factor; if (is_test) { if (dropout_implementation == "upscale_in_train") { - dX.device(place) = static_cast(1) * dY; + factor = static_cast(1.0f); } else { - dX.device(place) = dY * static_cast(1.0f - dropout_prob); + factor = static_cast(1.0f - dropout_prob); } + std::vector ins = {&grad_y}; + std::vector outs = {grad_x}; + auto functor = phi::funcs::ScaleFunctor(factor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } else { - auto M = EigenVector::Flatten(mask); + std::vector ins = {&grad_y, &mask}; + std::vector outs = {grad_x}; if (dropout_implementation == "upscale_in_train") { if (dropout_prob == 1.0f) { - dX.device(place) = static_cast(0) * dY; +#ifdef PADDLE_WITH_HIP + hipMemset(grad_x->data(), 0, size * sizeof(T)); +#else + cudaMemset(grad_x->data(), 0, size * sizeof(T)); +#endif } else { - auto factor = static_cast(1.0f / (1.0f - dropout_prob)); - auto stream = dev_ctx.stream(); - std::vector ins = {&grad_y, &mask}; - std::vector outs = {grad_x}; - auto functor = CudaDropoutGradFunctor(factor); + factor = static_cast(1.0f / (1.0f - dropout_prob)); paddle::operators::LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + dev_ctx, ins, &outs, CudaDropoutGradFunctor(factor)); } } else { - dX.device(place) = dY * M.cast(); + factor = static_cast(1.0f); + paddle::operators::LaunchSameDimsElementwiseCudaKernel( + dev_ctx, ins, &outs, CudaDropoutGradFunctor(factor)); } } } diff --git a/paddle/phi/kernels/funcs/functors.h b/paddle/phi/kernels/funcs/functors.h index 5657bb047d..d518a877b2 100644 --- a/paddle/phi/kernels/funcs/functors.h +++ b/paddle/phi/kernels/funcs/functors.h @@ -38,12 +38,15 @@ struct AddGradFunctor { template struct ScaleFunctor { - explicit ScaleFunctor(const T coeff) : coeff_(coeff) {} + using MT = typename paddle::operators::details::MPTypeTrait::Type; + explicit ScaleFunctor(const MT coeff) : coeff_(coeff) {} - inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; } + inline HOSTDEVICE T operator()(T ele) { + return static_cast(static_cast(ele) * coeff_); + } private: - T coeff_; + MT coeff_; }; template -- GitLab