diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 21fdf69ac570ac6972173d77194275d629ce436f..2fa956a2e6515e8b6a8e1c463c8ab8d1476f8d90 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
 namespace operators {
@@ -270,32 +271,38 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
                                 const Tensor& mask, int64_t size,
                                 Tensor* grad_x, bool is_test = false) {
   using MT = typename details::MPTypeTrait<T>::Type;
-  auto dX = EigenVector<T>::Flatten(*grad_x);
-  auto dY = EigenVector<T>::Flatten(grad_y);
-
-  auto& place = *dev_ctx.eigen_device();
+  auto stream = dev_ctx.stream();
+  MT factor;
   if (is_test) {
     if (dropout_implementation == "upscale_in_train") {
-      dX.device(place) = static_cast<T>(1) * dY;
+      factor = static_cast<MT>(1.0f);
     } else {
-      dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob);
+      factor = static_cast<MT>(1.0f - dropout_prob);
     }
+    std::vector<const framework::Tensor*> ins = {&grad_y};
+    std::vector<framework::Tensor*> outs = {grad_x};
+    auto functor = phi::funcs::ScaleFunctor<T>(factor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
   } else {
-    auto M = EigenVector<uint8_t>::Flatten(mask);
+    std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
+    std::vector<framework::Tensor*> outs = {grad_x};
     if (dropout_implementation == "upscale_in_train") {
       if (dropout_prob == 1.0f) {
-        dX.device(place) = static_cast<T>(0) * dY;
+#ifdef PADDLE_WITH_HIP
+        hipMemset(grad_x->data<T>(), 0, size * sizeof(T));
+#else
+        cudaMemset(grad_x->data<T>(), 0, size * sizeof(T));
+#endif
       } else {
-        auto factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
-        auto stream = dev_ctx.stream();
-        std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
-        std::vector<framework::Tensor*> outs = {grad_x};
-        auto functor = CudaDropoutGradFunctor<T, uint8_t>(factor);
+        factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
         paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-            dev_ctx, ins, &outs, functor);
+            dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
       }
     } else {
-      dX.device(place) = dY * M.cast<T>();
+      factor = static_cast<MT>(1.0f);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+          dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
     }
   }
 }
diff --git a/paddle/phi/kernels/funcs/functors.h b/paddle/phi/kernels/funcs/functors.h
index 5657bb047d7aa3a9b0f65d845d03e04c5b3636ae..d518a877b26f2c3d295eb0ceda8d4b862006e633 100644
--- a/paddle/phi/kernels/funcs/functors.h
+++ b/paddle/phi/kernels/funcs/functors.h
@@ -38,12 +38,15 @@ struct AddGradFunctor {
 
 template <typename T>
 struct ScaleFunctor {
-  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
+  using MT = typename paddle::operators::details::MPTypeTrait<T>::Type;
+  explicit ScaleFunctor(const MT coeff) : coeff_(coeff) {}
 
-  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
+  inline HOSTDEVICE T operator()(T ele) {
+    return static_cast<T>(static_cast<MT>(ele) * coeff_);
+  }
 
  private:
-  T coeff_;
+  MT coeff_;
 };
 
 template <typename T>