replace implementation with cuda kernel (#39795)

64f1485a · Zhang Ting · GitHub · bbe5228c · 64f1485a · 64f1485a
隐藏空白更改
内联并排

Showing with 28 addition and 18 deletion

paddle/fluid/operators/dropout_impl.cu.h paddle/fluid/operators/dropout_impl.cu.h +22 -15

paddle/phi/kernels/funcs/functors.h paddle/phi/kernels/funcs/functors.h +6 -3

未找到文件。
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/functors.h"
 namespace paddle {
 namespace operators {
@@ -270,32 +271,38 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
                                const Tensor& mask, int64_t size,
                                Tensor* grad_x, bool is_test = false) {
  using MT = typename details::MPTypeTrait<T>::Type;
-  auto dX = EigenVector<T>::Flatten(*grad_x);
+  auto stream = dev_ctx.stream();
-  auto dY = EigenVector<T>::Flatten(grad_y);
+  MT factor;
-  auto& place = *dev_ctx.eigen_device();
  if (is_test) {
    if (dropout_implementation == "upscale_in_train") {
-      dX.device(place) = static_cast<T>(1) * dY;
+      factor = static_cast<MT>(1.0f);
    } else {
-      dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob);
+      factor = static_cast<MT>(1.0f - dropout_prob);
    }
+    std::vector<const framework::Tensor*> ins = {&grad_y};
+    std::vector<framework::Tensor*> outs = {grad_x};
+    auto functor = phi::funcs::ScaleFunctor<T>(factor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
  } else {
-    auto M = EigenVector<uint8_t>::Flatten(mask);
+    std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
+    std::vector<framework::Tensor*> outs = {grad_x};
    if (dropout_implementation == "upscale_in_train") {
      if (dropout_prob == 1.0f) {
-        dX.device(place) = static_cast<T>(0) * dY;
+#ifdef PADDLE_WITH_HIP
+        hipMemset(grad_x->data<T>(), 0, size * sizeof(T));
+#else
+        cudaMemset(grad_x->data<T>(), 0, size * sizeof(T));
+#endif
      } else {
-        auto factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
+        factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
-        auto stream = dev_ctx.stream();
-        std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
-        std::vector<framework::Tensor*> outs = {grad_x};
-        auto functor = CudaDropoutGradFunctor<T, uint8_t>(factor);
        paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-            dev_ctx, ins, &outs, functor);
+            dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
      }
    } else {
-      dX.device(place) = dY * M.cast<T>();
+      factor = static_cast<MT>(1.0f);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+          dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
    }
  }
 }

--- a/paddle/phi/kernels/funcs/functors.h
+++ b/paddle/phi/kernels/funcs/functors.h
@@ -38,12 +38,15 @@ struct AddGradFunctor {
 template <typename T>
 struct ScaleFunctor {
-  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
+  using MT = typename paddle::operators::details::MPTypeTrait<T>::Type;
+  explicit ScaleFunctor(const MT coeff) : coeff_(coeff) {}
-  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
+  inline HOSTDEVICE T operator()(T ele) {
+    return static_cast<T>(static_cast<MT>(ele) * coeff_);
+  }
 private:
-  T coeff_;
+  MT coeff_;
 };
 template <typename T>