add dropout fp32 (#39501)

b81358d1 · sneaxiy · GitHub · 8cedcd3e · b81358d1
隐藏空白更改
内联并排

Showing with 26 addition and 12 deletion

paddle/fluid/operators/dropout_impl.cu.h paddle/fluid/operators/dropout_impl.cu.h +26 -12

未找到文件。
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/dropout_impl_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
@@ -45,6 +46,7 @@ __global__ void RandomGenerator(const size_t n, uint64_t seed,
                                const float dropout_prob, const T* src,
                                MaskType* mask, T* dst,
                                bool is_upscale_in_train, uint64_t increment) {
+  using MT = typename details::MPTypeTrait<T>::Type;
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
 #ifdef PADDLE_WITH_HIP
  hiprandStatePhilox4_32_10_t state;
@@ -56,7 +58,7 @@ __global__ void RandomGenerator(const size_t n, uint64_t seed,

  MaskType mask_val;
  T dst_val;
-  T factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
+  MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
  for (; idx < n; idx += blockDim.x * gridDim.x) {
    T src_val = src[idx];
 #ifdef PADDLE_WITH_HIP
@@ -68,7 +70,9 @@ __global__ void RandomGenerator(const size_t n, uint64_t seed,
      dst_val = 0;
    } else {
      mask_val = 1;
-      dst_val = is_upscale_in_train ? src_val * factor : src_val;
+      dst_val = is_upscale_in_train
+                    ? static_cast<T>(static_cast<MT>(src_val) * factor)
+                    : src_val;
    }
    mask[idx] = mask_val;
    dst[idx] = dst_val;
@@ -81,6 +85,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
                                          const T* src, MaskType* mask, T* dst,
                                          bool is_upscale_in_train,
                                          uint64_t increment) {
+  using MT = typename details::MPTypeTrait<T>::Type;
  using LoadT = platform::AlignedVector<T, VecSize>;
  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;

@@ -94,7 +99,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
  curand_init(seed, idx, increment, &state);
 #endif

-  T factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
+  MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
  for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
    LoadT src_val;
    platform::Load<T, VecSize>(&src[i], &src_val);
@@ -114,7 +119,9 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
        dst_val[j] = 0;
        mask_val[j] = 0;
      } else {
-        dst_val[j] = is_upscale_in_train ? src_val[j] * factor : src_val[j];
+        dst_val[j] = is_upscale_in_train
+                         ? static_cast<T>(static_cast<MT>(src_val[j]) * factor)
+                         : src_val[j];
        mask_val[j] = 1;
      }
    }
@@ -126,21 +133,26 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,

 template <typename T, typename MaskType>
 struct CudaDropoutGradFunctor {
-  explicit CudaDropoutGradFunctor(const T factor) : factor_(factor) {}
+  using MT = typename details::MPTypeTrait<T>::Type;
+
+  explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}

  __device__ __forceinline__ T operator()(const T dout,
                                          const MaskType mask) const {
-    return dout * static_cast<T>(mask) * factor_;
+    return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) *
+                          factor_);
  }

 private:
-  T factor_;
+  MT factor_;
 };

 template <typename T, typename MaskType, int VecSize>
-__global__ void DropoutGradCUDAKernel(const T* dout, const MaskType* mask,
-                                      const T factor, const int64_t size,
-                                      T* dx) {
+__global__ void DropoutGradCUDAKernel(
+    const T* dout, const MaskType* mask,
+    const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
+    T* dx) {
+  using MT = typename details::MPTypeTrait<T>::Type;
  using LoadT = platform::AlignedVector<T, VecSize>;
  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;

@@ -156,7 +168,8 @@ __global__ void DropoutGradCUDAKernel(const T* dout, const MaskType* mask,

 #pragma unroll
    for (int j = 0; j < VecSize; j++) {
-      dx_val[j] = dout_val[j] * static_cast<T>(mask_val[j]) * factor;
+      dx_val[j] = static_cast<T>(static_cast<MT>(dout_val[j]) *
+                                 static_cast<MT>(mask_val[j]) * factor);
    }

    platform::Store<T, VecSize>(dx_val, &dx[i]);
@@ -257,6 +270,7 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
                                float dropout_prob, const Tensor& grad_y,
                                const Tensor& mask, int64_t size,
                                Tensor* grad_x, bool is_test = false) {
+  using MT = typename details::MPTypeTrait<T>::Type;
  auto dX = EigenVector<T>::Flatten(*grad_x);
  auto dY = EigenVector<T>::Flatten(grad_y);

@@ -273,7 +287,7 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
      if (dropout_prob == 1.0f) {
        dX.device(place) = static_cast<T>(0) * dY;
      } else {
-        auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
+        auto factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
        auto stream = dev_ctx.stream();
        std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
        std::vector<framework::Tensor*> outs = {grad_x};