From 07923ba006220bf39ebd9fcf19c6b930012e5139 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Mon, 12 Feb 2018 17:56:15 +0800
Subject: [PATCH] Memory/dropout4 (#8407)

* "merge random generator kernel and mul"

* "fix dropout"
---
 paddle/fluid/operators/dropout_op.cu | 42 ++++++++++++++--------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 4ae9f4ce54d..a4a96d48f99 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -23,24 +23,23 @@ namespace paddle {
 namespace operators {
 
 template <typename T, typename AttrType>
-struct MaskGenerator {
-  AttrType dropout_prob;
-  int seed;
+__global__ void RandomGenerator(const size_t n, const int seed,
+                                const AttrType dropout_prob, const T* src,
+                                T* mask_data, T* dst) {
+  thrust::minstd_rand rng;
+  rng.seed(seed);
+  thrust::uniform_real_distribution<AttrType> dist(0, 1);
 
-  __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
-      : dropout_prob(dropout_prob), seed(seed) {}
-
-  inline __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed);
-    thrust::uniform_real_distribution<AttrType> dist(0, 1);
-    rng.discard(n);
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < n; idx += blockDim.x * gridDim.x) {
     if (dist(rng) < dropout_prob) {
-      return static_cast<T>(0);
+      mask_data[idx] = static_cast<T>(0);
+    } else {
+      mask_data[idx] = static_cast<T>(1);
     }
-    return static_cast<T>(1);
+    dst[idx] = mask_data[idx] * src[idx];
   }
-};
+}
 
 // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
@@ -61,18 +60,19 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
-      int size = framework::product(mask->dims());
+      size_t size = framework::product(mask->dims());
+      auto* x_data = x->data<T>();
+      auto* y_data = y->mutable_data<T>(context.GetPlace());
 
       std::random_device rnd;
       int seed =
           context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
 
-      thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(mask_data),
-                        MaskGenerator<T, AttrType>(dropout_prob, seed));
-      auto M = EigenMatrix<T>::Reshape(*mask, 1);
-      Y.device(place) = X * M;
+      int threads = 512;
+      int grid = (x->numel() + threads - 1) / threads;
+      RandomGenerator<T, AttrType><<<grid, threads, 0,
+                                     context.cuda_device_context().stream()>>>(
+          size, seed, dropout_prob, x_data, mask_data, y_data);
     } else {
       Y.device(place) = X * (1.0f - dropout_prob);
     }
-- 
GitLab