Memory/dropout4 (#8407)

* "merge random generator kernel and mul" * "fix dropout"

Memory/dropout4 (#8407)
* "merge random generator kernel and mul" * "fix dropout"
07923ba0 · dzhwinter · GitHub · 91aac572 · 07923ba0
隐藏空白更改
内联并排

Showing with 21 addition and 21 deletion

paddle/fluid/operators/dropout_op.cu paddle/fluid/operators/dropout_op.cu +21 -21

未找到文件。
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -23,24 +23,23 @@ namespace paddle {
 namespace operators {

 template <typename T, typename AttrType>
-struct MaskGenerator {
-  AttrType dropout_prob;
-  int seed;
+__global__ void RandomGenerator(const size_t n, const int seed,
+                                const AttrType dropout_prob, const T* src,
+                                T* mask_data, T* dst) {
+  thrust::minstd_rand rng;
+  rng.seed(seed);
+  thrust::uniform_real_distribution<AttrType> dist(0, 1);

-  __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
-      : dropout_prob(dropout_prob), seed(seed) {}
-
-  inline __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed);
-    thrust::uniform_real_distribution<AttrType> dist(0, 1);
-    rng.discard(n);
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < n; idx += blockDim.x * gridDim.x) {
    if (dist(rng) < dropout_prob) {
-      return static_cast<T>(0);
+      mask_data[idx] = static_cast<T>(0);
+    } else {
+      mask_data[idx] = static_cast<T>(1);
    }
-    return static_cast<T>(1);
+    dst[idx] = mask_data[idx] * src[idx];
  }
-};
+}

 // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
@@ -61,18 +60,19 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
    if (!context.Attr<bool>("is_test")) {
      auto* mask = context.Output<Tensor>("Mask");
      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
-      int size = framework::product(mask->dims());
+      size_t size = framework::product(mask->dims());
+      auto* x_data = x->data<T>();
+      auto* y_data = y->mutable_data<T>(context.GetPlace());

      std::random_device rnd;
      int seed =
          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();

-      thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(mask_data),
-                        MaskGenerator<T, AttrType>(dropout_prob, seed));
-      auto M = EigenMatrix<T>::Reshape(*mask, 1);
-      Y.device(place) = X * M;
+      int threads = 512;
+      int grid = (x->numel() + threads - 1) / threads;
+      RandomGenerator<T, AttrType><<<grid, threads, 0,
+                                     context.cuda_device_context().stream()>>>(
+          size, seed, dropout_prob, x_data, mask_data, y_data);
    } else {
      Y.device(place) = X * (1.0f - dropout_prob);
    }