change CUDA implementaion of bernoulli OP (#39732)

* change CUDA implementaion of bernoulli OP * fix CI

change CUDA implementaion of bernoulli OP (#39732)
* change CUDA implementaion of bernoulli OP * fix CI
b9675acc · zhouweiwei2014 · GitHub · 69a04209 · b9675acc · b9675acc
8 changed file
--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/fluid/operators/distribution_helper.h
@@ -180,8 +180,8 @@ struct normal_distribution<double> {
 /******** Launch GPU function of distribution and transformation *********/
 template <typename T, typename DistOp, typename TransformOp>
 __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
-                                   DistOp dist, TransformOp trans,
-                                   T *out_data) {
+                                   DistOp dist, TransformOp trans, T *out_data,
+                                   size_t stride) {
  size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
  static constexpr int kCount = DistOp::kReturnsCount;
 #if defined(__NVCC__)
@@ -201,7 +201,8 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(&result[0], &args[0],
                                                           trans);
    kps::WriteData<T, T, kCount, 1, 1, true>(out_data + i, &result[0], size - i,
-                                             1, total_thread, 1);
+                                             1, stride, 1);
+    __syncthreads();
  }
 }

@@ -234,7 +235,7 @@ void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,

  DistributionKernel<
      T, DistOp, TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      size, seed, offset, dist, trans, out_data);
+      size, seed, offset, dist, trans, out_data, total_thread);
 }

 #endif

--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -29,6 +29,7 @@
 #include <string>
 #include <vector>
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/enforce.h"

 #ifdef __HIPCC__
 // HIP results in error or nan if > 256

--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -12,19 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <thrust/execution_policy.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#endif
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bernoulli_kernel.h"

 // See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/platform/transform.h"

+DECLARE_bool(use_curand);
+
 namespace phi {

 template <typename T>
@@ -49,15 +60,57 @@ struct BernoulliCudaFunctor {
  }
 };

+// 'curand_uniform4/hiprand_uniform4' generate 4 random number each time
+template <typename T>
+__global__ void bernoulli_cuda_kernel(
+    size_t size, uint64_t seed, uint64_t offset, const T* x_data, T* out_data) {
+  size_t thread_idx =
+      static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+
+#if defined(__NVCC__)
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, thread_idx, offset, &state);
+#else
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, thread_idx, offset, &state);
+#endif
+
+  size_t total_thread = gridDim.x * blockDim.x;
+  for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) {
+    paddle::distribution::uniform_distribution<float> dist;
+    float4 rand = dist(&state);
+#pragma unroll
+    for (size_t j = 0; j < 4; j++) {
+      size_t idx = i + j;
+      if (idx < size) {
+        out_data[idx] = static_cast<T>((&rand.x)[j] <= x_data[idx]);
+      }
+    }
+  }
+}
+
 template <typename T, typename Context>
 void BernoulliKernel(const Context& ctx,
                     const DenseTensor& x,
                     DenseTensor* out) {
-  auto numel = x.numel();
-  auto* x_data = x.data<T>();
+  const T* x_data = x.data<T>();
  T* out_data = ctx.template Alloc<T>(out);
+  auto numel = x.numel();

  auto gen_cuda = ctx.GetGenerator();
+
+  if (FLAGS_use_curand) {
+    auto seed_offset = gen_cuda->IncrementOffset(12);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, 4);
+    size_t grid_size = gpu_config.GetGridSize();
+    size_t block_size = gpu_config.GetBlockSize();
+
+    bernoulli_cuda_kernel<<<grid_size, block_size, 0, ctx.stream()>>>(
+        numel, seed, offset, x_data, out_data);
+  } else {
    auto seed_offset = gen_cuda->IncrementOffset(1);
    int64_t gen_offset = numel * seed_offset.second;
    paddle::platform::Transform<phi::GPUContext> trans;
@@ -69,6 +122,7 @@ void BernoulliKernel(const Context& ctx,
          out_data,
          BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
                                  static_cast<int64_t>(gen_offset)));
+  }
 }

 }  // namespace phi

--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -18,6 +18,7 @@ import unittest
 import paddle
 from op_test import OpTest
 import numpy as np
+import os


 def output_hist(out):
@@ -68,5 +69,43 @@ class TestBernoulliApi(unittest.TestCase):
                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))


+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(100)
+        np.random.seed(100)
+
+        x_np = np.random.rand(32, 1024, 1024)
+
+        x = paddle.to_tensor(x_np, dtype='float64')
+        y = paddle.bernoulli(x).numpy()
+        index0, index1, index2 = np.nonzero(y)
+        self.assertEqual(np.sum(index0), 260028995)
+        self.assertEqual(np.sum(index1), 8582429431)
+        self.assertEqual(np.sum(index2), 8581445798)
+        expect = [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]
+        self.assertTrue(np.array_equal(y[16, 500, 500:510], expect))
+
+        x = paddle.to_tensor(x_np, dtype='float32')
+        y = paddle.bernoulli(x).numpy()
+        index0, index1, index2 = np.nonzero(y)
+        self.assertEqual(np.sum(index0), 260092343)
+        self.assertEqual(np.sum(index1), 8583509076)
+        self.assertEqual(np.sum(index2), 8582778540)
+        expect = [0., 0., 1., 1., 1., 1., 0., 1., 1., 1.]
+        self.assertTrue(np.array_equal(y[16, 500, 500:510], expect))
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_exponential_op.py
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -16,6 +16,7 @@ import unittest
 import paddle
 import numpy as np
 from op_test import OpTest
+import os

 paddle.enable_static()
 paddle.seed(100)
@@ -90,18 +91,18 @@ class TestExponentialAPI(unittest.TestCase):
        self.assertTrue(np.min(x.numpy()) >= 0)
        paddle.enable_static()

-    # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
        if not paddle.is_compiled_with_cuda():
            return

-        # Note(zhouwei): The Number of threads is determined by 
-        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
-        # GPU have different number of threads, which result in different 
-        # random value. Only test on V100 GPU here.
+        # Different GPU generatte different random value. Only test V100 here.
        if not "V100" in paddle.device.cuda.get_device_name():
            return

+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
        print("Test Fixed Random number on V100 GPU------>")
        paddle.disable_static()
        paddle.set_device('gpu')

--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -14,6 +14,7 @@

 from __future__ import print_function

+import os
 import unittest
 import numpy as np
 import paddle
@@ -293,13 +294,13 @@ class TestRandomValue(unittest.TestCase):
        if not paddle.is_compiled_with_cuda():
            return

-        # Note(zhouwei): The Number of threads is determined by 
-        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
-        # GPU have different number of threads, which result in different 
-        # random value. Only test on V100 GPU here.
+        # Different GPU generatte different random value. Only test V100 here.
        if not "V100" in paddle.device.cuda.get_device_name():
            return

+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
        def _check_random_value(dtype, expect, expect_mean, expect_std):
            x = paddle.randn([32, 3, 1024, 1024], dtype=dtype)
            actual = x.numpy()

--- a/python/paddle/fluid/tests/unittests/test_poisson_op.py
+++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py
@@ -17,6 +17,7 @@ import paddle
 import numpy as np
 from op_test import OpTest
 import math
+import os

 paddle.enable_static()
 paddle.seed(100)
@@ -101,11 +102,15 @@ class TestPoissonAPI(unittest.TestCase):
        self.assertTrue(np.min(y.numpy()) >= 0)
        paddle.enable_static()

-    # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
        if not paddle.is_compiled_with_cuda():
            return

+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
        paddle.disable_static()
        paddle.set_device('gpu')
        paddle.seed(2021)

--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function

 import sys
+import os
 import subprocess
 import unittest
 import numpy as np
@@ -568,13 +569,13 @@ class TestRandomValue(unittest.TestCase):
        if not paddle.is_compiled_with_cuda():
            return

-        # Note(zhouwei): The Number of threads is determined by 
-        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
-        # GPU have different number of threads, which result in different 
-        # random value. Only test on V100 GPU here.
+        # Different GPU generate different random value. Only test V100 here.
        if not "V100" in paddle.device.cuda.get_device_name():
            return

+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
        def _check_random_value(dtype, expect, expect_mean, expect_std):
            x = paddle.rand([32, 3, 1024, 1024], dtype=dtype)
            actual = x.numpy()