change CUDA implementaion of uniform/gaussian OP (#38611)

* change CUDA implementaion of uniform/gaussian OP * fix unittest

change CUDA implementaion of uniform/gaussian OP (#38611)
* change CUDA implementaion of uniform/gaussian OP * fix unittest
bbbd75e4 · zhouweiwei2014 · GitHub · a998c077 · bbbd75e4 · bbbd75e4
10 changed file
--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/fluid/operators/distribution_helper.h
@@ -21,17 +21,26 @@ limitations under the License. */
 #include <hiprand_kernel.h>
 #endif

+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/hostdevice.h"

+#if !defined(_WIN32)
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#else
+// there is no equivalent intrinsics in msvc.
+#define UNLIKELY(condition) (condition)
+#endif
+
 namespace paddle {
 namespace distribution {

 using Tensor = framework::Tensor;

+/********************* Transformation Function **********************/
 template <typename T>
 struct exponential_transform {
  explicit exponential_transform(T lambda) : lambda_(lambda) {}
@@ -52,7 +61,37 @@ struct exponential_transform {
  T lambda_;
 };

+template <typename T>
+struct uniform_transform {
+  explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
+
+  HOSTDEVICE inline T operator()(T val) const {
+    if (UNLIKELY(val == static_cast<T>(1.0))) {
+      return min_;
+    } else {
+      return val * range_ + min_;
+    }
+  }
+
+ private:
+  T range_;
+  T min_;
+};
+
+template <typename T>
+struct normal_transform {
+  explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
+
+  HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
+
+ private:
+  T mean_;
+  T std_;
+};
+
 #if defined(__NVCC__) || defined(__HIPCC__)
+
+/*********************** Distribution Function *************************/
 template <typename T>
 struct uniform_distribution;

@@ -132,6 +171,7 @@ struct normal_distribution<double> {
 };
 #endif

+/******** Launch GPU function of distribution and transformation *********/
 template <typename T, typename DistOp, typename TransformOp>
 __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
                                   DistOp dist, TransformOp trans,
@@ -151,8 +191,8 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
    for (size_t j = 0; j < returns_count; j++) {
      size_t index = i + j * total_thread;
      if (index < size) {
-        auto random = static_cast<T>((&random_tuple.x)[j]);
-        out_data[index] = trans(random);
+        auto random = (&random_tuple.x)[j];
+        out_data[index] = static_cast<T>(trans(random));
      }
    }
  }

--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -19,8 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"

+DECLARE_bool(use_curand);
+
 namespace paddle {
 namespace operators {

@@ -65,7 +68,10 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
    thrust::counting_iterator<int64_t> index_sequence_begin(0);
    auto shape = GetShape(context);
    tensor->Resize(shape);
-    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    auto& dev_cxt =
+        context.template device_context<platform::CUDADeviceContext>();
+    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());

    int64_t size = tensor->numel();

@@ -73,12 +79,20 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);

    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+      if (FLAGS_use_curand) {
+        using MT = typename details::MPTypeTrait<T>::Type;
+        distribution::normal_distribution<MT> dist;
+        distribution::normal_transform<MT> trans(mean, std);
+        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
+                                                    trans);
+      } else {
+        auto seed_offset = gen_cuda->IncrementOffset(1);
+        int64_t gen_offset = size * seed_offset.second;
+        thrust::transform(
+            index_sequence_begin, index_sequence_begin + size,
+            thrust::device_ptr<T>(data),
+            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+      }
    } else {
      thrust::transform(index_sequence_begin, index_sequence_begin + size,
                        thrust::device_ptr<T>(data),

--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -18,7 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
+
+DECLARE_bool(use_curand);
+
 namespace paddle {
 namespace operators {

@@ -123,7 +128,9 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
          "unsupport type: %s.",
          framework::ToTypeName(out_var->Type())));
    }
-    T* data = tensor->mutable_data<T>(context.GetPlace());
+    auto& dev_cxt =
+        context.template device_context<platform::CUDADeviceContext>();
+    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
    bool seed_flag = false;
    if (seed == 0) {
@@ -144,13 +151,21 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
    int device_id = context.GetPlace().GetDeviceId();
    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
-                                    diag_step, diag_val, gen_offset));
+      if (FLAGS_use_curand) {
+        using MT = typename details::MPTypeTrait<T>::Type;
+        distribution::uniform_distribution<MT> dist;
+        distribution::uniform_transform<MT> trans(min, max);
+        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
+                                                    trans);
+      } else {
+        auto seed_offset = gen_cuda->IncrementOffset(1);
+        int64_t gen_offset = size * seed_offset.second;
+        thrust::transform(
+            index_sequence_begin, index_sequence_begin + size,
+            thrust::device_ptr<T>(data),
+            UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
+                                      diag_step, diag_val, gen_offset));
+      }
    } else {
      thrust::transform(
          index_sequence_begin, index_sequence_begin + size,

--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -545,6 +545,8 @@ PADDLE_DEFINE_EXPORTED_double(
 */
 PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");

+PADDLE_DEFINE_EXPORTED_bool(use_curand, false, "Random OP use CURAND");
+
 /**
 * Debug related FLAG
 * Name: FLAGS_call_stack_level

--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -662,6 +662,7 @@ for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%

 set FLAGS_call_stack_level=2
+set FLAGS_use_curand=True
 dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib
 dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin
 dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -61,6 +61,8 @@ function init() {
    # NOTE(chenweihang): For easy debugging, CI displays the C++ error stacktrace by default 
    export FLAGS_call_stack_level=2

+    export FLAGS_use_curand=True
+
    # set CI_SKIP_CPP_TEST if only *.py changed
    # In order to avoid using in some CI(such as daily performance), the current
    # branch must not be `${BRANCH}` which is usually develop.

--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -274,7 +274,7 @@ class TestDistTraning(unittest.TestCase):
        seq_length = 16
        class_size_per_card = 2
        vocab_size = class_size_per_card * self.model_parallel_size
-        seed = 1025
+        seed = 100

        set_random_seed(seed)
        rank_id = dist.get_rank()

--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -333,7 +333,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
            lr_ratio=simple_lr_fun)

        loss_ref = np.array(
-            [4.8383293, 3.084947, 1.3323904, -0.41943002, -2.1710064])
+            [-1.7267396, -2.81524, -3.9250019, -5.05954, -6.2272625])
        for i in range(5):
            a1 = linear1(a)
            out = linear2(a1)
@@ -379,7 +379,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
        exe.run(startup)

        loss_ref = np.array(
-            [0.36120513, 0.2720821, 0.67208904, 0.14607805, 0.24098626])
+            [0.33895183, 0.3159437, 0.19472016, 0.17764759, 0.1520702])
        for i in range(5):
            inputs = np.random.random(size=[8, 10]).astype('float32')
            outputs = np.random.random(size=[8, 1]).astype('float32')

--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -287,5 +287,49 @@ class TestStandardNormalDtype(unittest.TestCase):
        paddle.enable_static()


+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        # Note(zhouwei): The Number of threads is determined by 
+        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
+        # GPU have different number of threads, which result in different 
+        # random value. Only test on V100 GPU here.
+        if not "V100" in paddle.device.cuda.get_device_name():
+            return
+
+        def _check_random_value(dtype, expect, expect_mean, expect_std):
+            x = paddle.randn([32, 3, 1024, 1024], dtype=dtype)
+            actual = x.numpy()
+            self.assertTrue(np.allclose(actual[2, 1, 512, 1000:1010], expect))
+            self.assertTrue(np.mean(actual), expect_mean)
+            self.assertTrue(np.std(actual), expect_std)
+
+        print("Test Fixed Random number on V100 GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+        expect = [
+            -0.79037829, -0.54411126, -0.32266671, 0.35791815, 1.44169267,
+            -0.87785644, -1.23909874, -2.18194139, 0.49489656, 0.40703062
+        ]
+        expect_mean = -0.0000053026194133403266873214888799115129813799285329878330230713
+        expect_std = 0.99999191058126390974081232343451119959354400634765625
+        _check_random_value(core.VarDesc.VarType.FP64, expect, expect_mean,
+                            expect_std)
+
+        expect = [
+            -0.7988942, 1.8644791, 0.02782744, 1.3692524, 0.6419724, 0.12436751,
+            0.12058455, -1.9984808, 1.5635862, 0.18506318
+        ]
+        expect_mean = -0.00004762359094456769526004791259765625
+        expect_std = 0.999975681304931640625
+        _check_random_value(core.VarDesc.VarType.FP32, expect, expect_mean,
+                            expect_std)
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -562,5 +562,49 @@ class TestUniformDtype(unittest.TestCase):
        paddle.enable_static()


+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        # Note(zhouwei): The Number of threads is determined by 
+        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
+        # GPU have different number of threads, which result in different 
+        # random value. Only test on V100 GPU here.
+        if not "V100" in paddle.device.cuda.get_device_name():
+            return
+
+        def _check_random_value(dtype, expect, expect_mean, expect_std):
+            x = paddle.rand([32, 3, 1024, 1024], dtype=dtype)
+            actual = x.numpy()
+            self.assertTrue(np.allclose(actual[2, 1, 512, 1000:1010], expect))
+            self.assertEqual(np.mean(actual), expect_mean)
+            self.assertEqual(np.std(actual), expect_std)
+
+        print("Test Fixed Random number on V100 GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+        expect = [
+            0.55298901, 0.65184678, 0.49375412, 0.57943639, 0.16459608,
+            0.67181056, 0.03021481, 0.0238559, 0.07742096, 0.55972187
+        ]
+        expect_mean = 0.50000454338820143895816272561205551028251647949218750
+        expect_std = 0.28867379167297479991560749112977646291255950927734375
+        _check_random_value(core.VarDesc.VarType.FP64, expect, expect_mean,
+                            expect_std)
+
+        expect = [
+            0.45320973, 0.17582087, 0.725341, 0.30849215, 0.622257, 0.46352342,
+            0.97228295, 0.12771158, 0.286525, 0.9810645
+        ]
+        expect_mean = 0.50002604722976684570312500
+        expect_std = 0.2886914908885955810546875
+        _check_random_value(core.VarDesc.VarType.FP32, expect, expect_mean,
+                            expect_std)
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
    unittest.main()