diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h
index 695cb6e0ef2de07f4c8bb6682c1a74a72f7e00fc..8bb963979e5a71f7f3a46fbdcc0614582fb43746 100644
--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/fluid/operators/distribution_helper.h
@@ -21,17 +21,26 @@ limitations under the License. */
 #include <hiprand_kernel.h>
 #endif
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
+#if !defined(_WIN32)
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#else
+// there is no equivalent intrinsics in msvc.
+#define UNLIKELY(condition) (condition)
+#endif
+
 namespace paddle {
 namespace distribution {
 
 using Tensor = framework::Tensor;
 
+/********************* Transformation Function **********************/
 template <typename T>
 struct exponential_transform {
   explicit exponential_transform(T lambda) : lambda_(lambda) {}
@@ -52,7 +61,37 @@ struct exponential_transform {
   T lambda_;
 };
 
+template <typename T>
+struct uniform_transform {
+  explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
+
+  HOSTDEVICE inline T operator()(T val) const {
+    if (UNLIKELY(val == static_cast<T>(1.0))) {
+      return min_;
+    } else {
+      return val * range_ + min_;
+    }
+  }
+
+ private:
+  T range_;
+  T min_;
+};
+
+template <typename T>
+struct normal_transform {
+  explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
+
+  HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
+
+ private:
+  T mean_;
+  T std_;
+};
+
 #if defined(__NVCC__) || defined(__HIPCC__)
+
+/*********************** Distribution Function *************************/
 template <typename T>
 struct uniform_distribution;
 
@@ -132,6 +171,7 @@ struct normal_distribution<double> {
 };
 #endif
 
+/******** Launch GPU function of distribution and transformation *********/
 template <typename T, typename DistOp, typename TransformOp>
 __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
                                    DistOp dist, TransformOp trans,
@@ -151,8 +191,8 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
     for (size_t j = 0; j < returns_count; j++) {
       size_t index = i + j * total_thread;
       if (index < size) {
-        auto random = static_cast<T>((&random_tuple.x)[j]);
-        out_data[index] = trans(random);
+        auto random = (&random_tuple.x)[j];
+        out_data[index] = static_cast<T>(trans(random));
       }
     }
   }
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index e43ffdae903f595a227f2b689e180a692e6f8d0f..fa9fe9d8602012f71ca6829e58561d03b7bfb2f1 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -19,8 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 
+DECLARE_bool(use_curand);
+
 namespace paddle {
 namespace operators {
 
@@ -65,7 +68,10 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
     thrust::counting_iterator<int64_t> index_sequence_begin(0);
     auto shape = GetShape(context);
     tensor->Resize(shape);
-    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    auto& dev_cxt =
+        context.template device_context<platform::CUDADeviceContext>();
+    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
 
     int64_t size = tensor->numel();
 
@@ -73,12 +79,20 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
 
     if (gen_cuda->GetIsInitPy() && seed_flag) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+      if (FLAGS_use_curand) {
+        using MT = typename details::MPTypeTrait<T>::Type;
+        distribution::normal_distribution<MT> dist;
+        distribution::normal_transform<MT> trans(mean, std);
+        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
+                                                    trans);
+      } else {
+        auto seed_offset = gen_cuda->IncrementOffset(1);
+        int64_t gen_offset = size * seed_offset.second;
+        thrust::transform(
+            index_sequence_begin, index_sequence_begin + size,
+            thrust::device_ptr<T>(data),
+            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+      }
     } else {
       thrust::transform(index_sequence_begin, index_sequence_begin + size,
                         thrust::device_ptr<T>(data),
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 440c9b786b69c98bda9457977967f9b4cc5ff130..63eecd15c2d69bab3a4e8230f6fa947e3662f22d 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -18,7 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
+
+DECLARE_bool(use_curand);
+
 namespace paddle {
 namespace operators {
 
@@ -123,7 +128,9 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
     }
-    T* data = tensor->mutable_data<T>(context.GetPlace());
+    auto& dev_cxt =
+        context.template device_context<platform::CUDADeviceContext>();
+    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     bool seed_flag = false;
     if (seed == 0) {
@@ -144,13 +151,21 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     int device_id = context.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
     if (gen_cuda->GetIsInitPy() && seed_flag) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
-                                    diag_step, diag_val, gen_offset));
+      if (FLAGS_use_curand) {
+        using MT = typename details::MPTypeTrait<T>::Type;
+        distribution::uniform_distribution<MT> dist;
+        distribution::uniform_transform<MT> trans(min, max);
+        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
+                                                    trans);
+      } else {
+        auto seed_offset = gen_cuda->IncrementOffset(1);
+        int64_t gen_offset = size * seed_offset.second;
+        thrust::transform(
+            index_sequence_begin, index_sequence_begin + size,
+            thrust::device_ptr<T>(data),
+            UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
+                                      diag_step, diag_val, gen_offset));
+      }
     } else {
       thrust::transform(
           index_sequence_begin, index_sequence_begin + size,
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 44bd4eaa29b807214912f45401b962d089d02348..4a6bfe67ba59729a199a4d756f798e28c7388892 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -545,6 +545,8 @@ PADDLE_DEFINE_EXPORTED_double(
  */
 PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
 
+PADDLE_DEFINE_EXPORTED_bool(use_curand, false, "Random OP use CURAND");
+
 /**
  * Debug related FLAG
  * Name: FLAGS_call_stack_level
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 343ab8ff9f5b728831557e1345d4955d6a1ade2f..711b8811b973c7172af5733c70efd46cd6f25e77 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -662,6 +662,7 @@ for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
 
 set FLAGS_call_stack_level=2
+set FLAGS_use_curand=True
 dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib
 dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin
 dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index afa0011858987b17f6b01563d315069d9cf7c6d3..5cd416e6d933a7424f67b3f94b49f488b6171e45 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -61,6 +61,8 @@ function init() {
     # NOTE(chenweihang): For easy debugging, CI displays the C++ error stacktrace by default 
     export FLAGS_call_stack_level=2
 
+    export FLAGS_use_curand=True
+
     # set CI_SKIP_CPP_TEST if only *.py changed
     # In order to avoid using in some CI(such as daily performance), the current
     # branch must not be `${BRANCH}` which is usually develop.
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
index 317eb14ad069e2ad76bce3a3de1f8f41f129697f..9ae9c14db3fcfd10c41ccf13d2dca4e7835ea244 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -274,7 +274,7 @@ class TestDistTraning(unittest.TestCase):
         seq_length = 16
         class_size_per_card = 2
         vocab_size = class_size_per_card * self.model_parallel_size
-        seed = 1025
+        seed = 100
 
         set_random_seed(seed)
         rank_id = dist.get_rank()
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index dbeb5a430377f7199ca30220114676651cf530a2..d5fa944802a4763c9686811fdb85cb0be1586f59 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -333,7 +333,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
             lr_ratio=simple_lr_fun)
 
         loss_ref = np.array(
-            [4.8383293, 3.084947, 1.3323904, -0.41943002, -2.1710064])
+            [-1.7267396, -2.81524, -3.9250019, -5.05954, -6.2272625])
         for i in range(5):
             a1 = linear1(a)
             out = linear2(a1)
@@ -379,7 +379,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
         exe.run(startup)
 
         loss_ref = np.array(
-            [0.36120513, 0.2720821, 0.67208904, 0.14607805, 0.24098626])
+            [0.33895183, 0.3159437, 0.19472016, 0.17764759, 0.1520702])
         for i in range(5):
             inputs = np.random.random(size=[8, 10]).astype('float32')
             outputs = np.random.random(size=[8, 1]).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 121dcbb3cdc12a3554939896997614b31cfc8a92..70ab1cc523507edcdfb361beaaa7b44742ba10cd 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -287,5 +287,49 @@ class TestStandardNormalDtype(unittest.TestCase):
         paddle.enable_static()
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        # Note(zhouwei): The Number of threads is determined by 
+        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
+        # GPU have different number of threads, which result in different 
+        # random value. Only test on V100 GPU here.
+        if not "V100" in paddle.device.cuda.get_device_name():
+            return
+
+        def _check_random_value(dtype, expect, expect_mean, expect_std):
+            x = paddle.randn([32, 3, 1024, 1024], dtype=dtype)
+            actual = x.numpy()
+            self.assertTrue(np.allclose(actual[2, 1, 512, 1000:1010], expect))
+            self.assertTrue(np.mean(actual), expect_mean)
+            self.assertTrue(np.std(actual), expect_std)
+
+        print("Test Fixed Random number on V100 GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+        expect = [
+            -0.79037829, -0.54411126, -0.32266671, 0.35791815, 1.44169267,
+            -0.87785644, -1.23909874, -2.18194139, 0.49489656, 0.40703062
+        ]
+        expect_mean = -0.0000053026194133403266873214888799115129813799285329878330230713
+        expect_std = 0.99999191058126390974081232343451119959354400634765625
+        _check_random_value(core.VarDesc.VarType.FP64, expect, expect_mean,
+                            expect_std)
+
+        expect = [
+            -0.7988942, 1.8644791, 0.02782744, 1.3692524, 0.6419724, 0.12436751,
+            0.12058455, -1.9984808, 1.5635862, 0.18506318
+        ]
+        expect_mean = -0.00004762359094456769526004791259765625
+        expect_std = 0.999975681304931640625
+        _check_random_value(core.VarDesc.VarType.FP32, expect, expect_mean,
+                            expect_std)
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 6de36c02bee05b35413b465987459187f641bb84..a84c3b20da26c9f7ab8792caae038e1c432c6659 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -562,5 +562,49 @@ class TestUniformDtype(unittest.TestCase):
         paddle.enable_static()
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        # Note(zhouwei): The Number of threads is determined by 
+        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
+        # GPU have different number of threads, which result in different 
+        # random value. Only test on V100 GPU here.
+        if not "V100" in paddle.device.cuda.get_device_name():
+            return
+
+        def _check_random_value(dtype, expect, expect_mean, expect_std):
+            x = paddle.rand([32, 3, 1024, 1024], dtype=dtype)
+            actual = x.numpy()
+            self.assertTrue(np.allclose(actual[2, 1, 512, 1000:1010], expect))
+            self.assertEqual(np.mean(actual), expect_mean)
+            self.assertEqual(np.std(actual), expect_std)
+
+        print("Test Fixed Random number on V100 GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+        expect = [
+            0.55298901, 0.65184678, 0.49375412, 0.57943639, 0.16459608,
+            0.67181056, 0.03021481, 0.0238559, 0.07742096, 0.55972187
+        ]
+        expect_mean = 0.50000454338820143895816272561205551028251647949218750
+        expect_std = 0.28867379167297479991560749112977646291255950927734375
+        _check_random_value(core.VarDesc.VarType.FP64, expect, expect_mean,
+                            expect_std)
+
+        expect = [
+            0.45320973, 0.17582087, 0.725341, 0.30849215, 0.622257, 0.46352342,
+            0.97228295, 0.12771158, 0.286525, 0.9810645
+        ]
+        expect_mean = 0.50002604722976684570312500
+        expect_std = 0.2886914908885955810546875
+        _check_random_value(core.VarDesc.VarType.FP32, expect, expect_mean,
+                            expect_std)
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()