diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h index 695cb6e0ef2de07f4c8bb6682c1a74a72f7e00fc..8bb963979e5a71f7f3a46fbdcc0614582fb43746 100644 --- a/paddle/fluid/operators/distribution_helper.h +++ b/paddle/fluid/operators/distribution_helper.h @@ -21,17 +21,26 @@ limitations under the License. */ #include #endif +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/hostdevice.h" +#if !defined(_WIN32) +#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) +#else +// there is no equivalent intrinsics in msvc. +#define UNLIKELY(condition) (condition) +#endif + namespace paddle { namespace distribution { using Tensor = framework::Tensor; +/********************* Transformation Function **********************/ template struct exponential_transform { explicit exponential_transform(T lambda) : lambda_(lambda) {} @@ -52,7 +61,37 @@ struct exponential_transform { T lambda_; }; +template +struct uniform_transform { + explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {} + + HOSTDEVICE inline T operator()(T val) const { + if (UNLIKELY(val == static_cast(1.0))) { + return min_; + } else { + return val * range_ + min_; + } + } + + private: + T range_; + T min_; +}; + +template +struct normal_transform { + explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {} + + HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; } + + private: + T mean_; + T std_; +}; + #if defined(__NVCC__) || defined(__HIPCC__) + +/*********************** Distribution Function *************************/ template struct uniform_distribution; @@ -132,6 +171,7 @@ struct normal_distribution { }; #endif +/******** Launch GPU function of distribution and transformation *********/ template __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset, DistOp dist, TransformOp trans, @@ -151,8 +191,8 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset, for (size_t j = 0; j < returns_count; j++) { size_t index = i + j * total_thread; if (index < size) { - auto random = static_cast((&random_tuple.x)[j]); - out_data[index] = trans(random); + auto random = (&random_tuple.x)[j]; + out_data[index] = static_cast(trans(random)); } } } diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index e43ffdae903f595a227f2b689e180a692e6f8d0f..fa9fe9d8602012f71ca6829e58561d03b7bfb2f1 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -19,8 +19,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/fill_constant_op.h" +DECLARE_bool(use_curand); + namespace paddle { namespace operators { @@ -65,7 +68,10 @@ class GPUGaussianRandomKernel : public framework::OpKernel { thrust::counting_iterator index_sequence_begin(0); auto shape = GetShape(context); tensor->Resize(shape); - T* data = tensor->mutable_data(context.GetPlace()); + + auto& dev_cxt = + context.template device_context(); + T* data = tensor->mutable_data(dev_cxt.GetPlace()); int64_t size = tensor->numel(); @@ -73,12 +79,20 @@ class GPUGaussianRandomKernel : public framework::OpKernel { auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed_flag) { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - thrust::transform( - index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - GaussianGenerator(mean, std, seed_offset.first, gen_offset)); + if (FLAGS_use_curand) { + using MT = typename details::MPTypeTrait::Type; + distribution::normal_distribution dist; + distribution::normal_transform trans(mean, std); + distribution::distribution_and_transform(dev_cxt, tensor, dist, + trans); + } else { + auto seed_offset = gen_cuda->IncrementOffset(1); + int64_t gen_offset = size * seed_offset.second; + thrust::transform( + index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(data), + GaussianGenerator(mean, std, seed_offset.first, gen_offset)); + } } else { thrust::transform(index_sequence_begin, index_sequence_begin + size, thrust::device_ptr(data), diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index 440c9b786b69c98bda9457977967f9b4cc5ff130..63eecd15c2d69bab3a4e8230f6fa947e3662f22d 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -18,7 +18,12 @@ limitations under the License. */ #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/uniform_random_op.h" + +DECLARE_bool(use_curand); + namespace paddle { namespace operators { @@ -123,7 +128,9 @@ class GPUUniformRandomKernel : public framework::OpKernel { "unsupport type: %s.", framework::ToTypeName(out_var->Type()))); } - T* data = tensor->mutable_data(context.GetPlace()); + auto& dev_cxt = + context.template device_context(); + T* data = tensor->mutable_data(dev_cxt.GetPlace()); unsigned int seed = static_cast(context.Attr("seed")); bool seed_flag = false; if (seed == 0) { @@ -144,13 +151,21 @@ class GPUUniformRandomKernel : public framework::OpKernel { int device_id = context.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed_flag) { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - thrust::transform( - index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - UniformGeneratorOffset(min, max, seed_offset.first, diag_num, - diag_step, diag_val, gen_offset)); + if (FLAGS_use_curand) { + using MT = typename details::MPTypeTrait::Type; + distribution::uniform_distribution dist; + distribution::uniform_transform trans(min, max); + distribution::distribution_and_transform(dev_cxt, tensor, dist, + trans); + } else { + auto seed_offset = gen_cuda->IncrementOffset(1); + int64_t gen_offset = size * seed_offset.second; + thrust::transform( + index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(data), + UniformGeneratorOffset(min, max, seed_offset.first, diag_num, + diag_step, diag_val, gen_offset)); + } } else { thrust::transform( index_sequence_begin, index_sequence_begin + size, diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 44bd4eaa29b807214912f45401b962d089d02348..4a6bfe67ba59729a199a4d756f798e28c7388892 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -545,6 +545,8 @@ PADDLE_DEFINE_EXPORTED_double( */ PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run"); +PADDLE_DEFINE_EXPORTED_bool(use_curand, false, "Random OP use CURAND"); + /** * Debug related FLAG * Name: FLAGS_call_stack_level diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 343ab8ff9f5b728831557e1345d4955d6a1ade2f..711b8811b973c7172af5733c70efd46cd6f25e77 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -662,6 +662,7 @@ for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# set start=%start:~4,10% set FLAGS_call_stack_level=2 +set FLAGS_use_curand=True dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index afa0011858987b17f6b01563d315069d9cf7c6d3..5cd416e6d933a7424f67b3f94b49f488b6171e45 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -61,6 +61,8 @@ function init() { # NOTE(chenweihang): For easy debugging, CI displays the C++ error stacktrace by default export FLAGS_call_stack_level=2 + export FLAGS_use_curand=True + # set CI_SKIP_CPP_TEST if only *.py changed # In order to avoid using in some CI(such as daily performance), the current # branch must not be `${BRANCH}` which is usually develop. diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py index 317eb14ad069e2ad76bce3a3de1f8f41f129697f..9ae9c14db3fcfd10c41ccf13d2dca4e7835ea244 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py @@ -274,7 +274,7 @@ class TestDistTraning(unittest.TestCase): seq_length = 16 class_size_per_card = 2 vocab_size = class_size_per_card * self.model_parallel_size - seed = 1025 + seed = 100 set_random_seed(seed) rank_id = dist.get_rank() diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index dbeb5a430377f7199ca30220114676651cf530a2..d5fa944802a4763c9686811fdb85cb0be1586f59 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -333,7 +333,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp): lr_ratio=simple_lr_fun) loss_ref = np.array( - [4.8383293, 3.084947, 1.3323904, -0.41943002, -2.1710064]) + [-1.7267396, -2.81524, -3.9250019, -5.05954, -6.2272625]) for i in range(5): a1 = linear1(a) out = linear2(a1) @@ -379,7 +379,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp): exe.run(startup) loss_ref = np.array( - [0.36120513, 0.2720821, 0.67208904, 0.14607805, 0.24098626]) + [0.33895183, 0.3159437, 0.19472016, 0.17764759, 0.1520702]) for i in range(5): inputs = np.random.random(size=[8, 10]).astype('float32') outputs = np.random.random(size=[8, 1]).astype('float32') diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py index 121dcbb3cdc12a3554939896997614b31cfc8a92..70ab1cc523507edcdfb361beaaa7b44742ba10cd 100644 --- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py @@ -287,5 +287,49 @@ class TestStandardNormalDtype(unittest.TestCase): paddle.enable_static() +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + # Note(zhouwei): The Number of threads is determined by + # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different + # GPU have different number of threads, which result in different + # random value. Only test on V100 GPU here. + if not "V100" in paddle.device.cuda.get_device_name(): + return + + def _check_random_value(dtype, expect, expect_mean, expect_std): + x = paddle.randn([32, 3, 1024, 1024], dtype=dtype) + actual = x.numpy() + self.assertTrue(np.allclose(actual[2, 1, 512, 1000:1010], expect)) + self.assertTrue(np.mean(actual), expect_mean) + self.assertTrue(np.std(actual), expect_std) + + print("Test Fixed Random number on V100 GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(2021) + expect = [ + -0.79037829, -0.54411126, -0.32266671, 0.35791815, 1.44169267, + -0.87785644, -1.23909874, -2.18194139, 0.49489656, 0.40703062 + ] + expect_mean = -0.0000053026194133403266873214888799115129813799285329878330230713 + expect_std = 0.99999191058126390974081232343451119959354400634765625 + _check_random_value(core.VarDesc.VarType.FP64, expect, expect_mean, + expect_std) + + expect = [ + -0.7988942, 1.8644791, 0.02782744, 1.3692524, 0.6419724, 0.12436751, + 0.12058455, -1.9984808, 1.5635862, 0.18506318 + ] + expect_mean = -0.00004762359094456769526004791259765625 + expect_std = 0.999975681304931640625 + _check_random_value(core.VarDesc.VarType.FP32, expect, expect_mean, + expect_std) + paddle.enable_static() + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py index 6de36c02bee05b35413b465987459187f641bb84..a84c3b20da26c9f7ab8792caae038e1c432c6659 100644 --- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py @@ -562,5 +562,49 @@ class TestUniformDtype(unittest.TestCase): paddle.enable_static() +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + # Note(zhouwei): The Number of threads is determined by + # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different + # GPU have different number of threads, which result in different + # random value. Only test on V100 GPU here. + if not "V100" in paddle.device.cuda.get_device_name(): + return + + def _check_random_value(dtype, expect, expect_mean, expect_std): + x = paddle.rand([32, 3, 1024, 1024], dtype=dtype) + actual = x.numpy() + self.assertTrue(np.allclose(actual[2, 1, 512, 1000:1010], expect)) + self.assertEqual(np.mean(actual), expect_mean) + self.assertEqual(np.std(actual), expect_std) + + print("Test Fixed Random number on V100 GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(2021) + expect = [ + 0.55298901, 0.65184678, 0.49375412, 0.57943639, 0.16459608, + 0.67181056, 0.03021481, 0.0238559, 0.07742096, 0.55972187 + ] + expect_mean = 0.50000454338820143895816272561205551028251647949218750 + expect_std = 0.28867379167297479991560749112977646291255950927734375 + _check_random_value(core.VarDesc.VarType.FP64, expect, expect_mean, + expect_std) + + expect = [ + 0.45320973, 0.17582087, 0.725341, 0.30849215, 0.622257, 0.46352342, + 0.97228295, 0.12771158, 0.286525, 0.9810645 + ] + expect_mean = 0.50002604722976684570312500 + expect_std = 0.2886914908885955810546875 + _check_random_value(core.VarDesc.VarType.FP32, expect, expect_mean, + expect_std) + paddle.enable_static() + + if __name__ == "__main__": unittest.main()