diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index 49e1c82482c0f14a665380e1b55e8f7bd67b1e30..f0793fb9d27db68f22bc2bc27978844072c61616 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -21,12 +21,11 @@ limitations under the License. */ #include #endif +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/generator.h" - -#include "paddle/phi/kernels/funcs/index_impl.cu.h" +#include "paddle/phi/core/hostdevice.h" #if defined(__NVCC__) || defined(__HIPCC__) #include "paddle/phi/kernels/primitive/kernel_primitives.h" @@ -40,7 +39,7 @@ limitations under the License. */ #endif namespace phi { -namespace distribution { +namespace funcs { /********************* Transformation Function **********************/ template @@ -64,8 +63,9 @@ struct exponential_transform { }; template -struct uniform_transform { - explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {} +struct uniform_real_transform { + explicit uniform_real_transform(T min, T max) + : range_(max - min), min_(min) {} HOSTDEVICE inline T operator()(T val) const { if (UNLIKELY(val == static_cast(1.0))) { @@ -80,6 +80,22 @@ struct uniform_transform { T min_; }; +template +struct uniform_int_transform { + explicit uniform_int_transform(int min, int max) { + range_ = static_cast(max - min); + min_ = min; + } + + HOSTDEVICE inline T operator()(R rand) const { + return static_cast(static_cast(rand % range_) + min_); + } + + private: + uint32_t range_; + int min_; +}; + template struct normal_transform { explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {} @@ -120,6 +136,27 @@ struct uniform_distribution { static constexpr int kReturnsCount = 2; }; +template <> +struct uniform_distribution { + __device__ inline uint4 operator()(curandStatePhilox4_32_10_t *state) const { + return curand4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline ulonglong2 operator()( + curandStatePhilox4_32_10_t *state) const { + ulonglong2 result; + uint4 rand = curand4(state); + result.x = (uint64_t)rand.x << 32 | rand.y; + result.y = (uint64_t)rand.z << 32 | rand.w; + return result; + } + static constexpr int kReturnsCount = 2; +}; + template <> struct normal_distribution { __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { @@ -156,6 +193,27 @@ struct uniform_distribution { static constexpr int kReturnsCount = 2; }; +template <> +struct uniform_distribution { + __device__ inline uint4 operator()(hiprandStatePhilox4_32_10_t *state) const { + return hiprand4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline ulonglong2 operator()( + hiprandStatePhilox4_32_10_t *state) const { + ulonglong2 result; + uint4 rand = hiprand4(state); + result.x = (uint64_t)rand.x << 32 | rand.y; + result.y = (uint64_t)rand.z << 32 | rand.w; + return result; + } + static constexpr int kReturnsCount = 2; +}; + template <> struct normal_distribution { __device__ inline float4 operator()( @@ -209,19 +267,21 @@ __global__ void DistributionKernel(size_t size, } template -void distribution_and_transform(const GPUContext &dev_ctx, +void distribution_and_transform(const GPUContext &ctx, DenseTensor *out, DistOp dist, TransformOp trans) { - T *out_data = dev_ctx.template Alloc(out); + T *out_data = ctx.template Alloc(out); auto size = out->numel(); - - int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); - auto gen_cuda = dev_ctx.GetGenerator(); + if (size == 0) return; + auto gen_cuda = ctx.GetGenerator(); size_t block_size = 256; size_t expect_grid_size = (size + block_size - 1) / block_size; - const auto &prop = backends::gpu::GetDeviceProperties(device_id); + + int64_t device_id = ctx.GetPlace().GetDeviceId(); + const auto &prop = phi::backends::gpu::GetDeviceProperties(device_id); + size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) * prop.multiProcessorCount; size_t grid_size = @@ -237,13 +297,13 @@ void distribution_and_transform(const GPUContext &dev_ctx, uint64_t seed = seed_offset.first; uint64_t offset = seed_offset.second; - DistributionKernel< - T, - DistOp, - TransformOp><<>>( + DistributionKernel<<>>( size, seed, offset, dist, trans, out_data, total_thread); } #endif -} // namespace distribution + +} // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu index ac69d398b8ac44513625d2caeac2d80d5578ea6a..2b6140d2fde0d3bcef3f15c4414444f1d2099b2e 100644 --- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu +++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu @@ -29,9 +29,9 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/bernoulli_kernel.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/platform/transform.h" DECLARE_bool(use_curand); @@ -77,7 +77,7 @@ __global__ void bernoulli_cuda_kernel( size_t total_thread = gridDim.x * blockDim.x; for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) { - paddle::distribution::uniform_distribution dist; + funcs::uniform_distribution dist; float4 rand = dist(&state); #pragma unroll for (size_t j = 0; j < 4; j++) { diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu index 66dc5f72a5c7067a08127bce65740851b123efd3..d4cbd5c73feae26b04ff6c73f505f1b60a80138d 100644 --- a/paddle/phi/kernels/gpu/randint_kernel.cu +++ b/paddle/phi/kernels/gpu/randint_kernel.cu @@ -18,10 +18,13 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" +DECLARE_bool(use_curand); + namespace phi { template @@ -32,34 +35,39 @@ void RandintRawKernel(const Context& dev_ctx, DataType dtype, int seed, DenseTensor* out) { - DenseTensor tmp; - tmp.Resize(phi::make_ddim(shape.GetData())); - T* tmp_data = dev_ctx.template HostAlloc(&tmp); - - out->Resize(tmp.dims()); + out->Resize(phi::make_ddim(shape.GetData())); T* data = dev_ctx.template Alloc(out); - - std::shared_ptr engine; - if (seed) { - engine = std::make_shared(); - engine->seed(seed); + if (FLAGS_use_curand) { + funcs::uniform_distribution dist; + funcs::uniform_int_transform trans(low, high); + funcs::distribution_and_transform(dev_ctx, out, dist, trans); } else { - engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); - } + DenseTensor tmp; + tmp.Resize(phi::make_ddim(shape.GetData())); + T* tmp_data = dev_ctx.template HostAlloc(&tmp); - std::uniform_int_distribution dist(low, high - 1); - auto numel = out->numel(); - for (int64_t i = 0; i < numel; ++i) { - tmp_data[i] = dist(*engine); - } + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); + } + + std::uniform_int_distribution dist(low, high - 1); + auto numel = out->numel(); + for (int64_t i = 0; i < numel; ++i) { + tmp_data[i] = dist(*engine); + } - paddle::memory::Copy( - out->place(), - data, - tmp.place(), - tmp_data, - numel * paddle::experimental::SizeOf(out->dtype()), - 0); + paddle::memory::Copy( + out->place(), + data, + tmp.place(), + tmp_data, + numel * paddle::experimental::SizeOf(out->dtype()), + 0); + } } template diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu index 7f24a6667e562e64d8b523dd3ab1883af27bed5a..cdab9faf6aafe35045060a011e1354c11a4c9375 100644 --- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu @@ -116,9 +116,9 @@ void UniformRandomRawKernel(const Context& dev_ctx, if (generator->GetIsInitPy() && seed_flag) { if (FLAGS_use_curand) { using MT = typename kps::details::MPTypeTrait::Type; - distribution::uniform_distribution dist; - distribution::uniform_transform trans(min, max); - distribution::distribution_and_transform(dev_ctx, out, dist, trans); + funcs::uniform_distribution dist; + funcs::uniform_real_transform trans(min, max); + funcs::distribution_and_transform(dev_ctx, out, dist, trans); } else { auto seed_offset = generator->IncrementOffset(1); int64_t gen_offset = size * seed_offset.second; diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py index 686e738b8e0781ccc8d49edd551b5e8d64704181..6976019210283208e9583762ddd77867d1779e1a 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py @@ -93,11 +93,11 @@ class TestGeneratorSeed(unittest.TestCase): fluid.enable_dygraph() - gen = paddle.seed(12312321111) + paddle.seed(12312321111) x = paddle.randint(low=10, shape=[10], dtype="int32") - st1 = gen.get_state() + st1 = paddle.get_cuda_rng_state() x1 = paddle.randint(low=10, shape=[10], dtype="int32") - gen.set_state(st1) + paddle.set_cuda_rng_state(st1) x2 = paddle.randint(low=10, shape=[10], dtype="int32") paddle.seed(12312321111) x3 = paddle.randint(low=10, shape=[10], dtype="int32") diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py index 82bfb88d54d51e4c17e37abef66d23e3a093feda..5f58054d7efc94c9fcc234396dfefdff7f17facc 100644 --- a/python/paddle/fluid/tests/unittests/test_randint_op.py +++ b/python/paddle/fluid/tests/unittests/test_randint_op.py @@ -20,6 +20,9 @@ from op_test import OpTest import paddle from paddle.fluid import core from paddle.static import program_guard, Program +import os + +paddle.enable_static() def output_hist(out): @@ -156,5 +159,47 @@ class TestRandintImperative(unittest.TestCase): paddle.enable_static() +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + # Different GPU generatte different random value. Only test V100 here. + if not "V100" in paddle.device.cuda.get_device_name(): + return + + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + + print("Test Fixed Random number on GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(100) + + x = paddle.randint( + -10000, 10000, [32, 3, 1024, 1024], dtype='int32').numpy() + self.assertTrue(x.mean(), -0.7517569760481516) + self.assertTrue(x.std(), 5773.696619107639) + expect = [2535, 2109, 5916, -5011, -261] + self.assertTrue(np.array_equal(x[10, 0, 100, 100:105], expect)) + expect = [3465, 7206, -8660, -9628, -6574] + self.assertTrue(np.array_equal(x[20, 1, 600, 600:605], expect)) + expect = [881, 1560, 1100, 9664, 1669] + self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect)) + + x = paddle.randint( + -10000, 10000, [32, 3, 1024, 1024], dtype='int64').numpy() + self.assertTrue(x.mean(), -1.461287518342336) + self.assertTrue(x.std(), 5773.023477548159) + expect = [7213, -9597, 754, 8129, -1158] + self.assertTrue(np.array_equal(x[10, 0, 100, 100:105], expect)) + expect = [-7159, 8054, 7675, 6980, 8506] + self.assertTrue(np.array_equal(x[20, 1, 600, 600:605], expect)) + expect = [3581, 3420, -8027, -5237, -2436] + self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect)) + paddle.enable_static() + + if __name__ == "__main__": unittest.main()