diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu index d4d90cac917a2c35e26eca0d57d1c5349b878599..92948bf47c9345e53d1fea54d2be4fd8efbc6f96 100644 --- a/paddle/phi/kernels/gpu/randperm_kernel.cu +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -14,37 +14,161 @@ #include "paddle/phi/kernels/randperm_kernel.h" +#ifdef __NVCC__ +#include +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +#include +namespace cub = hipcub; +#endif +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/randint_kernel.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" +DECLARE_bool(use_curand); + namespace phi { +template +__global__ void SwapRepeatKernel( + int* key, T* data, int n, uint64_t seed, uint64_t offset) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < n) return; + + bool first_repeat = false; + if (data[idx] == data[idx + 1]) { + if (idx == 0) { + first_repeat = true; + } else if (data[idx] != data[idx - 1]) { + first_repeat = true; + } + } + + if (!first_repeat) return; + + int repeat_size = 1; + for (int i = idx; i < n; ++i) { + if (data[i] == data[i + 1]) { + ++repeat_size; + } else { + break; + } + } + +#ifdef __NVCC__ + curandStatePhilox4_32_10_t state; + curand_init(seed, idx, offset, &state); + for (int i = repeat_size - 1; i > 0; i--) { + uint32_t r = curand(&state) % (i + 1); +#elif __HIPCC__ + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, idx, offset, &state); + for (int i = repeat_size - 1; i > 0; i--) { + uint32_t r = hiprand(&state) % (i + 1); +#endif + if (r != i) { + T tmp = data[idx + i]; + data[idx + i] = data[idx + r]; + data[idx + r] = tmp; + } + } +} + template void RandpermRawKernel( const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) { - DenseTensor tmp; - tmp.Resize(phi::make_ddim({n})); - T* tmp_data = dev_ctx.template HostAlloc(&tmp); - - std::shared_ptr engine; - if (seed) { - engine = std::make_shared(); - engine->seed(seed); + if (FLAGS_use_curand) { + DenseTensor key; + RandintKernel(dev_ctx, + std::numeric_limits::min(), + std::numeric_limits::max(), + ScalarArray({n}), + phi::DataType::INT32, + &key); + DenseTensor key_out = Empty(dev_ctx, ScalarArray({n})); + + DenseTensor range = Empty(dev_ctx, ScalarArray({n})); + T* range_data = range.data(); + funcs::ForRange for_range(dev_ctx, n); + for_range([range_data] __device__(size_t idx) { + range_data[idx] = static_cast(idx); + }); + + out->Resize(phi::make_ddim({n})); + T* out_data = dev_ctx.template Alloc(out); + + // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to + // improve performance of radix sort. + double n_d = static_cast(n); + int begin_bit = 0; + int end_bit = + std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9)))); + + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, + temp_storage_bytes, + key.data(), + key_out.data(), + range.data(), + out_data, + n, + begin_bit, + end_bit < 32 ? end_bit : 32, + dev_ctx.stream()); + + auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes); + cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), + temp_storage_bytes, + key.data(), + key_out.data(), + range.data(), + out_data, + n, + begin_bit, + end_bit < 32 ? end_bit : 32, + dev_ctx.stream()); + + auto gen_cuda = dev_ctx.GetGenerator(); + auto seed_offset = gen_cuda->IncrementOffset(n); + uint64_t seed = seed_offset.first; + uint64_t offset = seed_offset.second; + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); + SwapRepeatKernel<<>>( + key_out.data(), out_data, n, seed, offset); } else { - engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); - } + DenseTensor tmp; + tmp.Resize(phi::make_ddim({n})); + T* tmp_data = dev_ctx.template HostAlloc(&tmp); - for (int i = 0; i < n; ++i) { - tmp_data[i] = static_cast(i); - } - std::shuffle(tmp_data, tmp_data + n, *engine); + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); + } - T* out_data = dev_ctx.template Alloc(out); - auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); - paddle::memory::Copy( - out->place(), out_data, tmp.place(), tmp_data, size, 0); + for (int i = 0; i < n; ++i) { + tmp_data[i] = static_cast(i); + } + std::shuffle(tmp_data, tmp_data + n, *engine); + + T* out_data = dev_ctx.template Alloc(out); + auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); + paddle::memory::Copy( + out->place(), out_data, tmp.place(), tmp_data, size, 0); + } } template diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py index 4361a45f1568f5f047ee03090bd3ef28a8d6654f..2380ccb14aaeede7474c99ccbd2d5805849c2118 100644 --- a/python/paddle/fluid/tests/unittests/test_randperm_op.py +++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py @@ -18,6 +18,7 @@ from op_test import OpTest import paddle import paddle.fluid.core as core from paddle.static import program_guard, Program +import os def check_randperm_out(n, data_np): @@ -129,5 +130,81 @@ class TestRandpermImperative(unittest.TestCase): paddle.enable_static() +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + + print("Test Fixed Random number on GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(2021) + + x = paddle.randperm(30000, dtype='int32').numpy() + expect = [ + 24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413 + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096 + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343 + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='int64').numpy() + expect = [ + 6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171 + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818 + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='float32').numpy() + expect = [ + 5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144., + 22906., 10705. + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564., + 26991. + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339., + 4662. + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='float64').numpy() + expect = [ + 19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147., + 9163. + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945., + 17624. + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202., + 21404. + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + paddle.enable_static() + + if __name__ == "__main__": unittest.main()