change CUDA implementation of randperm OP (#40464)

813f61d2 · zhouweiwei2014 · GitHub · 6b7d4845 · 813f61d2 · 813f61d2
Showing with 219 addition and 18 deletion

paddle/phi/kernels/gpu/randperm_kernel.cu paddle/phi/kernels/gpu/randperm_kernel.cu +142 -18

python/paddle/fluid/tests/unittests/test_randperm_op.py python/paddle/fluid/tests/unittests/test_randperm_op.py +77 -0

未找到文件。
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -14,37 +14,161 @@

 #include "paddle/phi/kernels/randperm_kernel.h"

+#ifdef __NVCC__
+#include <curand_kernel.h>
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/randint_kernel.h"

 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"

+DECLARE_bool(use_curand);
+
 namespace phi {

+template <typename T>
+__global__ void SwapRepeatKernel(
+    int* key, T* data, int n, uint64_t seed, uint64_t offset) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < n) return;
+
+  bool first_repeat = false;
+  if (data[idx] == data[idx + 1]) {
+    if (idx == 0) {
+      first_repeat = true;
+    } else if (data[idx] != data[idx - 1]) {
+      first_repeat = true;
+    }
+  }
+
+  if (!first_repeat) return;
+
+  int repeat_size = 1;
+  for (int i = idx; i < n; ++i) {
+    if (data[i] == data[i + 1]) {
+      ++repeat_size;
+    } else {
+      break;
+    }
+  }
+
+#ifdef __NVCC__
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, offset, &state);
+  for (int i = repeat_size - 1; i > 0; i--) {
+    uint32_t r = curand(&state) % (i + 1);
+#elif __HIPCC__
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx, offset, &state);
+  for (int i = repeat_size - 1; i > 0; i--) {
+    uint32_t r = hiprand(&state) % (i + 1);
+#endif
+    if (r != i) {
+      T tmp = data[idx + i];
+      data[idx + i] = data[idx + r];
+      data[idx + r] = tmp;
+    }
+  }
+}
+
 template <typename T, typename Context>
 void RandpermRawKernel(
    const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) {
-  DenseTensor tmp;
-  tmp.Resize(phi::make_ddim({n}));
-  T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
-
-  std::shared_ptr<std::mt19937_64> engine;
-  if (seed) {
-    engine = std::make_shared<std::mt19937_64>();
-    engine->seed(seed);
+  if (FLAGS_use_curand) {
+    DenseTensor key;
+    RandintKernel<int, Context>(dev_ctx,
+                                std::numeric_limits<int>::min(),
+                                std::numeric_limits<int>::max(),
+                                ScalarArray({n}),
+                                phi::DataType::INT32,
+                                &key);
+    DenseTensor key_out = Empty<int, Context>(dev_ctx, ScalarArray({n}));
+
+    DenseTensor range = Empty<T, Context>(dev_ctx, ScalarArray({n}));
+    T* range_data = range.data<T>();
+    funcs::ForRange<Context> for_range(dev_ctx, n);
+    for_range([range_data] __device__(size_t idx) {
+      range_data[idx] = static_cast<T>(idx);
+    });
+
+    out->Resize(phi::make_ddim({n}));
+    T* out_data = dev_ctx.template Alloc<T>(out);
+
+    // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to
+    // improve performance of radix sort.
+    double n_d = static_cast<double>(n);
+    int begin_bit = 0;
+    int end_bit =
+        std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9))));
+
+    size_t temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs<int, T>(nullptr,
+                                            temp_storage_bytes,
+                                            key.data<int>(),
+                                            key_out.data<int>(),
+                                            range.data<T>(),
+                                            out_data,
+                                            n,
+                                            begin_bit,
+                                            end_bit < 32 ? end_bit : 32,
+                                            dev_ctx.stream());
+
+    auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes);
+    cub::DeviceRadixSort::SortPairs<int, T>(d_temp_storage->ptr(),
+                                            temp_storage_bytes,
+                                            key.data<int>(),
+                                            key_out.data<int>(),
+                                            range.data<T>(),
+                                            out_data,
+                                            n,
+                                            begin_bit,
+                                            end_bit < 32 ? end_bit : 32,
+                                            dev_ctx.stream());
+
+    auto gen_cuda = dev_ctx.GetGenerator();
+    auto seed_offset = gen_cuda->IncrementOffset(n);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
+    SwapRepeatKernel<T><<<config.block_per_grid.x,
+                          config.thread_per_block.x,
+                          0,
+                          dev_ctx.stream()>>>(
+        key_out.data<int>(), out_data, n, seed, offset);
  } else {
-    engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
-  }
+    DenseTensor tmp;
+    tmp.Resize(phi::make_ddim({n}));
+    T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);

-  for (int i = 0; i < n; ++i) {
-    tmp_data[i] = static_cast<T>(i);
-  }
-  std::shuffle(tmp_data, tmp_data + n, *engine);
+    std::shared_ptr<std::mt19937_64> engine;
+    if (seed) {
+      engine = std::make_shared<std::mt19937_64>();
+      engine->seed(seed);
+    } else {
+      engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
+    }

-  T* out_data = dev_ctx.template Alloc<T>(out);
-  auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
-  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
-      out->place(), out_data, tmp.place(), tmp_data, size, 0);
+    for (int i = 0; i < n; ++i) {
+      tmp_data[i] = static_cast<T>(i);
+    }
+    std::shuffle(tmp_data, tmp_data + n, *engine);
+
+    T* out_data = dev_ctx.template Alloc<T>(out);
+    auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
+    paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+        out->place(), out_data, tmp.place(), tmp_data, size, 0);
+  }
 }

 template <typename T, typename Context>

--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -18,6 +18,7 @@ from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
 from paddle.static import program_guard, Program
+import os


 def check_randperm_out(n, data_np):
@@ -129,5 +130,81 @@ class TestRandpermImperative(unittest.TestCase):
        paddle.enable_static()


+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+
+        x = paddle.randperm(30000, dtype='int32').numpy()
+        expect = [
+            24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='int64').numpy()
+        expect = [
+            6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='float32').numpy()
+        expect = [
+            5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144.,
+            22906., 10705.
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564.,
+            26991.
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339.,
+            4662.
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='float64').numpy()
+        expect = [
+            19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147.,
+            9163.
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945.,
+            17624.
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202.,
+            21404.
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
    unittest.main()