From 589709954ce73c064c638ee1df7cd92f6bc46efd Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Wed, 23 Mar 2022 11:43:41 +0800 Subject: [PATCH] change CUDA implementation of multinomial OP (#40752) --- .../phi/kernels/funcs/distribution_helper.h | 12 +- paddle/phi/kernels/funcs/inclusive_scan.h | 274 ++++++++++++++++++ paddle/phi/kernels/gpu/multinomial_kernel.cu | 191 +++++++++--- .../tests/unittests/test_multinomial_op.py | 55 ++++ 4 files changed, 488 insertions(+), 44 deletions(-) create mode 100644 paddle/phi/kernels/funcs/inclusive_scan.h diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index f752ec0c5cf..68e986c334e 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -50,11 +50,15 @@ struct exponential_transform { HOSTDEVICE inline T operator()(T val) const { #if defined(__NVCC__) || defined(__HIPCC__) - if (std::is_same::value) { - return static_cast(-1.0) / lambda_ * log(val); - } else { - return static_cast(-1.0) / lambda_ * __logf(val); + T log = -std::numeric_limits::epsilon() / 2; + if (val < static_cast(1.) - std::numeric_limits::epsilon() / 2) { + if (std::is_same::value) { + log = logf(val); + } else { + log = __logf(val); + } } + return static_cast(-1.0) / lambda_ * log; #else return static_cast(-1.0) / lambda_ * std::log(static_cast(1.0) - val); #endif diff --git a/paddle/phi/kernels/funcs/inclusive_scan.h b/paddle/phi/kernels/funcs/inclusive_scan.h new file mode 100644 index 00000000000..b285c5bdbbf --- /dev/null +++ b/paddle/phi/kernels/funcs/inclusive_scan.h @@ -0,0 +1,274 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include +#include +#include "paddle/phi/common/type_traits.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/memory/malloc.h" + +namespace phi { +namespace funcs { + +template +struct IsComplex : public std::false_type {}; + +template <> +struct IsComplex<::phi::dtype::complex> : public std::true_type {}; + +template <> +struct IsComplex<::phi::dtype::complex> : public std::true_type {}; + +template +static void CubInclusiveScan(InputIterator x_iter, + OutputIterator y_iter, + size_t n, + BinaryOp op, + const phi::GPUContext &dev_ctx) { + paddle::memory::allocation::AllocationPtr allocation; + void *temp_storage = nullptr; + size_t temp_storage_bytes = 0; + for (size_t i = 0; i < 2; ++i) { + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::InclusiveScan(temp_storage, + temp_storage_bytes, + x_iter, + y_iter, + op, + static_cast(n), + dev_ctx.stream())); + if (i == 0 && temp_storage_bytes > 0) { + allocation = + paddle::memory::Alloc(dev_ctx.GetPlace(), temp_storage_bytes); + temp_storage = allocation->ptr(); + } + } +} + +template +static auto MakeThrustReverseIterator(T *x) { + return thrust::reverse_iterator>( + thrust::device_pointer_cast(x)); +} + +template +struct InclusiveScanOuterOrMidDimFunctor { + HOSTDEVICE InclusiveScanOuterOrMidDimFunctor( + const T *x, T *y, size_t mid_dim, size_t inner_dim, T init, BinaryOp op) + : x_(x), + y_(y), + mid_dim_(mid_dim), + inner_dim_(inner_dim), + init_(init), + op_(op) {} + + HOSTDEVICE void operator()(size_t idx) const { + auto outer_idx = idx / inner_dim_; + auto inner_idx = idx % inner_dim_; + if (kReverse) { + idx = outer_idx * mid_dim_ * inner_dim_ + (mid_dim_ - 1) * inner_dim_ + + inner_idx; + } else { + idx = outer_idx * mid_dim_ * inner_dim_ + inner_idx; + } + + auto x_ptr = x_ + idx; + auto y_ptr = y_ + idx; + T acc_value = init_; + for (size_t i = 0; i < mid_dim_; ++i) { + acc_value = op_(acc_value, *x_ptr); + *y_ptr = acc_value; + if (kReverse) { + x_ptr -= inner_dim_; + y_ptr -= inner_dim_; + } else { + x_ptr += inner_dim_; + y_ptr += inner_dim_; + } + } + } + + private: + const T *x_; + T *y_; + size_t mid_dim_; + size_t inner_dim_; + T init_; + BinaryOp op_; +}; + +template +static __global__ void InclusiveScanInnerDimCUDAKernel( + const T *x, T *y, size_t num_rows, size_t row_size, T init, BinaryOp op) { + using RealT = phi::dtype::Real; + constexpr auto kSharedBufferSize = + IsComplex::value ? 4 * kThreadNumX : 2 * kThreadNumX; + __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize]; + T *row_buf = reinterpret_cast(sbuf[threadIdx.y]); + + size_t block_row = static_cast(blockIdx.x * kThreadNumY); + size_t block_row_stride = static_cast(gridDim.x * kThreadNumY); + for (; block_row < num_rows; block_row += block_row_stride) { + size_t row = block_row + threadIdx.y; + T block_total = init; + + const T *row_x = x + row * row_size; + T *row_y = y + row * row_size; + for (size_t block_col = 0; block_col < row_size; + block_col += 2 * kThreadNumX) { + size_t col1, col2; + if (kReverse) { + col1 = row_size - 1 - block_col - threadIdx.x; + col2 = col1 - kThreadNumX; + } else { + col1 = block_col + threadIdx.x; + col2 = col1 + kThreadNumX; + } + + if (row < num_rows) { + if (col1 < row_size) { + row_buf[threadIdx.x] = row_x[col1]; + } else { + row_buf[threadIdx.x] = init; + } + + if (col2 < row_size) { + row_buf[kThreadNumX + threadIdx.x] = row_x[col2]; + } else { + row_buf[kThreadNumX + threadIdx.x] = init; + } + + if (threadIdx.x == 0) { + row_buf[0] = op(row_buf[0], block_total); + } + } + __syncthreads(); + + for (size_t s = kThreadNumX, d = 1; s >= 1; s >>= 1, d <<= 1) { + if (row < num_rows && threadIdx.x < s) { + size_t offset = (2 * threadIdx.x + 1) * d - 1; + row_buf[offset + d] = op(row_buf[offset], row_buf[offset + d]); + } + __syncthreads(); + } + + for (size_t s = 2, d = kThreadNumX / 2; d >= 1; s <<= 1, d >>= 1) { + if (row < num_rows && threadIdx.x < s - 1) { + size_t offset = 2 * (threadIdx.x + 1) * d - 1; + row_buf[offset + d] = op(row_buf[offset], row_buf[offset + d]); + } + __syncthreads(); + } + + if (row < num_rows) { + if (col1 < row_size) row_y[col1] = row_buf[threadIdx.x]; + if (col2 < row_size) row_y[col2] = row_buf[kThreadNumX + threadIdx.x]; + } + block_total = row_buf[2 * kThreadNumX - 1]; + __syncthreads(); + } + } +} + +template +static void InclusiveScanInnerDim(const T *x, + T *y, + size_t outer_dim, + size_t inner_dim, + T init, + BinaryOp op, + bool reverse, + const phi::GPUContext &dev_ctx) { + constexpr size_t kThreadNumX = 16; + constexpr size_t kThreadNumY = 32; + + size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY; + grid_dim = std::min(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]); + dim3 thread_dims(kThreadNumX, kThreadNumY); + if (reverse) { + InclusiveScanInnerDimCUDAKernel< + T, + BinaryOp, + kThreadNumX, + kThreadNumY, + /*kReverse=*/true><<>>( + x, y, outer_dim, inner_dim, init, op); + } else { + InclusiveScanInnerDimCUDAKernel< + T, + BinaryOp, + kThreadNumX, + kThreadNumY, + /*kReverse=*/false><<>>( + x, y, outer_dim, inner_dim, init, op); + } +} + +template +void InclusiveScan(const T *x, + T *y, + size_t outer_dim, + size_t mid_dim, + size_t inner_dim, + T init, + BinaryOp op, + bool reverse, + const phi::GPUContext &dev_ctx) { + if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; + + if (outer_dim == 1 && inner_dim == 1) { + if (reverse) { + auto x_reverse_iter = MakeThrustReverseIterator(x + mid_dim); + auto y_reverse_iter = MakeThrustReverseIterator(y + mid_dim); + CubInclusiveScan(x_reverse_iter, y_reverse_iter, mid_dim, op, dev_ctx); + } else { + CubInclusiveScan(x, y, mid_dim, op, dev_ctx); + } + } else if (inner_dim != 1) { + phi::funcs::ForRange for_range(dev_ctx, + outer_dim * inner_dim); + if (reverse) { + for_range( + InclusiveScanOuterOrMidDimFunctor( + x, y, mid_dim, inner_dim, init, op)); + } else { + for_range( + InclusiveScanOuterOrMidDimFunctor( + x, y, mid_dim, inner_dim, init, op)); + } + } else { + InclusiveScanInnerDim( + x, y, outer_dim, mid_dim, init, op, reverse, dev_ctx); + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu index 4918495ff7b..752a91fa481 100644 --- a/paddle/phi/kernels/gpu/multinomial_kernel.cu +++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu @@ -23,11 +23,32 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/transform.h" +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/arg_min_max_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/inclusive_scan.h" #include "paddle/phi/kernels/funcs/multinomial_functor.h" +#include "paddle/phi/kernels/top_k_kernel.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/transform.h" + +DECLARE_bool(use_curand); namespace phi { @@ -57,12 +78,12 @@ template __global__ void GetCumulativeProbs(T* norm_probs_data, int64_t num_distributions, int64_t num_categories, - T* cumulative_probs) { + T* cumulative_probs_data) { int id = blockIdx.x; thrust::inclusive_scan(thrust::device, norm_probs_data + id * num_categories, norm_probs_data + (id + 1) * num_categories, - cumulative_probs + id * num_categories); + cumulative_probs_data + id * num_categories); } template @@ -80,7 +101,7 @@ struct RandomGeneratorCudaFunctor { }; template -__device__ int binarySearchFunctor(T* cumulative_probs, +__device__ int binarySearchFunctor(T* cumulative_probs_data, T* norm_probs_data, int num_categories, T rng_number) { @@ -90,7 +111,7 @@ __device__ int binarySearchFunctor(T* cumulative_probs, while (right - left > 0) { int mid = left + (right - left) / 2; - T temp_prob = cumulative_probs[mid]; + T temp_prob = cumulative_probs_data[mid]; if (temp_prob < rng_number) { left = mid + 1; } else { @@ -114,26 +135,35 @@ __global__ void sampleMultinomialWithReplacement( int64_t* out_data, const int64_t num_distributions, const int64_t num_categories, - T* cumulative_probs, - T* norm_probs_data) { + T* cumulative_probs_data, + T* norm_probs_data, + uint64_t seed, + uint64_t offset, + bool use_curand) { // use binary search to get the selected category sample id. - // let cumulative_probs[id-1] < rng_data < cumulative_probs[id]. + // let cumulative_probs_data[id-1] < rng_data < cumulative_probs_data[id]. + size_t idx = gridDim.x * blockDim.x * blockIdx.y + blockDim.x * blockIdx.x + + threadIdx.x; - // for every distribution - int dist = blockIdx.y; - // for every sample - int sample = blockIdx.x * blockDim.x + threadIdx.x; - if (sample < num_samples) { - T rng_number = rng_data[sample + dist * num_samples]; + curandStatePhilox4_32_10_t state; + curand_init(seed, idx, offset, &state); - // Find the bucket that a uniform random number lies in - int selected_category = - binarySearchFunctor(cumulative_probs + dist * num_categories, - norm_probs_data + dist * num_categories, - num_categories, - rng_number); + int sample = blockIdx.x * blockDim.x + threadIdx.x; + for (int dist = blockIdx.y; dist < num_distributions; dist += gridDim.y) { + if (sample < num_samples) { + T rng_number = rng_data[sample + dist * num_samples]; + if (use_curand) { + rng_number = static_cast(curand_uniform4(&state).x); + } + // Find the bucket that a uniform random number lies in + int selected_category = + binarySearchFunctor(cumulative_probs_data + dist * num_categories, + norm_probs_data + dist * num_categories, + num_categories, + rng_number); - out_data[sample + dist * num_samples] = selected_category; + out_data[sample + dist * num_samples] = selected_category; + } } } @@ -172,6 +202,54 @@ void MultinomialKernel(const Context& dev_ctx, in_data_numel * sizeof(T), cudaMemcpyDeviceToHost); #endif + if (FLAGS_use_curand) { + for (size_t i = 0; i < num_distributions; ++i) { + int zero_num = 0; + for (size_t j = 0; j < num_categories; ++j) { + T weight = cpu_in_data[i * num_distributions + j]; + PADDLE_ENFORCE_GE( + weight, + 0, + errors::InvalidArgument( + "Each element of multinomial'input must >= 0, but got %f.", + weight)); + if (weight == static_cast(0)) { + zero_num++; + } + } + int valid_samples = num_categories - zero_num; + PADDLE_ENFORCE_LE( + num_samples, + valid_samples, + errors::InvalidArgument("When replacement=False, 'num_samples' " + "must less than or eaqual to the number of " + "positive item of input")); + } + + // Refer to [gumbel softmax algorithm] + DenseTensor rand = EmptyLike(dev_ctx, x); + T* rand_data = rand.data(); + funcs::uniform_distribution dist; + funcs::exponential_transform trans(1.0); + funcs::distribution_and_transform(dev_ctx, &rand, dist, trans); + + funcs::ForRange for_range(dev_ctx, x.numel()); + for_range([rand_data, in_data] __device__(size_t idx) { + rand_data[idx] = in_data[idx] / rand_data[idx]; + }); + + if (num_samples == 1) { + ArgMaxKernel( + dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out); + } else { + std::vector out_dim_vec = vectorize(out->dims()); + DenseTensor value = + Empty(dev_ctx, ScalarArray(out_dim_vec)); + TopkKernel( + dev_ctx, rand, Scalar(num_samples), -1, true, true, &value, out); + } + return; + } funcs::MultinomialFunctor(dev_ctx, cpu_out_data, @@ -228,7 +306,8 @@ void MultinomialKernel(const Context& dev_ctx, auto* norm_probs_data = dev_ctx.template Alloc(&norm_probs_tensor); // number of threads in a block is min(num_categories, 512) - dim3 block_norm(num_categories < 512 ? num_categories : 512); + int block_size = num_categories < 512 ? num_categories : 512; + dim3 block_norm(block_size); dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1); NormalizeProbability<<>>( norm_probs_data, @@ -238,16 +317,34 @@ void MultinomialKernel(const Context& dev_ctx, num_categories); // Get cumulative probability of each distribution. It's the same function - // of - // ``cumsum`` op. + // of ``cumsum`` op. DenseTensor cumulative_probs_tensor; cumulative_probs_tensor.Resize({num_distributions, num_categories}); - auto* cumulative_probs = dev_ctx.template Alloc(&cumulative_probs_tensor); - - dim3 block_cumsum(1); - dim3 grid_cumsum(num_distributions); - GetCumulativeProbs<<>>( - norm_probs_data, num_distributions, num_categories, cumulative_probs); + auto* cumulative_probs_data = + dev_ctx.template Alloc(&cumulative_probs_tensor); + + if (FLAGS_use_curand) { + // 'phi::funcs::InclusiveScan' has higher accuracy than + // 'thrust::inclusive_scan' + funcs::InclusiveScan>( + /*in*/ norm_probs_data, + /*out*/ cumulative_probs_data, + /*outer_dim*/ static_cast(num_distributions), + /*mid_dim*/ static_cast(num_categories), + /*inner_dim*/ static_cast(1), + /*init*/ static_cast(0), + std::plus(), + /*reverse=*/false, + dev_ctx); + } else { + dim3 block_cumsum(1); + dim3 grid_cumsum(num_distributions); + GetCumulativeProbs<<>>( + norm_probs_data, + num_distributions, + num_categories, + cumulative_probs_data); + } // Generate random number for each sample. std::random_device rd; @@ -266,16 +363,30 @@ void MultinomialKernel(const Context& dev_ctx, RandomGeneratorCudaFunctor(seed)); // Sample the multinomial distributions. - dim3 block_sample(128); - dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions); - sampleMultinomialWithReplacement< - T><<>>(rng_data, - num_samples, - out_data, - num_distributions, - num_categories, - cumulative_probs, - norm_probs_data); + dim3 block(128); + int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); + const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id); + int grid_y = std::min(num_distributions, prop.maxGridSize[1]); + dim3 grid((num_samples - 1) / block.x + 1, grid_y); + + auto gen_cuda = dev_ctx.GetGenerator(); + size_t curand4_loop_times = + (num_distributions + 4 * grid_y - 1) / (4 * grid_y); + // 'increment' shoulde be multiple of 4 + uint64_t increment = curand4_loop_times * 4; + auto seed_offset = gen_cuda->IncrementOffset(increment); + + sampleMultinomialWithReplacement<<>>( + rng_data, + num_samples, + out_data, + num_distributions, + num_categories, + cumulative_probs_data, + norm_probs_data, + seed_offset.first, + seed_offset.second, + FLAGS_use_curand); } } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py index cdb89bb9640..a65a1c7e14c 100644 --- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py +++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py @@ -20,6 +20,7 @@ import paddle.fluid as fluid from paddle.fluid import core from op_test import OpTest import numpy as np +import os def sample_output_one_dimension(out, dim): @@ -216,5 +217,59 @@ class TestMultinomialError(unittest.TestCase): self.assertRaises(ValueError, test_dim_less_than_1) +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + # Different GPU generatte different random value. Only test V100 here. + if not "V100" in paddle.device.cuda.get_device_name(): + return + + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + + print("Test Fixed Random number on V100 GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(100) + + x = paddle.randint(0, 100, [1024, 10000]).astype('float32') + y = paddle.multinomial(x, 1, replacement=False).numpy() + self.assertEqual(np.sum(y), 5187793) + self.assertEqual(np.mean(y), 5066.2041015625) + expect = [9982, 1655, 4741, 1323, 9319, 3298, 6473, 7477, 2507, 2628] + self.assertTrue(np.array_equal(y[100:110, :].flatten(), expect)) + + y = paddle.multinomial(x, 5000, replacement=False).numpy() + self.assertEqual(np.sum(y), 25603962316) + self.assertEqual(np.mean(y), 5000.77388984375) + expect = [7300, 6055, 8714, 5401, 7360, 161, 5035, 7002, 6788, 2916] + self.assertTrue(np.array_equal(y[100, 1000:1010], expect)) + + y = paddle.multinomial(x, 5000, replacement=False).numpy() + self.assertEqual(np.sum(y), 25592855710) + self.assertEqual(np.mean(y), 4998.604630859375) + expect = [5700, 6567, 4399, 5688, 7472, 545, 6894, 526, 2124, 385] + self.assertTrue(np.array_equal(y[300, 3000:3010], expect)) + + y = paddle.multinomial(x, 20000, replacement=True).numpy() + self.assertEqual(np.sum(y), 102371362581) + self.assertEqual(np.mean(y), 4998.60168852539) + self.assertEqual(np.std(y), 2886.316308500771) + expect = [7630, 8235, 8445, 3275, 5580, 4591, 1331, 342, 1662, 7156] + self.assertTrue(np.array_equal(y[100, 0:10], expect)) + + y = paddle.multinomial(x, 20000, replacement=True).numpy() + self.assertEqual(np.sum(y), 102400672117) + self.assertEqual(np.mean(y), 5000.032818212891) + self.assertEqual(np.std(y), 2886.913426124017) + expect = [4159, 7849, 9305, 5759, 4422, 122, 345, 2897, 5200, 5911] + self.assertTrue(np.array_equal(y[100, 0:10], expect)) + + paddle.enable_static() + + if __name__ == "__main__": unittest.main() -- GitLab