From 3c14b09458a31ff3d9d819b36cbb59cc5d1835c9 Mon Sep 17 00:00:00 2001 From: Rayman Date: Wed, 24 Aug 2022 16:59:04 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90Hackathon=20No.34=E3=80=91=E4=BC=98?= =?UTF-8?q?=E5=8C=96=20poisson=20op=20(#45160)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 【Hackathon No.34】优化 poisson op * [poisson] code style fix * modify code style * prevent from big number * modify code style * modify code style * modify import * modify import * modify code style --- paddle/phi/backends/gpu/gpu_launch_config.h | 8 ++++ paddle/phi/kernels/gpu/poisson_kernel.cu | 43 ++++++++------------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 552f60783c..6ea206178c 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -229,6 +229,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig3D(const phi::GPUContext& context, return config; } +template +void LimitGridDim(const Context& ctx, dim3* grid_dim) { + auto max_grid_dim = + reinterpret_cast(ctx).GetCUDAMaxGridDimSize(); + grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; + grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; + grid_dim->z = grid_dim->z < max_grid_dim[2] ? grid_dim->z : max_grid_dim[2]; +} } // namespace gpu } // namespace backends } // namespace phi diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu index 347f70b166..302a9fe5ce 100644 --- a/paddle/phi/kernels/gpu/poisson_kernel.cu +++ b/paddle/phi/kernels/gpu/poisson_kernel.cu @@ -20,6 +20,7 @@ limitations under the License. */ #endif #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/poisson_kernel.h" @@ -27,48 +28,38 @@ limitations under the License. */ namespace phi { template -struct PoissonCudaFunctor { - public: - PoissonCudaFunctor(const T* in, - T* out, - unsigned int seed, - unsigned int offset) - : in_(in), out_(out), seed_(seed), offset_(offset) {} - - __device__ void operator()(int64_t idx) { +__global__ void GetPoisson( + const T* in, T* out, const int N, unsigned int seed, unsigned int offset) { + CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { #ifdef __NVCC__ curandStatePhilox4_32_10_t state; - curand_init(seed_, idx, offset_, &state); - out_[idx] = static_cast(curand_poisson(&state, in_[idx])); + curand_init(seed, idx, offset, &state); + out[idx] = static_cast(curand_poisson(&state, in[idx])); #elif __HIPCC__ hiprandStatePhilox4_32_10_t state; - hiprand_init(seed_, idx, offset_, &state); - out_[idx] = static_cast(hiprand_poisson(&state, in_[idx])); + hiprand_init(seed, idx, offset, &state); + out[idx] = static_cast(hiprand_poisson(&state, in[idx])); #endif } - - private: - const T* in_; - T* out_; - const unsigned int seed_; - const unsigned int offset_; -}; +} template void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { const T* x_data = x.data(); T* out_data = ctx.template Alloc(out); - auto size = x.numel(); + const int size = x.numel(); + const int kMaxBlockDim = 256; + + int block_size = std::min(kMaxBlockDim, ctx.GetMaxThreadsPerBlock()); + dim3 dim_block(block_size); + dim3 dim_grid((size + block_size - 1) / block_size); + phi::backends::gpu::LimitGridDim(ctx, &dim_grid); auto gen_cuda = ctx.GetGenerator(); auto seed_offset = gen_cuda->IncrementOffset(20); uint64_t seed = seed_offset.first; uint64_t offset = seed_offset.second; - - phi::funcs::ForRange for_range(ctx, size); - - PoissonCudaFunctor functor(x_data, out_data, seed, offset); - for_range(functor); + GetPoisson<<>>(x_data, out_data, size, seed, offset); } } // namespace phi -- GitLab