diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 552f60783c8b24278680cfa80326f01a2086a218..6ea206178ca5639ad091826c33ca054c96046b08 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -229,6 +229,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig3D(const phi::GPUContext& context, return config; } +template +void LimitGridDim(const Context& ctx, dim3* grid_dim) { + auto max_grid_dim = + reinterpret_cast(ctx).GetCUDAMaxGridDimSize(); + grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; + grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; + grid_dim->z = grid_dim->z < max_grid_dim[2] ? grid_dim->z : max_grid_dim[2]; +} } // namespace gpu } // namespace backends } // namespace phi diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu index 347f70b166657622840fbd3cfb4e62aa1f87eb2a..302a9fe5ce581e3d0557712af7809dda2c700aba 100644 --- a/paddle/phi/kernels/gpu/poisson_kernel.cu +++ b/paddle/phi/kernels/gpu/poisson_kernel.cu @@ -20,6 +20,7 @@ limitations under the License. */ #endif #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/poisson_kernel.h" @@ -27,48 +28,38 @@ limitations under the License. */ namespace phi { template -struct PoissonCudaFunctor { - public: - PoissonCudaFunctor(const T* in, - T* out, - unsigned int seed, - unsigned int offset) - : in_(in), out_(out), seed_(seed), offset_(offset) {} - - __device__ void operator()(int64_t idx) { +__global__ void GetPoisson( + const T* in, T* out, const int N, unsigned int seed, unsigned int offset) { + CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { #ifdef __NVCC__ curandStatePhilox4_32_10_t state; - curand_init(seed_, idx, offset_, &state); - out_[idx] = static_cast(curand_poisson(&state, in_[idx])); + curand_init(seed, idx, offset, &state); + out[idx] = static_cast(curand_poisson(&state, in[idx])); #elif __HIPCC__ hiprandStatePhilox4_32_10_t state; - hiprand_init(seed_, idx, offset_, &state); - out_[idx] = static_cast(hiprand_poisson(&state, in_[idx])); + hiprand_init(seed, idx, offset, &state); + out[idx] = static_cast(hiprand_poisson(&state, in[idx])); #endif } - - private: - const T* in_; - T* out_; - const unsigned int seed_; - const unsigned int offset_; -}; +} template void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { const T* x_data = x.data(); T* out_data = ctx.template Alloc(out); - auto size = x.numel(); + const int size = x.numel(); + const int kMaxBlockDim = 256; + + int block_size = std::min(kMaxBlockDim, ctx.GetMaxThreadsPerBlock()); + dim3 dim_block(block_size); + dim3 dim_grid((size + block_size - 1) / block_size); + phi::backends::gpu::LimitGridDim(ctx, &dim_grid); auto gen_cuda = ctx.GetGenerator(); auto seed_offset = gen_cuda->IncrementOffset(20); uint64_t seed = seed_offset.first; uint64_t offset = seed_offset.second; - - phi::funcs::ForRange for_range(ctx, size); - - PoissonCudaFunctor functor(x_data, out_data, seed, offset); - for_range(functor); + GetPoisson<<>>(x_data, out_data, size, seed, offset); } } // namespace phi