From 8c1548801e198832ced1c8a218138ca96308abc1 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Wed, 30 Aug 2023 14:40:40 +0800 Subject: [PATCH] [ROCM] Remove the constraint with a maximum number of threads per block of 256, P4 (#56702) --- paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 7 +------ paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu | 4 ---- paddle/phi/kernels/gpu/send_ue_recv_kernel.cu | 4 ---- .../phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h | 4 ---- paddle/phi/kernels/gpu/unpool_grad_kernel.cu | 8 -------- paddle/phi/kernels/gpu/unpool_kernel.cu | 8 -------- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 4 ---- 7 files changed, 1 insertion(+), 38 deletions(-) diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h index 778f1363430..1a52e57e45f 100644 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h @@ -42,11 +42,10 @@ template using LayerNormParamType = typename CudnnDataType::BatchNormParamType; inline static int GetDesiredBlockDim(int64_t block_dim) { + const int kMaxBlockDim = 512; #ifdef __HIPCC__ - const int kMaxBlockDim = 256; const int lwarpSize = 64; #else - const int kMaxBlockDim = 512; const int lwarpSize = 32; #endif return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize; @@ -1875,11 +1874,7 @@ static void LayerNormBackward( int64_t feature_size, const phi::GPUContext &dev_ctx) { auto stream = dev_ctx.stream(); -#ifdef __HIPCC__ - const int kMaxBlockDim = 256; -#else const int kMaxBlockDim = 512; -#endif const int kMaxBlockNum = 128; int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) | ((d_scale != nullptr ? 1 : 0) << 1) | diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu index d368c43a297..e455714c508 100644 --- a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu @@ -113,11 +113,7 @@ void CalculateXGrad(const Context& ctx, const DenseTensor& out_grad_tensor, const DenseTensor* dst_count = nullptr, const DenseTensor* out = nullptr) { -#ifdef PADDLE_WITH_HIP - int block = 256; -#else int block = 1024; -#endif int64_t n = slice_size * index_size; int max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_tmp = (n + block - 1) / block; diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu index 764490bd1cb..7274b391e8d 100644 --- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu @@ -101,11 +101,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, const dim3 grid(nbx, nby); const dim3 block(ntx, nty); int64_t input_size = x.dims()[0]; -#ifdef PADDLE_WITH_HIP - int block_ = 256; -#else int block_ = 1024; -#endif if (reduce_op == "SUM" || reduce_op == "MEAN") { GraphSendUERecvSumCUDAFunctor sum_functor; if (message_op == "ADD") { diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h index 3962d86c3e7..307b51a1ca1 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h @@ -35,11 +35,7 @@ namespace cub = hipcub; namespace phi { -#ifdef __HIPCC__ -static constexpr int kNumCUDAThreads = 256; -#else static constexpr int kNumCUDAThreads = 512; -#endif static constexpr int kNumMaxinumNumBlocks = 4096; static inline int NumBlocks(const int N) { diff --git a/paddle/phi/kernels/gpu/unpool_grad_kernel.cu b/paddle/phi/kernels/gpu/unpool_grad_kernel.cu index 959544cdbb9..7cf08d92401 100644 --- a/paddle/phi/kernels/gpu/unpool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/unpool_grad_kernel.cu @@ -88,11 +88,7 @@ class Unpool2dMaxGradFunctor { const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = dev_ctx.template Alloc(input_grad); -#ifdef __HIPCC__ - int threads = 256; -#else int threads = 1024; -#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMaxGrad <<>>(input.numel(), @@ -131,11 +127,7 @@ class Unpool3dMaxGradFunctor { const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = dev_ctx.template Alloc(input_grad); -#ifdef __HIPCC__ - int threads = 256; -#else int threads = 1024; -#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool3dMaxGrad <<>>(input.numel(), diff --git a/paddle/phi/kernels/gpu/unpool_kernel.cu b/paddle/phi/kernels/gpu/unpool_kernel.cu index 9365c286195..1e09323642b 100644 --- a/paddle/phi/kernels/gpu/unpool_kernel.cu +++ b/paddle/phi/kernels/gpu/unpool_kernel.cu @@ -80,11 +80,7 @@ class Unpool2dMaxFunctor { const T* input_data = input.data(); const int* indices_data = indices.data(); T* output_data = dev_ctx.template Alloc(output); -#ifdef __HIPCC__ - int threads = 256; -#else int threads = 1024; -#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMax <<>>(input.numel(), @@ -117,11 +113,7 @@ class Unpool3dMaxFunctor { const T* input_data = input.data(); const int* indices_data = indices.data(); T* output_data = dev_ctx.template Alloc(output); -#ifdef __HIPCC__ - int threads = 256; -#else int threads = 1024; -#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool3dMax <<>>(input.numel(), diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index fb434b5c9cf..a4571b83e39 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -870,11 +870,7 @@ static void GetGridDim( } static void GetBlockDim(int mid_dim, int low_dim, dim3* block) { -#ifdef __HIPCC__ - constexpr int max_num_threads = 256; -#else constexpr int max_num_threads = 1024; -#endif int block_x = 1 << Log2Ceil(low_dim); int block_y = 1 << Log2Ceil(mid_dim); block->x = std::min(block_x, 32); -- GitLab