From d76794262eb3f3fd0b507576ae8047ee20359630 Mon Sep 17 00:00:00 2001
From: ronnywang <ronny1996@163.com>
Date: Thu, 31 Aug 2023 10:09:52 +0800
Subject: [PATCH] [ROCM] Remove the constraint with a maximum number of threads
 per block of 256, P1 (#56699)

---
 paddle/fluid/operators/graph_khop_sampler_op.cu  | 10 +---------
 paddle/fluid/operators/math/unpooling.cu         | 16 ----------------
 .../optimizers/distributed_fused_lamb_op.cu      |  8 --------
 .../platform/device/gpu/gpu_launch_config.h      |  9 ---------
 paddle/phi/backends/gpu/gpu_launch_config.h      |  9 ---------
 paddle/phi/kernels/funcs/cross_entropy.cu        |  4 ----
 paddle/phi/kernels/funcs/elementwise_grad_base.h |  4 ----
 paddle/phi/kernels/funcs/for_range.h             |  5 +----
 paddle/phi/kernels/gpu/lars_momentum_kernel.cu   |  4 ----
 9 files changed, 2 insertions(+), 67 deletions(-)
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index e533960c8a6..b4e0f511f6d 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -287,11 +287,7 @@ void FillHashTable(const framework::ExecutionContext& ctx,
                    thrust::device_vector<T>* keys,
                    thrust::device_vector<T>* values,
                    thrust::device_vector<int64_t>* key_index) {
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
   int block = 1024;
-#endif
   const auto& dev_ctx = ctx.cuda_device_context();
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_input + block - 1) / block;
@@ -377,12 +373,8 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
   subset->resize(unique_items.size());
   thrust::copy(unique_items.begin(), unique_items.end(), subset->begin());
 
-// Fill outputs with reindex result.
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
+  // Fill outputs with reindex result.
   int block = 1024;
-#endif
   const auto& dev_ctx = ctx.cuda_device_context();
   int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (outputs->size() + block - 1) / block;
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index 0ecac6c5fb0..a386772405a 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -126,11 +126,7 @@ class Unpool2dMaxFunctor<phi::GPUContext, T> {
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMax<T>
         <<<grid, threads, 0, context.stream()>>>(input.numel(),
@@ -167,11 +163,7 @@ class Unpool2dMaxGradFunctor<phi::GPUContext, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMaxGrad<T>
         <<<grid, threads, 0, context.stream()>>>(input.numel(),
@@ -206,11 +198,7 @@ class Unpool3dMaxFunctor<phi::GPUContext, T> {
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool3dMax<T>
         <<<grid, threads, 0, context.stream()>>>(input.numel(),
@@ -251,11 +239,7 @@ class Unpool3dMaxGradFunctor<phi::GPUContext, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool3dMaxGrad<T>
         <<<grid, threads, 0, context.stream()>>>(input.numel(),
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index cad7e38ba1c..fdec898edbe 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -170,11 +170,7 @@ static void MultiTensorL2Norm(const phi::GPUPlace &place,
 
   constexpr int kNumTensor = MaxTensorNumPerLaunch;
   constexpr int kNumChunk = MaxChunkNumPerLaunch;
-#ifdef PADDLE_WITH_HIP
-  constexpr int kBlockDim = 256;
-#else
   constexpr int kBlockDim = 512;
-#endif
 
   int max_chunk_num = -1;
   int vec_size = 8;
@@ -812,11 +808,7 @@ static void MultiTensorUpdateLambParamAndBetaPows(
         phi::errors::InvalidArgument("Beta2Pow should be nullptr."));
   }
 
-#ifdef PADDLE_WITH_HIP
-  const int block_dim = 256;
-#else
   const int block_dim = 512;
-#endif
 
   int vec_size = 8;
   for (int i = 0; i < n; ++i) {
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index d253a92c986..98c6e379342 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -32,14 +32,9 @@
 
 #include "paddle/fluid/platform/device_context.h"
 
-#ifdef __HIPCC__
-// HIP results in error or nan if > 256
-#define PREDEFINED_BLOCK_SIZE 256
-#else
 /* CUDA performs better as thread_per_block
    num is between [64, 512] */
 #define PREDEFINED_BLOCK_SIZE 512
-#endif
 
 namespace paddle {
 namespace platform {
@@ -58,11 +53,7 @@ static inline int RoundToPowerOfTwo(int n) {
   n |= (n >> 4);
   n |= (n >> 8);
   n |= (n >> 16);
-#ifdef __HIPCC__
-  return std::min(256, std::max(32, (n + 1)));
-#else
   return std::min(1024, std::max(32, (n + 1)));
-#endif
 }
 
 #ifdef WITH_NV_JETSON
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index a7a7ad03ad6..fd712baf754 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -34,13 +34,8 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/enforce.h"
 
-#ifdef __HIPCC__
-// HIP results in error or nan if > 256
-#define PREDEFINED_BLOCK_SIZE 256
-#else
 // CUDA performs better when thread_per_block is between [64, 512]
 #define PREDEFINED_BLOCK_SIZE 512
-#endif
 
 namespace phi {
 namespace backends {
@@ -69,11 +64,7 @@ inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) {
 inline int64_t RoundToPowerOfTwo(int64_t n) {
   constexpr int64_t min_val = 32;
   int64_t num = RoundToNextHighPowOfTwo(n, min_val);
-#ifdef __HIPCC__
-  int64_t max_val = 256;
-#else
   int64_t max_val = 1024;
-#endif
   return std::min(max_val, num);
 }
 
diff --git a/paddle/phi/kernels/funcs/cross_entropy.cu b/paddle/phi/kernels/funcs/cross_entropy.cu
index add838106bf..20a15f9e944 100644
--- a/paddle/phi/kernels/funcs/cross_entropy.cu
+++ b/paddle/phi/kernels/funcs/cross_entropy.cu
@@ -124,11 +124,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
 
   int batch_size = prob->dims()[0];
   int class_num = prob->dims()[1];
-#ifdef __HIPCC__
-  constexpr int kMaxBlockDim = 256;
-#else
   constexpr int kMaxBlockDim = 512;
-#endif
 
   if (softLabel) {
     const T* label_data = labels->data<T>();
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index df9e9347444..5ff70c86d5f 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -32,11 +32,7 @@ limitations under the License. */
 
 #endif
 
-#ifdef __HIPCC__
-constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
-#else
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
-#endif
 
 #define BLOCK_X 32
 #define BLOCK_Y 32
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
index 9648a7d845f..484fbd21dc7 100644
--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -65,10 +65,7 @@ struct ForRange<phi::GPUContext> {
 
   template <typename Function>
   inline void operator()(Function func) const {
-#ifdef __HIPCC__
-    // HIP will throw core dump when threads > 256
-    constexpr int num_threads = 256;
-#elif WITH_NV_JETSON
+#if WITH_NV_JETSON
     // JETSON_NANO will throw core dump when threads > 128
     int num_thread = 256;
     backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128);
diff --git a/paddle/phi/kernels/gpu/lars_momentum_kernel.cu b/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
index 14b7f1ca328..be8b1ff3796 100644
--- a/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
+++ b/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
@@ -25,11 +25,7 @@
 #include <cooperative_groups.h>
 #endif
 
-#ifdef __HIPCC__
-#define LARS_BLOCK_SIZE 256
-#else
 #define LARS_BLOCK_SIZE 512
-#endif
 
 #define LARS_MAX_MERGED_OPS 60
 
-- 
GitLab