[ROCM] Remove the constraint with a maximum number of threads per block of 256, P1 (#56699)

d7679426 · ronnywang · GitHub · 68f8176b · d7679426 · d7679426
9 changed file
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -287,11 +287,7 @@ void FillHashTable(const framework::ExecutionContext& ctx,
                   thrust::device_vector<T>* keys,
                   thrust::device_vector<T>* values,
                   thrust::device_vector<int64_t>* key_index) {
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
  int block = 1024;
-#endif
  const auto& dev_ctx = ctx.cuda_device_context();
  int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
  int grid_tmp = (num_input + block - 1) / block;
@@ -377,12 +373,8 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
  subset->resize(unique_items.size());
  thrust::copy(unique_items.begin(), unique_items.end(), subset->begin());

-// Fill outputs with reindex result.
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
+  // Fill outputs with reindex result.
  int block = 1024;
-#endif
  const auto& dev_ctx = ctx.cuda_device_context();
  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
  int64_t grid_tmp = (outputs->size() + block - 1) / block;

--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -126,11 +126,7 @@ class Unpool2dMaxFunctor<phi::GPUContext, T> {
    const T* input_data = input.data<T>();
    const int* indices_data = indices.data<int>();
    T* output_data = output->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
    int threads = 1024;
-#endif
    int grid = (input.numel() + threads - 1) / threads;
    KernelUnpool2dMax<T>
        <<<grid, threads, 0, context.stream()>>>(input.numel(),
@@ -167,11 +163,7 @@ class Unpool2dMaxGradFunctor<phi::GPUContext, T> {
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
    int threads = 1024;
-#endif
    int grid = (input.numel() + threads - 1) / threads;
    KernelUnpool2dMaxGrad<T>
        <<<grid, threads, 0, context.stream()>>>(input.numel(),
@@ -206,11 +198,7 @@ class Unpool3dMaxFunctor<phi::GPUContext, T> {
    const T* input_data = input.data<T>();
    const int* indices_data = indices.data<int>();
    T* output_data = output->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
    int threads = 1024;
-#endif
    int grid = (input.numel() + threads - 1) / threads;
    KernelUnpool3dMax<T>
        <<<grid, threads, 0, context.stream()>>>(input.numel(),
@@ -251,11 +239,7 @@ class Unpool3dMaxGradFunctor<phi::GPUContext, T> {
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
    int threads = 1024;
-#endif
    int grid = (input.numel() + threads - 1) / threads;
    KernelUnpool3dMaxGrad<T>
        <<<grid, threads, 0, context.stream()>>>(input.numel(),

--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -170,11 +170,7 @@ static void MultiTensorL2Norm(const phi::GPUPlace &place,

  constexpr int kNumTensor = MaxTensorNumPerLaunch;
  constexpr int kNumChunk = MaxChunkNumPerLaunch;
-#ifdef PADDLE_WITH_HIP
-  constexpr int kBlockDim = 256;
-#else
  constexpr int kBlockDim = 512;
-#endif

  int max_chunk_num = -1;
  int vec_size = 8;
@@ -812,11 +808,7 @@ static void MultiTensorUpdateLambParamAndBetaPows(
        phi::errors::InvalidArgument("Beta2Pow should be nullptr."));
  }

-#ifdef PADDLE_WITH_HIP
-  const int block_dim = 256;
-#else
  const int block_dim = 512;
-#endif

  int vec_size = 8;
  for (int i = 0; i < n; ++i) {

--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -32,14 +32,9 @@

 #include "paddle/fluid/platform/device_context.h"

-#ifdef __HIPCC__
-// HIP results in error or nan if > 256
-#define PREDEFINED_BLOCK_SIZE 256
-#else
 /* CUDA performs better as thread_per_block
   num is between [64, 512] */
 #define PREDEFINED_BLOCK_SIZE 512
-#endif

 namespace paddle {
 namespace platform {
@@ -58,11 +53,7 @@ static inline int RoundToPowerOfTwo(int n) {
  n |= (n >> 4);
  n |= (n >> 8);
  n |= (n >> 16);
-#ifdef __HIPCC__
-  return std::min(256, std::max(32, (n + 1)));
-#else
  return std::min(1024, std::max(32, (n + 1)));
-#endif
 }

 #ifdef WITH_NV_JETSON

--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -34,13 +34,8 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/enforce.h"

-#ifdef __HIPCC__
-// HIP results in error or nan if > 256
-#define PREDEFINED_BLOCK_SIZE 256
-#else
 // CUDA performs better when thread_per_block is between [64, 512]
 #define PREDEFINED_BLOCK_SIZE 512
-#endif

 namespace phi {
 namespace backends {
@@ -69,11 +64,7 @@ inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) {
 inline int64_t RoundToPowerOfTwo(int64_t n) {
  constexpr int64_t min_val = 32;
  int64_t num = RoundToNextHighPowOfTwo(n, min_val);
-#ifdef __HIPCC__
-  int64_t max_val = 256;
-#else
  int64_t max_val = 1024;
-#endif
  return std::min(max_val, num);
 }


--- a/paddle/phi/kernels/funcs/cross_entropy.cu
+++ b/paddle/phi/kernels/funcs/cross_entropy.cu
@@ -124,11 +124,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(

  int batch_size = prob->dims()[0];
  int class_num = prob->dims()[1];
-#ifdef __HIPCC__
-  constexpr int kMaxBlockDim = 256;
-#else
  constexpr int kMaxBlockDim = 512;
-#endif

  if (softLabel) {
    const T* label_data = labels->data<T>();

--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -32,11 +32,7 @@ limitations under the License. */

 #endif

-#ifdef __HIPCC__
-constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
-#else
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
-#endif

 #define BLOCK_X 32
 #define BLOCK_Y 32

--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -65,10 +65,7 @@ struct ForRange<phi::GPUContext> {

  template <typename Function>
  inline void operator()(Function func) const {
-#ifdef __HIPCC__
-    // HIP will throw core dump when threads > 256
-    constexpr int num_threads = 256;
-#elif WITH_NV_JETSON
+#if WITH_NV_JETSON
    // JETSON_NANO will throw core dump when threads > 128
    int num_thread = 256;
    backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128);

--- a/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
+++ b/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
@@ -25,11 +25,7 @@
 #include <cooperative_groups.h>
 #endif

-#ifdef __HIPCC__
-#define LARS_BLOCK_SIZE 256
-#else
 #define LARS_BLOCK_SIZE 512
-#endif

 #define LARS_MAX_MERGED_OPS 60