[ROCM] Remove the constraint with a maximum number of threads per block of 256, P2 (#56700)

76b328bc · ronnywang · GitHub · 593a4428 · 76b328bc · 76b328bc
10 changed file
--- a/paddle/phi/kernels/funcs/norm_utils.cu.h
+++ b/paddle/phi/kernels/funcs/norm_utils.cu.h
@@ -467,11 +467,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
    set_constant(ctx, &scale_tmp, static_cast<T>(1));
  }
  const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
-#ifdef __HIPCC__
-  const int block = 256;
-#else
  const int block = 512;
-#endif
  int max_threads = ctx.GetMaxPhysicalThreadCount();
  const int max_blocks = std::max(max_threads / block, 1);
  int grid = std::min(C, max_blocks);

--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -106,9 +106,6 @@ void ComputeFullArg(const phi::GPUContext& dev_ctx,
      block_size = 32;
    else if (col > 8)
      block_size = 16;
-#ifdef __HIPCC__
-    block_size = std::min(block_size, 256);
-#endif
    return block_size;
  };


--- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu
+++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -505,12 +505,7 @@ void CheckNumericsKernel(const Context& ctx,
  // Print to the standard output.
  char* gpu_str_ptr = GetGpuHintStringPtr<T>(ctx, op_type, var_name, dev_id);

-#ifdef __HIPCC__
-  // HIP will throw GPU memory access fault if threads > 256
-  const size_t threads = 256;
-#else
  const size_t threads = 1024;
-#endif
  size_t blocks =
      std::min(static_cast<size_t>(128),
               static_cast<size_t>((tensor.numel() + threads - 1) / threads));

--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -174,11 +174,7 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
  const int64_t d = phi::funcs::SizeFromAxis(axis_v, logit_grad->dims());
  const int64_t remain = d / axis_dim;

-#ifdef __HIPCC__
-  int block = 256;
-#else
  int block = 512;
-#endif
  auto stream = dev_ctx.stream();

  // do not with softmax op, and input is softmax

--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -90,11 +90,7 @@ __global__ void CrossEntropySoftLabel(T* loss,
  const int kDimCeil = 1 << log2_elements;
  const int kVSize = sizeof(VecT) / sizeof(T);

-#ifdef __HIPCC__
-  const int kThreadPerBlock = 256;
-#else
  const int kThreadPerBlock = 512;
-#endif
  const int kBatchPerBlock = 1;
  const int kWarpSize = 32;  // (dim < 32) ? dim : 32;
  const int kBatchSize = 1;
@@ -718,11 +714,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
                                             int N,
                                             int dim,
                                             int D) {
-#ifdef __HIPCC__
-  constexpr int kMaxBlockDim = 256;
-#else
  constexpr int kMaxBlockDim = 512;
-#endif
  int64_t block_dim = dim >= kMaxBlockDim
                          ? kMaxBlockDim
                          : (1 << static_cast<int>(std::log2(dim)));
@@ -799,11 +791,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,

    const int kDimLog2 = static_cast<int>(Log2Ceil(dim));
    const int kDimCeil = 1 << kDimLog2;
-#ifdef __HIPCC__
-    int kThreadPerBlock = 256;
-#else
    int kThreadPerBlock = 512;
-#endif

    int kBatchPerBlock = 1;
    int blocks = (N * D + kBatchPerBlock - 1) / kBatchPerBlock;
@@ -1308,11 +1296,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,

      const int kDimLog2 = static_cast<int>(Log2Ceil(axis_dim));
      const int kDimCeil = 1 << kDimLog2;
-#ifdef __HIPCC__
-      int kThreadPerBlock = 256;
-#else
      int kThreadPerBlock = 512;
-#endif
      int kBatchPerBlock = 1;
      int blocks = (n * d + kBatchPerBlock - 1) / kBatchPerBlock;
      dim3 threads(kThreadPerBlock / kBatchPerBlock, kBatchPerBlock, 1);

--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -1256,16 +1256,10 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
        thread = (output_width - 1) / 2 + 1;
      else if (output_width > 512 && output_width <= 1024)
        thread = output_width;
-#ifdef __HIPCC__
-      thread = std::min(thread, 256);
-#endif
      blocks = std::min(std::max(thread / output_width, 1), output_height);
      threads = dim3(std::min(output_width, thread), blocks, 1);
      grid = dim3(output_channels, batch_size, 1);
    } else {
-#ifdef __HIPCC__
-      thread = std::min(thread, 256);
-#endif
      blocks = std::min(
          std::max(thread / output_channels, 1),
          ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
@@ -1276,11 +1270,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
    }
    int filter_multiplier = output_channels / input_channels;
    int nums_output = output->numel();
-#ifdef __HIPCC__
-    int block_size = 256;
-#else
    int block_size = 512;
-#endif
    int grid_size = (nums_output + block_size - 1) / block_size;

 #define check_case(c_filter_multiplier, c_stride, c_filter)             \
@@ -1449,11 +1439,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
    }
    int filter_multiplier = output_channels / input_channels;
    int nums_input = input_grad->numel();
-#ifdef __HIPCC__
-    int block_size = 256;
-#else
    int block_size = 512;
-#endif
    int grid_size = (nums_input + block_size - 1) / block_size;

 #define check_case(c_filter_multiplier, c_stride, c_filter)             \

--- a/paddle/phi/kernels/gpu/fill_diagonal_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_grad_kernel.cu
@@ -50,11 +50,7 @@ void FillDiagonalGradKernel(const Context& ctx,
                            int offset,
                            bool wrap,
                            DenseTensor* x_grad) {
-#ifdef __HIPCC__
-  const int64_t kMaxBlockDim = 256;
-#else
  const int64_t kMaxBlockDim = 512;
-#endif
  auto* in_data = ctx.template Alloc<T>(x_grad);

  phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);

--- a/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu
@@ -50,11 +50,7 @@ void FillDiagonalKernel(const Context& ctx,
                        int offset,
                        bool wrap,
                        DenseTensor* out) {
-#ifdef __HIPCC__
-  const int64_t kMaxBlockDim = 256;
-#else
  const int64_t kMaxBlockDim = 512;
-#endif
  phi::Copy(ctx, x, ctx.GetPlace(), false, out);

  T* out_data = ctx.template Alloc<T>(out);

--- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
@@ -48,11 +48,7 @@ void FillDiagonalTensorGradKernel(const Context &ctx,
                                  int dim1,
                                  int dim2,
                                  DenseTensor *x_grad) {
-#ifdef __HIPCC__
-  const int64_t kMaxBlockDim = 256;
-#else
  const int64_t kMaxBlockDim = 512;
-#endif
  auto matrows = 1;

  if (x_grad) {

--- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
@@ -50,11 +50,7 @@ void FillDiagonalTensorKernel(const Context &ctx,
                              int dim1,
                              int dim2,
                              DenseTensor *out) {
-#ifdef __HIPCC__
-  const int64_t kMaxBlockDim = 256;
-#else
  const int64_t kMaxBlockDim = 512;
-#endif
  phi::Copy(ctx, x, ctx.GetPlace(), false, out);

  T *out_data = ctx.template Alloc<T>(out);