未验证 提交 8c154880 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] Remove the constraint with a maximum number of threads per block of 256, P4 (#56702)

上级 59b2ad39
......@@ -42,11 +42,10 @@ template <typename T>
using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
inline static int GetDesiredBlockDim(int64_t block_dim) {
const int kMaxBlockDim = 512;
#ifdef __HIPCC__
const int kMaxBlockDim = 256;
const int lwarpSize = 64;
#else
const int kMaxBlockDim = 512;
const int lwarpSize = 32;
#endif
return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
......@@ -1875,11 +1874,7 @@ static void LayerNormBackward(
int64_t feature_size,
const phi::GPUContext &dev_ctx) {
auto stream = dev_ctx.stream();
#ifdef __HIPCC__
const int kMaxBlockDim = 256;
#else
const int kMaxBlockDim = 512;
#endif
const int kMaxBlockNum = 128;
int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
((d_scale != nullptr ? 1 : 0) << 1) |
......
......@@ -113,11 +113,7 @@ void CalculateXGrad(const Context& ctx,
const DenseTensor& out_grad_tensor,
const DenseTensor* dst_count = nullptr,
const DenseTensor* out = nullptr) {
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024;
#endif
int64_t n = slice_size * index_size;
int max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block;
......
......@@ -101,11 +101,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
const dim3 grid(nbx, nby);
const dim3 block(ntx, nty);
int64_t input_size = x.dims()[0];
#ifdef PADDLE_WITH_HIP
int block_ = 256;
#else
int block_ = 1024;
#endif
if (reduce_op == "SUM" || reduce_op == "MEAN") {
GraphSendUERecvSumCUDAFunctor<T> sum_functor;
if (message_op == "ADD") {
......
......@@ -35,11 +35,7 @@ namespace cub = hipcub;
namespace phi {
#ifdef __HIPCC__
static constexpr int kNumCUDAThreads = 256;
#else
static constexpr int kNumCUDAThreads = 512;
#endif
static constexpr int kNumMaxinumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
......
......@@ -88,11 +88,7 @@ class Unpool2dMaxGradFunctor {
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMaxGrad<T>
<<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
......@@ -131,11 +127,7 @@ class Unpool3dMaxGradFunctor {
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool3dMaxGrad<T>
<<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
......
......@@ -80,11 +80,7 @@ class Unpool2dMaxFunctor {
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
T* output_data = dev_ctx.template Alloc<T>(output);
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMax<T>
<<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
......@@ -117,11 +113,7 @@ class Unpool3dMaxFunctor {
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
T* output_data = dev_ctx.template Alloc<T>(output);
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool3dMax<T>
<<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
......
......@@ -870,11 +870,7 @@ static void GetGridDim(
}
static void GetBlockDim(int mid_dim, int low_dim, dim3* block) {
#ifdef __HIPCC__
constexpr int max_num_threads = 256;
#else
constexpr int max_num_threads = 1024;
#endif
int block_x = 1 << Log2Ceil(low_dim);
int block_y = 1 << Log2Ceil(mid_dim);
block->x = std::min(block_x, 32);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册