未验证 提交 593a4428 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] Remove the constraint with a maximum number of threads per block of 256, P3 (#56701)

上级 41e72a41
......@@ -58,11 +58,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
int* final_nodes_len) {
const auto place = dev_ctx.GetPlace();
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024;
#endif
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_input + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
......@@ -128,11 +124,7 @@ void FillBufferHashTable(const Context& dev_ctx,
thrust::device_vector<T>* unique_items,
int* values,
int* key_index) {
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024;
#endif
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_input + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
......@@ -167,11 +159,7 @@ void ResetBufferHashTable(const Context& dev_ctx,
thrust::device_vector<T>* unique_items,
int* values,
int* key_index) {
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024;
#endif
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (unique_items->size() + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
......@@ -189,12 +177,8 @@ void ReindexSrc(const Context& dev_ctx,
int* values,
int64_t num_edges,
int64_t table_size) {
// Fill outputs with reindex result.
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
// Fill outputs with reindex result.
int block = 1024;
#endif
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_edges + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
......@@ -289,12 +273,8 @@ void BufferReindex(const Context& dev_ctx,
out_nodes->resize(unique_nodes.size());
thrust::copy(unique_nodes.begin(), unique_nodes.end(), out_nodes->begin());
// Fill outputs with reindex result.
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
// Fill outputs with reindex result.
int block = 1024;
#endif
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_edges + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
......
......@@ -336,13 +336,8 @@ void GroupNormGradKernel(const Context& dev_ctx,
}
}
#ifdef __HIPCC__
int block_size = std::max(std::min(256, imsize), 64);
const int block_dims = 256;
#else
int block_size = std::min(1024, imsize);
const int block_dims = 1024;
#endif
dim3 grid(group_size, groups, x_dims[0]);
dim3 threads(block_size, 1, 1);
int flags =
......
......@@ -820,11 +820,7 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
image_size *= input_ddim[i];
}
}
#ifdef __HIPCC__
int block_size = std::max(std::min(256, image_size), 64);
#else
int block_size = std::min(1024, image_size);
#endif
dim3 grid(group_size, groups, input_ddim[0]);
dim3 threads(block_size, 1, 1);
if (data_layout == DataLayout::kNCHW) {
......@@ -943,11 +939,7 @@ void GroupNormGeneralCaseKernel(const Context& dev_ctx,
}
}
#ifdef __HIPCC__
int block_size = std::max(std::min(256, imsize), 64);
#else
int block_size = std::min(1024, imsize);
#endif
dim3 grid(group_size, groups, x_dims[0]);
dim3 threads(block_size, 1, 1);
......
......@@ -1096,11 +1096,7 @@ static void Interpolate2DCUDABwd(
interp_divmods);
}
} else if ("bicubic" == interp_method) {
#ifdef __HIPCC__
constexpr int thread_per_block = 256;
#else
constexpr int thread_per_block = 512;
#endif
KeBicubicInterpBw<T>
<<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
input_grad_data,
......
......@@ -985,11 +985,7 @@ static void Interpolate2DCUDAFwd(
interp_divmods);
}
} else if ("bicubic" == interp_method) {
#ifdef __HIPCC__
constexpr int thread_per_block = 256;
#else
constexpr int thread_per_block = 512;
#endif
KeBicubicInterpFw<T>
<<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
input_data,
......
......@@ -127,11 +127,7 @@ void LUKernel(const Context& dev_ctx,
DenseTensor* out,
DenseTensor* pivots,
DenseTensor* infos) {
#ifdef __HIPCC__
const int64_t kMaxBlockDim = 256;
#else
const int64_t kMaxBlockDim = 512;
#endif
*out = Transpose2DTo6D<Context, T>(dev_ctx, x);
......
......@@ -96,11 +96,7 @@ void NormGradKernel(const Context& ctx,
int pre, n, post;
funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post);
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post);
......
......@@ -108,11 +108,7 @@ void NormKernel(const Context& ctx,
int pre, n, post;
funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post);
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post);
......
......@@ -63,11 +63,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
const IndexT* s_index = src_index.data<IndexT>();
const IndexT* d_index = dst_index.data<IndexT>();
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024;
#endif
int64_t n = slice_size * index_size;
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block;
......
......@@ -90,11 +90,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
const IndexT* s_index = src_index.data<IndexT>();
const IndexT* d_index = dst_index.data<IndexT>();
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024;
#endif
int64_t n = slice_size * index_size;
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册