未验证 提交 593a4428 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] Remove the constraint with a maximum number of threads per block of 256, P3 (#56701)

上级 41e72a41
...@@ -58,11 +58,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx, ...@@ -58,11 +58,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
int* final_nodes_len) { int* final_nodes_len) {
const auto place = dev_ctx.GetPlace(); const auto place = dev_ctx.GetPlace();
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024; int block = 1024;
#endif
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_input + block - 1) / block; int grid_tmp = (num_input + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
...@@ -128,11 +124,7 @@ void FillBufferHashTable(const Context& dev_ctx, ...@@ -128,11 +124,7 @@ void FillBufferHashTable(const Context& dev_ctx,
thrust::device_vector<T>* unique_items, thrust::device_vector<T>* unique_items,
int* values, int* values,
int* key_index) { int* key_index) {
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024; int block = 1024;
#endif
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_input + block - 1) / block; int grid_tmp = (num_input + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
...@@ -167,11 +159,7 @@ void ResetBufferHashTable(const Context& dev_ctx, ...@@ -167,11 +159,7 @@ void ResetBufferHashTable(const Context& dev_ctx,
thrust::device_vector<T>* unique_items, thrust::device_vector<T>* unique_items,
int* values, int* values,
int* key_index) { int* key_index) {
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024; int block = 1024;
#endif
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (unique_items->size() + block - 1) / block; int grid_tmp = (unique_items->size() + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
...@@ -189,12 +177,8 @@ void ReindexSrc(const Context& dev_ctx, ...@@ -189,12 +177,8 @@ void ReindexSrc(const Context& dev_ctx,
int* values, int* values,
int64_t num_edges, int64_t num_edges,
int64_t table_size) { int64_t table_size) {
// Fill outputs with reindex result. // Fill outputs with reindex result.
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024; int block = 1024;
#endif
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_edges + block - 1) / block; int grid_tmp = (num_edges + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
...@@ -289,12 +273,8 @@ void BufferReindex(const Context& dev_ctx, ...@@ -289,12 +273,8 @@ void BufferReindex(const Context& dev_ctx,
out_nodes->resize(unique_nodes.size()); out_nodes->resize(unique_nodes.size());
thrust::copy(unique_nodes.begin(), unique_nodes.end(), out_nodes->begin()); thrust::copy(unique_nodes.begin(), unique_nodes.end(), out_nodes->begin());
// Fill outputs with reindex result. // Fill outputs with reindex result.
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024; int block = 1024;
#endif
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_edges + block - 1) / block; int grid_tmp = (num_edges + block - 1) / block;
int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
......
...@@ -336,13 +336,8 @@ void GroupNormGradKernel(const Context& dev_ctx, ...@@ -336,13 +336,8 @@ void GroupNormGradKernel(const Context& dev_ctx,
} }
} }
#ifdef __HIPCC__
int block_size = std::max(std::min(256, imsize), 64);
const int block_dims = 256;
#else
int block_size = std::min(1024, imsize); int block_size = std::min(1024, imsize);
const int block_dims = 1024; const int block_dims = 1024;
#endif
dim3 grid(group_size, groups, x_dims[0]); dim3 grid(group_size, groups, x_dims[0]);
dim3 threads(block_size, 1, 1); dim3 threads(block_size, 1, 1);
int flags = int flags =
......
...@@ -820,11 +820,7 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()( ...@@ -820,11 +820,7 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
image_size *= input_ddim[i]; image_size *= input_ddim[i];
} }
} }
#ifdef __HIPCC__
int block_size = std::max(std::min(256, image_size), 64);
#else
int block_size = std::min(1024, image_size); int block_size = std::min(1024, image_size);
#endif
dim3 grid(group_size, groups, input_ddim[0]); dim3 grid(group_size, groups, input_ddim[0]);
dim3 threads(block_size, 1, 1); dim3 threads(block_size, 1, 1);
if (data_layout == DataLayout::kNCHW) { if (data_layout == DataLayout::kNCHW) {
...@@ -943,11 +939,7 @@ void GroupNormGeneralCaseKernel(const Context& dev_ctx, ...@@ -943,11 +939,7 @@ void GroupNormGeneralCaseKernel(const Context& dev_ctx,
} }
} }
#ifdef __HIPCC__
int block_size = std::max(std::min(256, imsize), 64);
#else
int block_size = std::min(1024, imsize); int block_size = std::min(1024, imsize);
#endif
dim3 grid(group_size, groups, x_dims[0]); dim3 grid(group_size, groups, x_dims[0]);
dim3 threads(block_size, 1, 1); dim3 threads(block_size, 1, 1);
......
...@@ -1096,11 +1096,7 @@ static void Interpolate2DCUDABwd( ...@@ -1096,11 +1096,7 @@ static void Interpolate2DCUDABwd(
interp_divmods); interp_divmods);
} }
} else if ("bicubic" == interp_method) { } else if ("bicubic" == interp_method) {
#ifdef __HIPCC__
constexpr int thread_per_block = 256;
#else
constexpr int thread_per_block = 512; constexpr int thread_per_block = 512;
#endif
KeBicubicInterpBw<T> KeBicubicInterpBw<T>
<<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>( <<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
input_grad_data, input_grad_data,
......
...@@ -985,11 +985,7 @@ static void Interpolate2DCUDAFwd( ...@@ -985,11 +985,7 @@ static void Interpolate2DCUDAFwd(
interp_divmods); interp_divmods);
} }
} else if ("bicubic" == interp_method) { } else if ("bicubic" == interp_method) {
#ifdef __HIPCC__
constexpr int thread_per_block = 256;
#else
constexpr int thread_per_block = 512; constexpr int thread_per_block = 512;
#endif
KeBicubicInterpFw<T> KeBicubicInterpFw<T>
<<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>( <<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
input_data, input_data,
......
...@@ -127,11 +127,7 @@ void LUKernel(const Context& dev_ctx, ...@@ -127,11 +127,7 @@ void LUKernel(const Context& dev_ctx,
DenseTensor* out, DenseTensor* out,
DenseTensor* pivots, DenseTensor* pivots,
DenseTensor* infos) { DenseTensor* infos) {
#ifdef __HIPCC__
const int64_t kMaxBlockDim = 256;
#else
const int64_t kMaxBlockDim = 512; const int64_t kMaxBlockDim = 512;
#endif
*out = Transpose2DTo6D<Context, T>(dev_ctx, x); *out = Transpose2DTo6D<Context, T>(dev_ctx, x);
......
...@@ -96,11 +96,7 @@ void NormGradKernel(const Context& ctx, ...@@ -96,11 +96,7 @@ void NormGradKernel(const Context& ctx,
int pre, n, post; int pre, n, post;
funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post); funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post);
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512; const int block = 512;
#endif
int max_threads = ctx.GetMaxPhysicalThreadCount(); int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post); int grid = std::min(max_blocks, pre * post);
......
...@@ -108,11 +108,7 @@ void NormKernel(const Context& ctx, ...@@ -108,11 +108,7 @@ void NormKernel(const Context& ctx,
int pre, n, post; int pre, n, post;
funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post); funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post);
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512; const int block = 512;
#endif
int max_threads = ctx.GetMaxPhysicalThreadCount(); int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post); int grid = std::min(max_blocks, pre * post);
......
...@@ -63,11 +63,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper( ...@@ -63,11 +63,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
const IndexT* s_index = src_index.data<IndexT>(); const IndexT* s_index = src_index.data<IndexT>();
const IndexT* d_index = dst_index.data<IndexT>(); const IndexT* d_index = dst_index.data<IndexT>();
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024; int block = 1024;
#endif
int64_t n = slice_size * index_size; int64_t n = slice_size * index_size;
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block; int64_t grid_tmp = (n + block - 1) / block;
......
...@@ -90,11 +90,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, ...@@ -90,11 +90,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
const IndexT* s_index = src_index.data<IndexT>(); const IndexT* s_index = src_index.data<IndexT>();
const IndexT* d_index = dst_index.data<IndexT>(); const IndexT* d_index = dst_index.data<IndexT>();
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024; int block = 1024;
#endif
int64_t n = slice_size * index_size; int64_t n = slice_size * index_size;
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block; int64_t grid_tmp = (n + block - 1) / block;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册