未验证 提交 8c154880 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] Remove the constraint with a maximum number of threads per block of 256, P4 (#56702)

上级 59b2ad39
...@@ -42,11 +42,10 @@ template <typename T> ...@@ -42,11 +42,10 @@ template <typename T>
using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType; using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
inline static int GetDesiredBlockDim(int64_t block_dim) { inline static int GetDesiredBlockDim(int64_t block_dim) {
const int kMaxBlockDim = 512;
#ifdef __HIPCC__ #ifdef __HIPCC__
const int kMaxBlockDim = 256;
const int lwarpSize = 64; const int lwarpSize = 64;
#else #else
const int kMaxBlockDim = 512;
const int lwarpSize = 32; const int lwarpSize = 32;
#endif #endif
return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize; return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
...@@ -1875,11 +1874,7 @@ static void LayerNormBackward( ...@@ -1875,11 +1874,7 @@ static void LayerNormBackward(
int64_t feature_size, int64_t feature_size,
const phi::GPUContext &dev_ctx) { const phi::GPUContext &dev_ctx) {
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
#ifdef __HIPCC__
const int kMaxBlockDim = 256;
#else
const int kMaxBlockDim = 512; const int kMaxBlockDim = 512;
#endif
const int kMaxBlockNum = 128; const int kMaxBlockNum = 128;
int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) | int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
((d_scale != nullptr ? 1 : 0) << 1) | ((d_scale != nullptr ? 1 : 0) << 1) |
......
...@@ -113,11 +113,7 @@ void CalculateXGrad(const Context& ctx, ...@@ -113,11 +113,7 @@ void CalculateXGrad(const Context& ctx,
const DenseTensor& out_grad_tensor, const DenseTensor& out_grad_tensor,
const DenseTensor* dst_count = nullptr, const DenseTensor* dst_count = nullptr,
const DenseTensor* out = nullptr) { const DenseTensor* out = nullptr) {
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024; int block = 1024;
#endif
int64_t n = slice_size * index_size; int64_t n = slice_size * index_size;
int max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (n + block - 1) / block; int64_t grid_tmp = (n + block - 1) / block;
......
...@@ -101,11 +101,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, ...@@ -101,11 +101,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
const dim3 grid(nbx, nby); const dim3 grid(nbx, nby);
const dim3 block(ntx, nty); const dim3 block(ntx, nty);
int64_t input_size = x.dims()[0]; int64_t input_size = x.dims()[0];
#ifdef PADDLE_WITH_HIP
int block_ = 256;
#else
int block_ = 1024; int block_ = 1024;
#endif
if (reduce_op == "SUM" || reduce_op == "MEAN") { if (reduce_op == "SUM" || reduce_op == "MEAN") {
GraphSendUERecvSumCUDAFunctor<T> sum_functor; GraphSendUERecvSumCUDAFunctor<T> sum_functor;
if (message_op == "ADD") { if (message_op == "ADD") {
......
...@@ -35,11 +35,7 @@ namespace cub = hipcub; ...@@ -35,11 +35,7 @@ namespace cub = hipcub;
namespace phi { namespace phi {
#ifdef __HIPCC__
static constexpr int kNumCUDAThreads = 256;
#else
static constexpr int kNumCUDAThreads = 512; static constexpr int kNumCUDAThreads = 512;
#endif
static constexpr int kNumMaxinumNumBlocks = 4096; static constexpr int kNumMaxinumNumBlocks = 4096;
static inline int NumBlocks(const int N) { static inline int NumBlocks(const int N) {
......
...@@ -88,11 +88,7 @@ class Unpool2dMaxGradFunctor { ...@@ -88,11 +88,7 @@ class Unpool2dMaxGradFunctor {
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = dev_ctx.template Alloc<T>(input_grad); T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024; int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads; int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMaxGrad<T> KernelUnpool2dMaxGrad<T>
<<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(), <<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
...@@ -131,11 +127,7 @@ class Unpool3dMaxGradFunctor { ...@@ -131,11 +127,7 @@ class Unpool3dMaxGradFunctor {
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = dev_ctx.template Alloc<T>(input_grad); T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024; int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads; int grid = (input.numel() + threads - 1) / threads;
KernelUnpool3dMaxGrad<T> KernelUnpool3dMaxGrad<T>
<<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(), <<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
......
...@@ -80,11 +80,7 @@ class Unpool2dMaxFunctor { ...@@ -80,11 +80,7 @@ class Unpool2dMaxFunctor {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>(); const int* indices_data = indices.data<int>();
T* output_data = dev_ctx.template Alloc<T>(output); T* output_data = dev_ctx.template Alloc<T>(output);
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024; int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads; int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMax<T> KernelUnpool2dMax<T>
<<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(), <<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
...@@ -117,11 +113,7 @@ class Unpool3dMaxFunctor { ...@@ -117,11 +113,7 @@ class Unpool3dMaxFunctor {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>(); const int* indices_data = indices.data<int>();
T* output_data = dev_ctx.template Alloc<T>(output); T* output_data = dev_ctx.template Alloc<T>(output);
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024; int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads; int grid = (input.numel() + threads - 1) / threads;
KernelUnpool3dMax<T> KernelUnpool3dMax<T>
<<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(), <<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
......
...@@ -870,11 +870,7 @@ static void GetGridDim( ...@@ -870,11 +870,7 @@ static void GetGridDim(
} }
static void GetBlockDim(int mid_dim, int low_dim, dim3* block) { static void GetBlockDim(int mid_dim, int low_dim, dim3* block) {
#ifdef __HIPCC__
constexpr int max_num_threads = 256;
#else
constexpr int max_num_threads = 1024; constexpr int max_num_threads = 1024;
#endif
int block_x = 1 << Log2Ceil(low_dim); int block_x = 1 << Log2Ceil(low_dim);
int block_y = 1 << Log2Ceil(mid_dim); int block_y = 1 << Log2Ceil(mid_dim);
block->x = std::min(block_x, 32); block->x = std::min(block_x, 32);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册