未验证 提交 76b328bc 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] Remove the constraint with a maximum number of threads per block of 256, P2 (#56700)

上级 593a4428
...@@ -467,11 +467,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx, ...@@ -467,11 +467,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
set_constant(ctx, &scale_tmp, static_cast<T>(1)); set_constant(ctx, &scale_tmp, static_cast<T>(1));
} }
const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>(); const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512; const int block = 512;
#endif
int max_threads = ctx.GetMaxPhysicalThreadCount(); int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(C, max_blocks); int grid = std::min(C, max_blocks);
......
...@@ -106,9 +106,6 @@ void ComputeFullArg(const phi::GPUContext& dev_ctx, ...@@ -106,9 +106,6 @@ void ComputeFullArg(const phi::GPUContext& dev_ctx,
block_size = 32; block_size = 32;
else if (col > 8) else if (col > 8)
block_size = 16; block_size = 16;
#ifdef __HIPCC__
block_size = std::min(block_size, 256);
#endif
return block_size; return block_size;
}; };
......
...@@ -505,12 +505,7 @@ void CheckNumericsKernel(const Context& ctx, ...@@ -505,12 +505,7 @@ void CheckNumericsKernel(const Context& ctx,
// Print to the standard output. // Print to the standard output.
char* gpu_str_ptr = GetGpuHintStringPtr<T>(ctx, op_type, var_name, dev_id); char* gpu_str_ptr = GetGpuHintStringPtr<T>(ctx, op_type, var_name, dev_id);
#ifdef __HIPCC__
// HIP will throw GPU memory access fault if threads > 256
const size_t threads = 256;
#else
const size_t threads = 1024; const size_t threads = 1024;
#endif
size_t blocks = size_t blocks =
std::min(static_cast<size_t>(128), std::min(static_cast<size_t>(128),
static_cast<size_t>((tensor.numel() + threads - 1) / threads)); static_cast<size_t>((tensor.numel() + threads - 1) / threads));
......
...@@ -174,11 +174,7 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx, ...@@ -174,11 +174,7 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
const int64_t d = phi::funcs::SizeFromAxis(axis_v, logit_grad->dims()); const int64_t d = phi::funcs::SizeFromAxis(axis_v, logit_grad->dims());
const int64_t remain = d / axis_dim; const int64_t remain = d / axis_dim;
#ifdef __HIPCC__
int block = 256;
#else
int block = 512; int block = 512;
#endif
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
// do not with softmax op, and input is softmax // do not with softmax op, and input is softmax
......
...@@ -90,11 +90,7 @@ __global__ void CrossEntropySoftLabel(T* loss, ...@@ -90,11 +90,7 @@ __global__ void CrossEntropySoftLabel(T* loss,
const int kDimCeil = 1 << log2_elements; const int kDimCeil = 1 << log2_elements;
const int kVSize = sizeof(VecT) / sizeof(T); const int kVSize = sizeof(VecT) / sizeof(T);
#ifdef __HIPCC__
const int kThreadPerBlock = 256;
#else
const int kThreadPerBlock = 512; const int kThreadPerBlock = 512;
#endif
const int kBatchPerBlock = 1; const int kBatchPerBlock = 1;
const int kWarpSize = 32; // (dim < 32) ? dim : 32; const int kWarpSize = 32; // (dim < 32) ? dim : 32;
const int kBatchSize = 1; const int kBatchSize = 1;
...@@ -718,11 +714,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx, ...@@ -718,11 +714,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
int N, int N,
int dim, int dim,
int D) { int D) {
#ifdef __HIPCC__
constexpr int kMaxBlockDim = 256;
#else
constexpr int kMaxBlockDim = 512; constexpr int kMaxBlockDim = 512;
#endif
int64_t block_dim = dim >= kMaxBlockDim int64_t block_dim = dim >= kMaxBlockDim
? kMaxBlockDim ? kMaxBlockDim
: (1 << static_cast<int>(std::log2(dim))); : (1 << static_cast<int>(std::log2(dim)));
...@@ -799,11 +791,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx, ...@@ -799,11 +791,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
const int kDimLog2 = static_cast<int>(Log2Ceil(dim)); const int kDimLog2 = static_cast<int>(Log2Ceil(dim));
const int kDimCeil = 1 << kDimLog2; const int kDimCeil = 1 << kDimLog2;
#ifdef __HIPCC__
int kThreadPerBlock = 256;
#else
int kThreadPerBlock = 512; int kThreadPerBlock = 512;
#endif
int kBatchPerBlock = 1; int kBatchPerBlock = 1;
int blocks = (N * D + kBatchPerBlock - 1) / kBatchPerBlock; int blocks = (N * D + kBatchPerBlock - 1) / kBatchPerBlock;
...@@ -1308,11 +1296,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx, ...@@ -1308,11 +1296,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
const int kDimLog2 = static_cast<int>(Log2Ceil(axis_dim)); const int kDimLog2 = static_cast<int>(Log2Ceil(axis_dim));
const int kDimCeil = 1 << kDimLog2; const int kDimCeil = 1 << kDimLog2;
#ifdef __HIPCC__
int kThreadPerBlock = 256;
#else
int kThreadPerBlock = 512; int kThreadPerBlock = 512;
#endif
int kBatchPerBlock = 1; int kBatchPerBlock = 1;
int blocks = (n * d + kBatchPerBlock - 1) / kBatchPerBlock; int blocks = (n * d + kBatchPerBlock - 1) / kBatchPerBlock;
dim3 threads(kThreadPerBlock / kBatchPerBlock, kBatchPerBlock, 1); dim3 threads(kThreadPerBlock / kBatchPerBlock, kBatchPerBlock, 1);
......
...@@ -1256,16 +1256,10 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> { ...@@ -1256,16 +1256,10 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
thread = (output_width - 1) / 2 + 1; thread = (output_width - 1) / 2 + 1;
else if (output_width > 512 && output_width <= 1024) else if (output_width > 512 && output_width <= 1024)
thread = output_width; thread = output_width;
#ifdef __HIPCC__
thread = std::min(thread, 256);
#endif
blocks = std::min(std::max(thread / output_width, 1), output_height); blocks = std::min(std::max(thread / output_width, 1), output_height);
threads = dim3(std::min(output_width, thread), blocks, 1); threads = dim3(std::min(output_width, thread), blocks, 1);
grid = dim3(output_channels, batch_size, 1); grid = dim3(output_channels, batch_size, 1);
} else { } else {
#ifdef __HIPCC__
thread = std::min(thread, 256);
#endif
blocks = std::min( blocks = std::min(
std::max(thread / output_channels, 1), std::max(thread / output_channels, 1),
((output_width + dilate_width - 1) / dilate_width) * dilate_width); ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
...@@ -1276,11 +1270,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> { ...@@ -1276,11 +1270,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
} }
int filter_multiplier = output_channels / input_channels; int filter_multiplier = output_channels / input_channels;
int nums_output = output->numel(); int nums_output = output->numel();
#ifdef __HIPCC__
int block_size = 256;
#else
int block_size = 512; int block_size = 512;
#endif
int grid_size = (nums_output + block_size - 1) / block_size; int grid_size = (nums_output + block_size - 1) / block_size;
#define check_case(c_filter_multiplier, c_stride, c_filter) \ #define check_case(c_filter_multiplier, c_stride, c_filter) \
...@@ -1449,11 +1439,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> { ...@@ -1449,11 +1439,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
} }
int filter_multiplier = output_channels / input_channels; int filter_multiplier = output_channels / input_channels;
int nums_input = input_grad->numel(); int nums_input = input_grad->numel();
#ifdef __HIPCC__
int block_size = 256;
#else
int block_size = 512; int block_size = 512;
#endif
int grid_size = (nums_input + block_size - 1) / block_size; int grid_size = (nums_input + block_size - 1) / block_size;
#define check_case(c_filter_multiplier, c_stride, c_filter) \ #define check_case(c_filter_multiplier, c_stride, c_filter) \
......
...@@ -50,11 +50,7 @@ void FillDiagonalGradKernel(const Context& ctx, ...@@ -50,11 +50,7 @@ void FillDiagonalGradKernel(const Context& ctx,
int offset, int offset,
bool wrap, bool wrap,
DenseTensor* x_grad) { DenseTensor* x_grad) {
#ifdef __HIPCC__
const int64_t kMaxBlockDim = 256;
#else
const int64_t kMaxBlockDim = 512; const int64_t kMaxBlockDim = 512;
#endif
auto* in_data = ctx.template Alloc<T>(x_grad); auto* in_data = ctx.template Alloc<T>(x_grad);
phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad); phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
......
...@@ -50,11 +50,7 @@ void FillDiagonalKernel(const Context& ctx, ...@@ -50,11 +50,7 @@ void FillDiagonalKernel(const Context& ctx,
int offset, int offset,
bool wrap, bool wrap,
DenseTensor* out) { DenseTensor* out) {
#ifdef __HIPCC__
const int64_t kMaxBlockDim = 256;
#else
const int64_t kMaxBlockDim = 512; const int64_t kMaxBlockDim = 512;
#endif
phi::Copy(ctx, x, ctx.GetPlace(), false, out); phi::Copy(ctx, x, ctx.GetPlace(), false, out);
T* out_data = ctx.template Alloc<T>(out); T* out_data = ctx.template Alloc<T>(out);
......
...@@ -48,11 +48,7 @@ void FillDiagonalTensorGradKernel(const Context &ctx, ...@@ -48,11 +48,7 @@ void FillDiagonalTensorGradKernel(const Context &ctx,
int dim1, int dim1,
int dim2, int dim2,
DenseTensor *x_grad) { DenseTensor *x_grad) {
#ifdef __HIPCC__
const int64_t kMaxBlockDim = 256;
#else
const int64_t kMaxBlockDim = 512; const int64_t kMaxBlockDim = 512;
#endif
auto matrows = 1; auto matrows = 1;
if (x_grad) { if (x_grad) {
......
...@@ -50,11 +50,7 @@ void FillDiagonalTensorKernel(const Context &ctx, ...@@ -50,11 +50,7 @@ void FillDiagonalTensorKernel(const Context &ctx,
int dim1, int dim1,
int dim2, int dim2,
DenseTensor *out) { DenseTensor *out) {
#ifdef __HIPCC__
const int64_t kMaxBlockDim = 256;
#else
const int64_t kMaxBlockDim = 512; const int64_t kMaxBlockDim = 512;
#endif
phi::Copy(ctx, x, ctx.GetPlace(), false, out); phi::Copy(ctx, x, ctx.GetPlace(), false, out);
T *out_data = ctx.template Alloc<T>(out); T *out_data = ctx.template Alloc<T>(out);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册