未验证 提交 76b328bc 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] Remove the constraint with a maximum number of threads per block of 256, P2 (#56700)

上级 593a4428
......@@ -467,11 +467,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
set_constant(ctx, &scale_tmp, static_cast<T>(1));
}
const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(C, max_blocks);
......
......@@ -106,9 +106,6 @@ void ComputeFullArg(const phi::GPUContext& dev_ctx,
block_size = 32;
else if (col > 8)
block_size = 16;
#ifdef __HIPCC__
block_size = std::min(block_size, 256);
#endif
return block_size;
};
......
......@@ -505,12 +505,7 @@ void CheckNumericsKernel(const Context& ctx,
// Print to the standard output.
char* gpu_str_ptr = GetGpuHintStringPtr<T>(ctx, op_type, var_name, dev_id);
#ifdef __HIPCC__
// HIP will throw GPU memory access fault if threads > 256
const size_t threads = 256;
#else
const size_t threads = 1024;
#endif
size_t blocks =
std::min(static_cast<size_t>(128),
static_cast<size_t>((tensor.numel() + threads - 1) / threads));
......
......@@ -174,11 +174,7 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
const int64_t d = phi::funcs::SizeFromAxis(axis_v, logit_grad->dims());
const int64_t remain = d / axis_dim;
#ifdef __HIPCC__
int block = 256;
#else
int block = 512;
#endif
auto stream = dev_ctx.stream();
// do not with softmax op, and input is softmax
......
......@@ -90,11 +90,7 @@ __global__ void CrossEntropySoftLabel(T* loss,
const int kDimCeil = 1 << log2_elements;
const int kVSize = sizeof(VecT) / sizeof(T);
#ifdef __HIPCC__
const int kThreadPerBlock = 256;
#else
const int kThreadPerBlock = 512;
#endif
const int kBatchPerBlock = 1;
const int kWarpSize = 32; // (dim < 32) ? dim : 32;
const int kBatchSize = 1;
......@@ -718,11 +714,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
int N,
int dim,
int D) {
#ifdef __HIPCC__
constexpr int kMaxBlockDim = 256;
#else
constexpr int kMaxBlockDim = 512;
#endif
int64_t block_dim = dim >= kMaxBlockDim
? kMaxBlockDim
: (1 << static_cast<int>(std::log2(dim)));
......@@ -799,11 +791,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
const int kDimLog2 = static_cast<int>(Log2Ceil(dim));
const int kDimCeil = 1 << kDimLog2;
#ifdef __HIPCC__
int kThreadPerBlock = 256;
#else
int kThreadPerBlock = 512;
#endif
int kBatchPerBlock = 1;
int blocks = (N * D + kBatchPerBlock - 1) / kBatchPerBlock;
......@@ -1308,11 +1296,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
const int kDimLog2 = static_cast<int>(Log2Ceil(axis_dim));
const int kDimCeil = 1 << kDimLog2;
#ifdef __HIPCC__
int kThreadPerBlock = 256;
#else
int kThreadPerBlock = 512;
#endif
int kBatchPerBlock = 1;
int blocks = (n * d + kBatchPerBlock - 1) / kBatchPerBlock;
dim3 threads(kThreadPerBlock / kBatchPerBlock, kBatchPerBlock, 1);
......
......@@ -1256,16 +1256,10 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
thread = (output_width - 1) / 2 + 1;
else if (output_width > 512 && output_width <= 1024)
thread = output_width;
#ifdef __HIPCC__
thread = std::min(thread, 256);
#endif
blocks = std::min(std::max(thread / output_width, 1), output_height);
threads = dim3(std::min(output_width, thread), blocks, 1);
grid = dim3(output_channels, batch_size, 1);
} else {
#ifdef __HIPCC__
thread = std::min(thread, 256);
#endif
blocks = std::min(
std::max(thread / output_channels, 1),
((output_width + dilate_width - 1) / dilate_width) * dilate_width);
......@@ -1276,11 +1270,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
}
int filter_multiplier = output_channels / input_channels;
int nums_output = output->numel();
#ifdef __HIPCC__
int block_size = 256;
#else
int block_size = 512;
#endif
int grid_size = (nums_output + block_size - 1) / block_size;
#define check_case(c_filter_multiplier, c_stride, c_filter) \
......@@ -1449,11 +1439,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
}
int filter_multiplier = output_channels / input_channels;
int nums_input = input_grad->numel();
#ifdef __HIPCC__
int block_size = 256;
#else
int block_size = 512;
#endif
int grid_size = (nums_input + block_size - 1) / block_size;
#define check_case(c_filter_multiplier, c_stride, c_filter) \
......
......@@ -50,11 +50,7 @@ void FillDiagonalGradKernel(const Context& ctx,
int offset,
bool wrap,
DenseTensor* x_grad) {
#ifdef __HIPCC__
const int64_t kMaxBlockDim = 256;
#else
const int64_t kMaxBlockDim = 512;
#endif
auto* in_data = ctx.template Alloc<T>(x_grad);
phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
......
......@@ -50,11 +50,7 @@ void FillDiagonalKernel(const Context& ctx,
int offset,
bool wrap,
DenseTensor* out) {
#ifdef __HIPCC__
const int64_t kMaxBlockDim = 256;
#else
const int64_t kMaxBlockDim = 512;
#endif
phi::Copy(ctx, x, ctx.GetPlace(), false, out);
T* out_data = ctx.template Alloc<T>(out);
......
......@@ -48,11 +48,7 @@ void FillDiagonalTensorGradKernel(const Context &ctx,
int dim1,
int dim2,
DenseTensor *x_grad) {
#ifdef __HIPCC__
const int64_t kMaxBlockDim = 256;
#else
const int64_t kMaxBlockDim = 512;
#endif
auto matrows = 1;
if (x_grad) {
......
......@@ -50,11 +50,7 @@ void FillDiagonalTensorKernel(const Context &ctx,
int dim1,
int dim2,
DenseTensor *out) {
#ifdef __HIPCC__
const int64_t kMaxBlockDim = 256;
#else
const int64_t kMaxBlockDim = 512;
#endif
phi::Copy(ctx, x, ctx.GetPlace(), false, out);
T *out_data = ctx.template Alloc<T>(out);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册