From 5ac8c0401113424de5ae3b0c389c7a1f5f0f966d Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Mon, 22 May 2023 11:17:39 +0800 Subject: [PATCH] [Paddle Inference] Fix transfer_layout when input size if too big (#53881) * fix transfer_layout when input size if too big * do not add TransferLayoutKernelGPU * add int64 and add check --- paddle/phi/kernels/funcs/math_function.cu | 62 ++++++++++++++++---- paddle/phi/kernels/funcs/math_function.h | 7 ++- paddle/phi/kernels/transfer_layout_kernel.cc | 30 +++++++--- 3 files changed, 79 insertions(+), 20 deletions(-) diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu index 24d9248c371..5b9fecbd43b 100644 --- a/paddle/phi/kernels/funcs/math_function.cu +++ b/paddle/phi/kernels/funcs/math_function.cu @@ -31,8 +31,12 @@ namespace funcs { // Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: BSD-3-Clause template -__global__ void batch_transpose_kernel( - T* output, const T* input, const int batch, const int M, const int N) { +__global__ void batch_transpose_kernel(T* output, + const T* input, + const int batch, + const int M, + const int N, + int swizzle) { const int num = M * N; // "+1" to avoid smem bank conflict __shared__ T shbuf[32 * (32 + 1)]; @@ -40,8 +44,8 @@ __global__ void batch_transpose_kernel( const int32_t wid = tid / 32; const int32_t lid = tid % 32; const int32_t batch_i = blockIdx.z; - const int32_t mi0 = blockIdx.y * 32; - const int32_t ni0 = blockIdx.x * 32; + const int32_t mi0 = (blockIdx.y * swizzle + blockIdx.x % swizzle) * 32; + const int32_t ni0 = blockIdx.x / swizzle * 32; const size_t input_idx = batch_i * num + (mi0 + wid) * N + ni0; const T* A = input + input_idx; @@ -87,19 +91,55 @@ __global__ void batch_transpose_kernel( } template -void BatchTranspose(T* output, const T* input, int batch, int m, int n) { - dim3 grid((n + 31) / 32, (m + 31) / 32, batch); +void BatchTranspose(T* output, + const T* input, + int64_t batch, + int64_t m, + int64_t n, + const phi::GPUContext* dev_ctx) { + int64_t device_id = dev_ctx->GetPlace().GetDeviceId(); + const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id); + int max_grid_y = prop.maxGridSize[1]; + int64_t input_num = batch * m * n; + + if (input_num >= std::numeric_limits::max()) { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported input size, batch: %ld,m: %ld, n: %ld", batch, m, n)); + } + + dim3 logical_grid((n + 31) / 32, (m + 31) / 32, batch); dim3 block(32, 8); - batch_transpose_kernel<<>>(output, input, batch, m, n); + // we set swizzle to 2 default. + int swizzle = (logical_grid.y + max_grid_y - 1) / max_grid_y; + swizzle = std::max(swizzle, 2); + dim3 physical_grid(logical_grid.x * swizzle, + (logical_grid.y + swizzle - 1) / swizzle, + batch); + batch_transpose_kernel<<>>( + output, input, batch, m, n, swizzle); } using float16 = phi::dtype::float16; using bfloat16 = phi::dtype::bfloat16; -template void BatchTranspose( - float16* output, const float16* input, int batch, int m, int n); -template void BatchTranspose( - float* output, const float* input, int batch, int m, int n); +template void BatchTranspose(float16* output, + const float16* input, + int64_t batch, + int64_t m, + int64_t n, + const phi::GPUContext* dev_ctx); +template void BatchTranspose(float* output, + const float* input, + int64_t batch, + int64_t m, + int64_t n, + const phi::GPUContext* dev_ctx); +template void BatchTranspose(bfloat16* output, + const bfloat16* input, + int64_t batch, + int64_t m, + int64_t n, + const phi::GPUContext* dev_ctx); template struct SetConstant; template struct SetConstant; diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h index d2de413dad5..e2824cf4e26 100644 --- a/paddle/phi/kernels/funcs/math_function.h +++ b/paddle/phi/kernels/funcs/math_function.h @@ -26,7 +26,12 @@ namespace phi { namespace funcs { template -void BatchTranspose(T* output, const T* input, int batch, int m, int n); +void BatchTranspose(T* output, + const T* input, + int64_t batch, + int64_t m, + int64_t n, + const phi::GPUContext* dev_ctx); template struct TransposeNormal { diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc index 3d0b5950bc9..3dec79b13b5 100644 --- a/paddle/phi/kernels/transfer_layout_kernel.cc +++ b/paddle/phi/kernels/transfer_layout_kernel.cc @@ -71,15 +71,16 @@ void TransferLayoutGeneral(const Context& dev_ctx, out->Resize(phi::make_ddim(dst_dim)); dev_ctx.Alloc(out, x.dtype()); - +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // In GPU fp16 model, we will insert many transfer_layout ops in // conv2d_fusion_layout_transfer_pass, so we optimize this kernel on GPU if (std::is_same::value) { std::vector axis_nchw_nhwc = {0, 2, 3, 1}; std::vector axis_nhwc_nchw = {0, 3, 1, 2}; - const int batch = src_dim[0]; - int row_len = src_dim[1]; - int col_len = src_dim[2] * src_dim[3]; + auto* gpu_ctx = reinterpret_cast(&dev_ctx); + const int64_t batch = src_dim[0]; + int64_t row_len = src_dim[1]; + int64_t col_len = src_dim[2] * src_dim[3]; if (axis == axis_nhwc_nchw) { row_len = src_dim[1] * src_dim[2]; col_len = src_dim[3]; @@ -89,15 +90,28 @@ void TransferLayoutGeneral(const Context& dev_ctx, x.data(), batch, row_len, - col_len); + col_len, + gpu_ctx); return; } else if (x.dtype() == phi::DataType::FLOAT32) { - funcs::BatchTranspose( - out->data(), x.data(), batch, row_len, col_len); + funcs::BatchTranspose(out->data(), + x.data(), + batch, + row_len, + col_len, + gpu_ctx); + return; + } else if (x.dtype() == phi::DataType::BFLOAT16) { + funcs::BatchTranspose(out->data(), + x.data(), + batch, + row_len, + col_len, + gpu_ctx); return; } } - +#endif PD_VISIT_ALL_TYPES(x.dtype(), "CastDataLayout", ([&] { CastDataLayout(dev_ctx, x, axis, out); })); -- GitLab