From 7f02407281ec35e3ee20ed029f8651df7e5531ef Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 30 May 2022 12:35:52 +0800 Subject: [PATCH] perf(dnn): speed up pad kernel GitOrigin-RevId: 33db700687c04ee4d813e3b3fa24b6e35de4036c --- dnn/src/cuda/padding/opr_impl.cpp | 21 ++++++- dnn/src/cuda/padding/padding.cu | 99 +++++++++++++++++++++++++++++++ dnn/src/cuda/padding/padding.cuh | 5 ++ 3 files changed, 123 insertions(+), 2 deletions(-) diff --git a/dnn/src/cuda/padding/opr_impl.cpp b/dnn/src/cuda/padding/opr_impl.cpp index 2565a9b0c..b766ea663 100644 --- a/dnn/src/cuda/padding/opr_impl.cpp +++ b/dnn/src/cuda/padding/opr_impl.cpp @@ -7,6 +7,10 @@ namespace megdnn { namespace cuda { +bool is_conv_pad(size_t offsets[MEGDNN_MAX_NDIM * 2]) { + return (offsets[0] == offsets[2] && offsets[0] == 0); +} + void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { forward_check_exec(src.layout, dst.layout); SmallVector offsets(get_offsets()); @@ -16,6 +20,18 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { offsets[5], offsets[6], offsets[7], offsets[8], offsets[9], offsets[10], offsets[11], offsets[12], offsets[13]}; auto stream = cuda_stream(this->handle()); + if (src.layout.ndim == 4 && is_conv_pad(param_offsets)) { +#define cb(DType) \ + if (src.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + padding::pad4d_forward_proxy( \ + src, dst, param_offsets, uint32_t(param().padding_mode), \ + param().padding_val, stream); \ + } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) +#undef cb + } else { #define cb(DType) \ if (src.layout.dtype.enumv() == DTypeTrait::enumv) { \ using ctype = typename DTypeTrait::ctype; \ @@ -23,9 +39,10 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { src, dst, param_offsets, uint32_t(param().padding_mode), \ param().padding_val, stream); \ } - MEGDNN_FOREACH_COMPUTING_DTYPE(cb) - MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) + MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) #undef cb + } } void PaddingBackwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { diff --git a/dnn/src/cuda/padding/padding.cu b/dnn/src/cuda/padding/padding.cu index 49253cbb0..35a09c268 100644 --- a/dnn/src/cuda/padding/padding.cu +++ b/dnn/src/cuda/padding/padding.cu @@ -1,3 +1,4 @@ +#include #include #include #include @@ -94,6 +95,55 @@ __global__ void paddingReflect_kernel( } } +__device__ inline thrust::pair get_index_mapping2d( + int64_t input_dim_x, int64_t input_dim_y, int64_t output_dim_x, + int64_t output_dim_y, int64_t pad_l, int64_t pad_t, int64_t output_xy, + int y_shift, int z_shift, int nplane) { + // 3D grid of 1D blocks + auto input_offset = ((blockIdx.y + y_shift) + (blockIdx.z + z_shift) * nplane) * + input_dim_x * input_dim_y; + auto output_offset = ((blockIdx.y + y_shift) + (blockIdx.z + z_shift) * nplane) * + output_dim_x * output_dim_y; + + auto output_x = output_xy % output_dim_x; + auto output_y = output_xy / output_dim_x; + + auto i_start_x = ::max(int64_t(0), -pad_l); + auto i_start_y = ::max(int64_t(0), -pad_t); + auto o_start_x = ::max(int64_t(0), pad_l); + auto o_start_y = ::max(int64_t(0), pad_t); + + auto input_x = ::abs(output_x - pad_l) - + ::abs(output_x - (input_dim_x + pad_l - 1)) - output_x + 2 * pad_l + + input_dim_x - 1 - o_start_x + i_start_x; + + auto input_y = ::abs(output_y - pad_t) - + ::abs(output_y - (input_dim_y + pad_t - 1)) - output_y + 2 * pad_t + + input_dim_y - 1 - o_start_y + i_start_y; + + return thrust::make_pair( + input_offset + input_y * input_dim_x + input_x, + output_offset + output_y * output_dim_x + output_x); +} + +template +__global__ void reflection_pad4d_kernel( + const T* const input, T* const output, int64_t input_dim_x, int64_t input_dim_y, + int pad_t, int pad_b, int pad_l, int pad_r, int y_shift, int z_shift, + int nplane) { + auto output_xy = threadIdx.x + blockIdx.x * blockDim.x; + auto output_dim_x = input_dim_x + pad_l + pad_r; + auto output_dim_y = input_dim_y + pad_t + pad_b; + + if (output_xy < output_dim_x * output_dim_y) { + auto index_pair = get_index_mapping2d( + input_dim_x, input_dim_y, output_dim_x, output_dim_y, pad_l, pad_t, + output_xy, y_shift, z_shift, nplane); + + output[index_pair.second] = input[index_pair.first]; + } +} + template __global__ void paddingConstBackward_kernel( const size_t ndim, const size_t total_in_nr, const T* const src, T* const dst, @@ -198,6 +248,44 @@ void padding_forward_proxy( after_kernel_launch(); } +template +void pad4d_forward_proxy( + const TensorND& src, const TensorND& dst, size_t offsets[MEGDNN_MAX_NDIM * 2], + uint32_t mode, const float_t padding_val, cudaStream_t stream) { + if (mode == param_enumv::Padding::PaddingMode::REFLECT) { + size_t pad_t = offsets[4]; + size_t pad_b = offsets[5]; + size_t pad_l = offsets[6]; + size_t pad_r = offsets[7]; + + size_t nbatch = src.layout.shape[0]; + size_t nplane = src.layout.shape[1]; + size_t input_h = src.layout.shape[2]; + size_t input_w = src.layout.shape[3]; + + size_t output_plane_size = dst.layout.shape[2] * dst.layout.shape[3]; + dim3 block_size(output_plane_size > 256 ? 256 : output_plane_size); + + for (size_t block_y = 0; block_y < nplane; block_y += 65535) { + size_t block_y_size = + std::min(nplane - block_y, static_cast(65535)); + for (size_t block_z = 0; block_z < nbatch; block_z += 65535) { + size_t block_z_size = + std::min(nbatch - block_z, static_cast(65535)); + dim3 grid_size( + DIVUP(output_plane_size, static_cast(256)), + block_y_size, block_z_size); + reflection_pad4d_kernel<<>>( + src.ptr(), dst.ptr(), input_w, input_h, pad_t, pad_b, + pad_l, pad_r, block_y, block_z, nplane); + } + } + after_kernel_launch(); + } else { + padding_forward_proxy(src, dst, offsets, mode, padding_val, stream); + } +} + template void padding_backward_proxy( const TensorND& src, const TensorND& dst, size_t offsets[MEGDNN_MAX_NDIM * 2], @@ -250,6 +338,17 @@ MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) #undef cb #undef INST +#define INST(T) \ + template void pad4d_forward_proxy( \ + const TensorND& src, const TensorND& dst, \ + size_t offsets[MEGDNN_MAX_NDIM * 2], uint32_t mode, \ + const float_t padding_val, cudaStream_t stream); +#define cb(DType) INST(typename DTypeTrait::ctype) +MEGDNN_FOREACH_COMPUTING_DTYPE(cb) +MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) +#undef cb +#undef INST + #define INST(T) \ template void padding_backward_proxy( \ const TensorND& src, const TensorND& dst, \ diff --git a/dnn/src/cuda/padding/padding.cuh b/dnn/src/cuda/padding/padding.cuh index a898a01c6..7994efa96 100644 --- a/dnn/src/cuda/padding/padding.cuh +++ b/dnn/src/cuda/padding/padding.cuh @@ -13,6 +13,11 @@ void padding_forward_proxy( const TensorND& src, const TensorND& dst, size_t offsets[MEGDNN_MAX_NDIM * 2], uint32_t mode, const float_t padding_val, cudaStream_t stream); +template +void pad4d_forward_proxy( + const TensorND& src, const TensorND& dst, size_t offsets[MEGDNN_MAX_NDIM * 2], + uint32_t mode, const float_t padding_val, cudaStream_t stream); + template void padding_backward_proxy( const TensorND& src, const TensorND& dst, size_t offsets[MEGDNN_MAX_NDIM * 2], -- GitLab