From e661ae904f82a70fee0fb59fbade1f245c60768a Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 21 May 2021 10:12:28 +0800 Subject: [PATCH] feat(dnn/cuda): add base class for cutlass uint4 and int4 algos GitOrigin-RevId: a4d42f032c7e53f2966016092ba52c091575be77 --- dnn/src/cuda/conv_bias/algo.h | 130 ++++++---- .../implicit_gemm_int4_int4_nchw64_imma.cpp | 207 ++++----------- .../implicit_gemm_int4_nchw64_imma_base.cpp | 149 +++++++++++ .../implicit_gemm_uint4_int4_nchw64_imma.cpp | 243 ++++++------------ dnn/src/cuda/conv_bias/opr_impl.h | 1 + 5 files changed, 366 insertions(+), 364 deletions(-) create mode 100644 dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index aa74e6227..23d0f1890 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -765,7 +765,7 @@ private: std::string m_name; }; -class ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm final +class ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase : public AlgoBase { public: struct AlgoParam { @@ -776,89 +776,121 @@ public: int warp_n; int warp_k; }; + + AlgoInt4NCHW64IMMAImplicitGemmBase(AlgoParam algo_param) + : m_algo_param(algo_param) {} + + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE; + } + const char* name() const override { return m_name.c_str(); } + std::string param() const override; + + bool is_available(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + std::string to_string(AlgoParam algo_param); + +protected: + virtual DTypeEnum src_dtype() const = 0; + + // return filter_ptr, bias_ptr + virtual std::tuple prepare_filter_bias( + const ExecArgs& args) const = 0; + + // return alpha, beta, gamma, delta, theta + virtual std::tuple get_constants( + const ExecArgs& args) const = 0; + + virtual void do_exec(const ExecArgs& args, void* filter_ptr, void* bias_ptr, + void* z_ptr, convolution::ConvParam kern_param, + uint32_t nonlinear_mode, float alpha, float beta, + float gamma, float delta, float theta, + cudaStream_t stream) const = 0; + + void reorder_filter(const ExecArgs& args, void* reordered_filter) const; + + std::string m_name; + AlgoParam m_algo_param; +}; + +class ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm final + : public AlgoInt4NCHW64IMMAImplicitGemmBase { +public: + using Base = AlgoInt4NCHW64IMMAImplicitGemmBase; + using AlgoParam = Base::AlgoParam; + AlgoInt4Int4NCHW64IMMAImplicitGemm(AlgoParam algo_param) - : m_algo_param{algo_param} { + : Base{algo_param} { m_name = ConvBias::algo_name( ssprintf("INT4_INT4_NCHW64_IMMA_IMPLICIT_GEMM_%s", to_string(m_algo_param).c_str()), ConvBias::DirectParam{}); } - bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; - void exec(const ExecArgs& args) const override; - const char* name() const override { return m_name.c_str(); } - AlgoAttribute attribute() const override { - return AlgoAttribute::REPRODUCIBLE; - } - static std::string to_string(AlgoParam algo_param); size_t get_preprocess_workspace_in_bytes( const SizeArgs& args) const override; SmallVector deduce_preprocessed_filter_layout( const SizeArgs& args) const override; void exec_preprocess(const ExecArgs& args) const override; - MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4) - std::string param() const override { - std::string ret; - serialize_write_pod(m_algo_param, ret); - return ret; - } + MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4) private: - WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, - const SizeArgs& args) const; + DTypeEnum src_dtype() const override { return DTypeEnum::QuantizedS4; } - AlgoParam m_algo_param; - std::string m_name; + std::tuple prepare_filter_bias( + const ExecArgs& args) const override; + + std::tuple get_constants( + const ExecArgs& args) const override; + + void do_exec(const ExecArgs& args, void* filter_ptr, void* bias_ptr, + void* z_ptr, convolution::ConvParam kern_param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float delta, float theta, cudaStream_t stream) const override; }; class ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm final - : public AlgoBase { + : public AlgoInt4NCHW64IMMAImplicitGemmBase { public: - struct AlgoParam { - int threadblock_m; - int threadblock_n; - int threadblock_k; - int warp_m; - int warp_n; - int warp_k; - }; + using Base = AlgoInt4NCHW64IMMAImplicitGemmBase; + using AlgoParam = Base::AlgoParam; + AlgoUInt4Int4NCHW64IMMAImplicitGemm(AlgoParam algo_param) - : m_algo_param{algo_param} { + : Base{algo_param} { m_name = ConvBias::algo_name( ssprintf("UINT4_INT4_NCHW64_IMMA_IMPLICIT_GEMM_%s", to_string(m_algo_param).c_str()), ConvBias::DirectParam{}); } - bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; - void exec(const ExecArgs& args) const override; - const char* name() const override { return m_name.c_str(); } - AlgoAttribute attribute() const override { - return AlgoAttribute::REPRODUCIBLE; - } - static std::string to_string(AlgoParam algo_param); size_t get_preprocess_workspace_in_bytes( const SizeArgs& args) const override; SmallVector deduce_preprocessed_filter_layout( const SizeArgs& args) const override; void exec_preprocess(const ExecArgs& args) const override; - MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4) - std::string param() const override { - std::string ret; - serialize_write_pod(m_algo_param, ret); - return ret; - } + MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4) private: - WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, - const SizeArgs& args) const; - void reorder_filter_bias(const ExecArgs& args, void* reduce_filter, - void* reordered_filter, - void* reordered_bias) const; - AlgoParam m_algo_param; - std::string m_name; + DTypeEnum src_dtype() const override { return DTypeEnum::Quantized4Asymm; } + + std::tuple prepare_filter_bias( + const ExecArgs& args) const override; + + std::tuple get_constants( + const ExecArgs& args) const override; + + void do_exec(const ExecArgs& args, void* filter_ptr, void* bias_ptr, + void* z_ptr, convolution::ConvParam kern_param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float delta, float theta, cudaStream_t stream) const override; + + void update_bias(const ExecArgs& args, void* updated_bias, + void* reduce_filter_ptr, void* reduce_workspace) const; }; #endif diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp index 50ead1514..d2a8d0c38 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp @@ -11,117 +11,59 @@ */ #include "./algo.h" -#include "src/common/conv_bias.h" #include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" -#include "src/cuda/convolution_helper/parameter.cuh" -#include "src/cuda/utils.h" using namespace megdnn; using namespace cuda; using namespace convolution; #if CUDA_VERSION >= 10020 -bool ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::is_available( +size_t +ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::get_workspace_in_bytes( const SizeArgs& args) const { - if (args.bias_layout->ndim <= 0) - return false; - - using Param = param::ConvBias; - using Format = Param::Format; - using Sparse = Param::Sparse; - using Mode = Param::Mode; - using NonlineMode = megdnn::param::ConvBias::NonlineMode; - - auto&& param = args.opr->param(); - - if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) - return false; - - if (param.format != Format::NCHW64 || param.sparse != Sparse::DENSE || - param.mode != Mode::CROSS_CORRELATION) - return false; - - if (param.nonlineMode != NonlineMode::IDENTITY && - param.nonlineMode != NonlineMode::RELU && - param.nonlineMode != NonlineMode::H_SWISH) - return false; - - if (args.src_layout->dtype.enumv() != DTypeEnum::QuantizedS4 || - args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 || - args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32 || - args.dst_layout->dtype.enumv() != DTypeEnum::QuantizedS4) - return false; - - if (!is_compute_capability_required(7, 5)) - return false; - - return true; -} - -WorkspaceBundle -ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::get_workspace_bundle( - dt_byte* raw_ptr, const SizeArgs& args) const { if (args.preprocessed_filter) { - return WorkspaceBundle{raw_ptr, {}}; + return 0; } else { - size_t ws_filter = args.filter_layout->span().dist_byte(); - return WorkspaceBundle{raw_ptr, {ws_filter}}; + return args.filter_layout->span().dist_byte(); } } -size_t -ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::get_workspace_in_bytes( - const SizeArgs& args) const { - return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +size_t ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm:: + get_preprocess_workspace_in_bytes(const SizeArgs& args) const { + return 0; +} + +SmallVector ConvBiasForwardImpl:: + AlgoInt4Int4NCHW64IMMAImplicitGemm::deduce_preprocessed_filter_layout( + const SizeArgs& args) const { + return {args.filter_layout->collapse_contiguous()}; } -void ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::exec( +void ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::exec_preprocess( + const ExecArgs& args) const { + megdnn_assert(args.preprocessed_filter->tensors.size() == 1); + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + reorder_filter(args, filter_ptr); +} + +std::tuple +ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::prepare_filter_bias( const ExecArgs& args) const { - auto&& param = args.opr->param(); - auto&& fm = args.filter_meta; - size_t n = args.src_layout->operator[](0), - ci = args.src_layout->operator[](1) * 64, - hi = args.src_layout->operator[](2), - wi = args.src_layout->operator[](3); - size_t co = args.dst_layout->operator[](1) * 64, - ho = args.dst_layout->operator[](2), - wo = args.dst_layout->operator[](3); - UNPACK_CONV_PARAMETER(fm, param); - MARK_USED_VAR - auto&& stream = cuda_stream(args.opr->handle()); - - int8_t* filter_ptr = nullptr; - if (args.preprocessed_filter == nullptr) { - filter_ptr = reinterpret_cast(args.workspace.raw_ptr); - // reformat filter from nchw64 to chwn64 - TensorLayout src{{co, ci / 64, fh, fw, 64}, dtype::QuantizedS4()}; - src.init_contiguous_stride(); - TensorLayout dst = src; - dst.stride[0] = 64; - dst.stride[1] = co * fh * fw * 64; - dst.stride[2] = co * fw * 64; - dst.stride[3] = co * 64; - dst.stride[4] = 1; - TensorND ts_src, ts_dst; - ts_src.raw_ptr = args.filter_tensor->raw_ptr; - ts_src.layout = src; - ts_dst.raw_ptr = args.workspace.raw_ptr; - ts_dst.layout = dst; - auto&& transpose = - args.opr->handle()->create_operator(); - transpose->exec(ts_src, ts_dst); + void* filter_ptr = nullptr; + if (args.preprocessed_filter) { + megdnn_assert(args.preprocessed_filter->tensors.size() == 1); + filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; } else { - filter_ptr = reinterpret_cast( - args.preprocessed_filter->tensors[0].raw_ptr); + filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + reorder_filter(args, filter_ptr); } + void* bias_ptr = args.bias_tensor->raw_ptr; + return {filter_ptr, bias_ptr}; +} - ConvParam kern_param; - kern_param.n = n, kern_param.co = co, kern_param.ci = ci, - kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, - kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, - kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, - kern_param.fw = fw; - +std::tuple +ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::get_constants( + const ExecArgs& args) const { float src_scale = args.src_layout->dtype.param().scale, filter_scale = args.filter_layout->dtype.param().scale, @@ -130,78 +72,37 @@ void ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::exec( dst_scale = args.dst_layout->dtype.param().scale; float alpha = src_scale * filter_scale / dst_scale, - beta = bias_scale / dst_scale; + beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = 0.f; - int8_t* z_dev_ptr = nullptr; - float gamma = 0.f; if (args.z_layout->ndim > 0) { - z_dev_ptr = reinterpret_cast(args.z_tensor->raw_ptr); float z_scale = args.z_layout->dtype.param().scale; gamma = z_scale / dst_scale; } - uint32_t nonlinear_mode = static_cast(param.nonlineMode); - - cutlass_wrapper::do_conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64< - true>( - reinterpret_cast(args.src_tensor->raw_ptr), filter_ptr, - args.bias_tensor->compatible_ptr(), z_dev_ptr, - reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, - kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, - cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k}, - cutlass_wrapper::GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, - m_algo_param.warp_k}, - stream); + return {alpha, beta, gamma, delta, theta}; } -std::string ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::to_string( - AlgoParam algo_param) { - return ssprintf("%uX%uX%u_%uX%uX%u", algo_param.threadblock_m, - algo_param.threadblock_n, algo_param.threadblock_k, - algo_param.warp_m, algo_param.warp_n, algo_param.warp_k); -} +void ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::do_exec( + const ExecArgs& args, void* filter_ptr, void* bias_ptr, void* z_ptr, + ConvParam kern_param, uint32_t nonlinear_mode, float alpha, float beta, + float gamma, float delta, float theta, cudaStream_t stream) const { + float dst_scale = args.dst_layout->dtype.param().scale; -size_t ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm:: - get_preprocess_workspace_in_bytes(const SizeArgs& args) const { - return 0_z; -} + cutlass_wrapper::GemmCoord threadblock_shape{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}; -SmallVector ConvBiasForwardImpl:: - AlgoInt4Int4NCHW64IMMAImplicitGemm::deduce_preprocessed_filter_layout( - const SizeArgs& args) const { - return {args.filter_layout->collapse_contiguous()}; -} + cutlass_wrapper::GemmCoord warp_shape{ + m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}; -void ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::exec_preprocess( - const ExecArgs& args) const { - auto&& param = args.opr->param(); - auto&& fm = args.filter_meta; - size_t n = args.src_layout->operator[](0), - ci = args.src_layout->operator[](1) * 64, - hi = args.src_layout->operator[](2), - wi = args.src_layout->operator[](3); - size_t co = args.dst_layout->operator[](1) * 64, - ho = args.dst_layout->operator[](2), - wo = args.dst_layout->operator[](3); - UNPACK_CONV_PARAMETER(fm, param); - MARK_USED_VAR - TensorLayout src{{co, ci / 64, fh, fw, 64}, dtype::QuantizedS4()}; - src.init_contiguous_stride(); - TensorLayout dst = src; - dst.stride[0] = 64; - dst.stride[1] = co * fh * fw * 64; - dst.stride[2] = co * fw * 64; - dst.stride[3] = co * 64; - dst.stride[4] = 1; - TensorND ts_src, ts_dst; - ts_src.raw_ptr = args.filter_tensor->raw_ptr; - ts_src.layout = src; - ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr; - ts_dst.layout = dst; - auto&& transpose = args.opr->handle()->create_operator(); - transpose->exec(ts_src, ts_dst); + cutlass_wrapper::do_conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64< + true>(reinterpret_cast(args.src_tensor->raw_ptr), + reinterpret_cast(filter_ptr), + reinterpret_cast(bias_ptr), + reinterpret_cast(z_ptr), + reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + threadblock_shape, warp_shape, stream); } #endif diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp new file mode 100644 index 000000000..033359674 --- /dev/null +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp @@ -0,0 +1,149 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./algo.h" +#include "src/common/conv_bias.h" +#include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" +#include "src/cuda/conv_bias/reduce_filter.cuh" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace convolution; + +#if CUDA_VERSION >= 10020 +std::string ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::param() + const { + std::string ret; + serialize_write_pod(m_algo_param, ret); + return ret; +} + +bool ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::is_available( + const SizeArgs& args) const { + if (args.bias_layout->ndim <= 0) + return false; + + using Param = param::ConvBias; + using Format = Param::Format; + using Sparse = Param::Sparse; + using Mode = Param::Mode; + using NonlineMode = megdnn::param::ConvBias::NonlineMode; + + auto&& param = args.opr->param(); + + if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) + return false; + + if (param.format != Format::NCHW64 || param.sparse != Sparse::DENSE || + param.mode != Mode::CROSS_CORRELATION) + return false; + + if (param.nonlineMode != NonlineMode::IDENTITY && + param.nonlineMode != NonlineMode::RELU && + param.nonlineMode != NonlineMode::H_SWISH) + return false; + + if (args.src_layout->dtype.enumv() != src_dtype() || + args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 || + args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32 || + args.dst_layout->dtype.enumv() != src_dtype()) + return false; + + if (!is_compute_capability_required(7, 5)) + return false; + + return true; +} + +void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec( + const ExecArgs& args) const { + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 64, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t co = args.dst_layout->operator[](1) * 64, + ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR + + void* filter_ptr = nullptr; + void* bias_ptr = nullptr; + void* z_ptr = nullptr; + + std::tie(filter_ptr, bias_ptr) = prepare_filter_bias(args); + if (args.z_layout->ndim > 0) + z_ptr = args.z_tensor->raw_ptr; + + float alpha, beta, gamma, delta, theta; + std::tie(alpha, beta, gamma, delta, theta) = get_constants(args); + + ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + uint32_t nonlinear_mode = static_cast(param.nonlineMode); + + cudaStream_t stream = cuda_stream(args.opr->handle()); + + do_exec(args, filter_ptr, bias_ptr, z_ptr, kern_param, nonlinear_mode, + alpha, beta, gamma, delta, theta, stream); +} + +std::string ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::to_string( + AlgoParam algo_param) { + return ssprintf("%uX%uX%u_%uX%uX%u", algo_param.threadblock_m, + algo_param.threadblock_n, algo_param.threadblock_k, + algo_param.warp_m, algo_param.warp_n, algo_param.warp_k); +} + +void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::reorder_filter( + const ExecArgs& args, void* reordered_filter) const { + auto&& param = args.opr->param(); + auto&& fm = args.filter_meta; + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 64, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t co = args.dst_layout->operator[](1) * 64, + ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR; + + // filter: KCRS64 => CRSK64 + TensorLayout src{{co, ci / 64, fh, fw, 64}, dtype::QuantizedS4()}; + src.init_contiguous_stride(); + TensorLayout dst = src; + dst.stride[0] = 64; + dst.stride[1] = co * fh * fw * 64; + dst.stride[2] = co * fw * 64; + dst.stride[3] = co * 64; + dst.stride[4] = 1; + TensorND ts_src, ts_dst; + ts_src.raw_ptr = args.filter_tensor->raw_ptr; + ts_src.layout = src; + ts_dst.raw_ptr = reordered_filter; + ts_dst.layout = dst; + auto&& transpose = args.opr->handle()->create_operator(); + transpose->exec(ts_src, ts_dst); +} +#endif + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp index 6ca373065..0d4762025 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp @@ -11,10 +11,8 @@ */ #include "./algo.h" -#include "src/common/conv_bias.h" #include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" #include "src/cuda/conv_bias/reduce_filter.cuh" -#include "src/cuda/convolution_helper/parameter.cuh" #include "src/cuda/utils.h" using namespace megdnn; @@ -22,85 +20,60 @@ using namespace cuda; using namespace convolution; #if CUDA_VERSION >= 10020 -bool ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::is_available( - const SizeArgs& args) const { - if (args.bias_layout->ndim <= 0) - return false; - - using Param = param::ConvBias; - using Format = Param::Format; - using Sparse = Param::Sparse; - using Mode = Param::Mode; - using NonlineMode = megdnn::param::ConvBias::NonlineMode; - - auto&& param = args.opr->param(); - - if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) - return false; - - if (param.format != Format::NCHW64 || param.sparse != Sparse::DENSE || - param.mode != Mode::CROSS_CORRELATION) - return false; - - if (param.nonlineMode != NonlineMode::IDENTITY && - param.nonlineMode != NonlineMode::RELU && - param.nonlineMode != NonlineMode::H_SWISH) - return false; - - if (args.src_layout->dtype.enumv() != DTypeEnum::Quantized4Asymm || - args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 || - args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32 || - args.dst_layout->dtype.enumv() != DTypeEnum::Quantized4Asymm) - return false; - - if (!is_compute_capability_required(7, 5)) - return false; - - return true; -} - -WorkspaceBundle -ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::get_workspace_bundle( - dt_byte* raw_ptr, const SizeArgs& args) const { +size_t ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm:: + get_workspace_in_bytes(const SizeArgs& args) const { if (args.preprocessed_filter) { - return WorkspaceBundle{raw_ptr, {}}; + return 0; } else { size_t ws_filter = args.filter_layout->span().dist_byte(), ws_bias = args.bias_layout->span().dist_byte(), ws_reduce_filter = get_preprocess_workspace_in_bytes(args); - return WorkspaceBundle{raw_ptr, - {ws_filter + ws_bias + ws_reduce_filter}}; + return ws_filter + ws_bias + ws_reduce_filter; } } size_t ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm:: - get_workspace_in_bytes(const SizeArgs& args) const { - return get_workspace_bundle(nullptr, args).total_size_in_bytes(); + get_preprocess_workspace_in_bytes(const SizeArgs& args) const { + size_t co = args.filter_layout->operator[](0), + ci = args.filter_layout->operator[](1) * 64, + fh = args.filter_layout->operator[](2), + fw = args.filter_layout->operator[](3); + size_t ws_size_reduce_filter = co * sizeof(int32_t); + size_t A = co, B = ci * fh * fw / 8, C = 1; + ws_size_reduce_filter += do_dispatch_reduce_workspace_in_bytes(A, B, C); + return ws_size_reduce_filter; } -void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::exec( +SmallVector ConvBiasForwardImpl:: + AlgoUInt4Int4NCHW64IMMAImplicitGemm::deduce_preprocessed_filter_layout( + const SizeArgs& args) const { + return {args.filter_layout->collapse_contiguous(), + args.bias_layout->collapse_contiguous()}; +} + +void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::exec_preprocess( const ExecArgs& args) const { - auto&& param = args.opr->param(); - auto&& fm = args.filter_meta; - size_t n = args.src_layout->operator[](0), - ci = args.src_layout->operator[](1) * 64, - hi = args.src_layout->operator[](2), - wi = args.src_layout->operator[](3); - size_t co = args.dst_layout->operator[](1) * 64, - ho = args.dst_layout->operator[](2), - wo = args.dst_layout->operator[](3); - UNPACK_CONV_PARAMETER(fm, param); - MARK_USED_VAR - auto&& stream = cuda_stream(args.opr->handle()); + megdnn_assert(args.preprocessed_filter->tensors.size() == 2); + void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; + void* bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; + void* reduce_filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + void* reduce_workspace = reinterpret_cast( + args.workspace.raw_ptr + args.bias_layout->span().dist_byte()); + reorder_filter(args, filter_ptr); + update_bias(args, bias_ptr, reduce_filter_ptr, reduce_workspace); +} +std::tuple +ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::prepare_filter_bias( + const ExecArgs& args) const { void* filter_ptr = nullptr; void* bias_ptr = nullptr; if (args.preprocessed_filter) { megdnn_assert(args.preprocessed_filter->tensors.size() == 2); filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; bias_ptr = args.preprocessed_filter->tensors[1].raw_ptr; + return {filter_ptr, bias_ptr}; } else { - // reorder filter and bias filter_ptr = reinterpret_cast(args.workspace.raw_ptr); bias_ptr = reinterpret_cast(args.workspace.raw_ptr + @@ -109,16 +82,20 @@ void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::exec( reinterpret_cast(args.workspace.raw_ptr + args.filter_layout->span().dist_byte() + args.bias_layout->span().dist_byte()); - reorder_filter_bias(args, reduce_filter_ptr, filter_ptr, bias_ptr); + void* reduce_workspace = + reinterpret_cast(args.workspace.raw_ptr + + args.filter_layout->span().dist_byte() + + args.bias_layout->span().dist_byte() + + args.bias_layout->span().dist_byte()); + reorder_filter(args, filter_ptr); + update_bias(args, bias_ptr, reduce_filter_ptr, reduce_workspace); } + return {filter_ptr, bias_ptr}; +} - ConvParam kern_param; - kern_param.n = n, kern_param.co = co, kern_param.ci = ci, - kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, - kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, - kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, - kern_param.fw = fw; - +std::tuple +ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::get_constants( + const ExecArgs& args) const { float src_scale = args.src_layout->dtype.param().scale, filter_scale = @@ -128,125 +105,67 @@ void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::exec( dst_scale = args.dst_layout->dtype.param().scale; - uint8_t src_zero = args.src_layout->dtype.param() - .zero_point, - dst_zero = args.dst_layout->dtype.param() - .zero_point; - - float alpha = src_scale * filter_scale / dst_scale; - float beta = bias_scale / dst_scale; - float gamma = 0.f; - float delta = 0.f; - float theta = dst_zero; + uint8_t dst_zero = + args.dst_layout->dtype.param().zero_point; + float alpha = src_scale * filter_scale / dst_scale, + beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, + theta = dst_zero; - uint8_t* z_dev_ptr = nullptr; if (args.z_layout->ndim > 0) { - z_dev_ptr = reinterpret_cast(args.z_tensor->raw_ptr); float z_scale = args.z_layout->dtype.param().scale; + gamma = z_scale / dst_scale; uint8_t z_zero = args.z_layout->dtype.param().zero_point; - gamma = z_scale / dst_scale; delta = -z_zero * gamma; } - uint32_t nonlinear_mode = static_cast(param.nonlineMode); - - cutlass_wrapper::do_conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64< - true>( - reinterpret_cast(args.src_tensor->raw_ptr), - reinterpret_cast(filter_ptr), - reinterpret_cast(bias_ptr), z_dev_ptr, - reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, - kern_param, nonlinear_mode, alpha, beta, gamma, delta, theta, - dst_scale, src_zero, - cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k}, - cutlass_wrapper::GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, - m_algo_param.warp_k}, - stream); + return {alpha, beta, gamma, delta, theta}; } -std::string ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::to_string( - AlgoParam algo_param) { - return ssprintf("%uX%uX%u_%uX%uX%u", algo_param.threadblock_m, - algo_param.threadblock_n, algo_param.threadblock_k, - algo_param.warp_m, algo_param.warp_n, algo_param.warp_k); +void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::do_exec( + const ExecArgs& args, void* filter_ptr, void* bias_ptr, void* z_ptr, + ConvParam kern_param, uint32_t nonlinear_mode, float alpha, float beta, + float gamma, float delta, float theta, cudaStream_t stream) const { + float dst_scale = + args.dst_layout->dtype.param().scale; + uint8_t src_zero = + args.src_layout->dtype.param().zero_point; + cutlass_wrapper::GemmCoord threadblock_shape{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}; + + cutlass_wrapper::GemmCoord warp_shape{ + m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}; + cutlass_wrapper::do_conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64< + true>(reinterpret_cast(args.src_tensor->raw_ptr), + reinterpret_cast(filter_ptr), + reinterpret_cast(bias_ptr), + reinterpret_cast(z_ptr), + reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, delta, theta, + dst_scale, src_zero, threadblock_shape, warp_shape, stream); } -size_t ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm:: - get_preprocess_workspace_in_bytes(const SizeArgs& args) const { +void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::update_bias( + const ExecArgs& args, void* updated_bias, void* reduce_filter_ptr, + void* reduce_workspace) const { size_t co = args.filter_layout->operator[](0), ci = args.filter_layout->operator[](1) * 64, fh = args.filter_layout->operator[](2), fw = args.filter_layout->operator[](3); - size_t ws_size_reduce_filter = co * sizeof(int32_t); - size_t A = co, B = ci * fh * fw / 8, C = 1; - ws_size_reduce_filter += do_dispatch_reduce_workspace_in_bytes(A, B, C); - return ws_size_reduce_filter; -} - -SmallVector ConvBiasForwardImpl:: - AlgoUInt4Int4NCHW64IMMAImplicitGemm::deduce_preprocessed_filter_layout( - const SizeArgs& args) const { - return {args.filter_layout->collapse_contiguous(), - args.bias_layout->collapse_contiguous()}; -} - -void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::exec_preprocess( - const ExecArgs& args) const { - megdnn_assert(args.preprocessed_filter->tensors.size() == 2); - reorder_filter_bias(args, args.workspace.raw_ptr, - args.preprocessed_filter->tensors[0].raw_ptr, - args.preprocessed_filter->tensors[1].raw_ptr); -} -void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm:: - reorder_filter_bias(const ExecArgs& args, void* reduce_filter, - void* reordered_filter, - void* reordered_bias) const { - auto&& param = args.opr->param(); - auto&& fm = args.filter_meta; - size_t n = args.src_layout->operator[](0), - ci = args.src_layout->operator[](1) * 64, - hi = args.src_layout->operator[](2), - wi = args.src_layout->operator[](3); - size_t co = args.dst_layout->operator[](1) * 64, - ho = args.dst_layout->operator[](2), - wo = args.dst_layout->operator[](3); - UNPACK_CONV_PARAMETER(fm, param); - MARK_USED_VAR; auto&& stream = cuda_stream(args.opr->handle()); - // filter: KCRS64 => CRSK64 - TensorLayout src{{co, ci / 64, fh, fw, 64}, dtype::QuantizedS4()}; - src.init_contiguous_stride(); - TensorLayout dst = src; - dst.stride[0] = 64; - dst.stride[1] = co * fh * fw * 64; - dst.stride[2] = co * fw * 64; - dst.stride[3] = co * 64; - dst.stride[4] = 1; - TensorND ts_src, ts_dst; - ts_src.raw_ptr = args.filter_tensor->raw_ptr; - ts_src.layout = src; - ts_dst.raw_ptr = reordered_filter; - ts_dst.layout = dst; - auto&& transpose = args.opr->handle()->create_operator(); - transpose->exec(ts_src, ts_dst); - - // reduce filter and update bias - int32_t* workspace = reinterpret_cast(reordered_bias) + - args.bias_layout->span().dist_byte(); int src_zero_point = args.src_tensor->layout.dtype.param() .zero_point; do_dispatch_reduce_filter_and_update_bias_4bit( reinterpret_cast(args.filter_tensor->raw_ptr), args.bias_tensor->compatible_ptr(), co, ci * fh * fw / 8, - reinterpret_cast(reordered_bias), workspace, - src_zero_point, stream); + reinterpret_cast(updated_bias), + reinterpret_cast(reduce_workspace), src_zero_point, + stream); } #endif diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index c8fca2b8f..6b292778e 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -64,6 +64,7 @@ public: class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; class AlgoInt8NCHW32IMMAImplicitGemm; + class AlgoInt4NCHW64IMMAImplicitGemmBase; class AlgoInt4Int4NCHW64IMMAImplicitGemm; class AlgoUInt4Int4NCHW64IMMAImplicitGemm; class AlgoBFloat16; -- GitLab