From 09dab38748edcf52688a883ea3447c9d360fd587 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 11 Mar 2022 20:02:59 +0800 Subject: [PATCH] feat(cuda): support int1 simplewq conv GitOrigin-RevId: 9c37c41bc7e450f3df81e6059603101de3f14416 --- dnn/src/common/convolution.cpp | 3 +- dnn/src/cuda/conv_bias/algo.cpp | 3 +- dnn/src/cuda/conv_bias/algo.h | 20 +++ .../conv_bias/cudnn_conv_bias_activation.cpp | 2 + dnn/src/cuda/conv_bias/helper.cpp | 3 + dnn/src/cuda/conv_bias/opr_impl.cpp | 5 + dnn/src/cuda/conv_bias/opr_impl.h | 1 + dnn/src/cuda/conv_bias/simple_int1.cpp | 145 ++++++++++++++++++ dnn/src/cuda/convolution/forward/algos.cpp | 4 + dnn/src/naive/conv_bias/opr_impl.cpp | 3 + dnn/src/naive/convolution/helper.h | 9 ++ dnn/test/cuda/conv_bias.cpp | 26 ++++ 12 files changed, 222 insertions(+), 2 deletions(-) create mode 100644 dnn/src/cuda/conv_bias/simple_int1.cpp diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp index 0fc9afbbb..7531ed3f4 100644 --- a/dnn/src/common/convolution.cpp +++ b/dnn/src/common/convolution.cpp @@ -561,7 +561,8 @@ void ConvolutionBase::check_or_deduce_dtype_fwd( src.enumv() == DTypeEnum::QuantizedS8 || src.enumv() == DTypeEnum::Quantized8Asymm || src.enumv() == DTypeEnum::QuantizedS4 || - src.enumv() == DTypeEnum::Quantized4Asymm) { + src.enumv() == DTypeEnum::Quantized4Asymm || + src.enumv() == DTypeEnum::QuantizedS1) { supported_dst_dtype.push_back(dtype::QuantizedS32(mul_scale(src, filter))); bool cond_dst = dst.valid() && (dst.enumv() == src.enumv() || ((dst.enumv() == DTypeEnum::QuantizedS4 || diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index f1e139d69..602aae54a 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -25,7 +25,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { non_cudnn_algos.push_back(&matmul); non_cudnn_algos.push_back(&matmul8x8x32); non_cudnn_algos.push_back(&batched_matmul); - + non_cudnn_algos.push_back(&int1_simple); fill_cudnn_algos(); for (auto&& algo : cudnn_conv_bias_activations) { all_algos.push_back(&algo); @@ -45,6 +45,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { conv_algos.push_back(&matmul8x8x32); conv_algos.push_back(&batched_matmul); conv_algos.push_back(&group); + conv_algos.push_back(&int1_simple); for (auto&& algo : conv_algos) { all_algos.push_back(algo); diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index 18daa829d..a404d4835 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -87,6 +87,7 @@ public: CUDA_FALLBACK_NCHW_INT4, CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, + CUDA_SIMPLE_INT1, }; using Mapper = std::unordered_map; @@ -1089,6 +1090,24 @@ private: WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; }; +class ConvBiasForwardImpl::AlgoSimpleInt1 final : public AlgoBase { +public: + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + + std::vector get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const override; + + const char* name() const override { return "CONVBIAS_SIMPLE_INT1"; } + + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } + + MEGDNN_DECL_ALGO_TYPE(CUDA_SIMPLE_INT1) +private: + WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; +}; + class ConvBiasForwardImpl::AlgoPack : NonCopyableObj { private: AlgoBase::Mapper m_all_algos_map; @@ -1132,6 +1151,7 @@ public: std::vector f16_implicit_bmm; AlgoGroupConvGeneral group; AlgoBFloat16 bfloat16; + AlgoSimpleInt1 int1_simple; AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); diff --git a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp index 03c01c4a2..54cf04d92 100644 --- a/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp +++ b/dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp @@ -30,6 +30,8 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available( return false; } } + if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1) + return false; if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) diff --git a/dnn/src/cuda/conv_bias/helper.cpp b/dnn/src/cuda/conv_bias/helper.cpp index 8953707dc..fab76fc09 100644 --- a/dnn/src/cuda/conv_bias/helper.cpp +++ b/dnn/src/cuda/conv_bias/helper.cpp @@ -134,6 +134,9 @@ void ConvBiasDesc::set_conv( namespace conv_bias { bool is_cudnn_supported(const BiasForwardSizeArgs& args) { + if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1) + return false; + if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 || args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) && args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4) diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp index 821f4e1c3..a44faa3cf 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -221,6 +221,11 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( return &sm_algo_pack.fallback_nchw_qs8; } + if (sm_algo_pack.int1_simple.is_available_attribute( + args, positive_attr, negative_attr, workspace_limit_in_bytes)) { + return &sm_algo_pack.int1_simple; + } + if (args.src_layout->dtype.enumv() != DTypeTrait::enumv) { return megdnn::get_algo_match_attribute( sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index e1a30eccb..98f4890cb 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -72,6 +72,7 @@ public: class AlgoInt4Int4NHWCIMMAImplicitGemm; class AlgoUInt4Int4NHWCIMMAImplicitGemm; class AlgoBFloat16; + class AlgoSimpleInt1; // The following algorithms are suitable for channel wise convolution class AlgoFloat32NCHWFMAImplicitBatchedGemm; class AlgoFloat16NCHWHMMAImplicitBatchedGemm; diff --git a/dnn/src/cuda/conv_bias/simple_int1.cpp b/dnn/src/cuda/conv_bias/simple_int1.cpp new file mode 100644 index 000000000..ffaffda7e --- /dev/null +++ b/dnn/src/cuda/conv_bias/simple_int1.cpp @@ -0,0 +1,145 @@ +/** + * \file dnn/src/cuda/conv_bias/simple_int1.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/common/algo_base.h" +#include "src/cuda/conv_bias/algo.h" +#include "src/cuda/handle.h" +#include "src/cuda/utils.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; +using namespace conv_bias; + +namespace { +std::pair sub_opr_config( + const TensorLayoutArray& layouts, const ConvBiasForwardImpl* opr) { + megdnn_assert(layouts.size() >= 3); + std::pair ret; + ret.first = layouts; + auto change_dtype = [](TensorLayout& layout) { + if (layout.dtype.enumv() == DTypeEnum::QuantizedS1 || + layout.dtype.enumv() == DTypeEnum::QuantizedS32) { + layout.dtype = dtype::Float32(); + } + }; + change_dtype(ret.first[0]); + change_dtype(ret.first[1]); + change_dtype(ret.first[2]); + change_dtype(ret.first[3]); + change_dtype(ret.first[4]); + + ret.second = opr->param(); + ret.second.compute_mode = ConvBiasForwardImpl::Param::ComputeMode::DEFAULT; + return ret; +} + +std::pair> prepare_sub_opr( + const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) { + auto convbias_opr = args.handle->create_operator(); + auto&& config = sub_opr_config( + {*args.src_layout, *args.filter_layout, *args.bias_layout, *args.z_layout, + *args.dst_layout}, + args.opr); + convbias_opr->param() = config.second; + + return {config.first, std::move(convbias_opr)}; +} +} // namespace + +std::vector ConvBiasForwardImpl::AlgoSimpleInt1::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + auto&& config = + sub_opr_config(layouts, static_cast(opr)); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str, config.first}}; +} + +bool ConvBiasForwardImpl::AlgoSimpleInt1::is_available(const SizeArgs& args) const { + if (args.src_layout->dtype.valid() && args.filter_layout->dtype.valid() && + args.bias_layout->dtype.valid() && args.z_layout->dtype.valid() && + args.dst_layout->dtype.valid()) { + auto config = prepare_sub_opr(args); + + return args.src_layout->dtype.enumv() == args.filter_layout->dtype.enumv() && + args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS1 && + get_algorithm( + static_cast(config.second.get()), + config.first[0], config.first[1], config.first[2], + config.first[3], config.first[4]); + } else { + return false; + } +} + +WorkspaceBundle ConvBiasForwardImpl::AlgoSimpleInt1::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + auto config = prepare_sub_opr(args); + + SmallVector sizes; + auto get_workspace = [&sizes](const TensorLayout& src, const TensorLayout& dst) { + if (src.dtype != dst.dtype) { + sizes.push_back(dst.span().dist_byte()); + } + }; + get_workspace(*args.src_layout, config.first[0]); + get_workspace(*args.filter_layout, config.first[1]); + get_workspace(*args.bias_layout, config.first[2]); + get_workspace(*args.z_layout, config.first[3]); + get_workspace(*args.dst_layout, config.first[4]); + sizes.push_back(config.second->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2], config.first[3], + config.first[4], nullptr)); + + return {ptr, std::move(sizes)}; +} + +size_t ConvBiasForwardImpl::AlgoSimpleInt1::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void ConvBiasForwardImpl::AlgoSimpleInt1::exec(const ExecArgs& args) const { + TensorND fsrc_tensor = *args.src_tensor; + TensorND ffilter_tensor = *args.filter_tensor; + TensorND fbias_tensor = *args.bias_tensor; + TensorND fz_tensor = *args.z_tensor; + TensorND fdst_tensor = *args.dst_tensor; + auto config = prepare_sub_opr(args); + + auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); + CompTypeCvter cvter(args.handle, &bundle); + { + cvter.src_to_comp_type(*args.src_tensor, fsrc_tensor) + .src_to_comp_type(*args.filter_tensor, ffilter_tensor); + } + WorkspaceBundle dst_bundle = { + bundle.get(2), + {bundle.get_size(2), bundle.get_size(3), bundle.get_size(4), + bundle.get_size(5)}}; + CompTypeCvter dst_cvter( + args.handle, &dst_bundle); + { + dst_cvter.src_to_comp_type(*args.bias_tensor, fbias_tensor) + .src_to_comp_type(*args.z_tensor, fz_tensor) + .src_to_comp_type(*args.dst_tensor, fdst_tensor); + } + config.second->exec( + fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, fdst_tensor, nullptr, + dst_cvter.workspace()); + + { dst_cvter.comp_to_dst_type(fdst_tensor, *args.dst_tensor); } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/forward/algos.cpp b/dnn/src/cuda/convolution/forward/algos.cpp index 52a7c347d..ff66e3b38 100644 --- a/dnn/src/cuda/convolution/forward/algos.cpp +++ b/dnn/src/cuda/convolution/forward/algos.cpp @@ -44,6 +44,10 @@ std::pair sub_opr_config( src.dtype.param().scale * filter.dtype.param().scale); + } else if (src.dtype.enumv() == DTypeEnum::QuantizedS1) { + bias_type = dtype::QuantizedS32( + src.dtype.param().scale * + filter.dtype.param().scale); } else { megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT); bias_type = src.dtype; diff --git a/dnn/src/naive/conv_bias/opr_impl.cpp b/dnn/src/naive/conv_bias/opr_impl.cpp index 25050c8fb..8460da658 100644 --- a/dnn/src/naive/conv_bias/opr_impl.cpp +++ b/dnn/src/naive/conv_bias/opr_impl.cpp @@ -278,6 +278,9 @@ void ConvBiasForwardImpl::exec( DISPATCH_RAW( Quantized4Asymm, QuantizedS4, QuantizedS32, QuantizedS32, DEFAULT, (convolution::forward_bias)) + DISPATCH_RAW( + QuantizedS1, QuantizedS1, QuantizedS32, QuantizedS32, FLOAT32, + (convolution::forward_bias)) #if !MEGDNN_DISABLE_FLOAT16 DISPATCH(Float16, Float16) DISPATCH_RAW( diff --git a/dnn/src/naive/convolution/helper.h b/dnn/src/naive/convolution/helper.h index 6693110f5..e4c896faf 100644 --- a/dnn/src/naive/convolution/helper.h +++ b/dnn/src/naive/convolution/helper.h @@ -84,6 +84,15 @@ inline void StrategyFwd::on( d += cast(s) * cast(f); } +template <> +inline void StrategyFwd::on( + dt_qint1& s, dt_qint1& f, dt_qint32& d, DType, DType, DType) { + auto cast = [](const dt_qint1& val) { + return dt_qint32(static_cast(val.as_int8())); + }; + d += cast(s) * cast(f); +} + struct StrategyBwdData { template static void on(st& s, ft& f, dt& d, DType, DType, DType) { diff --git a/dnn/test/cuda/conv_bias.cpp b/dnn/test/cuda/conv_bias.cpp index 70356c1b5..19dec2397 100644 --- a/dnn/test/cuda/conv_bias.cpp +++ b/dnn/test/cuda/conv_bias.cpp @@ -133,6 +133,32 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_BF16) { } } +TEST_F(CUDA, CONV_BIAS_FORWARD_QS1) { + require_compute_capability(6, 1); + + UniformIntRNG int_rng{1, 1}; + Checker checker(handle_cuda()); + checker.set_before_exec_callback(AlgoChecker( + ExecutionPolicyAlgoName{"CONVBIAS_SIMPLE_INT1", {{"MATMUL", {}}}})); + + ConvBias::Param param; + param.format = ConvBias::Param::Format::NCHW; + param.compute_mode = param::Convolution::ComputeMode::FLOAT32; + { + auto src_shape = TensorShape{20, 2, 224, 224}; + auto filter_shape = TensorShape{20, 2, 3, 3}; + checker.set_dtype(0, dtype::QuantizedS1(1.0f)) + .set_dtype(1, dtype::QuantizedS1(1.0f)) + .set_dtype(2, dtype::QuantizedS32(1.0f)) + .set_dtype(3, dtype::QuantizedS32(1.0f)) + .set_dtype(4, dtype::QuantizedS32(1.0f)) + .set_rng(0, &int_rng) + .set_rng(1, &int_rng) + .set_param(param) + .execs({src_shape, filter_shape, {}, {}, {}}); + } +} + TEST_F(CUDA, CONV_BIAS_FORWARD_QS8) { require_compute_capability(6, 1); -- GitLab