From c985204b313e2d1b885c24baaf09aa0dc1b32ce4 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 16 Apr 2020 20:26:43 +0800 Subject: [PATCH] feat(dnn): add conv1x1 algo and tests GitOrigin-RevId: 374a62cf12efb74fd92ee5d0ec8df7cfd40addba --- dnn/src/fallback/conv_bias/conv1x1/algos.cpp | 230 +++++++++++++ dnn/src/fallback/conv_bias/conv1x1/algos.h | 56 ++++ .../conv_bias/conv1x1/conv1x1_dispatcher.h | 99 ++++++ .../conv_bias/conv1x1/conv1x1_strategy.cpp | 214 ++++++++++++ .../conv_bias/conv1x1/conv1x1_strategy.h | 310 ++++++++++++++++++ dnn/src/fallback/conv_bias/opr_impl.cpp | 9 +- dnn/src/fallback/conv_bias/opr_impl.h | 1 + dnn/src/x86/matrix_mul/algos.cpp | 1 - dnn/test/common/conv_bias.cpp | 76 +++++ dnn/test/common/conv_bias.h | 4 + dnn/test/x86/conv_bias.cpp | 73 +++++ 11 files changed, 1071 insertions(+), 2 deletions(-) create mode 100644 dnn/src/fallback/conv_bias/conv1x1/algos.cpp create mode 100644 dnn/src/fallback/conv_bias/conv1x1/algos.h create mode 100644 dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h create mode 100644 dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp create mode 100644 dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp new file mode 100644 index 000000000..995540de8 --- /dev/null +++ b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp @@ -0,0 +1,230 @@ +/** + * \file dnn/src/fallback/conv_bias/conv1x1/algos.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/fallback/conv_bias/conv1x1/algos.h" +#include "src/common/opr_delegate.h" +#include "src/fallback/conv_bias/common.h" +#include "src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h" +#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" +#include "src/fallback/conv_bias/opr_impl.h" + +#include "megdnn/opr_param_defs.h" +#include "src/naive/convolution/helper.h" + +#if MEGDNN_X86 +#include "src/x86/conv_bias/postprocess_helper.h" +#endif + +#include "midout.h" +MIDOUT_DECL(megdnn_fallback_conv1x1) + +using namespace megdnn; +using namespace fallback; +#if MEGDNN_X86 +using namespace x86; +#endif +using namespace conv1x1; + +size_t ConvBiasImpl::AlgoConv1x1::get_oc_tile_size_heuristic( + const NCBKernSizeParam& param) const { + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + size_t OC = param.filter_meta.ocpg; + if (OH * OW >= 56 * 56 || OC >= 64) + return m_oc_block_size; + return div_ceil(OC, param.nr_threads); +} + +size_t ConvBiasImpl::AlgoConv1x1::get_workspace( + ConvBiasImpl*, const NCBKernSizeParam& param) const { + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); + + auto matmul_param = + get_matmul_kern_param(param, OH * OW, compt_oc_block_size); + + auto pack_mode = m_matmul_algo->packmode(); + if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { + MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 0) { + Conv1x1Kerns dispatcher; + return dispatcher + .get_bundle(param, matmul_param, m_matmul_algo, + compt_oc_block_size) + .total_size_in_bytes(); + } + MIDOUT_END(); + } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { + MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) { + Conv1x1Kerns dispatcher; + return dispatcher + .get_bundle(param, matmul_param, m_matmul_algo, + compt_oc_block_size) + .total_size_in_bytes(); + } + MIDOUT_END(); + } else { + MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 2) { + Conv1x1Kerns dispatcher; + return dispatcher + .get_bundle(param, matmul_param, m_matmul_algo, + compt_oc_block_size) + .total_size_in_bytes(); + } + MIDOUT_END(); + } + return 0; +} + +SmallVector ConvBiasImpl::AlgoConv1x1::dispatch_kerns( + ConvBiasImpl* opr, const NCBKernSizeParam& param) const { + SmallVector ret_kern; + + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + size_t OC = param.filter_meta.ocpg; + size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); + size_t GROUP = param.filter_meta.group; + size_t BATCH = param.n; + size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); + + auto matmul_param = + get_matmul_kern_param(param, OH * OW, compt_oc_block_size); + WorkspaceBundle whole_bundle = {nullptr, {}}; + WorkspaceBundle thread_bundle = {nullptr, {}}; + WorkspaceBundle matmul_bundle = {nullptr, {}}; + + auto pack_mode = m_matmul_algo->packmode(); + if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { + MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 0) { + Conv1x1Kerns dispatcher; + whole_bundle = dispatcher.get_bundle( + param, matmul_param, m_matmul_algo, compt_oc_block_size); + matmul_bundle = m_matmul_algo->get_bundle(matmul_param); + } + MIDOUT_END(); + } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { + MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) { + Conv1x1Kerns dispatcher; + whole_bundle = dispatcher.get_bundle( + param, matmul_param, m_matmul_algo, compt_oc_block_size); + matmul_bundle = m_matmul_algo->get_bundle(matmul_param); + } + MIDOUT_END(); + } else { + MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 2) { + Conv1x1Kerns dispatcher; + whole_bundle = dispatcher.get_bundle( + param, matmul_param, m_matmul_algo, compt_oc_block_size); + matmul_bundle = { + nullptr, + {0, 0, m_matmul_algo->get_workspace(matmul_param)}}; + } + MIDOUT_END(); + } + + //! get thread bundle + thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2), + compt_oc_block_size); + + Conv1x1StrategyBase* conv1x1_strategy = + Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, + opr->param().format); + + auto kern_packA = [this, whole_bundle, matmul_bundle, param, + compt_oc_block_size, conv1x1_strategy]( + const NCBKernParam& ncb_param, + const NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->packA(whole_bundle, matmul_bundle, + compt_oc_block_size, this->m_matmul_algo, param, + ncb_param, std::move(ncb_index)); + }; + auto kern_packB = [this, whole_bundle, matmul_bundle, param, + conv1x1_strategy]( + const NCBKernParam& ncb_param, + const NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->packB(whole_bundle, matmul_bundle, + this->m_matmul_algo, param, ncb_param, + std::move(ncb_index)); + }; + auto kern_compt = [this, whole_bundle, matmul_bundle, thread_bundle, param, + compt_oc_block_size, conv1x1_strategy]( + const NCBKernParam& ncb_param, + const NCBKernIndex& ncb_index) mutable { + conv1x1_strategy->exec(whole_bundle, matmul_bundle, thread_bundle, + compt_oc_block_size, this->m_matmul_algo, param, + ncb_param, std::move(ncb_index)); + }; + + if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT || + pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { + ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); + if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { + ret_kern.push_back({kern_packB, {1}}); + } + } + ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); + + return ret_kern; +} + +bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr, + const NCBKernSizeParam& param, + AlgoSelectionStrategy) const { + MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) { + //! only support nchw format + if (opr->param().format != param::ConvBias::Format::NCHW) + return false; + + size_t FH = param.filter_meta.spatial[0], + FW = param.filter_meta.spatial[1]; + size_t PH = param.filter_meta.padding[0], + PW = param.filter_meta.padding[1]; + size_t SH = param.filter_meta.stride[0], + SW = param.filter_meta.stride[1]; + + if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) + return false; + + //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode + //! is identity otherwise return false mean that 8x8x32 and 8x8x16 + //! not support PostProcess + if (param.src_type.enumv() == param.filter_type.enumv() && + (param.src_type.enumv() == DTypeEnum::Int8 && + (param.dst_type.enumv() == DTypeEnum::Int16 || + param.dst_type.enumv() == DTypeEnum::Int32)) && + param.bias_mode != megdnn::BiasMode::NO_BIAS && + param.nonlineMode != megdnn::NonlineMode::IDENTITY) + return false; + + if (param.src_type.enumv() == param.filter_type.enumv() && + ((param.src_type.enumv() == DTypeEnum::QuantizedS8 || + param.src_type.enumv() == DTypeEnum::Quantized8Asymm) && + param.dst_type.enumv() == DTypeEnum::QuantizedS32) && + param.bias_mode != megdnn::BiasMode::NO_BIAS && + param.nonlineMode != megdnn::NonlineMode::IDENTITY) + return false; + + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + MatrixMulImpl::KernSizeParam matmul_param = + get_matmul_kern_param(param, OH * OW, get_oc_tile_size_heuristic(param)); + + bool matmulusable = m_matmul_algo->usable(matmul_param); + return matmulusable && + (param.filter_meta.dilation[0] == + param.filter_meta.dilation[1] && + param.filter_meta.dilation[0] == 1) && + param.compute_mode == param::ConvBias::ComputeMode::DEFAULT; + } + MIDOUT_END(); + return false; +} diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.h b/dnn/src/fallback/conv_bias/conv1x1/algos.h new file mode 100644 index 000000000..fb3bdb66b --- /dev/null +++ b/dnn/src/fallback/conv_bias/conv1x1/algos.h @@ -0,0 +1,56 @@ +/** + * \file dnn/src/fallback/conv_bias/conv1x1/algos.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/thin/small_vector.h" +#include "src/common/utils.h" +#include "src/fallback/conv_bias/opr_impl.h" +#include "src/fallback/matrix_mul/opr_impl.h" + +namespace megdnn { +namespace fallback { + +class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase { +public: + AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) + : m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {} + + bool is_reproducible() const override { return true; } + + const char* name() const override { + if (m_name.empty()) { + m_name = ssprintf("CONV1x1:%s:%zu", m_matmul_algo->name(), + m_oc_block_size); + } + return m_name.c_str(); + } + + bool usable(ConvBiasImpl* opr, const NCBKernSizeParam& param, + AlgoSelectionStrategy algo_selection_strategy) const override; + size_t get_workspace(ConvBiasImpl*, + const NCBKernSizeParam& param) const override; + SmallVector dispatch_kerns( + ConvBiasImpl* opr, const NCBKernSizeParam& param) const override; + +protected: + size_t get_oc_tile_size_heuristic(const NCBKernSizeParam& param) const; + +private: + MatrixMulImpl::AlgoBase* m_matmul_algo; + mutable std::string m_name; + mutable size_t m_oc_block_size = 0; +}; + +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h new file mode 100644 index 000000000..581d2d69b --- /dev/null +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h @@ -0,0 +1,99 @@ +/** + * \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" + +namespace megdnn { +namespace fallback { +namespace conv1x1 { + +namespace { +//! get_thread_bundle +WorkspaceBundle get_thread_bundle(const ConvBiasImpl::NCBKernSizeParam& param, + size_t matmul_c_size, size_t oc_tile_size) { + //! for some cases, matmul result need temp space to store + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8) || + (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && + param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); + size_t matmul_dst_bytes_per_thread = + is_dst_8bit ? oc_tile_size * OH * OW * sizeof(param.bias_type) : 0; + return WorkspaceBundle{nullptr, + {matmul_c_size, matmul_dst_bytes_per_thread}}; +} +} // anonymous namespace + +template +class Conv1x1Kerns { +public: + //! get_bundle + WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, + const MatrixMulImpl::KernSizeParam& matmul_param, + const MatrixMulImpl::AlgoBase* matmul_algo, + size_t oc_tile_size) { + size_t GROUP = param.filter_meta.group; + size_t OC = param.filter_meta.ocpg; + size_t BATCH = param.n; + + //! bundle per thread + //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH + //! * OW this does not bother packb bytes + auto matmul_bundle = matmul_algo->get_bundle(matmul_param); + auto thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2), + oc_tile_size); + + //! size per thread + size_t all_threads_bytes = + thread_bundle.total_size_in_bytes() * param.nr_threads; + + //! packa size = GROUP * packa_size_each_group + size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); + size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); + size_t all_packa_bytes = + packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; + + if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) + return WorkspaceBundle{nullptr, + {all_packa_bytes, 0, all_threads_bytes}}; + + //! packb size = N * GROUP * packb_size_per_group + size_t packb_bytes_per_group = matmul_bundle.get_size(1); + size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH; + + return WorkspaceBundle{ + nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}}; + } +}; + +template<> +class Conv1x1Kerns { +public: + //! get_bundle + WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, + const MatrixMulImpl::KernSizeParam& matmul_param, + const MatrixMulImpl::AlgoBase* matmul_algo, + size_t oc_tile_size) { + size_t matmul_size = matmul_algo->get_workspace(matmul_param); + auto thread_bundle = get_thread_bundle(param, matmul_size, oc_tile_size); + //! size per thread + size_t all_threads_bytes = + thread_bundle.total_size_in_bytes() * param.nr_threads; + return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}}; + } +}; + +} // namespace conv1x1 +} // namespace fallback +} // namespace megdnn diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp new file mode 100644 index 000000000..05322417c --- /dev/null +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp @@ -0,0 +1,214 @@ +/** + * \file dnn/src/fallback/conv_bias/conv1x1/Conv1x1_strategy.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include +#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" + +#include "midout.h" + +MIDOUT_DECL(megdnn_fallback_conv1x1_factory_strategy) + +namespace megdnn { +namespace fallback { +namespace conv1x1 { + +namespace { + +struct StrategyHashParam { + ConvBiasImpl::NCBKernSizeParam param; + param::ConvBias::Format format; + MatrixMulImpl::AlgoBase::PackMode packmode; +}; + +struct StrategyHashParamHash { + std::size_t operator()(const StrategyHashParam& sparam) const { + constexpr size_t base = 1; //! avoid hashkey is zero + std::size_t result = + static_cast(sparam.param.src_type.enumv()) + base; + result = result ^ + ((static_cast(sparam.param.dst_type.enumv()) + + base) + << 3); + result = result ^ + ((static_cast(sparam.param.filter_type.enumv()) + + base) + << 6); + result = result ^ + ((static_cast(sparam.param.bias_type.enumv()) + + base) + << 9); + result = result ^ + ((static_cast(sparam.format) + base) << 12); + result = result ^ + ((static_cast(sparam.packmode) + base) << 15); + return result; + }; +}; + +struct StrategyHashParamEqual { + bool operator()(const StrategyHashParam& param1, + const StrategyHashParam& param2) const { + bool flags = true; + flags = param1.param.src_type == param2.param.src_type && flags; + flags = param1.param.filter_type == param2.param.filter_type && flags; + flags = param1.param.bias_type == param2.param.bias_type && flags; + flags = param1.param.dst_type == param2.param.dst_type && flags; + flags = param1.format == param2.format && flags; + flags = param1.packmode == param2.packmode && flags; + return flags; + }; +}; + +std::unique_ptr create_conv1x1_strategy( + const ConvBiasImpl::NCBKernSizeParam& param, + MatrixMulImpl::AlgoBase::PackMode pack_mode, + param::ConvBias::Format format) { + MEGDNN_MARK_USED_VAR(format); + +#define cb1(_packmode, _dt, _post_ctype, _postprocess_mode, _midout_tag) \ + MIDOUT_BEGIN(megdnn_fallback_conv1x1_factory_strategy, \ + midout_iv(_midout_tag)) { \ + if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \ + return std::make_unique< \ + Conv1x1Strategy<_dt, _dt, _dt, _post_ctype, _post_ctype, \ + _postprocess_mode, _packmode>>(); \ + } \ + } \ + MIDOUT_END() + +#define cb2(_packmode, _i_src_type, _i_bias_type, _i_dst_type, _src_ctype, \ + _bias_ctype, _dst_ctype, _postprocess_mode, _midout_tag) \ + MIDOUT_BEGIN(megdnn_fallback_conv1x1_factory_strategy, \ + midout_iv(_midout_tag)) { \ + if (param.filter_type.enumv() == param.src_type.enumv() && \ + param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv && \ + param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) { \ + return std::make_unique< \ + Conv1x1Strategy<_src_ctype, _bias_ctype, _dst_ctype, \ + DTypeTrait<_i_bias_type>::ctype, \ + DTypeTrait<_i_dst_type>::ctype, \ + _postprocess_mode, _packmode>>(); \ + } \ + } \ + MIDOUT_END() + + switch (pack_mode) { + case MatrixMulImpl::AlgoBase::PackMode::DEFAULT: + cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float32, + dt_float32, PostprocessMode::FLOAT, "Default::FLOAT"_hash); +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float16, __fp16, + PostprocessMode::FLOAT, "Default::FLOAT16_FP16"_hash); +#else +#if !MEGDNN_DISABLE_FLOAT16 + cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float16, + dt_float16, PostprocessMode::NO_PROCESS, + "Default::FLOAT16_FLOAT16"_hash); +#endif +#endif + cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_int8, dt_int32, + dt_int32, dt_int8, dt_int32, dt_int32, + PostprocessMode::NO_PROCESS, "Default::INT8x8x32_INT32"_hash); + cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_int8, dt_int16, + dt_int16, dt_int8, dt_int16, dt_int16, + PostprocessMode::NO_PROCESS, "Default::INT8x8x16_INT16"_hash); +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 + cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, + dtype::Quantized8Asymm, dtype::QuantizedS32, + dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32, + PostprocessMode::NO_PROCESS, + "Default::QUINT8x8x32_QINT32"_hash); + cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, + dtype::Quantized8Asymm, dtype::QuantizedS32, + dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8, + PostprocessMode::QUANTIZED, "Default::QUINT8x8x32_QUINT8"_hash); +#endif + cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dtype::QuantizedS8, + dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, + dt_int32, PostprocessMode::NO_PROCESS, + "Default::QINT8x8x32_QINT32"_hash); + cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dtype::QuantizedS8, + dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32, + dt_int8, PostprocessMode::QUANTIZED, + "Default::QINT8x8x32_QINT8"_hash); + break; + + case MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA: + cb1(MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA, dt_float32, + dt_float32, PostprocessMode::FLOAT, "OnlyPackA::FLOAT"_hash); + break; + + case MatrixMulImpl::AlgoBase::PackMode::NO_PACK: + cb1(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_float32, + dt_float32, PostprocessMode::FLOAT, "NoPack::FLOAT"_hash); + + cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_int8, dt_int16, + dt_int16, dt_int8, dt_int16, dt_int16, + PostprocessMode::NO_PROCESS, "NoPack::INT8x8x16_INT16"_hash); + + cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_int8, dt_int32, + dt_int32, dt_int8, dt_int32, dt_int32, + PostprocessMode::NO_PROCESS, "NoPack::INT8x8x32_INT32"_hash); + + cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, + dtype::QuantizedS8, dtype::QuantizedS32, + dtype::QuantizedS32, dt_int8, dt_int32, dt_int32, + PostprocessMode::NO_PROCESS, + "NoPack::QINT8x8x32_QINT32"_hash); + break; + + default: + megdnn_throw("Invalid Pack Mode"); + break; + } +#undef cb1 +#undef cb2 + megdnn_throw("Invalid Data Type"); + return nullptr; +} + +class StrategyDelegationStorage { +public: + Conv1x1StrategyBase* get(const ConvBiasImpl::NCBKernSizeParam& param, + MatrixMulImpl::AlgoBase::PackMode pack_mode, + param::ConvBias::Format format) { + MEGDNN_LOCK_GUARD(m_mtx); + StrategyHashParam sparam; + sparam.param = param; + sparam.format = format; + sparam.packmode = pack_mode; + if (m_map_strategies.find(sparam) == m_map_strategies.end()) { + auto strategy = create_conv1x1_strategy(param, pack_mode, format); + m_map_strategies[sparam] = std::move(strategy); + } + return m_map_strategies[sparam].get(); + } + +private: + std::mutex m_mtx; + std::unordered_map, + StrategyHashParamHash, StrategyHashParamEqual> + m_map_strategies; +}; + +} // anonymous namespace + +Conv1x1StrategyBase* Conv1x1Factory::make_conv1x1_strategy( + const ConvBiasImpl::NCBKernSizeParam& param, + MatrixMulImpl::AlgoBase::PackMode pack_mode, + param::ConvBias::Format format) { + static StrategyDelegationStorage storage; + return storage.get(param, pack_mode, format); +} + +} // namespace conv1x1 +} // namespace fallback +} // namespace megdnn diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h new file mode 100644 index 000000000..7bb6028b4 --- /dev/null +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h @@ -0,0 +1,310 @@ +/** + * \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once + +#include "megdnn/opr_param_defs.h" +#include "src/fallback/conv_bias/opr_impl.h" +#if MEGDNN_X86 +#include "src/x86/conv_bias/postprocess_helper.h" +#endif + +namespace megdnn { +namespace fallback { +namespace conv1x1 { + +#if MEGDNN_X86 +using namespace x86; +#endif + +namespace { + +//! get_matmul_kern_param +MatrixMulImpl::KernSizeParam get_matmul_kern_param( + const ConvBiasImpl::NCBKernSizeParam& param, size_t n, size_t m) { + size_t M = m; + size_t N = n; + size_t K = param.filter_meta.icpg; //! K = IC + size_t LDA = K, LDB = N, LDC = N; + bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8) || + (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && + param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); + return {param.filter_type, + param.src_type, + is_dst_8bit ? param.bias_type : param.dst_type, + M, + N, + K, + LDA, + LDB, + LDC, + false, + false, + param::MatrixMul::ComputeMode::DEFAULT, + param::MatrixMul::Format::DEFAULT}; +} +} // namespace + +class Conv1x1StrategyBase { +public: + virtual void packA(WorkspaceBundle& whole_bundle, + WorkspaceBundle& matmul_bundle, + size_t oc_tile_size, + const MatrixMulImpl::AlgoBase* matmul_algo, + const ConvBiasImpl::NCBKernSizeParam& param, + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) = 0; + + virtual void packB(WorkspaceBundle& whole_bundle, + WorkspaceBundle& matmul_bundle, + const MatrixMulImpl::AlgoBase* matmul_algo, + const ConvBiasImpl::NCBKernSizeParam& param, + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) = 0; + + virtual void exec(WorkspaceBundle& whole_bundle, + WorkspaceBundle& matmul_bundle, + WorkspaceBundle& thread_bundle, + size_t oc_tile_size, + const MatrixMulImpl::AlgoBase* matmul_algo, + const ConvBiasImpl::NCBKernSizeParam& param, + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) = 0; + virtual ~Conv1x1StrategyBase() = default; +}; + +template +class Conv1x1Strategy : public Conv1x1StrategyBase { +public: + void packA(WorkspaceBundle& whole_bundle, + WorkspaceBundle& matmul_bundle, + size_t oc_tile_size, + const MatrixMulImpl::AlgoBase* matmul_algo, + const ConvBiasImpl::NCBKernSizeParam& param, + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) override { + + if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { + megdnn_log_error("NoPack mode has no packA kernel"); + return; + } + + whole_bundle.set(ncb_param.workspace_ptr); + + //! packa size per group + size_t OC = param.filter_meta.ocpg; + size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); + size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); + size_t packa_bytes_per_group = + oc_tiles_per_group * packa_bytes_per_oc_tile; + + size_t group_id = ncb_index.ndrange_id[0]; + size_t oc_tile_id_in_group = ncb_index.ndrange_id[1]; + + size_t oc_start = oc_tile_id_in_group * oc_tile_size; + size_t oc_end = oc_start + oc_tile_size; + oc_end = (oc_end <= OC ? oc_end : OC); + + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + size_t IC = param.filter_meta.icpg; + MatrixMulImpl::KernParam matmul_kern_param; + static_cast(matmul_kern_param) = + get_matmul_kern_param(param, OH * OW, oc_end - oc_start); + + size_t bytes_offset_of_a_panel = + group_id * packa_bytes_per_group + + oc_tile_id_in_group * packa_bytes_per_oc_tile; + size_t numbers_offset_of_filter = + oc_tile_size * IC * oc_tile_id_in_group; + + src_ctype* a_panel = reinterpret_cast( + reinterpret_cast(whole_bundle.get(0)) + + bytes_offset_of_a_panel); + matmul_kern_param.A_ptr = const_cast( + ncb_param.filter(group_id) + + numbers_offset_of_filter); + matmul_algo->pack_A(matmul_kern_param, a_panel, 0, + oc_end - oc_start); + } + + void packB(WorkspaceBundle& whole_bundle, + WorkspaceBundle& matmul_bundle, + const MatrixMulImpl::AlgoBase* matmul_algo, + const ConvBiasImpl::NCBKernSizeParam& param, + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) override { + if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { + whole_bundle.set(ncb_param.workspace_ptr); + + //! packb size per group + size_t packb_bytes_per_group = matmul_bundle.get_size(1); + + size_t GROUP = param.filter_meta.group; + size_t BATCH = param.n; + size_t SH = param.filter_meta.stride[0]; + size_t SW = param.filter_meta.stride[1]; + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + size_t OC = param.filter_meta.ocpg; + + MatrixMulImpl::KernParam matmul_kern_param; + static_cast(matmul_kern_param) = + get_matmul_kern_param(param, OH * OW, OC); + + rep(batch, BATCH) { + rep(g, GROUP) { + if (SH == 2 && SW == 2) + megdnn_throw("no support for stride = 2"); + + size_t bytes_offset_of_b_panel = + batch * packb_bytes_per_group * GROUP + + g * packb_bytes_per_group; + src_ctype* b_panel = reinterpret_cast( + reinterpret_cast(whole_bundle.get(1)) + + bytes_offset_of_b_panel); + matmul_kern_param.B_ptr = const_cast( + ncb_param.src(batch, g)); + matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); + } + } + } else { + megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel"); + } + } + + void exec(WorkspaceBundle& whole_bundle, + WorkspaceBundle& matmul_bundle, + WorkspaceBundle& thread_bundle, + size_t oc_tile_size, + const MatrixMulImpl::AlgoBase* matmul_algo, + const ConvBiasImpl::NCBKernSizeParam& param, + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) override { + whole_bundle.set(ncb_param.workspace_ptr); + size_t OC = param.filter_meta.ocpg; + size_t IC = param.filter_meta.icpg; + + //! packa bytes per group + size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); + size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); + size_t packa_bytes_per_group = + packa_bytes_per_oc_tile * oc_tiles_per_group; + + //! packb bytes per group + size_t packb_bytes_per_group = matmul_bundle.get_size(1); + + //! matmul bytes per thread + size_t matmul_bytes_per_thread = thread_bundle.get_size(0); + + size_t batch_id = ncb_index.ndrange_id[0]; + size_t group_id = ncb_index.ndrange_id[1]; + size_t oc_tile_id_in_group = ncb_index.ndrange_id[2]; + size_t thread_id = ncb_index.thread_id; + + size_t GROUP = param.filter_meta.group; + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + size_t oc_start = oc_tile_size * oc_tile_id_in_group; + size_t oc_end = oc_start + oc_tile_size; + oc_end = (oc_end <= OC ? oc_end : OC); + + MatrixMulImpl::KernParam matmul_kern_param; + static_cast(matmul_kern_param) = + get_matmul_kern_param(param, OH * OW, oc_end - oc_start); + + size_t bytes_offset_of_a_panel = + group_id * packa_bytes_per_group + + oc_tile_id_in_group * packa_bytes_per_oc_tile; + int8_t* a_panel = reinterpret_cast(whole_bundle.get(0)) + + bytes_offset_of_a_panel; + + size_t bytes_offset_of_b_panel = + batch_id * packb_bytes_per_group * GROUP + + group_id * packb_bytes_per_group; + int8_t* b_panel = reinterpret_cast(whole_bundle.get(1)) + + bytes_offset_of_b_panel; + + size_t thread_offset = thread_bundle.total_size_in_bytes() * thread_id; + size_t bytes_offset_of_matmul_dst_this_thread = + thread_offset + thread_bundle.get_size(0); + int8_t* matmul_temp_dst = + reinterpret_cast(whole_bundle.get(2)) + + bytes_offset_of_matmul_dst_this_thread; + + size_t numbers_of_ncb_dst_offset = + oc_tile_size * OH * OW * oc_tile_id_in_group; + void* conv_bias_dst = static_cast( + ncb_param.dst(batch_id, group_id) + + numbers_of_ncb_dst_offset); + + size_t numbers_of_ncb_filter_offset = + oc_tile_size * IC * oc_tile_id_in_group; + matmul_kern_param.A_ptr = const_cast( + ncb_param.filter(group_id) + + numbers_of_ncb_filter_offset); + + matmul_kern_param.B_ptr = const_cast( + ncb_param.src(batch_id, group_id)); + + matmul_kern_param.workspace_ptr = + reinterpret_cast(whole_bundle.get(2)) + thread_offset; + matmul_kern_param.workspace_size = matmul_bytes_per_thread; + + bool is_dst_8bit = + (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8) || + (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && + param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); + void* matmul_dst = is_dst_8bit ? matmul_temp_dst : conv_bias_dst; + + matmul_kern_param.C_ptr = matmul_dst; + + if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { + auto matmul_kern = matmul_algo->get_kern(matmul_kern_param); + matmul_kern(matmul_kern_param); + } else { + auto matmul_kern_naked = + matmul_algo->get_kern_naked(matmul_kern_param); + matmul_kern_naked(matmul_kern_param, a_panel, b_panel); + } + + //! do postprocess + void* bias_ptr = nullptr; + if (param.bias_mode == megdnn::BiasMode::BIAS) + bias_ptr = static_cast(const_cast( + ncb_param.bias(batch_id, group_id) + + numbers_of_ncb_dst_offset)); + else + bias_ptr = static_cast(const_cast( + ncb_param.bias(batch_id, group_id) + oc_start)); + PostProcess::run( + matmul_dst, bias_ptr, conv_bias_dst, param.bias_mode, + param.nonlineMode, param.bias_type, param.dst_type, 1_z, + oc_end - oc_start, OH, OW); + } +}; + +class Conv1x1Factory { +public: + static Conv1x1StrategyBase* make_conv1x1_strategy( + const ConvBiasImpl::NCBKernSizeParam& param, + MatrixMulImpl::AlgoBase::PackMode pack_mode, + param::ConvBias::Format format); +}; + +} // namespace conv1x1 +} // namespace fallback +} // namespace megdnn diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp index 4619941df..bbf8155e5 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.cpp +++ b/dnn/src/fallback/conv_bias/opr_impl.cpp @@ -15,6 +15,7 @@ #include "src/common/opr_delegate.h" #include "src/common/utils.h" #include "src/fallback/conv_bias/algos.h" +#include "src/fallback/conv_bias/conv1x1/algos.h" #include "src/fallback/conv_bias/im2col/algos.h" #include "src/fallback/conv_bias/opr_impl.h" #include "src/naive/convolution/algorithms.h" @@ -54,7 +55,13 @@ public: ohw_tile_size)); all_algos.emplace_back(refhold.back().get()); } -#if 1 + for (size_t oc_tile_size : {24, 48}) { + refhold.emplace_back(new AlgoConv1x1( + static_cast(algo), + oc_tile_size)); + all_algos.emplace_back(refhold.back().get()); + } +#if 0 //! As these algos maybe very slow, it will make fastrun search slow, so //! we disable it, but for the test of strategyhelper, we just keep it. //! FIXME: I do not know a better way to do it. diff --git a/dnn/src/fallback/conv_bias/opr_impl.h b/dnn/src/fallback/conv_bias/opr_impl.h index c4d081bc8..76434f56f 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.h +++ b/dnn/src/fallback/conv_bias/opr_impl.h @@ -248,6 +248,7 @@ protected: private: class AlgoNaive; class AlgoIm2col; + class AlgoConv1x1; class AlgoWinogradF32; class AlgoWinogradF32_4x4; class AlgoWinogradQS8; diff --git a/dnn/src/x86/matrix_mul/algos.cpp b/dnn/src/x86/matrix_mul/algos.cpp index af1c5aa8b..6e0d8db21 100644 --- a/dnn/src/x86/matrix_mul/algos.cpp +++ b/dnn/src/x86/matrix_mul/algos.cpp @@ -438,7 +438,6 @@ size_t MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2::get_workspace( m, n, k, trans_a, trans_b, strategy, cacheline) .get_workspace_size(); } - MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL( AlgoInt8x8x32SSEM4N8K2, megdnn_x86_matmul_kern, 9, x86::matmul::gemm_sse_s8s8s32_4x8x2, dt_int8, dt_int32, dt_int16); diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp index 1cc8728bf..c95c340c1 100644 --- a/dnn/test/common/conv_bias.cpp +++ b/dnn/test/common/conv_bias.cpp @@ -875,6 +875,82 @@ std::vector get_conv_bias_args( return args; } +std::vector get_conv_bias_1x1_args( + bool no_bias, bool no_nonlinemode, bool quantized_nlmod, + bool only_broadcast_bias) { + using namespace conv_bias; + using Param = param::ConvBias; + using NLMode = param::ConvBias::NonlineMode; + using CONVMode = param::ConvBias::Mode; + std::vector args; + + auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, + size_t stride, NLMode nlmode, CONVMode convmode) { + Param param; + param.stride_h = stride; + param.stride_w = stride; + param.pad_h = 0; + param.pad_w = 0; + + param.mode = convmode; + param.nonlineMode = nlmode; + + args.emplace_back(param, TensorShape{n, ic, h, w}, + TensorShape{oc, ic, 1, 1}, TensorShape{}); + if (!no_bias) { + args.emplace_back(param, TensorShape{n, ic, h, w}, + TensorShape{oc, ic, 1, 1}, + TensorShape{1, oc, 1, 1}); + + if (!only_broadcast_bias) { + args.emplace_back(param, TensorShape{n, ic, h, w}, + TensorShape{oc, ic, 1, 1}, + TensorShape{n, oc, (h - 1) / stride + 1, + (w - 1) / stride + 1}); + } + } + + param.sparse = param::ConvBias::Sparse::GROUP; + + args.emplace_back(param, TensorShape{n, 2 * ic, h, w}, + TensorShape{2, oc, ic, 1, 1}, TensorShape{}); + if (!no_bias) { + args.emplace_back(param, TensorShape{n, 2 * ic, h, w}, + TensorShape{2, oc, ic, 1, 1}, + TensorShape{1, 2 * oc, 1, 1}); + + if (!only_broadcast_bias) { + args.emplace_back(param, TensorShape{n, 2 * ic, h, w}, + TensorShape{2, oc, ic, 1, 1}, + TensorShape{n, 2 * oc, (h - 1) / stride + 1, + (w - 1) / stride + 1}); + } + } + }; + + std::vector nonlinemode = {NLMode::IDENTITY}; + if (!no_nonlinemode) { + nonlinemode.emplace_back(NLMode::RELU); + nonlinemode.emplace_back(NLMode::H_SWISH); + if (!quantized_nlmod) { + nonlinemode.emplace_back(NLMode::SIGMOID); + } + } + + std::vector convmodes{param::ConvBias::Mode::CONVOLUTION, + param::ConvBias::Mode::CROSS_CORRELATION}; + + for (size_t n : {1, 2}) + for (size_t oc : {1, 9, 33}) + for (size_t ic : {1, 16, 64}) + for (size_t size : {7, 14, 28}) + for (auto nlmode : nonlinemode) + for (auto convmode : convmodes) { + pack(n, oc, ic, size, size, 1, nlmode, convmode); + } + return args; +} + void check_conv_bias(std::vector args, Handle* handle, const char* algo_name) { using namespace conv_bias; diff --git a/dnn/test/common/conv_bias.h b/dnn/test/common/conv_bias.h index b77222199..9ef78d6dd 100644 --- a/dnn/test/common/conv_bias.h +++ b/dnn/test/common/conv_bias.h @@ -76,6 +76,10 @@ std::vector get_conv_bias_args( bool no_nonlinemode, bool quantized_nlmod = false, bool only_broadcast_bias = false); +std::vector get_conv_bias_1x1_args( + bool no_bias, bool no_nonlinemode, bool quantized_nlmod = false, + bool only_broadcast_bias = false); + void check_conv_bias(std::vector args, megdnn::Handle* handle, const char* algo_name); diff --git a/dnn/test/x86/conv_bias.cpp b/dnn/test/x86/conv_bias.cpp index 7511d4d3f..950aab126 100644 --- a/dnn/test/x86/conv_bias.cpp +++ b/dnn/test/x86/conv_bias.cpp @@ -919,6 +919,79 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) { #undef cb } + +/**************************** Conv1x1 PackA *************************/ +namespace { +void checker_conv_bias(std::vector args, Handle* handle, + RNG* rng, float epsilon, DType type0, DType type1, + DType type2, DType type3, const char* algo_name) { + using namespace conv_bias; + + Checker checker(handle); + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo_name)); + checker.set_dtype(0, type0); + checker.set_dtype(1, type1); + checker.set_dtype(2, type2); + checker.set_dtype(4, type3); + checker.set_epsilon(epsilon); + if (NULL != rng) { + checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng); + } + for (auto&& arg : args) { + checker.set_param(arg.param).execs( + {arg.src, arg.filter, arg.bias, {}, {}}); + } +} +} // namespace + +#if MEGDNN_X86_WITH_MKL +TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_PACKA) { + using namespace conv_bias; + std::vector args = get_conv_bias_1x1_args(false, false); + check_conv_bias(args, handle(), "CONV1x1:X86_F32_MKL_PACKA:24"); +} + +TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_BLAS) { + using namespace conv_bias; + std::vector args = get_conv_bias_1x1_args(false, false); + check_conv_bias(args, handle(), "CONV1x1:X86_F32_BLAS:48"); +} +#endif + +TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32) { + using namespace conv_bias; + UniformIntRNG rng{-50, 50}; + float epsilon = 0.001; + std::vector args = get_conv_bias_1x1_args(true, true); +#if MEGDNN_X86_WITH_MKL_DNN + if (x86::is_supported(x86::SIMDType::VNNI)) { + checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, + dtype::Int8{}, dtype::Int32{}, dtype::Int32{}, + "CONV1x1:X86_INT8X8X32_MKLDNN:24"); + } +#endif +#if MEGDNN_X86_WITH_VNNI + if (x86::is_supported(x86::SIMDType::VNNI)) { + checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, + dtype::Int8{}, dtype::Int32{}, dtype::Int32{}, + "CONV1x1:X86_INT8X8X32_VNNI:24"); + } +#endif + if (x86::is_supported(x86::SIMDType::AVX2)) { + checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, + dtype::Int8{}, dtype::Int32{}, dtype::Int32{}, + "CONV1x1:X86_INT8X8X32_AVX2_4X16X2:24"); + checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, + dtype::Int8{}, dtype::Int32{}, dtype::Int32{}, + "CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24"); + } + checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, + dtype::Int8{}, dtype::Int32{}, dtype::Int32{}, + "CONV1x1:X86_INT8X8X32_SSE_4X8X2:48"); +} +/************************* End Conv1x1 PackA ************************/ + #endif TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) { -- GitLab