From c985204b313e2d1b885c24baaf09aa0dc1b32ce4 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Thu, 16 Apr 2020 20:26:43 +0800
Subject: [PATCH] feat(dnn): add conv1x1 algo and tests

GitOrigin-RevId: 374a62cf12efb74fd92ee5d0ec8df7cfd40addba
---
 dnn/src/fallback/conv_bias/conv1x1/algos.cpp  | 230 +++++++++++++
 dnn/src/fallback/conv_bias/conv1x1/algos.h    |  56 ++++
 .../conv_bias/conv1x1/conv1x1_dispatcher.h    |  99 ++++++
 .../conv_bias/conv1x1/conv1x1_strategy.cpp    | 214 ++++++++++++
 .../conv_bias/conv1x1/conv1x1_strategy.h      | 310 ++++++++++++++++++
 dnn/src/fallback/conv_bias/opr_impl.cpp       |   9 +-
 dnn/src/fallback/conv_bias/opr_impl.h         |   1 +
 dnn/src/x86/matrix_mul/algos.cpp              |   1 -
 dnn/test/common/conv_bias.cpp                 |  76 +++++
 dnn/test/common/conv_bias.h                   |   4 +
 dnn/test/x86/conv_bias.cpp                    |  73 +++++
 11 files changed, 1071 insertions(+), 2 deletions(-)
 create mode 100644 dnn/src/fallback/conv_bias/conv1x1/algos.cpp
 create mode 100644 dnn/src/fallback/conv_bias/conv1x1/algos.h
 create mode 100644 dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h
 create mode 100644 dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp
 create mode 100644 dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h

diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp
new file mode 100644
index 000000000..995540de8
--- /dev/null
+++ b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp
@@ -0,0 +1,230 @@
+/**
+ * \file dnn/src/fallback/conv_bias/conv1x1/algos.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "src/fallback/conv_bias/conv1x1/algos.h"
+#include "src/common/opr_delegate.h"
+#include "src/fallback/conv_bias/common.h"
+#include "src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h"
+#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"
+#include "src/fallback/conv_bias/opr_impl.h"
+
+#include "megdnn/opr_param_defs.h"
+#include "src/naive/convolution/helper.h"
+
+#if MEGDNN_X86
+#include "src/x86/conv_bias/postprocess_helper.h"
+#endif
+
+#include "midout.h"
+MIDOUT_DECL(megdnn_fallback_conv1x1)
+
+using namespace megdnn;
+using namespace fallback;
+#if MEGDNN_X86
+using namespace x86;
+#endif
+using namespace conv1x1;
+
+size_t ConvBiasImpl::AlgoConv1x1::get_oc_tile_size_heuristic(
+        const NCBKernSizeParam& param) const {
+    size_t OH = param.osz[0];
+    size_t OW = param.osz[1];
+    size_t OC = param.filter_meta.ocpg;
+    if (OH * OW >= 56 * 56 || OC >= 64)
+        return m_oc_block_size;
+    return div_ceil(OC, param.nr_threads);
+}
+
+size_t ConvBiasImpl::AlgoConv1x1::get_workspace(
+        ConvBiasImpl*, const NCBKernSizeParam& param) const {
+    size_t OH = param.osz[0];
+    size_t OW = param.osz[1];
+    size_t compt_oc_block_size = get_oc_tile_size_heuristic(param);
+
+    auto matmul_param =
+            get_matmul_kern_param(param, OH * OW, compt_oc_block_size);
+    
+    auto pack_mode = m_matmul_algo->packmode();
+    if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
+        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 0) {
+            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher;
+            return dispatcher
+                    .get_bundle(param, matmul_param, m_matmul_algo,
+                                compt_oc_block_size)
+                    .total_size_in_bytes();
+        }
+        MIDOUT_END();
+    } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
+        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) {
+            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher;
+            return dispatcher
+                    .get_bundle(param, matmul_param, m_matmul_algo,
+                                compt_oc_block_size)
+                    .total_size_in_bytes();
+        }
+        MIDOUT_END();
+    } else {
+        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 2) {
+            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher;
+            return dispatcher
+                    .get_bundle(param, matmul_param, m_matmul_algo,
+                                compt_oc_block_size)
+                    .total_size_in_bytes();
+        }
+        MIDOUT_END();
+    }
+    return 0;
+}
+
+SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns(
+        ConvBiasImpl* opr, const NCBKernSizeParam& param) const {
+    SmallVector<ConvBiasImpl::NCBKern> ret_kern;
+
+    size_t OH = param.osz[0];
+    size_t OW = param.osz[1];
+    size_t OC = param.filter_meta.ocpg;
+    size_t compt_oc_block_size = get_oc_tile_size_heuristic(param);
+    size_t GROUP = param.filter_meta.group;
+    size_t BATCH = param.n;
+    size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size);
+
+    auto matmul_param =
+            get_matmul_kern_param(param, OH * OW, compt_oc_block_size);
+    WorkspaceBundle whole_bundle = {nullptr, {}};
+    WorkspaceBundle thread_bundle = {nullptr, {}};
+    WorkspaceBundle matmul_bundle = {nullptr, {}};
+
+    auto pack_mode = m_matmul_algo->packmode();
+    if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
+        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 0) {
+            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher;
+            whole_bundle = dispatcher.get_bundle(
+                    param, matmul_param, m_matmul_algo, compt_oc_block_size);
+            matmul_bundle = m_matmul_algo->get_bundle(matmul_param);
+        }
+        MIDOUT_END();
+    } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
+        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) {
+            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher;
+            whole_bundle = dispatcher.get_bundle(
+                    param, matmul_param, m_matmul_algo, compt_oc_block_size);
+            matmul_bundle = m_matmul_algo->get_bundle(matmul_param);
+        }
+        MIDOUT_END();
+    } else {
+        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 2) {
+            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher;
+            whole_bundle = dispatcher.get_bundle(
+                    param, matmul_param, m_matmul_algo, compt_oc_block_size);
+            matmul_bundle = {
+                    nullptr,
+                    {0, 0, m_matmul_algo->get_workspace(matmul_param)}};
+        }
+        MIDOUT_END();
+    }
+
+    //! get thread bundle
+    thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2),
+                                      compt_oc_block_size);
+
+    Conv1x1StrategyBase* conv1x1_strategy =
+            Conv1x1Factory::make_conv1x1_strategy(param, pack_mode,
+                                                 opr->param().format);
+
+    auto kern_packA = [this, whole_bundle, matmul_bundle, param,
+                       compt_oc_block_size, conv1x1_strategy](
+                              const NCBKernParam& ncb_param,
+                              const NCBKernIndex& ncb_index) mutable {
+        conv1x1_strategy->packA(whole_bundle, matmul_bundle,
+                                compt_oc_block_size, this->m_matmul_algo, param,
+                                ncb_param, std::move(ncb_index));
+    };
+    auto kern_packB = [this, whole_bundle, matmul_bundle, param,
+                       conv1x1_strategy](
+                              const NCBKernParam& ncb_param,
+                              const NCBKernIndex& ncb_index) mutable {
+        conv1x1_strategy->packB(whole_bundle, matmul_bundle,
+                                this->m_matmul_algo, param, ncb_param,
+                                std::move(ncb_index));
+    };
+    auto kern_compt = [this, whole_bundle, matmul_bundle, thread_bundle, param,
+                       compt_oc_block_size, conv1x1_strategy](
+                              const NCBKernParam& ncb_param,
+                              const NCBKernIndex& ncb_index) mutable {
+        conv1x1_strategy->exec(whole_bundle, matmul_bundle, thread_bundle,
+                               compt_oc_block_size, this->m_matmul_algo, param,
+                               ncb_param, std::move(ncb_index));
+    };
+
+    if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT ||
+        pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
+        ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}});
+        if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
+                ret_kern.push_back({kern_packB, {1}});
+            }
+    }
+    ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}});
+
+    return ret_kern;
+}
+
+bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr,
+                                       const NCBKernSizeParam& param,
+                                       AlgoSelectionStrategy) const {
+    MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) {
+        //! only support nchw format
+        if (opr->param().format != param::ConvBias::Format::NCHW)
+            return false;
+
+        size_t FH = param.filter_meta.spatial[0],
+               FW = param.filter_meta.spatial[1];
+        size_t PH = param.filter_meta.padding[0],
+               PW = param.filter_meta.padding[1];
+        size_t SH = param.filter_meta.stride[0],
+               SW = param.filter_meta.stride[1];
+
+        if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1)
+            return false;
+
+        //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode
+        //! is identity otherwise return false mean that 8x8x32 and 8x8x16
+        //! not support PostProcess
+        if (param.src_type.enumv() == param.filter_type.enumv() &&
+            (param.src_type.enumv() == DTypeEnum::Int8 &&
+             (param.dst_type.enumv() == DTypeEnum::Int16 ||
+              param.dst_type.enumv() == DTypeEnum::Int32)) &&
+            param.bias_mode != megdnn::BiasMode::NO_BIAS &&
+            param.nonlineMode != megdnn::NonlineMode::IDENTITY)
+            return false;
+
+        if (param.src_type.enumv() == param.filter_type.enumv() &&
+            ((param.src_type.enumv() == DTypeEnum::QuantizedS8 ||
+              param.src_type.enumv() == DTypeEnum::Quantized8Asymm) &&
+             param.dst_type.enumv() == DTypeEnum::QuantizedS32) &&
+            param.bias_mode != megdnn::BiasMode::NO_BIAS &&
+            param.nonlineMode != megdnn::NonlineMode::IDENTITY)
+            return false;
+
+        size_t OH = param.osz[0];
+        size_t OW = param.osz[1];
+        MatrixMulImpl::KernSizeParam matmul_param =
+                get_matmul_kern_param(param, OH * OW, get_oc_tile_size_heuristic(param));
+
+        bool matmulusable = m_matmul_algo->usable(matmul_param);
+        return matmulusable &&
+               (param.filter_meta.dilation[0] ==
+                        param.filter_meta.dilation[1] &&
+                param.filter_meta.dilation[0] == 1) &&
+               param.compute_mode == param::ConvBias::ComputeMode::DEFAULT;
+    }
+    MIDOUT_END();
+    return false;
+}
diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.h b/dnn/src/fallback/conv_bias/conv1x1/algos.h
new file mode 100644
index 000000000..fb3bdb66b
--- /dev/null
+++ b/dnn/src/fallback/conv_bias/conv1x1/algos.h
@@ -0,0 +1,56 @@
+/**
+ * \file dnn/src/fallback/conv_bias/conv1x1/algos.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "megdnn/thin/small_vector.h"
+#include "src/common/utils.h"
+#include "src/fallback/conv_bias/opr_impl.h"
+#include "src/fallback/matrix_mul/opr_impl.h"
+
+namespace megdnn {
+namespace fallback {
+
+class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase {
+public:
+    AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size)
+            : m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {}
+
+    bool is_reproducible() const override { return true; }
+
+    const char* name() const override {
+        if (m_name.empty()) {
+            m_name = ssprintf("CONV1x1:%s:%zu", m_matmul_algo->name(),
+                              m_oc_block_size);
+        }
+        return m_name.c_str();
+    }
+
+    bool usable(ConvBiasImpl* opr, const NCBKernSizeParam& param,
+                AlgoSelectionStrategy algo_selection_strategy) const override;
+    size_t get_workspace(ConvBiasImpl*,
+                         const NCBKernSizeParam& param) const override;
+    SmallVector<NCBKern> dispatch_kerns(
+            ConvBiasImpl* opr, const NCBKernSizeParam& param) const override;
+
+protected:
+    size_t get_oc_tile_size_heuristic(const NCBKernSizeParam& param) const;
+
+private:
+    MatrixMulImpl::AlgoBase* m_matmul_algo;
+    mutable std::string m_name;
+    mutable size_t m_oc_block_size = 0;
+};
+
+}  // namespace fallback
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h
new file mode 100644
index 000000000..581d2d69b
--- /dev/null
+++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h
@@ -0,0 +1,99 @@
+/**
+ * \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"
+
+namespace megdnn {
+namespace fallback {
+namespace conv1x1 {
+
+namespace {
+//! get_thread_bundle
+WorkspaceBundle get_thread_bundle(const ConvBiasImpl::NCBKernSizeParam& param,
+                                  size_t matmul_c_size, size_t oc_tile_size) {
+    //! for some cases, matmul result need temp space to store
+    size_t OH = param.osz[0];
+    size_t OW = param.osz[1];
+    bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
+                        param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
+                       (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
+                        param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
+    size_t matmul_dst_bytes_per_thread =
+            is_dst_8bit ? oc_tile_size * OH * OW * sizeof(param.bias_type) : 0;
+    return WorkspaceBundle{nullptr,
+                           {matmul_c_size, matmul_dst_bytes_per_thread}};
+}
+} // anonymous namespace
+
+template <MatrixMulImpl::AlgoBase::PackMode pack_mode>
+class Conv1x1Kerns {
+public:
+    //! get_bundle
+    WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param,
+                               const MatrixMulImpl::KernSizeParam& matmul_param,
+                               const MatrixMulImpl::AlgoBase* matmul_algo,
+                               size_t oc_tile_size) {
+        size_t GROUP = param.filter_meta.group;
+        size_t OC = param.filter_meta.ocpg;
+        size_t BATCH = param.n;
+
+        //! bundle per thread
+        //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH
+        //! * OW this does not bother packb bytes
+        auto matmul_bundle = matmul_algo->get_bundle(matmul_param);
+        auto thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2),
+                                               oc_tile_size);
+
+        //! size per thread
+        size_t all_threads_bytes =
+                thread_bundle.total_size_in_bytes() * param.nr_threads;
+
+        //! packa size = GROUP * packa_size_each_group
+        size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0);
+        size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size);
+        size_t all_packa_bytes =
+                packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP;
+
+        if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA)
+            return WorkspaceBundle{nullptr,
+                                   {all_packa_bytes, 0, all_threads_bytes}};
+
+        //! packb size = N * GROUP * packb_size_per_group
+        size_t packb_bytes_per_group = matmul_bundle.get_size(1);
+        size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH;
+
+        return WorkspaceBundle{
+                nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}};
+    }
+};
+
+template<>
+class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> {
+public:
+    //! get_bundle
+    WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param,
+                               const MatrixMulImpl::KernSizeParam& matmul_param,
+                               const MatrixMulImpl::AlgoBase* matmul_algo,
+                               size_t oc_tile_size) {
+        size_t matmul_size = matmul_algo->get_workspace(matmul_param);
+        auto thread_bundle = get_thread_bundle(param, matmul_size, oc_tile_size);
+        //! size per thread
+        size_t all_threads_bytes =
+                thread_bundle.total_size_in_bytes() * param.nr_threads;
+        return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}};
+    }
+};
+
+}  // namespace conv1x1
+}  // namespace fallback
+}  // namespace megdnn
diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp
new file mode 100644
index 000000000..05322417c
--- /dev/null
+++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp
@@ -0,0 +1,214 @@
+/**
+ * \file dnn/src/fallback/conv_bias/conv1x1/Conv1x1_strategy.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include <unordered_map>
+#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"
+
+#include "midout.h"
+
+MIDOUT_DECL(megdnn_fallback_conv1x1_factory_strategy)
+
+namespace megdnn {
+namespace fallback {
+namespace conv1x1 {
+
+namespace {
+
+struct StrategyHashParam {
+    ConvBiasImpl::NCBKernSizeParam param;
+    param::ConvBias::Format format;
+    MatrixMulImpl::AlgoBase::PackMode packmode;
+};
+
+struct StrategyHashParamHash {
+    std::size_t operator()(const StrategyHashParam& sparam) const {
+        constexpr size_t base = 1;  //! avoid hashkey is zero
+        std::size_t result =
+                static_cast<std::size_t>(sparam.param.src_type.enumv()) + base;
+        result = result ^
+                 ((static_cast<std::size_t>(sparam.param.dst_type.enumv()) +
+                   base)
+                  << 3);
+        result = result ^
+                 ((static_cast<std::size_t>(sparam.param.filter_type.enumv()) +
+                   base)
+                  << 6);
+        result = result ^
+                 ((static_cast<std::size_t>(sparam.param.bias_type.enumv()) +
+                   base)
+                  << 9);
+        result = result ^
+                 ((static_cast<std::size_t>(sparam.format) + base) << 12);
+        result = result ^
+                 ((static_cast<std::size_t>(sparam.packmode) + base) << 15);
+        return result;
+    };
+};
+
+struct StrategyHashParamEqual {
+    bool operator()(const StrategyHashParam& param1,
+                    const StrategyHashParam& param2) const {
+        bool flags = true;
+        flags = param1.param.src_type == param2.param.src_type && flags;
+        flags = param1.param.filter_type == param2.param.filter_type && flags;
+        flags = param1.param.bias_type == param2.param.bias_type && flags;
+        flags = param1.param.dst_type == param2.param.dst_type && flags;
+        flags = param1.format == param2.format && flags;
+        flags = param1.packmode == param2.packmode && flags;
+        return flags;
+    };
+};
+
+std::unique_ptr<Conv1x1StrategyBase> create_conv1x1_strategy(
+        const ConvBiasImpl::NCBKernSizeParam& param,
+        MatrixMulImpl::AlgoBase::PackMode pack_mode,
+        param::ConvBias::Format format) {
+    MEGDNN_MARK_USED_VAR(format);
+
+#define cb1(_packmode, _dt, _post_ctype, _postprocess_mode, _midout_tag)     \
+    MIDOUT_BEGIN(megdnn_fallback_conv1x1_factory_strategy,                   \
+                 midout_iv(_midout_tag)) {                                   \
+        if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) {           \
+            return std::make_unique<                                         \
+                    Conv1x1Strategy<_dt, _dt, _dt, _post_ctype, _post_ctype, \
+                                    _postprocess_mode, _packmode>>();        \
+        }                                                                    \
+    }                                                                        \
+    MIDOUT_END()
+
+#define cb2(_packmode, _i_src_type, _i_bias_type, _i_dst_type, _src_ctype, \
+            _bias_ctype, _dst_ctype, _postprocess_mode, _midout_tag)       \
+    MIDOUT_BEGIN(megdnn_fallback_conv1x1_factory_strategy,                 \
+                 midout_iv(_midout_tag)) {                                 \
+        if (param.filter_type.enumv() == param.src_type.enumv() &&         \
+            param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv &&    \
+            param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) {    \
+            return std::make_unique<                                       \
+                    Conv1x1Strategy<_src_ctype, _bias_ctype, _dst_ctype,   \
+                                    DTypeTrait<_i_bias_type>::ctype,       \
+                                    DTypeTrait<_i_dst_type>::ctype,        \
+                                    _postprocess_mode, _packmode>>();      \
+        }                                                                  \
+    }                                                                      \
+    MIDOUT_END()
+
+    switch (pack_mode) {
+        case MatrixMulImpl::AlgoBase::PackMode::DEFAULT:
+            cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float32,
+                dt_float32, PostprocessMode::FLOAT, "Default::FLOAT"_hash);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float16, __fp16,
+                PostprocessMode::FLOAT, "Default::FLOAT16_FP16"_hash);
+#else
+#if !MEGDNN_DISABLE_FLOAT16
+            cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float16,
+                dt_float16, PostprocessMode::NO_PROCESS,
+                "Default::FLOAT16_FLOAT16"_hash);
+#endif
+#endif
+            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_int8, dt_int32,
+                dt_int32, dt_int8, dt_int32, dt_int32,
+                PostprocessMode::NO_PROCESS, "Default::INT8x8x32_INT32"_hash);
+            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_int8, dt_int16,
+                dt_int16, dt_int8, dt_int16, dt_int16,
+                PostprocessMode::NO_PROCESS, "Default::INT8x8x16_INT16"_hash);
+#if MEGDNN_AARCH64 || MEGDNN_ARMV7
+            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT,
+                dtype::Quantized8Asymm, dtype::QuantizedS32,
+                dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32,
+                PostprocessMode::NO_PROCESS,
+                "Default::QUINT8x8x32_QINT32"_hash);
+            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT,
+                dtype::Quantized8Asymm, dtype::QuantizedS32,
+                dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8,
+                PostprocessMode::QUANTIZED, "Default::QUINT8x8x32_QUINT8"_hash);
+#endif
+            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dtype::QuantizedS8,
+                dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32,
+                dt_int32, PostprocessMode::NO_PROCESS,
+                "Default::QINT8x8x32_QINT32"_hash);
+            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dtype::QuantizedS8,
+                dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32,
+                dt_int8, PostprocessMode::QUANTIZED,
+                "Default::QINT8x8x32_QINT8"_hash);
+            break;
+
+        case MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA:
+            cb1(MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA, dt_float32,
+                dt_float32, PostprocessMode::FLOAT, "OnlyPackA::FLOAT"_hash);
+            break;
+
+        case MatrixMulImpl::AlgoBase::PackMode::NO_PACK:
+            cb1(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_float32,
+                dt_float32, PostprocessMode::FLOAT, "NoPack::FLOAT"_hash);
+
+            cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_int8, dt_int16,
+                dt_int16, dt_int8, dt_int16, dt_int16,
+                PostprocessMode::NO_PROCESS, "NoPack::INT8x8x16_INT16"_hash);
+
+            cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_int8, dt_int32,
+                dt_int32, dt_int8, dt_int32, dt_int32,
+                PostprocessMode::NO_PROCESS, "NoPack::INT8x8x32_INT32"_hash);
+
+            cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK,
+                dtype::QuantizedS8, dtype::QuantizedS32,
+                dtype::QuantizedS32, dt_int8, dt_int32, dt_int32,
+                PostprocessMode::NO_PROCESS,
+                "NoPack::QINT8x8x32_QINT32"_hash);
+            break;
+
+        default:
+            megdnn_throw("Invalid Pack Mode");
+            break;
+    }
+#undef cb1
+#undef cb2
+    megdnn_throw("Invalid Data Type");
+    return nullptr;
+}
+
+class StrategyDelegationStorage {
+public:
+    Conv1x1StrategyBase* get(const ConvBiasImpl::NCBKernSizeParam& param,
+                             MatrixMulImpl::AlgoBase::PackMode pack_mode,
+                             param::ConvBias::Format format) {
+        MEGDNN_LOCK_GUARD(m_mtx);
+        StrategyHashParam sparam;
+        sparam.param = param;
+        sparam.format = format;
+        sparam.packmode = pack_mode;
+        if (m_map_strategies.find(sparam) == m_map_strategies.end()) {
+            auto strategy = create_conv1x1_strategy(param, pack_mode, format);
+            m_map_strategies[sparam] = std::move(strategy);
+        }
+        return m_map_strategies[sparam].get();
+    }
+
+private:
+    std::mutex m_mtx;
+    std::unordered_map<StrategyHashParam, std::unique_ptr<Conv1x1StrategyBase>,
+                       StrategyHashParamHash, StrategyHashParamEqual>
+            m_map_strategies;
+};
+
+}  // anonymous namespace
+
+Conv1x1StrategyBase* Conv1x1Factory::make_conv1x1_strategy(
+        const ConvBiasImpl::NCBKernSizeParam& param,
+        MatrixMulImpl::AlgoBase::PackMode pack_mode,
+        param::ConvBias::Format format) {
+    static StrategyDelegationStorage storage;
+    return storage.get(param, pack_mode, format);
+}
+
+}  // namespace conv1x1
+}  // namespace fallback
+}  // namespace megdnn
diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h
new file mode 100644
index 000000000..7bb6028b4
--- /dev/null
+++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h
@@ -0,0 +1,310 @@
+/**
+ * \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+
+#include "megdnn/opr_param_defs.h"
+#include "src/fallback/conv_bias/opr_impl.h"
+#if MEGDNN_X86
+#include "src/x86/conv_bias/postprocess_helper.h"
+#endif
+
+namespace megdnn {
+namespace fallback {
+namespace conv1x1 {
+
+#if MEGDNN_X86
+using namespace x86;
+#endif
+
+namespace {
+
+//! get_matmul_kern_param
+MatrixMulImpl::KernSizeParam get_matmul_kern_param(
+        const ConvBiasImpl::NCBKernSizeParam& param, size_t n, size_t m) {
+    size_t M = m;
+    size_t N = n;
+    size_t K = param.filter_meta.icpg;  //! K = IC
+    size_t LDA = K, LDB = N, LDC = N;
+    bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
+                        param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
+                       (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
+                        param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
+    return {param.filter_type,
+            param.src_type,
+            is_dst_8bit ? param.bias_type : param.dst_type,
+            M,
+            N,
+            K,
+            LDA,
+            LDB,
+            LDC,
+            false,
+            false,
+            param::MatrixMul::ComputeMode::DEFAULT,
+            param::MatrixMul::Format::DEFAULT};
+}
+}  // namespace
+
+class Conv1x1StrategyBase {
+public:
+    virtual void packA(WorkspaceBundle& whole_bundle,
+                       WorkspaceBundle& matmul_bundle,
+                       size_t oc_tile_size,
+                       const MatrixMulImpl::AlgoBase* matmul_algo,
+                       const ConvBiasImpl::NCBKernSizeParam& param,
+                       const ConvBiasImpl::NCBKernParam& ncb_param,
+                       const ConvBiasImpl::NCBKernIndex& ncb_index) = 0;
+
+    virtual void packB(WorkspaceBundle& whole_bundle,
+                       WorkspaceBundle& matmul_bundle,
+                       const MatrixMulImpl::AlgoBase* matmul_algo,
+                       const ConvBiasImpl::NCBKernSizeParam& param,
+                       const ConvBiasImpl::NCBKernParam& ncb_param,
+                       const ConvBiasImpl::NCBKernIndex& ncb_index) = 0;
+
+    virtual void exec(WorkspaceBundle& whole_bundle,
+                      WorkspaceBundle& matmul_bundle,
+                      WorkspaceBundle& thread_bundle,
+                      size_t oc_tile_size,
+                      const MatrixMulImpl::AlgoBase* matmul_algo,
+                      const ConvBiasImpl::NCBKernSizeParam& param,
+                      const ConvBiasImpl::NCBKernParam& ncb_param,
+                      const ConvBiasImpl::NCBKernIndex& ncb_index) = 0;
+    virtual ~Conv1x1StrategyBase() = default;
+};
+
+template <typename src_ctype, typename bias_ctype, typename dst_ctype,
+          typename op_ctype, typename op_dtype,
+          megdnn::PostprocessMode postprocess_mode, MatrixMulImpl::AlgoBase::PackMode pack_mode>
+class Conv1x1Strategy : public Conv1x1StrategyBase {
+public:
+    void packA(WorkspaceBundle& whole_bundle,
+               WorkspaceBundle& matmul_bundle,
+               size_t oc_tile_size,
+               const MatrixMulImpl::AlgoBase* matmul_algo,
+               const ConvBiasImpl::NCBKernSizeParam& param,
+               const ConvBiasImpl::NCBKernParam& ncb_param,
+               const ConvBiasImpl::NCBKernIndex& ncb_index) override {
+        
+        if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) {
+            megdnn_log_error("NoPack mode has no packA kernel");
+            return;
+        }
+
+        whole_bundle.set(ncb_param.workspace_ptr);
+
+        //! packa size per group
+        size_t OC = param.filter_meta.ocpg;
+        size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size);
+        size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0);
+        size_t packa_bytes_per_group =
+                oc_tiles_per_group * packa_bytes_per_oc_tile;
+
+        size_t group_id = ncb_index.ndrange_id[0];
+        size_t oc_tile_id_in_group = ncb_index.ndrange_id[1];
+
+        size_t oc_start = oc_tile_id_in_group * oc_tile_size;
+        size_t oc_end = oc_start + oc_tile_size;
+        oc_end = (oc_end <= OC ? oc_end : OC);
+
+        size_t OH = param.osz[0];
+        size_t OW = param.osz[1];
+        size_t IC = param.filter_meta.icpg;
+        MatrixMulImpl::KernParam matmul_kern_param;
+        static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) =
+                get_matmul_kern_param(param, OH * OW, oc_end - oc_start);
+
+        size_t bytes_offset_of_a_panel =
+                group_id * packa_bytes_per_group +
+                oc_tile_id_in_group * packa_bytes_per_oc_tile;
+        size_t numbers_offset_of_filter =
+                oc_tile_size * IC * oc_tile_id_in_group;
+
+        src_ctype* a_panel = reinterpret_cast<src_ctype*>(
+                reinterpret_cast<int8_t*>(whole_bundle.get(0)) +
+                bytes_offset_of_a_panel);
+        matmul_kern_param.A_ptr = const_cast<src_ctype*>(
+                ncb_param.filter<src_ctype>(group_id) +
+                numbers_offset_of_filter);
+        matmul_algo->pack_A(matmul_kern_param, a_panel, 0,
+                            oc_end - oc_start);
+    }
+
+    void packB(WorkspaceBundle& whole_bundle,
+               WorkspaceBundle& matmul_bundle,
+               const MatrixMulImpl::AlgoBase* matmul_algo,
+               const ConvBiasImpl::NCBKernSizeParam& param,
+               const ConvBiasImpl::NCBKernParam& ncb_param,
+               const ConvBiasImpl::NCBKernIndex& ncb_index) override {
+        if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
+            whole_bundle.set(ncb_param.workspace_ptr);
+
+            //! packb size per group
+            size_t packb_bytes_per_group = matmul_bundle.get_size(1);
+
+            size_t GROUP = param.filter_meta.group;
+            size_t BATCH = param.n;
+            size_t SH = param.filter_meta.stride[0];
+            size_t SW = param.filter_meta.stride[1];
+            size_t OH = param.osz[0];
+            size_t OW = param.osz[1];
+            size_t OC = param.filter_meta.ocpg;
+
+            MatrixMulImpl::KernParam matmul_kern_param;
+            static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) =
+                    get_matmul_kern_param(param, OH * OW, OC);
+
+            rep(batch, BATCH) {
+                rep(g, GROUP) {
+                    if (SH == 2 && SW == 2)
+                        megdnn_throw("no support for stride = 2");
+
+                    size_t bytes_offset_of_b_panel =
+                            batch * packb_bytes_per_group * GROUP +
+                            g * packb_bytes_per_group;
+                    src_ctype* b_panel = reinterpret_cast<src_ctype*>(
+                            reinterpret_cast<int8_t*>(whole_bundle.get(1)) +
+                            bytes_offset_of_b_panel);
+                    matmul_kern_param.B_ptr = const_cast<src_ctype*>(
+                            ncb_param.src<src_ctype>(batch, g));
+                    matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW);
+                }
+            }
+        } else {
+            megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel");
+        }
+    }
+
+    void exec(WorkspaceBundle& whole_bundle,
+              WorkspaceBundle& matmul_bundle,
+              WorkspaceBundle& thread_bundle,
+              size_t oc_tile_size,
+              const MatrixMulImpl::AlgoBase* matmul_algo,
+              const ConvBiasImpl::NCBKernSizeParam& param,
+              const ConvBiasImpl::NCBKernParam& ncb_param,
+              const ConvBiasImpl::NCBKernIndex& ncb_index) override {
+        whole_bundle.set(ncb_param.workspace_ptr);
+        size_t OC = param.filter_meta.ocpg;
+        size_t IC = param.filter_meta.icpg;
+
+        //! packa bytes per group
+        size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size);
+        size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0);
+        size_t packa_bytes_per_group =
+                packa_bytes_per_oc_tile * oc_tiles_per_group;
+
+        //! packb bytes per group
+        size_t packb_bytes_per_group = matmul_bundle.get_size(1);
+
+        //! matmul bytes per thread
+        size_t matmul_bytes_per_thread = thread_bundle.get_size(0);
+
+        size_t batch_id = ncb_index.ndrange_id[0];
+        size_t group_id = ncb_index.ndrange_id[1];
+        size_t oc_tile_id_in_group = ncb_index.ndrange_id[2];
+        size_t thread_id = ncb_index.thread_id;
+
+        size_t GROUP = param.filter_meta.group;
+        size_t OH = param.osz[0];
+        size_t OW = param.osz[1];
+        size_t oc_start = oc_tile_size * oc_tile_id_in_group;
+        size_t oc_end = oc_start + oc_tile_size;
+        oc_end = (oc_end <= OC ? oc_end : OC);
+
+        MatrixMulImpl::KernParam matmul_kern_param;
+        static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) =
+                get_matmul_kern_param(param, OH * OW, oc_end - oc_start);
+
+        size_t bytes_offset_of_a_panel =
+                group_id * packa_bytes_per_group +
+                oc_tile_id_in_group * packa_bytes_per_oc_tile;
+        int8_t* a_panel = reinterpret_cast<int8_t*>(whole_bundle.get(0)) +
+                          bytes_offset_of_a_panel;
+
+        size_t bytes_offset_of_b_panel =
+                batch_id * packb_bytes_per_group * GROUP +
+                group_id * packb_bytes_per_group;
+        int8_t* b_panel = reinterpret_cast<int8_t*>(whole_bundle.get(1)) +
+                          bytes_offset_of_b_panel;
+
+        size_t thread_offset = thread_bundle.total_size_in_bytes() * thread_id;
+        size_t bytes_offset_of_matmul_dst_this_thread =
+                thread_offset + thread_bundle.get_size(0);
+        int8_t* matmul_temp_dst =
+                reinterpret_cast<int8_t*>(whole_bundle.get(2)) +
+                bytes_offset_of_matmul_dst_this_thread;
+
+        size_t numbers_of_ncb_dst_offset =
+                oc_tile_size * OH * OW * oc_tile_id_in_group;
+        void* conv_bias_dst = static_cast<void*>(
+                ncb_param.dst<dst_ctype>(batch_id, group_id) +
+                numbers_of_ncb_dst_offset);
+
+        size_t numbers_of_ncb_filter_offset =
+                oc_tile_size * IC * oc_tile_id_in_group;
+        matmul_kern_param.A_ptr = const_cast<src_ctype*>(
+                ncb_param.filter<src_ctype>(group_id) +
+                numbers_of_ncb_filter_offset);
+
+        matmul_kern_param.B_ptr = const_cast<src_ctype*>(
+                ncb_param.src<src_ctype>(batch_id, group_id));
+
+        matmul_kern_param.workspace_ptr =
+                reinterpret_cast<int8_t*>(whole_bundle.get(2)) + thread_offset;
+        matmul_kern_param.workspace_size = matmul_bytes_per_thread;
+
+        bool is_dst_8bit =
+                (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
+                 param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
+                (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
+                 param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
+        void* matmul_dst = is_dst_8bit ? matmul_temp_dst : conv_bias_dst;
+
+        matmul_kern_param.C_ptr = matmul_dst;
+
+        if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) {
+            auto matmul_kern = matmul_algo->get_kern(matmul_kern_param);
+            matmul_kern(matmul_kern_param);
+        } else {
+            auto matmul_kern_naked =
+                    matmul_algo->get_kern_naked(matmul_kern_param);
+            matmul_kern_naked(matmul_kern_param, a_panel, b_panel);
+        }
+
+        //! do postprocess
+        void* bias_ptr = nullptr;
+        if (param.bias_mode == megdnn::BiasMode::BIAS)
+            bias_ptr = static_cast<void*>(const_cast<bias_ctype*>(
+                    ncb_param.bias<bias_ctype>(batch_id, group_id) +
+                    numbers_of_ncb_dst_offset));
+        else
+            bias_ptr = static_cast<void*>(const_cast<bias_ctype*>(
+                    ncb_param.bias<bias_ctype>(batch_id, group_id) + oc_start));
+        PostProcess<op_ctype, op_dtype, postprocess_mode>::run(
+                matmul_dst, bias_ptr, conv_bias_dst, param.bias_mode,
+                param.nonlineMode, param.bias_type, param.dst_type, 1_z,
+                oc_end - oc_start, OH, OW);
+    }
+};
+
+class Conv1x1Factory {
+public:
+    static Conv1x1StrategyBase* make_conv1x1_strategy(
+            const ConvBiasImpl::NCBKernSizeParam& param,
+            MatrixMulImpl::AlgoBase::PackMode pack_mode,
+            param::ConvBias::Format format);
+};
+
+}  // namespace conv1x1
+}  // namespace fallback
+}  // namespace megdnn
diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp
index 4619941df..bbf8155e5 100644
--- a/dnn/src/fallback/conv_bias/opr_impl.cpp
+++ b/dnn/src/fallback/conv_bias/opr_impl.cpp
@@ -15,6 +15,7 @@
 #include "src/common/opr_delegate.h"
 #include "src/common/utils.h"
 #include "src/fallback/conv_bias/algos.h"
+#include "src/fallback/conv_bias/conv1x1/algos.h"
 #include "src/fallback/conv_bias/im2col/algos.h"
 #include "src/fallback/conv_bias/opr_impl.h"
 #include "src/naive/convolution/algorithms.h"
@@ -54,7 +55,13 @@ public:
                         ohw_tile_size));
                 all_algos.emplace_back(refhold.back().get());
             }
-#if 1
+            for (size_t oc_tile_size : {24, 48}) {
+                refhold.emplace_back(new AlgoConv1x1(
+                    static_cast<MatrixMulImpl::AlgoBase*>(algo),
+                    oc_tile_size));
+                all_algos.emplace_back(refhold.back().get());
+            }
+#if 0
         //! As these algos maybe very slow, it will make fastrun search slow, so
         //! we disable it, but for the test of strategyhelper, we just keep it.
         //! FIXME: I do not know a better way to do it.
diff --git a/dnn/src/fallback/conv_bias/opr_impl.h b/dnn/src/fallback/conv_bias/opr_impl.h
index c4d081bc8..76434f56f 100644
--- a/dnn/src/fallback/conv_bias/opr_impl.h
+++ b/dnn/src/fallback/conv_bias/opr_impl.h
@@ -248,6 +248,7 @@ protected:
 private:
     class AlgoNaive;
     class AlgoIm2col;
+    class AlgoConv1x1;
     class AlgoWinogradF32;
     class AlgoWinogradF32_4x4;
     class AlgoWinogradQS8;
diff --git a/dnn/src/x86/matrix_mul/algos.cpp b/dnn/src/x86/matrix_mul/algos.cpp
index af1c5aa8b..6e0d8db21 100644
--- a/dnn/src/x86/matrix_mul/algos.cpp
+++ b/dnn/src/x86/matrix_mul/algos.cpp
@@ -438,7 +438,6 @@ size_t MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2::get_workspace(
                    m, n, k, trans_a, trans_b, strategy, cacheline)
             .get_workspace_size();
 }
-
 MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(
         AlgoInt8x8x32SSEM4N8K2, megdnn_x86_matmul_kern, 9,
         x86::matmul::gemm_sse_s8s8s32_4x8x2, dt_int8, dt_int32, dt_int16);
diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp
index 1cc8728bf..c95c340c1 100644
--- a/dnn/test/common/conv_bias.cpp
+++ b/dnn/test/common/conv_bias.cpp
@@ -875,6 +875,82 @@ std::vector<conv_bias::TestArg> get_conv_bias_args(
     return args;
 }
 
+std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_1x1_args(
+        bool no_bias, bool no_nonlinemode, bool quantized_nlmod,
+        bool only_broadcast_bias) {
+    using namespace conv_bias;
+    using Param = param::ConvBias;
+    using NLMode = param::ConvBias::NonlineMode;
+    using CONVMode = param::ConvBias::Mode;
+    std::vector<TestArg> args;
+
+    auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h,
+                    size_t stride, NLMode nlmode, CONVMode convmode) {
+        Param param;
+        param.stride_h = stride;
+        param.stride_w = stride;
+        param.pad_h = 0;
+        param.pad_w = 0;
+
+        param.mode = convmode;
+        param.nonlineMode = nlmode;
+
+        args.emplace_back(param, TensorShape{n, ic, h, w},
+                          TensorShape{oc, ic, 1, 1}, TensorShape{});
+        if (!no_bias) {
+            args.emplace_back(param, TensorShape{n, ic, h, w},
+                              TensorShape{oc, ic, 1, 1},
+                              TensorShape{1, oc, 1, 1});
+
+            if (!only_broadcast_bias) {
+                args.emplace_back(param, TensorShape{n, ic, h, w},
+                                  TensorShape{oc, ic, 1, 1},
+                                  TensorShape{n, oc, (h - 1) / stride + 1,
+                                              (w - 1) / stride + 1});
+            }
+        }
+
+        param.sparse = param::ConvBias::Sparse::GROUP;
+
+        args.emplace_back(param, TensorShape{n, 2 * ic, h, w},
+                          TensorShape{2, oc, ic, 1, 1}, TensorShape{});
+        if (!no_bias) {
+            args.emplace_back(param, TensorShape{n, 2 * ic, h, w},
+                              TensorShape{2, oc, ic, 1, 1},
+                              TensorShape{1, 2 * oc, 1, 1});
+
+            if (!only_broadcast_bias) {
+                args.emplace_back(param, TensorShape{n, 2 * ic, h, w},
+                                  TensorShape{2, oc, ic, 1, 1},
+                                  TensorShape{n, 2 * oc, (h - 1) / stride + 1,
+                                              (w - 1) / stride + 1});
+            }
+        }
+    };
+
+    std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
+    if (!no_nonlinemode) {
+        nonlinemode.emplace_back(NLMode::RELU);
+        nonlinemode.emplace_back(NLMode::H_SWISH);
+        if (!quantized_nlmod) {
+            nonlinemode.emplace_back(NLMode::SIGMOID);
+        }
+    }
+
+    std::vector<CONVMode> convmodes{param::ConvBias::Mode::CONVOLUTION,
+                                    param::ConvBias::Mode::CROSS_CORRELATION};
+
+    for (size_t n : {1, 2})
+        for (size_t oc : {1, 9, 33})
+            for (size_t ic : {1, 16, 64})
+                for (size_t size : {7, 14, 28})
+                    for (auto nlmode : nonlinemode)
+                        for (auto convmode : convmodes) {
+                            pack(n, oc, ic, size, size, 1, nlmode, convmode);
+                        }
+    return args;
+}
+
 void check_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
                      const char* algo_name) {
     using namespace conv_bias;
diff --git a/dnn/test/common/conv_bias.h b/dnn/test/common/conv_bias.h
index b77222199..9ef78d6dd 100644
--- a/dnn/test/common/conv_bias.h
+++ b/dnn/test/common/conv_bias.h
@@ -76,6 +76,10 @@ std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_args(
         bool no_nonlinemode, bool quantized_nlmod = false,
         bool only_broadcast_bias = false);
 
+std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_1x1_args(
+        bool no_bias, bool no_nonlinemode, bool quantized_nlmod = false,
+        bool only_broadcast_bias = false);
+
 void check_conv_bias(std::vector<megdnn::test::conv_bias::TestArg> args,
                      megdnn::Handle* handle, const char* algo_name);
 
diff --git a/dnn/test/x86/conv_bias.cpp b/dnn/test/x86/conv_bias.cpp
index 7511d4d3f..950aab126 100644
--- a/dnn/test/x86/conv_bias.cpp
+++ b/dnn/test/x86/conv_bias.cpp
@@ -919,6 +919,79 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
 
 #undef cb
 }
+
+/**************************** Conv1x1 PackA *************************/
+namespace {
+void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
+                       RNG* rng, float epsilon, DType type0, DType type1,
+                       DType type2, DType type3, const char* algo_name) {
+    using namespace conv_bias;
+
+    Checker<ConvBias> checker(handle);
+    checker.set_before_exec_callback(
+            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
+    checker.set_dtype(0, type0);
+    checker.set_dtype(1, type1);
+    checker.set_dtype(2, type2);
+    checker.set_dtype(4, type3);
+    checker.set_epsilon(epsilon);
+    if (NULL != rng) {
+        checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
+    }
+    for (auto&& arg : args) {
+        checker.set_param(arg.param).execs(
+                {arg.src, arg.filter, arg.bias, {}, {}});
+    }
+}
+}  // namespace
+
+#if MEGDNN_X86_WITH_MKL
+TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_PACKA) {
+    using namespace conv_bias;
+    std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
+    check_conv_bias(args, handle(), "CONV1x1:X86_F32_MKL_PACKA:24");
+}
+
+TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_BLAS) {
+    using namespace conv_bias;
+    std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
+    check_conv_bias(args, handle(), "CONV1x1:X86_F32_BLAS:48");
+}
+#endif
+
+TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32) {
+    using namespace conv_bias;
+    UniformIntRNG rng{-50, 50};
+    float epsilon = 0.001;
+    std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(true, true);
+#if MEGDNN_X86_WITH_MKL_DNN
+    if (x86::is_supported(x86::SIMDType::VNNI)) {
+        checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
+                        dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
+                        "CONV1x1:X86_INT8X8X32_MKLDNN:24");
+    }
+#endif
+#if MEGDNN_X86_WITH_VNNI
+    if (x86::is_supported(x86::SIMDType::VNNI)) {
+        checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
+                          dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
+                          "CONV1x1:X86_INT8X8X32_VNNI:24");
+    }
+#endif
+    if (x86::is_supported(x86::SIMDType::AVX2)) {
+        checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
+                          dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
+                          "CONV1x1:X86_INT8X8X32_AVX2_4X16X2:24");
+        checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
+                        dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
+                        "CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24");
+    }
+    checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
+                      dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
+                      "CONV1x1:X86_INT8X8X32_SSE_4X8X2:48");
+}
+/************************* End Conv1x1 PackA ************************/
+
 #endif
 
 TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
-- 
GitLab