From df356635b74d422cd745a01dc4fc3046a02f9950 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Fri, 14 Aug 2020 19:58:58 +0800
Subject: [PATCH] fix(mgb/fallback): delete im2col duplicate code and fix
 nchw44 usable

GitOrigin-RevId: 1aa250e9e715639364746144139d712edd610c6e
---
 .../arm_common/conv_bias/postprocess_helper.h |  38 +-
 dnn/src/fallback/conv_bias/im2col/algos.cpp   | 527 +++---------------
 dnn/src/fallback/conv_bias/im2col/factory.h   |  16 +-
 .../fallback/conv_bias/im2col/im2col_kerns.h  | 364 ++++++++++++
 .../conv_bias/im2col/strategy_default.cpp     |   3 +-
 .../im2col/strategy_default_nchw44.cpp        |   3 +-
 .../conv_bias/im2col/strategy_nopack.cpp      |   3 -
 7 files changed, 459 insertions(+), 495 deletions(-)
 create mode 100644 dnn/src/fallback/conv_bias/im2col/im2col_kerns.h

diff --git a/dnn/src/arm_common/conv_bias/postprocess_helper.h b/dnn/src/arm_common/conv_bias/postprocess_helper.h
index bcfa718c..539a105f 100644
--- a/dnn/src/arm_common/conv_bias/postprocess_helper.h
+++ b/dnn/src/arm_common/conv_bias/postprocess_helper.h
@@ -100,6 +100,7 @@ namespace {
             MIDOUT_END();                                                 \
             break;                                                        \
         default:                                                          \
+            megdnn_throw("unknow biasmode");                             \
             break;                                                        \
     }
 
@@ -282,24 +283,25 @@ struct PostProcess<opctype, opdtype, megdnn::PostprocessMode::QUANTIZED> {
                     reinterpret_cast<ctype*>(dst_ptr), bias_type, bias_type, \
                     dst_type, N* OC* OH* OW* pack_oc_size);
 
-#define FOR_BIAS(_bias_mode, OH, OW)                                \
-    switch (_bias_mode) {                                           \
-        case megdnn::BiasMode::NO_BIAS:                             \
-            break;                                                  \
-        case megdnn::BiasMode::BROADCAST_CHANNEL_BIAS:              \
-            if (pack_oc_size == 1) {                                \
-                FOR_BINARY_BROADCAST(CONCAT_OP(AddOp));             \
-            } else {                                                \
-                megdnn_assert(pack_oc_size == 4,                    \
-                              "Only support nchw44 in ARM");        \
-                FOR_BINARY_BROADCAST_NCHW44(CONCAT_OP(AddOp));      \
-            }                                                       \
-            break;                                                  \
-        case megdnn::BiasMode::BIAS:                                \
-            FOR_BINARY(CONCAT_OP(AddOp));                           \
-            break;                                                  \
-        default:                                                    \
-            break;                                                  \
+#define FOR_BIAS(_bias_mode, OH, OW)                           \
+    switch (_bias_mode) {                                      \
+        case megdnn::BiasMode::NO_BIAS:                        \
+            break;                                             \
+        case megdnn::BiasMode::BROADCAST_CHANNEL_BIAS:         \
+            if (pack_oc_size == 1) {                           \
+                FOR_BINARY_BROADCAST(CONCAT_OP(AddOp));        \
+            } else {                                           \
+                megdnn_assert(pack_oc_size == 4,               \
+                              "Only support nchw44 in ARM");   \
+                FOR_BINARY_BROADCAST_NCHW44(CONCAT_OP(AddOp)); \
+            }                                                  \
+            break;                                             \
+        case megdnn::BiasMode::BIAS:                           \
+            FOR_BINARY(CONCAT_OP(AddOp));                      \
+            break;                                             \
+        default:                                               \
+            megdnn_throw("unknow biasmode");                   \
+            break;                                             \
     }
 
 template <typename ctype, typename dtype>
diff --git a/dnn/src/fallback/conv_bias/im2col/algos.cpp b/dnn/src/fallback/conv_bias/im2col/algos.cpp
index 0068ef4c..7d5fafc9 100644
--- a/dnn/src/fallback/conv_bias/im2col/algos.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/algos.cpp
@@ -10,6 +10,7 @@
  */
 
 #include "src/fallback/conv_bias/im2col/algos.h"
+#include "src/fallback/conv_bias/im2col/im2col_kerns.h"
 #include "src/fallback/conv_bias/im2col/factory.h"
 #include "megdnn/opr_param_defs.h"
 #include "src/common/opr_delegate.h"
@@ -25,278 +26,6 @@ using namespace megdnn;
 using namespace fallback;
 using namespace im2col;
 
-/*======================== AlgoIm2col=======================*/
-/*!
- *  *\brief The index of all parts workspace in im2col workspace bundel
- *  *Through witch can convenient get the needed ptr
- */
-struct Im2colBundelIndex {
-    static constexpr size_t BUNDLE_THREAD_INDEX = 2_z;
-};
-
-using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode;
-/*!
- * *\brief Im2colKerns collects all the im2col kerns in it
- */
-
-template <Pack_Mode packmode>
-class Im2colKerns;
-
-template <>
-class Im2colKerns<Pack_Mode::DEFAULT> {
-public:
-    //! conv kernel
-    static void kerns(
-            const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
-            const ConvBiasImpl::NCBKernParam& param,
-            fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
-            const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
-            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
-                    matmul_desc,
-            StrategyParam strategyparam,
-            fallback::ConvBiasImpl::NCBKernIndex ncb_index,
-            size_t ohw_tile_size, StrategyBase* im2colstrategy) {
-        size_t OC = param.filter_meta.ocpg;
-        size_t output_block_size = std::min(
-                ohw_tile_size,
-                strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size);
-        size_t output_block_oc_size = std::min(
-                strategyparam.oc_tile_size,
-                OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size);
-
-        strategyparam.batch_id = ncb_index.ndrange_id[0];
-        strategyparam.group_id = ncb_index.ndrange_id[1];
-        strategyparam.oc_cur_index =
-                ncb_index.ndrange_id[3] *
-                strategyparam.oc_tile_size;
-        strategyparam.oc_end_index = strategyparam.oc_cur_index +
-                                     output_block_oc_size;
-        strategyparam.ohw_cur_index =
-                ncb_index.ndrange_id[2] * ohw_tile_size;
-        strategyparam.output_block_oc_size = output_block_oc_size;
-        strategyparam.output_block_size = output_block_size;
-
-        bundle_thread.set(
-                static_cast<int8_t*>(
-                        bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
-                bundle_thread.total_size_in_bytes() * ncb_index.thread_id);
-        fallback::MatrixMulImpl::KernParam matmul_param;
-        static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
-                matmul_kernsize_param;
-
-        //! 1.Im2col
-        im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param,
-                                    matmul_param, matmul_algo);
-
-        //! 2.packb and matmul compute
-        im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread,
-                                    matmul_param, matmul_algo, ncb_index,
-                                    matmul_desc);
-
-        //! 3.postprocess and copy dst if need
-        im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread);
-    }
-
-    WorkspaceBundle get_thread_bundle(
-            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
-            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
-            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
-            size_t oc_tile_size) {
-        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
-               FW = param.filter_meta.spatial[1];
-        size_t pack_oc_size = pack_size(param.filter_meta.format);
-        size_t im2col = 0, packb = 0, bias_temp = 0;
-        bool default_pack = matmul_algo->packmode() == Pack_Mode::DEFAULT;
-        megdnn_assert(default_pack, "only support default packa");
-        size_t im2col_dst_size =
-                IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
-        size_t matmul_dst_size = pack_oc_size * oc_tile_size * ohw_tile_size *
-                                 sizeof(param.bias_type);
-        //! matmul_dst and im2col_dst use the same memory
-        WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param);
-        packb = wb.get_size(1);
-        im2col = std::max(im2col_dst_size, matmul_dst_size);
-        if (param.bias_mode == megdnn::BiasMode::BIAS) {
-            bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
-        }
-        return {nullptr, {packb, im2col, bias_temp}};
-    }
-};
-
-template <>
-class Im2colKerns<Pack_Mode::ONLY_PACKA> {
-public:
-    //! conv kernel
-    static void kerns(
-            const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
-            const ConvBiasImpl::NCBKernParam& param,
-            fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
-            const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
-            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
-                    matmul_desc,
-            StrategyParam strategyparam,
-            fallback::ConvBiasImpl::NCBKernIndex ncb_index,
-            size_t ohw_tile_size, StrategyBase* im2colstrategy) {
-        size_t OC = param.filter_meta.ocpg;
-        size_t output_block_size = std::min(
-                ohw_tile_size,
-                strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size);
-        size_t output_block_oc_size = std::min(
-                strategyparam.oc_tile_size,
-                OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size);
-
-        bundle_thread.set(
-                static_cast<int8_t*>(
-                        bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
-                bundle_thread.total_size_in_bytes() * ncb_index.thread_id);
-
-        fallback::MatrixMulImpl::KernParam matmul_param;
-        static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
-                matmul_kernsize_param;
-
-        strategyparam.batch_id = ncb_index.ndrange_id[0];
-        strategyparam.group_id = ncb_index.ndrange_id[1];
-        strategyparam.oc_cur_index =
-                ncb_index.ndrange_id[3] *
-                strategyparam.oc_tile_size;
-        strategyparam.oc_end_index = strategyparam.oc_cur_index +
-                                     output_block_oc_size;
-        strategyparam.ohw_cur_index =
-                ncb_index.ndrange_id[2] * ohw_tile_size;
-        strategyparam.output_block_oc_size = output_block_oc_size;
-        strategyparam.output_block_size = output_block_size;
-
-        //! 1.Im2col
-        im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param,
-                                    matmul_param, matmul_algo);
-
-        //! 2.packb and matmul compute
-        im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread,
-                                    matmul_param, matmul_algo, ncb_index,
-                                    matmul_desc);
-
-        //! 3.postprocess and copy dst if need
-        im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread);
-    }
-    WorkspaceBundle get_thread_bundle(
-            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
-            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
-            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
-            size_t oc_tile_size) {
-        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
-               FW = param.filter_meta.spatial[1];
-
-        size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0;
-        bool only_packA = matmul_algo->packmode() == Pack_Mode::ONLY_PACKA;
-        megdnn_assert(only_packA, "onlysupport onlypackA mode");
-        size_t im2col_dst_size =
-                IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
-        size_t matmul_dst_size =
-                oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
-        //! matmul_dst and im2col_dst use the same memory
-        WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param);
-        packb = wb.get_size(1);
-        im2col = im2col_dst_size;
-        matmul_dst = matmul_dst_size;
-        if (param.bias_mode == megdnn::BiasMode::BIAS) {
-            bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
-        }
-
-        return {nullptr, {packb, im2col, matmul_dst, bias_temp}};
-    }
-};
-
-template <>
-class Im2colKerns<Pack_Mode::NO_PACK> {
-public:
-    //! conv kernel
-    static void kerns(
-            const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
-            const ConvBiasImpl::NCBKernParam& param,
-            fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
-            const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
-            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
-                    matmul_desc,
-            StrategyParam strategyparam,
-            fallback::ConvBiasImpl::NCBKernIndex ncb_index,
-            size_t ohw_tile_size, StrategyBase* im2colstrategy) {
-        size_t OC = param.filter_meta.ocpg;
-        size_t output_block_size = std::min(
-                ohw_tile_size,
-                strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size);
-        size_t output_block_oc_size = std::min(
-                strategyparam.oc_tile_size,
-                OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size);
-
-        strategyparam.batch_id = ncb_index.ndrange_id[0];
-        strategyparam.group_id = ncb_index.ndrange_id[1];
-        strategyparam.oc_cur_index =
-                ncb_index.ndrange_id[3] *
-                strategyparam.oc_tile_size;
-        strategyparam.oc_end_index = strategyparam.oc_cur_index +
-                                     output_block_oc_size;
-        strategyparam.ohw_cur_index =
-                ncb_index.ndrange_id[2] * ohw_tile_size;
-        strategyparam.output_block_oc_size = output_block_oc_size;
-        strategyparam.output_block_size = output_block_size;
-
-        bundle_thread.set(
-                static_cast<int8_t*>(
-                        bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
-                bundle_thread.total_size_in_bytes() * ncb_index.thread_id);
-
-        fallback::MatrixMulImpl::KernParam matmul_param;
-        static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
-                matmul_kernsize_param;
-
-        //! 1.Im2col
-        im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param,
-                                    matmul_param, matmul_algo);
-
-        //! 2.packb and matmul compute
-        im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread,
-                                    matmul_param, matmul_algo, ncb_index,
-                                    matmul_desc);
-
-        //! 3.postprocess and copy dst if need
-        im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread);
-    }
-    WorkspaceBundle get_thread_bundle(
-            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
-            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
-            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
-            size_t oc_tile_size) {
-        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
-               FW = param.filter_meta.spatial[1];
-        size_t ohw = param.osz[0] * param.osz[1];
-
-        size_t im2col = 0, matmul_dst = 0, bias_temp = 0, matmul_compute = 0;
-        bool no_pack = matmul_algo->packmode() == Pack_Mode::NO_PACK;
-        megdnn_assert(no_pack, "only support no pack");
-        bool is_dst_8bit =
-                (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
-                 param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
-                (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
-                 param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
-        size_t im2col_dst_size =
-                IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
-        size_t matmul_dst_size =
-                oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
-        im2col = im2col_dst_size;
-        if (is_dst_8bit) {
-            matmul_dst = matmul_dst_size;
-        } else {
-            matmul_dst = ohw_tile_size >= ohw ? 0 : matmul_dst_size;
-        }
-        matmul_compute = matmul_algo->get_workspace(im2col_kern_param);
-        if (param.bias_mode == megdnn::BiasMode::BIAS) {
-            bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
-        }
-
-        return {nullptr, {im2col, matmul_dst, bias_temp, matmul_compute}};
-    }
-};
-
 namespace {
 static fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param(
         const fallback::ConvBiasImpl::NCBKernSizeParam& param,
@@ -451,7 +180,6 @@ static WorkspaceBundle get_bundle(
         MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_tile_size,
         size_t ohw_tile_size) {
     UNPACK_CONV_F32_NCB_KERN_SIZES(param);
-    MEGDNN_MARK_USED_VAR(OC);
     MEGDNN_MARK_USED_VAR(OH);
     MEGDNN_MARK_USED_VAR(OW);
     MEGDNN_MARK_USED_VAR(FH);
@@ -506,8 +234,9 @@ size_t ConvBiasImpl::AlgoIm2col::get_workspace(
                 m_matmul_algo->matmul_description();
         size_t oc_tile_size = 0, ohw_tile_size = 0;
         choice_ohw_oc_block(p, oc_tile_size, ohw_tile_size,
-                            matmul_desc.innerblocksize.m, matmul_desc.innerblocksize.n,
-                            m_ohw_tile_size, matmul_desc.packmode);
+                            matmul_desc.innerblocksize.m,
+                            matmul_desc.innerblocksize.n, m_ohw_tile_size,
+                            matmul_desc.packmode);
         return get_bundle(p, m_matmul_algo, oc_tile_size, ohw_tile_size)
                 .total_size_in_bytes();
     }
@@ -518,20 +247,13 @@ size_t ConvBiasImpl::AlgoIm2col::get_workspace(
 SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
         const NCBKernSizeParam& param) const {
     MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 1) {
-        UNPACK_CONV_F32_NCB_KERN_SIZES(param);
-        MEGDNN_MARK_USED_VAR(SH);
-        MEGDNN_MARK_USED_VAR(SW);
-        MEGDNN_MARK_USED_VAR(IH);
-        MEGDNN_MARK_USED_VAR(IW);
-        MEGDNN_MARK_USED_VAR(FH);
-        MEGDNN_MARK_USED_VAR(FW);
-        size_t oc_tile_size = 0, ohw_tile_size = 0;
+        size_t OH = param.osz[0];
+        size_t OW = param.osz[1];
+        size_t OC = param.filter_meta.ocpg;
         size_t ohw = OH * OW;
-        size_t GROUP = param.filter_meta.group;
-        bool need_padding = (PH != 0 || PW != 0);
+        size_t oc_tile_size = 0, ohw_tile_size = 0;
 
-        fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
-                m_matmul_algo->matmul_description();
+        auto matmul_desc = m_matmul_algo->matmul_description();
 
         bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT;
         bool no_pack = matmul_desc.packmode == Pack_Mode::NO_PACK;
@@ -542,12 +264,8 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
                             matmul_desc.innerblocksize.n, m_ohw_tile_size,
                             matmul_desc.packmode);
 
-        WorkspaceBundle bundle = get_bundle(param,m_matmul_algo,oc_tile_size,ohw_tile_size);
-        size_t ohw_parallel_times = div_ceil(ohw, ohw_tile_size);
-        size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
         size_t packa_parallel_times = 0;
         size_t pack_oc_size = pack_size(param.filter_meta.format);
-
         if (only_packA) {
             packa_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
         } else if (default_pack) {
@@ -558,9 +276,12 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
         auto matmul_param = get_matmul_kern_param(
                 param, ohw_tile_size, default_pack ? OC : oc_tile_size);
 
+        WorkspaceBundle bundle =
+                get_bundle(param, m_matmul_algo, oc_tile_size, ohw_tile_size);
         WorkspaceBundle bundle_thread =
                 get_thread_bundle(param, m_matmul_algo, matmul_param,
                                   matmul_desc, oc_tile_size, ohw_tile_size);
+
         StrategyParam strategyparam;
         strategyparam.ohw = ohw;
         strategyparam.is_dst_8bit =
@@ -578,138 +299,39 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
                 m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times);
 
         SmallVector<ConvBiasImpl::NCBKern> ret_kern;
-        MIDOUT_BEGIN(
-                megdnn_fallback_im2col,
-                midout_iv("ConvBiasImpl::AlgoIm2col::dispatch_kerns"_hash)) {
-            StrategyBase* im2colstrategy =
-                    Factory::get_im2col_strategy(param, m_matmul_algo);
-            auto kern_padding = [bundle, im2colstrategy,
-                                 pack_oc_size = pack_oc_size](
-                                        const NCBKernParam& param,
-                                        const NCBKernIndex& ncb_index) mutable {
-                bundle.set(param.workspace_ptr);
-                im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
-                                                  pack_oc_size);
-            };
-
-            auto kern_packA = [bundle, matmul_algo = m_matmul_algo,
-                               matmul_param, im2colstrategy,
-                               strategyparam = strategyparam,
-                               matmul_desc = matmul_desc](
-                                      const NCBKernParam& param,
-                                      const NCBKernIndex& ncb_index) mutable {
-                bundle.set(param.workspace_ptr);
-
-                im2colstrategy->packA_kern(bundle, param, matmul_param,
-                                           matmul_algo, ncb_index, matmul_desc,
-                                           strategyparam);
-            };
-            if (default_pack) {
-                MIDOUT_BEGIN(
-                        megdnn_fallback_im2col,
-                        midout_iv(
-                                "ConvBiasImpl::AlgoIm2col::dispatch_kerns_default_pack"_hash)) {
-                    auto kern_compute_default =
-                            [bundle, bundle_thread, matmul_param,
-                             matmul_algo = m_matmul_algo,
-                             ohw_tile_size = ohw_tile_size,
-                             strategyparam = strategyparam,
-                             matmul_desc = matmul_desc, im2colstrategy](
-                                    const NCBKernParam& param,
-                                    const NCBKernIndex& ncb_index) mutable {
-                                bundle.set(param.workspace_ptr);
-                                Im2colKerns<Pack_Mode::DEFAULT>::kerns(
-                                        bundle, bundle_thread, param,
-                                        matmul_param, matmul_algo, matmul_desc,
-                                        strategyparam, ncb_index, ohw_tile_size,
-                                        im2colstrategy);
-                            };
-                    if (!enable_filter_preprocess) {
-                        ret_kern.push_back(
-                                {kern_packA, {GROUP, packa_parallel_times}});
-                    }
-                    if (need_padding) {
-                        ret_kern.push_back(
-                                {kern_padding,
-                                 {param.n, GROUP, IC / pack_oc_size}});
-                    }
-                    ret_kern.push_back({kern_compute_default,
-                                        {N, GROUP, ohw_parallel_times,
-                                         oc_parallel_times}});
-                    return ret_kern;
-                }
-                MIDOUT_END();
-                return {};
-            } else if (only_packA) {
-                MIDOUT_BEGIN(
-                        megdnn_fallback_im2col,
-                        midout_iv(
-                                "ConvBiasImpl::AlgoIm2col::dispatch_kerns_onlypacka"_hash)) {
-                    auto kern_compute_onlypackA =
-                            [bundle, bundle_thread, matmul_param,
-                             matmul_algo = m_matmul_algo,
-                             strategyparam = strategyparam,
-                             ohw_tile_size = ohw_tile_size,
-                             matmul_desc = matmul_desc, im2colstrategy](
-                                    const NCBKernParam& param,
-                                    const NCBKernIndex& ncb_index) mutable {
-                                bundle.set(param.workspace_ptr);
-                                Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns(
-                                        bundle, bundle_thread, param,
-                                        matmul_param, matmul_algo, matmul_desc,
-                                        strategyparam, ncb_index, ohw_tile_size,
-                                        im2colstrategy);
-                            };
-                    if (!enable_filter_preprocess) {
-                        ret_kern.push_back(
-                                {kern_packA, {GROUP, packa_parallel_times}});
-                    }
-                    if (need_padding) {
-                        ret_kern.push_back(
-                                {kern_padding, {param.n, GROUP, IC}});
-                    }
-                    ret_kern.push_back({kern_compute_onlypackA,
-                                        {N, GROUP, ohw_parallel_times,
-                                         oc_parallel_times}});
-                    return ret_kern;
-                }
-                MIDOUT_END();
-                return {};
-            } else if (no_pack) {
-                MIDOUT_BEGIN(
-                        megdnn_fallback_im2col,
-                        midout_iv(
-                                "ConvBiasImpl::AlgoIm2col::dispatch_kerns_no_pack"_hash)) {
-                    auto kern_compute_nopack =
-                            [bundle, bundle_thread, matmul_param,
-                             matmul_algo = m_matmul_algo,
-                             strategyparam = strategyparam,
-                             ohw_tile_size = ohw_tile_size,
-                             matmul_desc = matmul_desc, im2colstrategy](
-                                    const NCBKernParam& param,
-                                    const NCBKernIndex& ncb_index) mutable {
-                                bundle.set(param.workspace_ptr);
-                                Im2colKerns<Pack_Mode::NO_PACK>::kerns(
-                                        bundle, bundle_thread, param,
-                                        matmul_param, matmul_algo, matmul_desc,
-                                        strategyparam, ncb_index, ohw_tile_size,
-                                        im2colstrategy);
-                            };
-                    if (need_padding) {
-                        ret_kern.push_back(
-                                {kern_padding, {param.n, GROUP, IC}});
-                    }
-                    ret_kern.push_back({kern_compute_nopack,
-                                        {N, GROUP, ohw_parallel_times,
-                                         oc_parallel_times}});
-                    return ret_kern;
-                }
-                MIDOUT_END();
-                return {};
+        StrategyBase* im2colstrategy =
+                Factory::get_im2col_strategy(param, m_matmul_algo);
+        if (default_pack) {
+            MIDOUT_BEGIN(megdnn_fallback_im2col,
+                         midout_iv("dispatch_kerns_default_pack"_hash)) {
+                return Im2colKerns<Pack_Mode::DEFAULT>().get_kerns(
+                        param, bundle, bundle_thread, strategyparam,
+                        matmul_param, im2colstrategy, m_matmul_algo,
+                        ohw_tile_size, oc_tile_size, pack_oc_size);
             }
+            MIDOUT_END();
+            return {};
+        } else if (only_packA) {
+            MIDOUT_BEGIN(megdnn_fallback_im2col,
+                         midout_iv("dispatch_kerns_onlypacka"_hash)) {
+                return Im2colKerns<Pack_Mode::ONLY_PACKA>().get_kerns(
+                        param, bundle, bundle_thread, strategyparam,
+                        matmul_param, im2colstrategy, m_matmul_algo,
+                        ohw_tile_size, oc_tile_size, pack_oc_size);
+            }
+            MIDOUT_END();
+            return {};
+        } else if (no_pack) {
+            MIDOUT_BEGIN(megdnn_fallback_im2col,
+                         midout_iv("dispatch_kerns_no_pack"_hash)) {
+                return Im2colKerns<Pack_Mode::NO_PACK>().get_kerns(
+                        param, bundle, bundle_thread, strategyparam,
+                        matmul_param, im2colstrategy, m_matmul_algo,
+                        ohw_tile_size, oc_tile_size, pack_oc_size);
+            }
+            MIDOUT_END();
             return {};
         }
-        MIDOUT_END();
         return {};
     }
     MIDOUT_END();
@@ -721,23 +343,38 @@ bool ConvBiasImpl::AlgoIm2col::usable(
         AlgoSelectionStrategy /*algo_selection_strategy*/) const {
     MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 2) {
         auto format = param.filter_meta.format;
+        auto matmul_desc = m_matmul_algo->matmul_description();
+#if MEGDNN_AARCH64 || MEGDNN_ARMV7
         if (format != param::ConvBias::Format::NCHW &&
-            format != param::ConvBias::Format::NCHW44_DOT &&
-            format != param::ConvBias::Format::NCHW44) {
+            format != param::ConvBias::Format::NCHW44 &&
+            format != param::ConvBias::Format::NCHW44_DOT) {
             return false;
         }
-
-        if(param.src_type.enumv() != param.filter_type.enumv()) {
+        if (format == param::ConvBias::Format::NCHW44 ||
+            format == param::ConvBias::Format::NCHW44_DOT) {
+            //! current NCHW44 im2col only support DEFAULT mode matmul
+            if (matmul_desc.packmode != Pack_Mode::DEFAULT) {
+                return false;
+                //! nchw44 hybird mode and channel wise is not support
+            } else if (param.filter_meta.icpg < 4_z ||
+                       param.filter_meta.icpg == 1 ||
+                       param.filter_meta.ocpg == 1) {
+                return false;
+            }
+        }
+#else
+        if (format != param::ConvBias::Format::NCHW) {
             return false;
         }
-
-        if (param.src_type.enumv() != DTypeEnum::Int8 &&
-            param.src_type.enumv() != DTypeEnum::QuantizedS8 &&
-            param.src_type.enumv() != DTypeEnum::Quantized8Asymm &&
+#endif
+        if (param.src_type.enumv() != param.filter_type.enumv() ||
+            (param.src_type.enumv() != DTypeEnum::Int8 &&
+             param.src_type.enumv() != DTypeEnum::QuantizedS8 &&
+             param.src_type.enumv() != DTypeEnum::Quantized8Asymm &&
 #if !MEGDNN_DISABLE_FLOAT16
-            param.src_type.enumv() != DTypeEnum::Float16 &&
+             param.src_type.enumv() != DTypeEnum::Float16 &&
 #endif
-            param.src_type.enumv() != DTypeEnum::Float32) {
+             param.src_type.enumv() != DTypeEnum::Float32)) {
             return false;
         }
         //! make sure 8x8x16 and 8x8x32 biasmode is  nobias and nonlineMode is
@@ -750,28 +387,6 @@ bool ConvBiasImpl::AlgoIm2col::usable(
                 return false;
             }
         }
-        fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
-                m_matmul_algo->matmul_description();
-        //! only matmul's packmode is packa or default support weight preprocess
-        if (is_enable_filter_preprocess(param) &&
-            (matmul_desc.packmode ==
-             fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) {
-            return false;
-        }
-
-        if (format == param::ConvBias::Format::NCHW44 ||
-            format == param::ConvBias::Format::NCHW44_DOT) {
-            //! current NCHW44 im2col only support DEFAULT mode matmul
-            if (matmul_desc.packmode != Pack_Mode::DEFAULT) {
-                return false;
-                //! nchw44 hybird mode and channel wise is not support
-            } else if (param.filter_meta.icpg < 4_z ||
-                       param.filter_meta.icpg == 1 ||
-                       param.filter_meta.ocpg == 1) {
-                return false;
-            }
-        }
-
         size_t oc_tile_size = 0, ohw_tile_size = 0;
         choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
                             matmul_desc.innerblocksize.m,
@@ -798,10 +413,8 @@ bool ConvBiasImpl::AlgoIm2col::usable(
 SmallVector<TensorLayout>
 ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout(
         const NCBKernSizeParam& param) const {
-    MIDOUT_BEGIN(
-            megdnn_fallback_im2col,
-            midout_iv(
-                    "ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout"_hash)) {
+    MIDOUT_BEGIN(megdnn_fallback_im2col,
+                 midout_iv("deduce_preprocessed_filter_layout"_hash)) {
         fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
                 m_matmul_algo->matmul_description();
 
@@ -863,8 +476,6 @@ ConvBiasImpl::AlgoIm2col::dispatch_preprocess_kerns(
             packa_parallel_times =
                     div_ceil<size_t>(OC, matmul_desc.innerblocksize.m);
         } else {
-            //! if nopack return null so that OprWeightPreprocessProxy can run
-            //! with nopack mode
             return {};
         }
         auto matmul_param = get_matmul_kern_param(
diff --git a/dnn/src/fallback/conv_bias/im2col/factory.h b/dnn/src/fallback/conv_bias/im2col/factory.h
index b48d4f0d..a5024b50 100644
--- a/dnn/src/fallback/conv_bias/im2col/factory.h
+++ b/dnn/src/fallback/conv_bias/im2col/factory.h
@@ -26,10 +26,9 @@ enum class StrategyType : uint32_t {
     FLOAT = 0,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     FLOAT_FP16 = 1,
-#else
+#endif
 #if !MEGDNN_DISABLE_FLOAT16
     FLOAT16_FLOAT16 = 2,
-#endif
 #endif
     INT8x8x32 = 3,
     INT8x8x16 = 4,
@@ -153,12 +152,10 @@ public:
         cb1(dt_float32, dt_float32, StrategyType::FLOAT);
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         cb1(dt_float16, __fp16, StrategyType::FLOAT_FP16);
-#else
+#endif
 #if !MEGDNN_DISABLE_FLOAT16
         cb1(dt_float16, dt_float16, StrategyType::FLOAT16_FLOAT16);
 #endif
-#endif
-
         cb2(dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, dt_int32,
             StrategyType::INT8x8x32);
 
@@ -256,8 +253,7 @@ public:
                         !param.filter_meta.should_flip) {
                         MIDOUT_BEGIN(
                                 megdnn_fallback_im2col_factory_make_strategy,
-                                midout_iv(
-                                        "DefaultStrategyType::8x12x1_fuse_packb_s2_nchw44"_hash)) {
+                                midout_iv("8x12x1_fuse_packb_s2_nchw44"_hash)) {
                             return std::make_unique<
                                     StrategyFuseXx12x1Nchw44K3x3S2<
                                             float, float,
@@ -284,14 +280,13 @@ public:
                 cb1(NCHW, DEFAULT, dt_float16, __fp16, PostprocessMode::FLOAT,
                     "DefaultStrategyType::FLOAT_FP16"_hash);
                 break;
-#else
+#endif
 #if !MEGDNN_DISABLE_FLOAT16
             case StrategyType::FLOAT16_FLOAT16:
                 cb1(NCHW, DEFAULT, dt_float16, dt_float16,
                     PostprocessMode::NO_PROCESS,
                     "DefaultStrategyType::FLOAT16_FLOAT16"_hash);
                 break;
-#endif
 #endif
             case StrategyType::INT8x8x32:
                 if (format == param::ConvBias::Format::NCHW) {
@@ -472,15 +467,12 @@ public:
                 cb1(NCHW, NO_PACK, dt_float32, dt_float32,
                     PostprocessMode::FLOAT, "NoPackStrategyType::FLOAT"_hash);
                 break;
-#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#else
 #if !MEGDNN_DISABLE_FLOAT16
             case StrategyType::FLOAT16_FLOAT16:
                 cb1(NCHW, NO_PACK, dt_float16, dt_float16,
                     PostprocessMode::NO_PROCESS,
                     "NoPackStrategyType::FLOAT16_FLOAT16"_hash);
                 break;
-#endif
 #endif
             case StrategyType::INT8x8x16:
                 cb3(NCHW, NO_PACK, dt_int8, dt_int16, dt_int16, dt_int8,
diff --git a/dnn/src/fallback/conv_bias/im2col/im2col_kerns.h b/dnn/src/fallback/conv_bias/im2col/im2col_kerns.h
new file mode 100644
index 00000000..1e080520
--- /dev/null
+++ b/dnn/src/fallback/conv_bias/im2col/im2col_kerns.h
@@ -0,0 +1,364 @@
+/**
+ * \file dnn/src/fallback/conv_bias/im2col/im2col_kerns.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "src/fallback/conv_bias/opr_impl.h"
+#include "src/naive/convolution/helper.h"
+#include "src/fallback/conv_bias/im2col/factory.h"
+
+#include "midout.h"
+
+MIDOUT_DECL(megdnn_fallback_im2col)
+
+namespace megdnn {
+namespace fallback {
+namespace im2col {
+
+/*!
+ *  *\brief The index of all parts workspace in im2col workspace bundel
+ *  *Through witch can convenient get the needed ptr
+ */
+struct Im2colBundelIndex {
+    static constexpr size_t BUNDLE_THREAD_INDEX = 2_z;
+};
+
+using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode;
+/*!
+ * *\brief Im2colKerns collects all the im2col kerns in it
+ */
+namespace{
+//! conv kernel
+static void kerns(
+        const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
+        const ConvBiasImpl::NCBKernParam& param,
+        fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
+        const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
+        const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
+        StrategyParam strategyparam,
+        fallback::ConvBiasImpl::NCBKernIndex ncb_index, size_t ohw_tile_size,
+        StrategyBase* im2colstrategy) {
+    size_t OC = param.filter_meta.ocpg;
+    size_t output_block_size = std::min(
+            ohw_tile_size,
+            strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size);
+    size_t output_block_oc_size =
+            std::min(strategyparam.oc_tile_size,
+                     OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size);
+
+    bundle_thread.set(
+            static_cast<int8_t*>(
+                    bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
+            bundle_thread.total_size_in_bytes() * ncb_index.thread_id);
+
+    fallback::MatrixMulImpl::KernParam matmul_param;
+    static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
+            matmul_kernsize_param;
+
+    strategyparam.batch_id = ncb_index.ndrange_id[0];
+    strategyparam.group_id = ncb_index.ndrange_id[1];
+    strategyparam.oc_cur_index =
+            ncb_index.ndrange_id[3] * strategyparam.oc_tile_size;
+    strategyparam.oc_end_index =
+            strategyparam.oc_cur_index + output_block_oc_size;
+    strategyparam.ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size;
+    strategyparam.output_block_oc_size = output_block_oc_size;
+    strategyparam.output_block_size = output_block_size;
+
+    //! 1.Im2col
+    im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param,
+                                matmul_param, matmul_algo);
+
+    //! 2.packb and matmul compute
+    im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread,
+                                matmul_param, matmul_algo, ncb_index,
+                                matmul_desc);
+
+    //! 3.postprocess and copy dst if need
+    im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread);
+}
+}  // namespace
+
+template <Pack_Mode packmode>
+class Im2colKerns;
+
+template <>
+class Im2colKerns<Pack_Mode::DEFAULT> {
+public:
+    SmallVector<ConvBiasImpl::NCBKern> get_kerns(
+            const ConvBiasImpl::NCBKernSizeParam& param,
+            WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread,
+            const StrategyParam& strategyparam,
+            fallback::MatrixMulImpl::KernSizeParam& matmul_param,
+            StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo,
+            size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) {
+        auto matmul_desc = matmul_algo->matmul_description();
+        auto kern_padding =
+                [bundle, im2colstrategy, pack_oc_size = pack_oc_size](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
+                                                      pack_oc_size);
+                };
+
+        auto kern_packA =
+                [bundle, matmul_algo, matmul_param, im2colstrategy,
+                 strategyparam = strategyparam, matmul_desc = matmul_desc](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    im2colstrategy->packA_kern(bundle, param, matmul_param,
+                                               matmul_algo, ncb_index,
+                                               matmul_desc, strategyparam);
+                };
+        auto kern_compute_default =
+                [bundle, bundle_thread, matmul_param, matmul_algo,
+                 ohw_tile_size, strategyparam, matmul_desc = matmul_desc,
+                 im2colstrategy](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    kerns(bundle, bundle_thread, param, matmul_param,
+                          matmul_algo, matmul_desc, strategyparam, ncb_index,
+                          ohw_tile_size, im2colstrategy);
+                };
+        size_t OH = param.osz[0];
+        size_t OW = param.osz[1];
+        size_t BATCH = param.n;
+        size_t OC = param.filter_meta.ocpg;
+        size_t IC = param.filter_meta.icpg;
+        size_t PH = param.filter_meta.padding[0];
+        size_t PW = param.filter_meta.padding[1];
+        size_t GROUP = param.filter_meta.group;
+        size_t packa_parallel_times =
+                div_ceil<size_t>(OC, matmul_desc.innerblocksize.m);
+        size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size);
+        size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
+        SmallVector<ConvBiasImpl::NCBKern> ret_kern;
+        if (!is_enable_filter_preprocess(param)) {
+            ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}});
+        }
+        if (PH != 0 || PW != 0) {
+            ret_kern.push_back(
+                    {kern_padding, {BATCH, GROUP, IC / pack_oc_size}});
+        }
+        ret_kern.push_back(
+                {kern_compute_default,
+                 {BATCH, GROUP, ohw_parallel_times, oc_parallel_times}});
+        return ret_kern;
+    }
+
+    WorkspaceBundle get_thread_bundle(
+            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
+            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
+            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
+            size_t oc_tile_size) {
+        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
+               FW = param.filter_meta.spatial[1];
+        size_t pack_oc_size = pack_size(param.filter_meta.format);
+        size_t im2col = 0, packb = 0, bias_temp = 0;
+        bool default_pack = matmul_algo->packmode() == Pack_Mode::DEFAULT;
+        megdnn_assert(default_pack, "only support default packa");
+        size_t im2col_dst_size =
+                IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
+        size_t matmul_dst_size = pack_oc_size * oc_tile_size * ohw_tile_size *
+                                 sizeof(param.bias_type);
+        //! matmul_dst and im2col_dst use the same memory
+        WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param);
+        packb = wb.get_size(1);
+        im2col = std::max(im2col_dst_size, matmul_dst_size);
+        if (param.bias_mode == megdnn::BiasMode::BIAS) {
+            bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
+        }
+        return {nullptr, {packb, im2col, bias_temp}};
+    }
+};
+
+template <>
+class Im2colKerns<Pack_Mode::ONLY_PACKA> {
+public:
+    SmallVector<ConvBiasImpl::NCBKern> get_kerns(
+            const ConvBiasImpl::NCBKernSizeParam& param,
+            WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread,
+            const StrategyParam& strategyparam,
+            fallback::MatrixMulImpl::KernSizeParam& matmul_param,
+            StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo,
+            size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) {
+        auto matmul_desc = matmul_algo->matmul_description();
+        auto kern_padding =
+                [bundle, im2colstrategy, pack_oc_size = pack_oc_size](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
+                                                      pack_oc_size);
+                };
+
+        auto kern_packA =
+                [bundle, matmul_algo, matmul_param, im2colstrategy,
+                 strategyparam = strategyparam, matmul_desc = matmul_desc](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    im2colstrategy->packA_kern(bundle, param, matmul_param,
+                                               matmul_algo, ncb_index,
+                                               matmul_desc, strategyparam);
+                };
+        auto kern_compute_onlypackA =
+                [bundle, bundle_thread, matmul_param, matmul_algo,
+                 strategyparam, ohw_tile_size, matmul_desc, im2colstrategy](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    kerns(bundle, bundle_thread, param, matmul_param,
+                          matmul_algo, matmul_desc, strategyparam, ncb_index,
+                          ohw_tile_size, im2colstrategy);
+                };
+        size_t OH = param.osz[0];
+        size_t OW = param.osz[1];
+        size_t BATCH = param.n;
+        size_t OC = param.filter_meta.ocpg;
+        size_t IC = param.filter_meta.icpg;
+        size_t PH = param.filter_meta.padding[0];
+        size_t PW = param.filter_meta.padding[1];
+        size_t GROUP = param.filter_meta.group;
+        size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size);
+        size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
+        SmallVector<ConvBiasImpl::NCBKern> ret_kern;
+        if (!is_enable_filter_preprocess(param)) {
+            ret_kern.push_back({kern_packA, {GROUP, oc_parallel_times}});
+        }
+        if (PH != 0 || PW != 0) {
+            ret_kern.push_back(
+                    {kern_padding, {BATCH, GROUP, IC / pack_oc_size}});
+        }
+        ret_kern.push_back(
+                {kern_compute_onlypackA,
+                 {BATCH, GROUP, ohw_parallel_times, oc_parallel_times}});
+        return ret_kern;
+    }
+    WorkspaceBundle get_thread_bundle(
+            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
+            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
+            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
+            size_t oc_tile_size) {
+        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
+               FW = param.filter_meta.spatial[1];
+
+        size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0;
+        bool only_packA = matmul_algo->packmode() == Pack_Mode::ONLY_PACKA;
+        megdnn_assert(only_packA, "onlysupport onlypackA mode");
+        size_t im2col_dst_size =
+                IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
+        size_t matmul_dst_size =
+                oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
+        //! matmul_dst and im2col_dst use the same memory
+        WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param);
+        packb = wb.get_size(1);
+        im2col = im2col_dst_size;
+        matmul_dst = matmul_dst_size;
+        if (param.bias_mode == megdnn::BiasMode::BIAS) {
+            bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
+        }
+
+        return {nullptr, {packb, im2col, matmul_dst, bias_temp}};
+    }
+};
+
+template <>
+class Im2colKerns<Pack_Mode::NO_PACK> {
+public:
+    SmallVector<ConvBiasImpl::NCBKern> get_kerns(
+            const ConvBiasImpl::NCBKernSizeParam& param,
+            WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread,
+            const StrategyParam& strategyparam,
+            fallback::MatrixMulImpl::KernSizeParam& matmul_param,
+            StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo,
+            size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) {
+        auto matmul_desc = matmul_algo->matmul_description();
+        auto kern_padding =
+                [bundle, im2colstrategy, pack_oc_size = pack_oc_size](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
+                                                      pack_oc_size);
+                };
+        auto kern_compute_nopack =
+                [bundle, bundle_thread, matmul_param, matmul_algo,
+                 strategyparam, ohw_tile_size, matmul_desc, im2colstrategy](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    kerns(bundle, bundle_thread, param, matmul_param,
+                          matmul_algo, matmul_desc, strategyparam, ncb_index,
+                          ohw_tile_size, im2colstrategy);
+                };
+        size_t OH = param.osz[0];
+        size_t OW = param.osz[1];
+        size_t BATCH = param.n;
+        size_t OC = param.filter_meta.ocpg;
+        size_t IC = param.filter_meta.icpg;
+        size_t PH = param.filter_meta.padding[0];
+        size_t PW = param.filter_meta.padding[1];
+        size_t GROUP = param.filter_meta.group;
+        size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size);
+        size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
+        SmallVector<ConvBiasImpl::NCBKern> ret_kern;
+        if (PH != 0 || PW != 0) {
+            ret_kern.push_back(
+                    {kern_padding, {BATCH, GROUP, IC / pack_oc_size}});
+        }
+        ret_kern.push_back(
+                {kern_compute_nopack,
+                 {BATCH, GROUP, ohw_parallel_times, oc_parallel_times}});
+        return ret_kern;
+    }
+    WorkspaceBundle get_thread_bundle(
+            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
+            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
+            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
+            size_t oc_tile_size) {
+        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
+               FW = param.filter_meta.spatial[1];
+        size_t ohw = param.osz[0] * param.osz[1];
+
+        size_t im2col = 0, matmul_dst = 0, bias_temp = 0, matmul_compute = 0;
+        bool no_pack = matmul_algo->packmode() == Pack_Mode::NO_PACK;
+        megdnn_assert(no_pack, "only support no pack");
+        bool is_dst_8bit =
+                (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
+                 param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
+                (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
+                 param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
+        size_t im2col_dst_size =
+                IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
+        size_t matmul_dst_size =
+                oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
+        im2col = im2col_dst_size;
+        if (is_dst_8bit) {
+            matmul_dst = matmul_dst_size;
+        } else {
+            matmul_dst = ohw_tile_size >= ohw ? 0 : matmul_dst_size;
+        }
+        matmul_compute = matmul_algo->get_workspace(im2col_kern_param);
+        if (param.bias_mode == megdnn::BiasMode::BIAS) {
+            bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
+        }
+
+        return {nullptr, {im2col, matmul_dst, bias_temp, matmul_compute}};
+    }
+};
+
+}  // namespace im2col
+}  // namespace fallback
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
index 4b5fb720..911c7d55 100644
--- a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
@@ -192,12 +192,11 @@ INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16,
                  megdnn::PostprocessMode::FLOAT)
-#else
+#endif
 #if !MEGDNN_DISABLE_FLOAT16
 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
                  megdnn::PostprocessMode::NO_PROCESS)
 #endif
-#endif
 
 #if MEGDNN_AARCH64 || MEGDNN_ARMV7
 //! x86 do not have uint8 matmul so only armv7 armv8 support uint8
diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp
index 213a0193..ff4eab52 100644
--- a/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp
@@ -108,13 +108,12 @@ INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16,
                  megdnn::PostprocessMode::FLOAT)
-#else
+#endif
 #if !MEGDNN_DISABLE_FLOAT16
 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
                  megdnn::PostprocessMode::NO_PROCESS)
 #endif
 
-#endif
 #if MEGDNN_AARCH64 || MEGDNN_ARMV7
 //! x86 do not have uint8 matmul so only armv7 armv8 support uint8
 INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8,
diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
index c3a05d20..cb574b74 100644
--- a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
@@ -165,13 +165,10 @@ INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16,
                  megdnn::PostprocessMode::ADD_BIAS)
 INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32,
                  megdnn::PostprocessMode::ADD_BIAS)
-#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#else
 #if !MEGDNN_DISABLE_FLOAT16
 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
                  megdnn::PostprocessMode::NO_PROCESS)
 #endif
-#endif
 #undef INSTANTIAL_CLASS
 }  // namespace megdnn
 
-- 
GitLab