fix(mgb/fallback): delete im2col duplicate code and fix nchw44 usable

GitOrigin-RevId: 1aa250e9e715639364746144139d712edd610c6e

fix(mgb/fallback): delete im2col duplicate code and fix nchw44 usable
GitOrigin-RevId: 1aa250e9e715639364746144139d712edd610c6e
df356635 · Megvii Engine Team · Xinran Xu · 4a227083 · df356635 · df356635
7 changed file
--- a/dnn/src/arm_common/conv_bias/postprocess_helper.h
+++ b/dnn/src/arm_common/conv_bias/postprocess_helper.h
@@ -100,6 +100,7 @@ namespace {
            MIDOUT_END();                                                 \
            break;                                                        \
        default:                                                          \
+            megdnn_throw("unknow biasmode");                             \
            break;                                                        \
    }

@@ -282,24 +283,25 @@ struct PostProcess<opctype, opdtype, megdnn::PostprocessMode::QUANTIZED> {
                    reinterpret_cast<ctype*>(dst_ptr), bias_type, bias_type, \
                    dst_type, N* OC* OH* OW* pack_oc_size);

-#define FOR_BIAS(_bias_mode, OH, OW)                                \
-    switch (_bias_mode) {                                           \
-        case megdnn::BiasMode::NO_BIAS:                             \
-            break;                                                  \
-        case megdnn::BiasMode::BROADCAST_CHANNEL_BIAS:              \
-            if (pack_oc_size == 1) {                                \
-                FOR_BINARY_BROADCAST(CONCAT_OP(AddOp));             \
-            } else {                                                \
-                megdnn_assert(pack_oc_size == 4,                    \
-                              "Only support nchw44 in ARM");        \
-                FOR_BINARY_BROADCAST_NCHW44(CONCAT_OP(AddOp));      \
-            }                                                       \
-            break;                                                  \
-        case megdnn::BiasMode::BIAS:                                \
-            FOR_BINARY(CONCAT_OP(AddOp));                           \
-            break;                                                  \
-        default:                                                    \
-            break;                                                  \
+#define FOR_BIAS(_bias_mode, OH, OW)                           \
+    switch (_bias_mode) {                                      \
+        case megdnn::BiasMode::NO_BIAS:                        \
+            break;                                             \
+        case megdnn::BiasMode::BROADCAST_CHANNEL_BIAS:         \
+            if (pack_oc_size == 1) {                           \
+                FOR_BINARY_BROADCAST(CONCAT_OP(AddOp));        \
+            } else {                                           \
+                megdnn_assert(pack_oc_size == 4,               \
+                              "Only support nchw44 in ARM");   \
+                FOR_BINARY_BROADCAST_NCHW44(CONCAT_OP(AddOp)); \
+            }                                                  \
+            break;                                             \
+        case megdnn::BiasMode::BIAS:                           \
+            FOR_BINARY(CONCAT_OP(AddOp));                      \
+            break;                                             \
+        default:                                               \
+            megdnn_throw("unknow biasmode");                   \
+            break;                                             \
    }

 template <typename ctype, typename dtype>

--- a/dnn/src/fallback/conv_bias/im2col/algos.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/algos.cpp
--- a/dnn/src/fallback/conv_bias/im2col/factory.h
+++ b/dnn/src/fallback/conv_bias/im2col/factory.h
@@ -26,10 +26,9 @@ enum class StrategyType : uint32_t {
    FLOAT = 0,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    FLOAT_FP16 = 1,
-#else
+#endif
 #if !MEGDNN_DISABLE_FLOAT16
    FLOAT16_FLOAT16 = 2,
-#endif
 #endif
    INT8x8x32 = 3,
    INT8x8x16 = 4,
@@ -153,12 +152,10 @@ public:
        cb1(dt_float32, dt_float32, StrategyType::FLOAT);
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        cb1(dt_float16, __fp16, StrategyType::FLOAT_FP16);
-#else
+#endif
 #if !MEGDNN_DISABLE_FLOAT16
        cb1(dt_float16, dt_float16, StrategyType::FLOAT16_FLOAT16);
 #endif
-#endif
-
        cb2(dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, dt_int32,
            StrategyType::INT8x8x32);

@@ -256,8 +253,7 @@ public:
                        !param.filter_meta.should_flip) {
                        MIDOUT_BEGIN(
                                megdnn_fallback_im2col_factory_make_strategy,
-                                midout_iv(
-                                        "DefaultStrategyType::8x12x1_fuse_packb_s2_nchw44"_hash)) {
+                                midout_iv("8x12x1_fuse_packb_s2_nchw44"_hash)) {
                            return std::make_unique<
                                    StrategyFuseXx12x1Nchw44K3x3S2<
                                            float, float,
@@ -284,14 +280,13 @@ public:
                cb1(NCHW, DEFAULT, dt_float16, __fp16, PostprocessMode::FLOAT,
                    "DefaultStrategyType::FLOAT_FP16"_hash);
                break;
-#else
+#endif
 #if !MEGDNN_DISABLE_FLOAT16
            case StrategyType::FLOAT16_FLOAT16:
                cb1(NCHW, DEFAULT, dt_float16, dt_float16,
                    PostprocessMode::NO_PROCESS,
                    "DefaultStrategyType::FLOAT16_FLOAT16"_hash);
                break;
-#endif
 #endif
            case StrategyType::INT8x8x32:
                if (format == param::ConvBias::Format::NCHW) {
@@ -472,15 +467,12 @@ public:
                cb1(NCHW, NO_PACK, dt_float32, dt_float32,
                    PostprocessMode::FLOAT, "NoPackStrategyType::FLOAT"_hash);
                break;
-#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#else
 #if !MEGDNN_DISABLE_FLOAT16
            case StrategyType::FLOAT16_FLOAT16:
                cb1(NCHW, NO_PACK, dt_float16, dt_float16,
                    PostprocessMode::NO_PROCESS,
                    "NoPackStrategyType::FLOAT16_FLOAT16"_hash);
                break;
-#endif
 #endif
            case StrategyType::INT8x8x16:
                cb3(NCHW, NO_PACK, dt_int8, dt_int16, dt_int16, dt_int8,

--- a/dnn/src/fallback/conv_bias/im2col/im2col_kerns.h
+++ b/dnn/src/fallback/conv_bias/im2col/im2col_kerns.h
+/**
+ * \file dnn/src/fallback/conv_bias/im2col/im2col_kerns.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "src/fallback/conv_bias/opr_impl.h"
+#include "src/naive/convolution/helper.h"
+#include "src/fallback/conv_bias/im2col/factory.h"
+
+#include "midout.h"
+
+MIDOUT_DECL(megdnn_fallback_im2col)
+
+namespace megdnn {
+namespace fallback {
+namespace im2col {
+
+/*!
+ *  *\brief The index of all parts workspace in im2col workspace bundel
+ *  *Through witch can convenient get the needed ptr
+ */
+struct Im2colBundelIndex {
+    static constexpr size_t BUNDLE_THREAD_INDEX = 2_z;
+};
+
+using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode;
+/*!
+ * *\brief Im2colKerns collects all the im2col kerns in it
+ */
+namespace{
+//! conv kernel
+static void kerns(
+        const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
+        const ConvBiasImpl::NCBKernParam& param,
+        fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
+        const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
+        const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
+        StrategyParam strategyparam,
+        fallback::ConvBiasImpl::NCBKernIndex ncb_index, size_t ohw_tile_size,
+        StrategyBase* im2colstrategy) {
+    size_t OC = param.filter_meta.ocpg;
+    size_t output_block_size = std::min(
+            ohw_tile_size,
+            strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size);
+    size_t output_block_oc_size =
+            std::min(strategyparam.oc_tile_size,
+                     OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size);
+
+    bundle_thread.set(
+            static_cast<int8_t*>(
+                    bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
+            bundle_thread.total_size_in_bytes() * ncb_index.thread_id);
+
+    fallback::MatrixMulImpl::KernParam matmul_param;
+    static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
+            matmul_kernsize_param;
+
+    strategyparam.batch_id = ncb_index.ndrange_id[0];
+    strategyparam.group_id = ncb_index.ndrange_id[1];
+    strategyparam.oc_cur_index =
+            ncb_index.ndrange_id[3] * strategyparam.oc_tile_size;
+    strategyparam.oc_end_index =
+            strategyparam.oc_cur_index + output_block_oc_size;
+    strategyparam.ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size;
+    strategyparam.output_block_oc_size = output_block_oc_size;
+    strategyparam.output_block_size = output_block_size;
+
+    //! 1.Im2col
+    im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param,
+                                matmul_param, matmul_algo);
+
+    //! 2.packb and matmul compute
+    im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread,
+                                matmul_param, matmul_algo, ncb_index,
+                                matmul_desc);
+
+    //! 3.postprocess and copy dst if need
+    im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread);
+}
+}  // namespace
+
+template <Pack_Mode packmode>
+class Im2colKerns;
+
+template <>
+class Im2colKerns<Pack_Mode::DEFAULT> {
+public:
+    SmallVector<ConvBiasImpl::NCBKern> get_kerns(
+            const ConvBiasImpl::NCBKernSizeParam& param,
+            WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread,
+            const StrategyParam& strategyparam,
+            fallback::MatrixMulImpl::KernSizeParam& matmul_param,
+            StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo,
+            size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) {
+        auto matmul_desc = matmul_algo->matmul_description();
+        auto kern_padding =
+                [bundle, im2colstrategy, pack_oc_size = pack_oc_size](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
+                                                      pack_oc_size);
+                };
+
+        auto kern_packA =
+                [bundle, matmul_algo, matmul_param, im2colstrategy,
+                 strategyparam = strategyparam, matmul_desc = matmul_desc](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    im2colstrategy->packA_kern(bundle, param, matmul_param,
+                                               matmul_algo, ncb_index,
+                                               matmul_desc, strategyparam);
+                };
+        auto kern_compute_default =
+                [bundle, bundle_thread, matmul_param, matmul_algo,
+                 ohw_tile_size, strategyparam, matmul_desc = matmul_desc,
+                 im2colstrategy](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    kerns(bundle, bundle_thread, param, matmul_param,
+                          matmul_algo, matmul_desc, strategyparam, ncb_index,
+                          ohw_tile_size, im2colstrategy);
+                };
+        size_t OH = param.osz[0];
+        size_t OW = param.osz[1];
+        size_t BATCH = param.n;
+        size_t OC = param.filter_meta.ocpg;
+        size_t IC = param.filter_meta.icpg;
+        size_t PH = param.filter_meta.padding[0];
+        size_t PW = param.filter_meta.padding[1];
+        size_t GROUP = param.filter_meta.group;
+        size_t packa_parallel_times =
+                div_ceil<size_t>(OC, matmul_desc.innerblocksize.m);
+        size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size);
+        size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
+        SmallVector<ConvBiasImpl::NCBKern> ret_kern;
+        if (!is_enable_filter_preprocess(param)) {
+            ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}});
+        }
+        if (PH != 0 || PW != 0) {
+            ret_kern.push_back(
+                    {kern_padding, {BATCH, GROUP, IC / pack_oc_size}});
+        }
+        ret_kern.push_back(
+                {kern_compute_default,
+                 {BATCH, GROUP, ohw_parallel_times, oc_parallel_times}});
+        return ret_kern;
+    }
+
+    WorkspaceBundle get_thread_bundle(
+            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
+            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
+            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
+            size_t oc_tile_size) {
+        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
+               FW = param.filter_meta.spatial[1];
+        size_t pack_oc_size = pack_size(param.filter_meta.format);
+        size_t im2col = 0, packb = 0, bias_temp = 0;
+        bool default_pack = matmul_algo->packmode() == Pack_Mode::DEFAULT;
+        megdnn_assert(default_pack, "only support default packa");
+        size_t im2col_dst_size =
+                IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
+        size_t matmul_dst_size = pack_oc_size * oc_tile_size * ohw_tile_size *
+                                 sizeof(param.bias_type);
+        //! matmul_dst and im2col_dst use the same memory
+        WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param);
+        packb = wb.get_size(1);
+        im2col = std::max(im2col_dst_size, matmul_dst_size);
+        if (param.bias_mode == megdnn::BiasMode::BIAS) {
+            bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
+        }
+        return {nullptr, {packb, im2col, bias_temp}};
+    }
+};
+
+template <>
+class Im2colKerns<Pack_Mode::ONLY_PACKA> {
+public:
+    SmallVector<ConvBiasImpl::NCBKern> get_kerns(
+            const ConvBiasImpl::NCBKernSizeParam& param,
+            WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread,
+            const StrategyParam& strategyparam,
+            fallback::MatrixMulImpl::KernSizeParam& matmul_param,
+            StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo,
+            size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) {
+        auto matmul_desc = matmul_algo->matmul_description();
+        auto kern_padding =
+                [bundle, im2colstrategy, pack_oc_size = pack_oc_size](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
+                                                      pack_oc_size);
+                };
+
+        auto kern_packA =
+                [bundle, matmul_algo, matmul_param, im2colstrategy,
+                 strategyparam = strategyparam, matmul_desc = matmul_desc](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    im2colstrategy->packA_kern(bundle, param, matmul_param,
+                                               matmul_algo, ncb_index,
+                                               matmul_desc, strategyparam);
+                };
+        auto kern_compute_onlypackA =
+                [bundle, bundle_thread, matmul_param, matmul_algo,
+                 strategyparam, ohw_tile_size, matmul_desc, im2colstrategy](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    kerns(bundle, bundle_thread, param, matmul_param,
+                          matmul_algo, matmul_desc, strategyparam, ncb_index,
+                          ohw_tile_size, im2colstrategy);
+                };
+        size_t OH = param.osz[0];
+        size_t OW = param.osz[1];
+        size_t BATCH = param.n;
+        size_t OC = param.filter_meta.ocpg;
+        size_t IC = param.filter_meta.icpg;
+        size_t PH = param.filter_meta.padding[0];
+        size_t PW = param.filter_meta.padding[1];
+        size_t GROUP = param.filter_meta.group;
+        size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size);
+        size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
+        SmallVector<ConvBiasImpl::NCBKern> ret_kern;
+        if (!is_enable_filter_preprocess(param)) {
+            ret_kern.push_back({kern_packA, {GROUP, oc_parallel_times}});
+        }
+        if (PH != 0 || PW != 0) {
+            ret_kern.push_back(
+                    {kern_padding, {BATCH, GROUP, IC / pack_oc_size}});
+        }
+        ret_kern.push_back(
+                {kern_compute_onlypackA,
+                 {BATCH, GROUP, ohw_parallel_times, oc_parallel_times}});
+        return ret_kern;
+    }
+    WorkspaceBundle get_thread_bundle(
+            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
+            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
+            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
+            size_t oc_tile_size) {
+        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
+               FW = param.filter_meta.spatial[1];
+
+        size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0;
+        bool only_packA = matmul_algo->packmode() == Pack_Mode::ONLY_PACKA;
+        megdnn_assert(only_packA, "onlysupport onlypackA mode");
+        size_t im2col_dst_size =
+                IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
+        size_t matmul_dst_size =
+                oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
+        //! matmul_dst and im2col_dst use the same memory
+        WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param);
+        packb = wb.get_size(1);
+        im2col = im2col_dst_size;
+        matmul_dst = matmul_dst_size;
+        if (param.bias_mode == megdnn::BiasMode::BIAS) {
+            bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
+        }
+
+        return {nullptr, {packb, im2col, matmul_dst, bias_temp}};
+    }
+};
+
+template <>
+class Im2colKerns<Pack_Mode::NO_PACK> {
+public:
+    SmallVector<ConvBiasImpl::NCBKern> get_kerns(
+            const ConvBiasImpl::NCBKernSizeParam& param,
+            WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread,
+            const StrategyParam& strategyparam,
+            fallback::MatrixMulImpl::KernSizeParam& matmul_param,
+            StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo,
+            size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) {
+        auto matmul_desc = matmul_algo->matmul_description();
+        auto kern_padding =
+                [bundle, im2colstrategy, pack_oc_size = pack_oc_size](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
+                                                      pack_oc_size);
+                };
+        auto kern_compute_nopack =
+                [bundle, bundle_thread, matmul_param, matmul_algo,
+                 strategyparam, ohw_tile_size, matmul_desc, im2colstrategy](
+                        const ConvBiasImpl::NCBKernParam& param,
+                        const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
+                    bundle.set(param.workspace_ptr);
+                    kerns(bundle, bundle_thread, param, matmul_param,
+                          matmul_algo, matmul_desc, strategyparam, ncb_index,
+                          ohw_tile_size, im2colstrategy);
+                };
+        size_t OH = param.osz[0];
+        size_t OW = param.osz[1];
+        size_t BATCH = param.n;
+        size_t OC = param.filter_meta.ocpg;
+        size_t IC = param.filter_meta.icpg;
+        size_t PH = param.filter_meta.padding[0];
+        size_t PW = param.filter_meta.padding[1];
+        size_t GROUP = param.filter_meta.group;
+        size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size);
+        size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
+        SmallVector<ConvBiasImpl::NCBKern> ret_kern;
+        if (PH != 0 || PW != 0) {
+            ret_kern.push_back(
+                    {kern_padding, {BATCH, GROUP, IC / pack_oc_size}});
+        }
+        ret_kern.push_back(
+                {kern_compute_nopack,
+                 {BATCH, GROUP, ohw_parallel_times, oc_parallel_times}});
+        return ret_kern;
+    }
+    WorkspaceBundle get_thread_bundle(
+            const fallback::ConvBiasImpl::NCBKernSizeParam& param,
+            const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
+            const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
+            size_t oc_tile_size) {
+        size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
+               FW = param.filter_meta.spatial[1];
+        size_t ohw = param.osz[0] * param.osz[1];
+
+        size_t im2col = 0, matmul_dst = 0, bias_temp = 0, matmul_compute = 0;
+        bool no_pack = matmul_algo->packmode() == Pack_Mode::NO_PACK;
+        megdnn_assert(no_pack, "only support no pack");
+        bool is_dst_8bit =
+                (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
+                 param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
+                (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
+                 param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
+        size_t im2col_dst_size =
+                IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
+        size_t matmul_dst_size =
+                oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
+        im2col = im2col_dst_size;
+        if (is_dst_8bit) {
+            matmul_dst = matmul_dst_size;
+        } else {
+            matmul_dst = ohw_tile_size >= ohw ? 0 : matmul_dst_size;
+        }
+        matmul_compute = matmul_algo->get_workspace(im2col_kern_param);
+        if (param.bias_mode == megdnn::BiasMode::BIAS) {
+            bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
+        }
+
+        return {nullptr, {im2col, matmul_dst, bias_temp, matmul_compute}};
+    }
+};
+
+}  // namespace im2col
+}  // namespace fallback
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
@@ -192,12 +192,11 @@ INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16,
                 megdnn::PostprocessMode::FLOAT)
-#else
+#endif
 #if !MEGDNN_DISABLE_FLOAT16
 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
                 megdnn::PostprocessMode::NO_PROCESS)
 #endif
-#endif

 #if MEGDNN_AARCH64 || MEGDNN_ARMV7
 //! x86 do not have uint8 matmul so only armv7 armv8 support uint8

--- a/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp
@@ -108,13 +108,12 @@ INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16,
                 megdnn::PostprocessMode::FLOAT)
-#else
+#endif
 #if !MEGDNN_DISABLE_FLOAT16
 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
                 megdnn::PostprocessMode::NO_PROCESS)
 #endif

-#endif
 #if MEGDNN_AARCH64 || MEGDNN_ARMV7
 //! x86 do not have uint8 matmul so only armv7 armv8 support uint8
 INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8,

--- a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
@@ -165,13 +165,10 @@ INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16,
                 megdnn::PostprocessMode::ADD_BIAS)
 INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32,
                 megdnn::PostprocessMode::ADD_BIAS)
-#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#else
 #if !MEGDNN_DISABLE_FLOAT16
 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
                 megdnn::PostprocessMode::NO_PROCESS)
 #endif
-#endif
 #undef INSTANTIAL_CLASS
 }  // namespace megdnn