feat(dnn/fallback): add im2col filterpreprocess function

GitOrigin-RevId: 61c54ad258a42301711d3efdae0caef47d7b0584

feat(dnn/fallback): add im2col filterpreprocess function
GitOrigin-RevId: 61c54ad258a42301711d3efdae0caef47d7b0584
edd7e167 · Megvii Engine Team · Xu Xinran · 9e9e8ca0 · edd7e167 · edd7e167
13 changed file
--- a/dnn/src/fallback/conv_bias/im2col/algos.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/algos.cpp
--- a/dnn/src/fallback/conv_bias/im2col/algos.h
+++ b/dnn/src/fallback/conv_bias/im2col/algos.h
@@ -22,27 +22,6 @@ namespace megdnn {
 namespace fallback {

 class ConvBiasImpl::AlgoIm2col final : public AlgoBase {
-    //! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion,
-    //! when m_oc_tile_size < this value m_oc_tile_size = ohw
-    static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32;
-    //! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads,
-    //! m_oc_tile_size = DEFAULT_OC_TILE_SIZE
-    static constexpr size_t DEFAULT_OC_TILE_SIZE = 512;
-    //! when m_oc_tile_size > this value m_oc_tile_size =
-    //! DEFAULT_OC_MAX_TILE_SIZE
-    static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024;
-    //! when m_oc_tile_size < this value m_oc_tile_size =
-    //! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation
-    static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128;
-    fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param(
-            const NCBKernSizeParam& param, size_t ohw_tile_size,
-            size_t oc_tile_size) const;
-    WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const;
-    void choice_ohw_oc_block(
-            const NCBKernSizeParam& param, size_t& oc_tile_size,
-            size_t& ohw_tile_size, size_t block_m, size_t block_n,
-            fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) const;
-
 public:
    AlgoIm2col(MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size)
            : m_matmul_algo(matmul_algo),
@@ -59,10 +38,16 @@ public:
    bool usable(const NCBKernSizeParam& param,
                AlgoSelectionStrategy algo_selection_strategy) const override;
    size_t get_workspace(const NCBKernSizeParam& param) const override;
-    SmallVector<NCBKern> dispatch_kerns(
+    SmallVector<NCBKern> dispatch_kerns(const NCBKernSizeParam& param) const override;
+    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
+            const NCBKernSizeParam& param) const override;
+    size_t get_preprocess_workspace(
+            const NCBKernSizeParam& /*param*/) const override {
+        return 0;
+    }
+    SmallVector<NCBKern> dispatch_preprocess_kerns(
            const NCBKernSizeParam& param) const override;
-    bool is_preferred(
-                      const NCBKernSizeParam& param) const override {
+    bool is_preferred(const NCBKernSizeParam& param) const override {
        if (param.src_type.category() == DTypeCategory::QUANTIZED) {
            static CpuOprDelegationStorage<1> storage;
            auto conv_bias_opr = storage.get<ConvBias, 0>();

--- a/dnn/src/fallback/conv_bias/im2col/strategy_base.h
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_base.h
@@ -40,9 +40,11 @@ struct StrategyParam {
    size_t block_n;
    size_t block_k;
    size_t pack_oc_size;
+    size_t packA_group_size;
    bool skip_copy_dst;
    bool is_dst_8bit;
    bool is_ohw_size_bigger;
+    bool enable_filter_preprocess;
 };

 class StrategyBase {
@@ -62,7 +64,7 @@ public:
            const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
                    matmul_desec,
-            size_t pack_size) = 0;
+            const StrategyParam& sparam) = 0;

    virtual void exec_im2col(
            const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
@@ -296,7 +298,7 @@ public:
            const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
                    matmul_desc,
-            size_t pack_size) override;
+            const StrategyParam& sparam) override;
    virtual void exec_im2col(
            const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,
            const StrategyParam& sparam,
@@ -375,7 +377,7 @@ public:
            const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
            const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec,
-            size_t pack_size) override;
+            const StrategyParam& sparam) override;

    void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param,
                     const StrategyParam& sparam, const WorkspaceBundle& bundle,
@@ -431,7 +433,7 @@ public:
            const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
            const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
            const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec,
-            size_t pack_size) override;
+            const StrategyParam& sparam) override;

    void exec_im2col(
            const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread,

--- a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
@@ -25,19 +25,23 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
                   const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
                   const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
                           matmul_desc,
-                   size_t) {
+                   const StrategyParam& sparam) {
    fallback::MatrixMulImpl::KernParam matmul_param;
    size_t group_id = ncb_index.ndrange_id[0];
    static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
            matmulparam;
-    size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0);
    size_t packed_per_oc_block_size =
            round_up(matmul_param.K, matmul_desc.innerblocksize.k) *
            matmul_desc.innerblocksize.m * matmul_desc.packa_type_size;

    size_t a_panel_offset = ncb_index.ndrange_id[1] * packed_per_oc_block_size;
-    int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) +
-                      group_id * packA_group_size + a_panel_offset;
+    int8_t* tmp_ptr =
+            sparam.enable_filter_preprocess
+                    ? static_cast<int8_t*>(
+                              param.preprocessed_filter->tensors[0].raw_ptr)
+                    : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));
+    int8_t* a_panel =
+            tmp_ptr + group_id * sparam.packA_group_size + a_panel_offset;
    matmul_param.A_ptr =
            const_cast<src_ctype*>(param.filter<src_ctype>(group_id));
    matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[1],
@@ -149,15 +153,20 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
    size_t packA_per_oc_block_size =
            round_up(matmul_param.K, matmul_desc.innerblocksize.k) *
            sparam.oc_tile_size * matmul_desc.packa_type_size;
-    size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0);
+    size_t packA_group_size = sparam.packA_group_size;
    size_t a_panel_offset = ncb_index.ndrange_id[1] * packA_group_size +
                            ncb_index.ndrange_id[3] * packA_per_oc_block_size;

    void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);

-    src_ctype* a_panel = reinterpret_cast<src_ctype*>(
-            reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) +
-            a_panel_offset);
+    int8_t* tmp_ptr =
+           sparam.enable_filter_preprocess
+                    ? static_cast<int8_t*>(
+                              param.preprocessed_filter->tensors[0].raw_ptr)
+                    : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));
+
+    src_ctype* a_panel =
+                    reinterpret_cast<src_ctype*>(tmp_ptr + a_panel_offset);
    src_ctype* b_panel =
            reinterpret_cast<src_ctype*>(reinterpret_cast<uintptr_t>(
                    bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX)));

--- a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp
@@ -26,7 +26,7 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
                   const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
                   const fallback::MatrixMulImpl::AlgoBase::
                           MatmulDescription& /*matmul_dsec*/,
-                   size_t) {
+                   const StrategyParam&) {
    MEGDNN_MARK_USED_VAR(bundle);
    MEGDNN_MARK_USED_VAR(param);
    MEGDNN_MARK_USED_VAR(matmulparam);

--- a/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp
+++ b/dnn/src/fallback/conv_bias/im2col/strategy_onlypacka.cpp
@@ -26,7 +26,7 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
                   const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
                   const fallback::MatrixMulImpl::AlgoBase::
                           MatmulDescription& /*matmul_desc*/,
-                   size_t) {
+                   const StrategyParam& sparam) {
    fallback::MatrixMulImpl::KernParam matmul_param;
    static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
            matmulparam;
@@ -36,12 +36,17 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
    size_t output_block_oc_size =
            std::min(oc_tile_size, OC - ncb_index.ndrange_id[1] * oc_tile_size);
    size_t oc_cur_index = ncb_index.ndrange_id[1] * oc_tile_size;
-    size_t packA_group_size =
-            bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group;
    size_t a_panel_offset = ncb_index.ndrange_id[1] *
                            matmul_algo->get_bundle(matmul_param).get_size(0);
-    int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) +
-                      group_id * packA_group_size + a_panel_offset;
+
+    int8_t* tmp_ptr =
+           sparam.enable_filter_preprocess
+                    ? static_cast<int8_t*>(
+                              param.preprocessed_filter->tensors[0].raw_ptr)
+                    : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));
+
+    int8_t* a_panel = tmp_ptr +
+                      group_id * sparam.packA_group_size + a_panel_offset;
    matmul_param.A_ptr =
            const_cast<src_ctype*>(param.filter<src_ctype>(group_id)) +
            oc_cur_index * matmul_param.K;
@@ -60,20 +65,22 @@ void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype,
                    fallback::MatrixMulImpl::KernParam matmul_param,
                    const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
                    const fallback::ConvBiasImpl::NCBKernIndex& ncb_index,
-                    const fallback::MatrixMulImpl::AlgoBase::
-                            MatmulDescription& /*matmul_desc*/
-        ) {
-    size_t packA_group_size =
-            bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group;
+                    const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
+                            /*matmul_desc*/) {
    size_t a_panel_offset = ncb_index.ndrange_id[3] *
                            matmul_algo->get_bundle(matmul_param).get_size(0);
-    a_panel_offset = sparam.group_id * packA_group_size + a_panel_offset;
+    a_panel_offset =
+            sparam.group_id * sparam.packA_group_size + a_panel_offset;

    void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam);

-    src_ctype* a_panel = reinterpret_cast<src_ctype*>(
-            reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) +
-            a_panel_offset);
+    int8_t* tmp_ptr =
+            sparam.enable_filter_preprocess
+                    ? static_cast<int8_t*>(
+                              param.preprocessed_filter->tensors[0].raw_ptr)
+                    : static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX));
+
+    src_ctype* a_panel = reinterpret_cast<src_ctype*>(tmp_ptr + a_panel_offset);
    src_ctype* b_panel = nullptr;

    src_ctype* im2col_dst = static_cast<src_ctype*>(

--- a/dnn/src/fallback/conv_bias/opr_impl.cpp
+++ b/dnn/src/fallback/conv_bias/opr_impl.cpp
@@ -154,7 +154,8 @@ void ConvBiasImpl::exec_preprocess(const TensorLayout& src_layout,
            bias{nullptr, bias_layout};
    auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace,
                                      preprocessed_filter);
-    ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
+    //! should not pass workspace_size limit otherwise can not find match algo
+    ConvBiasImpl::Algorithm* algo = get_algorithm(fparam);
    if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo,
                                              fparam) <= workspace.size) {
        exec_preprocess_with_ncb_kern(fparam, algo);

--- a/dnn/src/fallback/conv_bias/opr_impl.h
+++ b/dnn/src/fallback/conv_bias/opr_impl.h
@@ -299,6 +299,11 @@ private:
            const PreprocessedFilter* preprocessed_filter);
 };

+inline bool is_enable_filter_preprocess(
+        const ConvBiasImpl::NCBKernSizeParam& param) {
+    return param.preprocessed_filter &&
+           param.preprocessed_filter->tensors.size() >= 1;
+}
 }  // namespace fallback
 }  // namespace megdnn


--- a/dnn/src/fallback/convolution/opr_impl.cpp
+++ b/dnn/src/fallback/convolution/opr_impl.cpp
@@ -109,7 +109,9 @@ void ConvolutionImpl::exec_preprocess(const TensorLayout& src_layout,
    TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout};
    auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter,
                                      workspace);
-    ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
+
+    //! should not pass workspace_size limit otherwise can not find match algo
+    ConvolutionImpl::Algorithm* algo = get_algorithm(fparam);
    if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo,
                                              fparam) <= workspace.size) {
        exec_preprocess_with_ncb_kern(fparam, algo);

--- a/dnn/test/arm_common/conv_bias_multi_thread.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp
--- a/dnn/test/common/conv_bias.cpp
+++ b/dnn/test/common/conv_bias.cpp
@@ -1118,6 +1118,30 @@ void checker_conv_bias_int8x8x16(std::vector<conv_bias::TestArg> args,
    }
 }

+void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args,
+                                Handle* handle, RNG* rng, float epsilon,
+                                DType type0, DType type1, DType type2,
+                                DType type3, const char* algo_name) {
+    using namespace conv_bias;
+
+    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
+            handle);
+    checker.set_dtype(0, type0);
+    checker.set_dtype(1, type1);
+    checker.set_dtype(2, type2);
+    checker.set_dtype(4, type3);
+    checker.set_epsilon(epsilon);
+    if (NULL != rng) {
+        checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
+    }
+    checker.set_before_exec_callback(
+            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
+    for (auto&& arg : args) {
+        checker.set_param(arg.param).execs(
+                {arg.src, arg.filter, arg.bias, {}, {}});
+    }
+}
+

 void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
                              param::ConvBias param, Handle* handle,

--- a/dnn/test/common/conv_bias.h
+++ b/dnn/test/common/conv_bias.h
@@ -58,7 +58,10 @@ std::vector<TestArg> get_int8_chwn4_tensorcore_args(size_t kernel_size);
 std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size,
                                          bool compute_float32 = false,
                                          bool group_mode = false);
-
+void check_conv_bias_preprocess(std::vector<conv_bias::TestArg> args,
+                                Handle* handle, RNG* rng, float epsilon,
+                                DType type0, DType type1, DType type2,
+                                DType type3, const char* algo_name);
 template <typename Opr>
 using ConvBiasAlgoChecker = AlgoChecker<Opr>;


--- a/dnn/test/x86/conv_bias.cpp
+++ b/dnn/test/x86/conv_bias.cpp
@@ -752,7 +752,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE2) {
    }
 }

-TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X) {
+TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32) {
    using namespace conv_bias;
    std::vector<TestArg> args;

@@ -842,6 +842,98 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X) {
 #undef cb2
 }

+TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32_FILTER_PREPROCESS) {
+    using namespace conv_bias;
+    std::vector<TestArg> args;
+
+    auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
+                   size_t p, NonlineMode nonline_mode) {
+        if (w + 2 * p < kernel || h + 2 * p < kernel)
+            return;
+        param::ConvBias param;
+        param.stride_h = 1;
+        param.stride_w = 1;
+        param.pad_h = p;
+        param.pad_w = p;
+        param.nonlineMode = nonline_mode;
+
+        //! no bias
+        args.emplace_back(param, TensorShape{1, ic, h, w},
+                          TensorShape{oc, ic, kernel, kernel}, TensorShape{});
+    };
+
+    for (size_t kernel : {2, 3, 4, 5, 6, 7})
+        for (size_t ic : {1, 4, 8, 16})
+            for (size_t oc : {1, 4, 8})
+                for (size_t p : {0, 2})
+                    for (size_t size : {20, 21, 24})
+                        for (NonlineMode nonline_mode :
+                             {NonlineMode::IDENTITY}) {
+                            run(oc, ic, size, size, kernel, p, nonline_mode);
+                        }
+    //! test OC block
+    run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY);
+
+    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
+            handle());
+    UniformIntRNG rng{-50, 50};
+#define cb(algo_name)                                                          \
+    checker.set_before_exec_callback(                                          \
+            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));              \
+    checker.set_dtype(0, dtype::Int8());                                       \
+    checker.set_dtype(1, dtype::Int8());                                       \
+    checker.set_dtype(2, dtype::Int32());                                      \
+    checker.set_dtype(4, dtype::Int32());                                      \
+    for (auto&& arg : args) {                                                  \
+        checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
+    }                                                                          \
+    for (auto&& arg : args) {                                                  \
+        checker.set_dtype(0, dtype::QuantizedS8(2.5f))                         \
+                .set_dtype(1, dtype::QuantizedS8(2.5f))                        \
+                .set_dtype(2, dtype::QuantizedS32(6.25f))                      \
+                .set_dtype(4, {})                                              \
+                .set_rng(0, &rng)                                              \
+                .set_rng(1, &rng)                                              \
+                .set_rng(2, &rng)                                              \
+                .set_param(arg.param)                                          \
+                .execs({arg.src, arg.filter, {}, {}, {}});                     \
+    }
+#define cb2(algo_name)                                                         \
+    checker.set_before_exec_callback(                                          \
+            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));              \
+    checker.set_dtype(0, dtype::Int8());                                       \
+    checker.set_dtype(1, dtype::Int8());                                       \
+    checker.set_dtype(2, dtype::Int16());                                      \
+    checker.set_dtype(4, dtype::Int16());                                      \
+    for (auto&& arg : args) {                                                  \
+        checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \
+    }
+
+#if MEGDNN_X86_WITH_MKL_DNN
+    if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
+        cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
+    }
+#endif
+#if MEGDNN_X86_WITH_VNNI
+    if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) {
+        cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
+    }
+#endif
+    if (megdnn::x86::is_supported(x86::SIMDType::AVX2)) {
+        cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
+        cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2");
+        cb2("IM2COLMATMUL:X86_INT8X8X16_AVX2");
+    }
+    if (::megdnn::x86::is_supported(::megdnn::x86::SIMDType::SSE4_2)) {
+        cb("IM2COLMATMUL:X86_INT8X8X32_SSE_4X8X2");
+        cb2("IM2COLMATMUL:X86_INT8X8X16_SSE");
+    }
+
+#undef cb
+#undef cb2
+}
+
+
 TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) {
    using namespace conv_bias;
    std::vector<TestArg> args;
@@ -950,6 +1042,61 @@ TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32) {

 #undef cb
 }
+
+TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_NOPACK_PREPROCESS) {
+    using namespace conv_bias;
+    std::vector<TestArg> args;
+
+    auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
+                   size_t p, NonlineMode nonline_mode) {
+        if (w + 2 * p < kernel || h + 2 * p < kernel)
+            return;
+        param::ConvBias param;
+        param.stride_h = 1;
+        param.stride_w = 1;
+        param.pad_h = p;
+        param.pad_w = p;
+        param.nonlineMode = nonline_mode;
+
+        //! no bias
+        args.emplace_back(param, TensorShape{1, ic, h, w},
+                          TensorShape{oc, ic, kernel, kernel}, TensorShape{});
+        args.emplace_back(param, TensorShape{1, ic, h, w},
+                          TensorShape{oc, ic, kernel, kernel},
+                          TensorShape{1, oc, 1, 1});
+        args.emplace_back(
+                param, TensorShape{1, ic, h, w},
+                TensorShape{oc, ic, kernel, kernel},
+                TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
+                            (w + 2 * p - kernel) / param.stride_w + 1});
+    };
+
+    for (size_t kernel : {2, 3, 4, 5, 6, 7})
+        for (size_t ic : {1, 4, 8, 16})
+            for (size_t oc : {1, 4, 8, 16, 300})
+                for (size_t p : {0, 2})
+                    for (size_t size : {8, 24})
+                        for (NonlineMode nonline_mode :
+                             {NonlineMode::IDENTITY, NonlineMode::RELU}) {
+                            run(oc, ic, size, size, kernel, p, nonline_mode);
+                        }
+
+    run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
+
+    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
+            handle());
+#define cb(algo_name)                                             \
+    checker.set_before_exec_callback(                             \
+            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
+    for (auto&& arg : args) {                                     \
+        checker.set_param(arg.param).execs(                       \
+                {arg.src, arg.filter, arg.bias, {}, {}});         \
+    }
+    cb("IM2COLMATMUL:X86_F32_BLAS");
+
+#undef cb
+}
+
 #endif


@@ -1020,6 +1167,73 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
 #undef cb
 }

+TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA_FILTER_PREPROCESS) {
+    using namespace conv_bias;
+    std::vector<TestArg> args;
+
+    auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
+                   size_t p, NonlineMode nonline_mode) {
+        if (w + 2 * p < kernel || h + 2 * p < kernel)
+            return;
+        param::ConvBias param;
+        param.stride_h = 1;
+        param.stride_w = 1;
+        param.pad_h = p;
+        param.pad_w = p;
+        param.nonlineMode = nonline_mode;
+
+        //! no bias
+        args.emplace_back(param, TensorShape{1, ic, h, w},
+                          TensorShape{oc, ic, kernel, kernel}, TensorShape{});
+        args.emplace_back(param, TensorShape{1, ic, h, w},
+                          TensorShape{oc, ic, kernel, kernel},
+                          TensorShape{1, oc, 1, 1});
+        args.emplace_back(
+                param, TensorShape{1, ic, h, w},
+                TensorShape{oc, ic, kernel, kernel},
+                TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1,
+                            (w + 2 * p - kernel) / param.stride_w + 1});
+        param.sparse = param::ConvBias::Sparse::GROUP;
+        args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
+                          TensorShape{2, oc, ic, kernel, kernel},
+                          TensorShape{});
+        args.emplace_back(param, TensorShape{1, 2 * ic, h, w},
+                          TensorShape{2, oc, ic, kernel, kernel},
+                          TensorShape{1, oc * 2, 1, 1});
+
+        args.emplace_back(
+                param, TensorShape{1, 2 * ic, h, w},
+                TensorShape{2, oc, ic, kernel, kernel},
+                TensorShape{1, 2 * oc, (h + 2 * param.pad_h - kernel) / 1 + 1,
+                            (w + 2 * param.pad_w - kernel) / 1 + 1});
+    };
+
+    for (size_t kernel : {2, 3, 4, 5, 6, 7})
+        for (size_t ic : {1, 4, 8, 16})
+            for (size_t oc : {1, 4, 8, 16})
+                for (size_t p : {0, 1})
+                    for (size_t size : {8, 24})
+                        for (NonlineMode nonline_mode :
+                             {NonlineMode::IDENTITY, NonlineMode::RELU}) {
+                            run(oc, ic, size, size, kernel, p, nonline_mode);
+                        }
+
+    run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
+    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
+            handle());
+#define cb(algo_name)                                             \
+    checker.set_before_exec_callback(                             \
+            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
+    for (auto&& arg : args) {                                     \
+        checker.set_param(arg.param).execs(                       \
+                {arg.src, arg.filter, arg.bias, {}, {}});         \
+    }
+
+    cb("IM2COLMATMUL:X86_F32_MKL_PACKA:192");
+
+#undef cb
+}
+
 /**************************** Conv1x1 PackA *************************/
 namespace {
 void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
@@ -1169,6 +1383,77 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
 #undef cb
 }

+TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8_FILTER_PREPROCESS) {
+    using namespace conv_bias;
+    std::vector<TestArg> args;
+
+    auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
+                   size_t p, NonlineMode nonline_mode) {
+        if (w + 2 * p < kernel || h + 2 * p < kernel)
+            return;
+        param::ConvBias param;
+        param.stride_h = 1;
+        param.stride_w = 1;
+        param.pad_h = p;
+        param.pad_w = p;
+        param.nonlineMode = nonline_mode;
+
+        //! no bias
+        args.emplace_back(param, TensorShape{1, ic, h, w},
+                          TensorShape{oc, ic, kernel, kernel}, TensorShape{});
+        //! bias channel
+        args.emplace_back(param, TensorShape{2, ic, h, w},
+                          TensorShape{oc, ic, kernel, kernel},
+                          TensorShape{1, oc, 1, 1});
+    };
+
+    for (size_t kernel : {2, 3, 4, 5, 6, 7})
+        for (size_t ic : {1, 4, 8, 16})
+            for (size_t oc : {1, 4, 8})
+                for (size_t p : {0, 2})
+                    for (size_t size : {20, 21, 24})
+                        for (NonlineMode nonline_mode :
+                             {NonlineMode::IDENTITY, NonlineMode::RELU,
+                              NonlineMode::H_SWISH}) {
+                            run(oc, ic, size, size, kernel, p, nonline_mode);
+                        }
+    run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY);
+    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
+            handle());
+#define cb(algo_name)                                             \
+    checker.set_before_exec_callback(                             \
+            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
+    UniformIntRNG rng{-50, 50};                                   \
+    for (auto&& arg : args) {                                     \
+        checker.set_dtype(0, dtype::QuantizedS8(2.5f))            \
+                .set_dtype(1, dtype::QuantizedS8(2.5f))           \
+                .set_dtype(2, dtype::QuantizedS32(6.25f))         \
+                .set_dtype(4, dtype::QuantizedS8(60.25))          \
+                .set_rng(0, &rng)                                 \
+                .set_rng(1, &rng)                                 \
+                .set_rng(2, &rng)                                 \
+                .set_param(arg.param)                             \
+                .execs({arg.src, arg.filter, {}, {}, {}});        \
+    }
+
+#if MEGDNN_X86_WITH_MKL_DNN
+    if (x86::is_supported(x86::SIMDType::VNNI)) {
+        cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN");
+    }
+#endif
+#if MEGDNN_X86_WITH_VNNI
+    if (x86::is_supported(x86::SIMDType::VNNI)) {
+        cb("IM2COLMATMUL:X86_INT8X8X32_VNNI");
+    }
+#endif
+    if (x86::is_supported(x86::SIMDType::AVX2)) {
+        cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16");
+    }
+
+#undef cb
+}
+
+
 TEST_F(X86, CONV_BIAS_MATMUL) {
    using namespace conv_bias;
    std::vector<TestArg> args;