diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp
index 95e28d0104980378eae9b9cdb661188011c478a3..873f3fcabf9d96e989bedddc20f26685fb2233e6 100644
--- a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp
+++ b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 
 #include "src/fallback/conv_bias/conv1x1/algos.h"
@@ -67,7 +68,8 @@ size_t ConvBiasImpl::AlgoConv1x1::get_workspace(
         MIDOUT_END();
     } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
         MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) {
-            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher;
+            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>
+                    dispatcher;
             return dispatcher
                     .get_bundle(param, matmul_param, m_matmul_algo,
                                 compt_oc_block_size)
@@ -116,7 +118,8 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns(
         MIDOUT_END();
     } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
         MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) {
-            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher;
+            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>
+                    dispatcher;
             whole_bundle = dispatcher.get_bundle(
                     param, matmul_param, m_matmul_algo, compt_oc_block_size);
             matmul_bundle = m_matmul_algo->get_bundle(matmul_param);
@@ -140,7 +143,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns(
 
     Conv1x1StrategyBase* conv1x1_strategy =
             Conv1x1Factory::make_conv1x1_strategy(param, pack_mode,
-                                                 opr->param().format);
+                                                  opr->param().format);
 
     auto kern_packA = [this, whole_bundle, matmul_bundle, param,
                        compt_oc_block_size, conv1x1_strategy](
@@ -171,8 +174,8 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns(
         pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
         ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}});
         if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
-                ret_kern.push_back({kern_packB, {1}});
-            }
+            ret_kern.push_back({kern_packB, {1}});
+        }
     }
     ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}});
 
@@ -230,7 +233,11 @@ bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr,
                 param, OH * OW, get_oc_tile_size_heuristic(param));
         bool matmul_usable = m_matmul_algo->usable(matmul_param);
 
-        return matmul_usable &&
+        auto pack_mode = m_matmul_algo->packmode();
+        bool strategy_usable = Conv1x1Factory::can_make_conv1x1_strategy(
+                param, pack_mode, opr->param().format);
+
+        return matmul_usable && strategy_usable &&
                (param.filter_meta.dilation[0] ==
                         param.filter_meta.dilation[1] &&
                 param.filter_meta.dilation[0] == 1) &&
diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp
index 320da5081cab2ea4e4771f38eca3cde61f76e879..e32f6d3adfe6677b0533aff32bb7a1e325c56fa8 100644
--- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp
+++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp
@@ -6,11 +6,12 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 
-#include <unordered_map>
 #include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"
+#include <unordered_map>
 
 #include "midout.h"
 
@@ -157,10 +158,9 @@ std::unique_ptr<Conv1x1StrategyBase> create_conv1x1_strategy(
                 dt_int32, dt_int8, dt_int32, dt_int32,
                 PostprocessMode::NO_PROCESS, "NoPack::INT8x8x32_INT32"_hash);
 
-            cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK,
-                dtype::QuantizedS8, dtype::QuantizedS32,
-                dtype::QuantizedS32, dt_int8, dt_int32, dt_int32,
-                PostprocessMode::NO_PROCESS,
+            cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dtype::QuantizedS8,
+                dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32,
+                dt_int32, PostprocessMode::NO_PROCESS,
                 "NoPack::QINT8x8x32_QINT32"_hash);
             break;
 
@@ -208,6 +208,19 @@ Conv1x1StrategyBase* Conv1x1Factory::make_conv1x1_strategy(
     return storage.get(param, pack_mode, format);
 }
 
+bool Conv1x1Factory::can_make_conv1x1_strategy(
+        const ConvBiasImpl::NCBKernSizeParam& param,
+        MatrixMulImpl::AlgoBase::PackMode pack_mode, param::ConvBias::Format) {
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC || !MEGDNN_DISABLE_FLOAT16
+    if ((pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK ||
+         pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) &&
+        param.src_type.enumv() == DTypeTrait<dt_float16>::enumv) {
+        return false;
+    }
+#endif
+    return true;
+}
+
 }  // namespace conv1x1
 }  // namespace fallback
 }  // namespace megdnn
diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h
index 8e0456de69346755d2e89b5b223a535e8fd72c61..5cb95ab49827873b77736578fadfedae3a325603 100644
--- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h
+++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h
@@ -320,6 +320,11 @@ public:
             const ConvBiasImpl::NCBKernSizeParam& param,
             MatrixMulImpl::AlgoBase::PackMode pack_mode,
             param::ConvBias::Format format);
+
+    static bool can_make_conv1x1_strategy(
+            const ConvBiasImpl::NCBKernSizeParam& param,
+            MatrixMulImpl::AlgoBase::PackMode pack_mode,
+            param::ConvBias::Format format);
 };
 
 }  // namespace conv1x1
diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp
index 995acec3aca0ebc7eee492ee8d73624533a8ebb0..f2466b77e51bf41c58ced3e64828a70305c35570 100644
--- a/dnn/src/fallback/conv_bias/opr_impl.cpp
+++ b/dnn/src/fallback/conv_bias/opr_impl.cpp
@@ -27,7 +27,7 @@ using namespace megdnn;
 using namespace fallback;
 
 size_t megdnn::fallback::get_format_pack_size(param::ConvBias::Format format) {
-    switch(format){
+    switch (format) {
         case param::ConvBias::Format::NCHW44:
         case param::ConvBias::Format::NCHW4:
             return 4_z;
@@ -57,10 +57,18 @@ public:
         auto&& matmul_algos =
                 static_cast<fallback::MatrixMulImpl*>(matmul_opr)->algo_pack();
         for (auto&& algo : matmul_algos) {
+#if MEGDNN_X86
+//! As we haven't direct conv for int8x8x16 yet, if we disable gemv here, it may
+//! fallback to naive implementation, which may cause performance very low, so
+//! here we just enable im2col for gemv in x86 backend.
+//! FIXME: remove it when we add direct conv support for int8x8x16
+#else
             if (algo->algoset() ==
                 MatrixMulImpl::AlgoBase::AlgoSet::ALGO_TYPE_GEMV) {
                 continue;
             }
+#endif
+
             for (size_t ohw_tile_size : {192, 384, 96, 48, 24}) {
                 refhold.emplace_back(new AlgoIm2col(
                         static_cast<MatrixMulImpl::AlgoBase*>(algo),
diff --git a/dnn/test/fallback/convolution.cpp b/dnn/test/fallback/convolution.cpp
index 8124b00450954d256455d4a93934952c4c008505..17fc65193f75d121c5199afcd148c27ee87eaedc 100644
--- a/dnn/test/fallback/convolution.cpp
+++ b/dnn/test/fallback/convolution.cpp
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 #include "test/fallback/fixture.h"
 
@@ -73,24 +74,115 @@ TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL) {
     profile(3, 3, 112, 112, 3, 1);
 }
 
+TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL_8832) {
+    using Param = Convolution::Param;
+    auto run = [&](const TensorShapeArray& shapes, Param param) {
+        Benchmarker<Convolution> benchmarker_float(handle());
+        size_t RUN = 50;
+        auto tfloat = benchmarker_float.set_display(false)
+                              .set_dtype(0, dtype::Int8{})
+                              .set_dtype(1, dtype::Int8{})
+                              .set_dtype(2, dtype::Int32{})
+                              .set_times(RUN)
+                              .set_param(param)
+                              .exec(shapes);
+        size_t IC = shapes[1][1];
+        size_t FH = shapes[1][2];
+        size_t FW = shapes[1][3];
+        TensorLayout dst_layout;
+        auto opr = handle()->create_operator<Convolution>();
+        opr->param() = param;
+        opr->deduce_layout({shapes[0], dtype::Float32()},
+                           {shapes[1], dtype::Float32()}, dst_layout);
+        printf("fp32 flops: %.3f mflops\n",
+               (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
+                       (tfloat / RUN * 1000));
+    };
+    auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
+                       size_t stride) {
+        Param param;
+        param.stride_h = stride;
+        param.stride_w = stride;
+        param.pad_h = kernel / 2;
+        param.pad_w = kernel / 2;
+        param.pad_h = 0;
+        param.pad_w = 0;
+        printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n",
+               oc, ic, w, h, stride, kernel);
+
+        run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
+    };
+
+    profile(48, 128, 56, 88, 1, 1);
+    profile(56, 128, 64, 80, 3, 1);
+    profile(24, 3, 256, 320, 3, 2);
+}
+
+TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL_8816) {
+    using Param = Convolution::Param;
+    auto run = [&](const TensorShapeArray& shapes, Param param) {
+        Benchmarker<Convolution> benchmarker_float(handle());
+        size_t RUN = 50;
+        auto tfloat = benchmarker_float.set_display(false)
+                              .set_dtype(0, dtype::Int8{})
+                              .set_dtype(1, dtype::Int8{})
+                              .set_dtype(2, dtype::Int16{})
+                              .set_times(RUN)
+                              .set_param(param)
+                              .exec(shapes);
+        size_t IC = shapes[1][1];
+        size_t FH = shapes[1][2];
+        size_t FW = shapes[1][3];
+        TensorLayout dst_layout;
+        auto opr = handle()->create_operator<Convolution>();
+        opr->param() = param;
+        opr->deduce_layout({shapes[0], dtype::Float32()},
+                           {shapes[1], dtype::Float32()}, dst_layout);
+        printf("fp32 flops: %.3f mflops\n",
+               (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
+                       (tfloat / RUN * 1000));
+    };
+    auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
+                       size_t stride) {
+        Param param;
+        param.stride_h = stride;
+        param.stride_w = stride;
+        param.pad_h = kernel / 2;
+        param.pad_w = kernel / 2;
+        param.pad_h = 0;
+        param.pad_w = 0;
+        printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n",
+               oc, ic, w, h, stride, kernel);
+
+        run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
+    };
+
+    profile(48, 128, 56, 88, 1, 1);
+    profile(48, 128, 56, 88, 1, 2);
+    profile(56, 128, 64, 80, 3, 1);
+    profile(24, 3, 256, 320, 3, 2);
+}
+
 TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_BACKWARD_DATA) {
     using Param = ConvolutionBackwardData::Param;
     auto run = [&](const TensorLayoutArray& tensors, Param param) {
         Benchmarker<ConvolutionBackwardData> benchmarker_fallback(handle());
         size_t RUN = 500;
         benchmarker_fallback.set_display(false)
-            .set_dtype(0, dtype::Float32{})
-            .set_dtype(1, dtype::Float32{})
-            .set_times(RUN)
-            .set_param(param);
-        auto tmatmul = benchmarker_fallback.set_before_exec_callback(
-                AlgoChecker<ConvolutionBackwardData>(
-                "DeconvMatmul"))
-            .exec(tensors);
-        auto tdirect = benchmarker_fallback.set_before_exec_callback(
-                AlgoChecker<ConvolutionBackwardData>(
-                "DeconvDirect"))
-            .exec(tensors);
+                .set_dtype(0, dtype::Float32{})
+                .set_dtype(1, dtype::Float32{})
+                .set_times(RUN)
+                .set_param(param);
+        auto tmatmul = benchmarker_fallback
+                               .set_before_exec_callback(
+                                       AlgoChecker<ConvolutionBackwardData>(
+                                               "DeconvMatmul"))
+                               .exec(tensors);
+        auto tdirect = benchmarker_fallback
+                               .set_before_exec_callback(
+                                       AlgoChecker<ConvolutionBackwardData>(
+                                               "DeconvDirect"))
+                               .exec(tensors);
         size_t IC = tensors[0][1];
         size_t FH = tensors[0][2];
         size_t FW = tensors[0][3];
@@ -98,8 +190,8 @@ TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_BACKWARD_DATA) {
         printf("Direct_time: %.3f ms  Direct_flops: %.3f mflops\n", tdirect,
                total_flops / (tdirect / RUN * 1000));
         printf("Matmul_time: %.3f ms  Matmul_flops: %.3f mflops\n", tmatmul,
-               total_flops / (tmatmul/ RUN * 1000));
-        printf("speedup: %.3f\n", tdirect/tmatmul);
+               total_flops / (tmatmul / RUN * 1000));
+        printf("speedup: %.3f\n", tdirect / tmatmul);
     };
 
     auto profile = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc,
@@ -154,6 +246,51 @@ TEST_F(FALLBACK, CONVOLUTION_MATRIX_MUL) {
     run(1, 3, 3, 112, 112, 3, 1);
     run(1, 1, 1, 1, 1, 3, 3);
 }
+
+#if MEGDNN_X86
+TEST_F(FALLBACK_MULTI_THREADS, CONVOLUTION_8816) {
+    Checker<Convolution> checker(handle());
+    using Param = Convolution::Param;
+    checker.set_before_exec_callback(AlgoChecker<Convolution>(".+FB_GEMV.+"));
+    auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc,
+                   size_t fh, size_t fw, size_t pad, size_t stride,
+                   size_t group) {
+        Param param;
+        param.sparse = group > 1 ? param::Convolution::Sparse::GROUP
+                                 : param::Convolution::Sparse::DENSE;
+        param.pad_h = param.pad_w = pad;
+        param.stride_h = param.stride_w = stride;
+        checker.set_param(param);
+        if (group > 1) {
+            checker.execl(
+                    {{{n, ic, ih, iw}, dtype::Int8()},
+                     {{group, oc / group, ic / group, fh, fw}, dtype::Int8()},
+                     {{}, dtype::Int16()}});
+        } else {
+            checker.execl({{{n, ic, ih, iw}, dtype::Int8()},
+                           {{oc, ic, fh, fw}, dtype::Int8()},
+                           {{}, dtype::Int16()}});
+        }
+    };
+
+    for (auto n : {1, 2})
+        for (auto ic : {3, 4, 8, 12, 16})
+            for (auto oc : {4, 8, 16, 32})
+                for (auto ih : {7, 14, 15, 22})
+                    for (auto iw : {7, 13, 11, 32})
+                        for (auto filter : {1, 2, 3, 5, 7})
+                            for (auto stride : {1, 2})
+                                for (auto pad : {0, filter / 2}) {
+                                    run(n, ic, ih, iw, oc, filter, filter, pad,
+                                        stride, 1);
+                                    if (ic == oc) {
+                                        run(n, ic, ih, iw, oc, filter, filter,
+                                            pad, stride, ic);
+                                    }
+                                }
+}
+#endif
+
 TEST_F(FALLBACK, CONVOLUTION_NAIVE_ALGO_FP16) {
     Checker<Convolution> checker(handle());
     using Param = Convolution::Param;
@@ -222,7 +359,7 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVOLUTION_NAIVE_ALGO) {
         TensorShape src{n, ic, ih, iw},
                 filter{group, oc / group, ic / group, fh, fw};
         checker.set_param(param).set_dtype(2, {});
-        //!float32
+        //! float32
         checker.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
         checker.execs({src, filter, {}});
         //! float16
@@ -257,10 +394,10 @@ TEST_F(FALLBACK, CONVOLUTION_MATRIX_MUL_SINT8) {
         param.pad_h = param.pad_w = 1;
         param.stride_h = param.stride_w = 1;
         checker.set_param(param)
-               .set_dtype(0, dtype::QuantizedS8(0.2f))
-               .set_dtype(1, dtype::QuantizedS8(0.2f))
+                .set_dtype(0, dtype::QuantizedS8(0.2f))
+                .set_dtype(1, dtype::QuantizedS8(0.2f))
                 // Use inferred output dtype.
-               .set_dtype(2, {});
+                .set_dtype(2, {});
         checker.execs({{n, ic, ih, iw}, {oc, ic, fh, fw}, {}});
     };