diff --git a/dnn/scripts/opr_param_defs.py b/dnn/scripts/opr_param_defs.py
index 7c214a8bc06f6d267e1600c8c3fe53fa0f6619b8..d60153d400227e768c73d45f17a99da58c3cf9d3 100755
--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -445,7 +445,8 @@ pdef('PowC', 'power with constant exponent').add_fields('float32', 'exp', 0)
       'uint32',
       Doc('output_block_size', 'output block size, detail meaning see winograd '
           'in convbias, equals to the meaning of m in F(m, r)'), 0).
-  add_enum_alias('Format', 'MatrixMul')
+  add_enum_alias('Format', 'MatrixMul').
+  add_enum_alias('ComputeMode', 'Convolution', name_field='compute_mode')
  )
 
 (pdef('SVD').
diff --git a/dnn/src/arm_common/conv_bias/int8/algos.cpp b/dnn/src/arm_common/conv_bias/int8/algos.cpp
index f6d7022e93174a7777a34b30084e08952b60c8f7..348d7faee8d64cb180240676bac87b788e496f7a 100644
--- a/dnn/src/arm_common/conv_bias/int8/algos.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/algos.cpp
@@ -273,5 +273,167 @@ ConvBiasImpl::AlgoS8WinogradF23_8x8::dispatch_kerns(
     MIDOUT_END();
     return {};
 }
+//=========================== input int8 compute float32 =========
+bool ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::usable(
+        fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
+        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
+    MEGDNN_MARK_USED_VAR(param);
+    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_int8,
+                 midout_iv("arm_common_AlgoS8CF32WinogradF23_4x4::usable"_hash)) {
+        if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
+            return false;
+        bool is_matmul_usable = false;
+
+        using Strategy = winograd::winograd_2x3_4x4_s8_f32_nchw44;
+        Strategy strategy(param.src_type, param.filter_type, param.dst_type);
+        is_matmul_usable = m_matmul_algo->usable(
+                megdnn::winograd::ConvBias<Strategy,
+                                           param::MatrixMul::Format::MK4>(
+                        strategy, m_tile_size, param.nr_threads, param.osz[0],
+                        param.osz[1], param.filter_meta.ocpg)
+                        .get_matmul_kern_param(param));
+        return is_matmul_usable &&
+               ((opr->param().format == param::ConvBias::Format::NCHW44 &&
+                 param.filter_type.enumv() == DTypeEnum::QuantizedS8) ||
+                ((opr->param().format ==
+                  param::ConvBias::Format::NCHW44_WINOGRAD) &&
+                 opr->param().output_block_size == 2 &&
+                 param.winograd_matmul_format ==
+                         param::MatrixMul::Format::MK4)) &&
+               opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
+               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
+                param.filter_meta.spatial[0] == 3) &&
+               (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
+                param.filter_meta.stride[0] == 1) &&
+               (param.filter_meta.dilation[0] ==
+                        param.filter_meta.dilation[1] &&
+                param.filter_meta.dilation[0] == 1) &&
+               (param.compute_mode == param::ConvBias::ComputeMode::FLOAT32 ||
+                param.compute_mode == param::ConvBias::ComputeMode::DEFAULT) &&
+               param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
+               param.bias_type.enumv() == DTypeEnum::QuantizedS32 &&
+               param.dst_type.enumv() == DTypeEnum::QuantizedS8;
+    }
+    MIDOUT_END();
+    return false;
+}
+
+size_t ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::get_workspace(
+        fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
+    MIDOUT_BEGIN(
+            megdnn_arm_common_conv_bias_int8,
+            midout_iv("arm_common_AlgoS8CF32WinogradF23_4x4::get_workspace"_hash)) {
+        winograd::winograd_2x3_4x4_s8_f32_nchw44 strategy(
+                param.src_type, param.filter_type, param.dst_type);
+        return megdnn::winograd::ConvBias<winograd::winograd_2x3_4x4_s8_f32_nchw44,
+                                          param::MatrixMul::Format::MK4>(
+                       strategy, m_tile_size, param.nr_threads, param.osz[0],
+                       param.osz[1], param.filter_meta.ocpg)
+                .get_workspace_size(param, m_matmul_algo);
+    }
+    MIDOUT_END();
+    return 0;
+}
+
+SmallVector<ConvBiasImpl::NCBKern>
+ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::dispatch_kerns(
+        fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
+    MEGDNN_MARK_USED_VAR(param);
+    MIDOUT_BEGIN(
+            megdnn_arm_common_conv_bias_int8,
+            midout_iv(
+                    "arm_common_AlgoS8CF32WinogradF23_4x4::dispatch_kerns"_hash)) {
+        winograd::winograd_2x3_4x4_s8_f32_nchw44 strategy(
+                param.src_type, param.filter_type, param.dst_type);
+        auto winograd_impl =
+                megdnn::winograd::ConvBias<winograd::winograd_2x3_4x4_s8_f32_nchw44,
+                                           param::MatrixMul::Format::MK4>(
+                        strategy, m_tile_size, param.nr_threads, param.osz[0],
+                        param.osz[1], param.filter_meta.ocpg);
+        return winograd_impl.get_kerns(param, m_matmul_algo);
+    }
+    MIDOUT_END();
+    return {};
+}
 
+/* ======================= AlgoS8WinogradF23_8x8_NCHW44 ======================== */
+bool ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::usable(
+        fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
+        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
+
+    MIDOUT_BEGIN(
+            megdnn_arm_common_conv_bias_int8,
+            midout_iv(
+                    "arm_common_AlgoS8WinogradF23_8x8_NCHW44::usable"_hash)) {
+    if (param.filter_meta.icpg % 8 != 0 || param.filter_meta.ocpg % 8 != 0)
+        return false;
+    using Strategy = winograd::winograd_2x3_8x8_s8_nchw44;
+    Strategy strategy(param.src_type, param.filter_type, param.dst_type);
+    auto&& matmul_param =
+            megdnn::winograd::ConvBias<Strategy,
+                                       param::MatrixMul::Format::MK8>(
+                    strategy, m_tile_size, param.nr_threads, param.osz[0],
+                    param.osz[1], param.filter_meta.ocpg)
+                    .get_matmul_kern_param(param);
+    bool is_matmul_usable = m_matmul_algo->usable(matmul_param);
+    return is_matmul_usable &&
+           ((opr->param().format == param::ConvBias::Format::NCHW44 &&
+             param.filter_type.enumv() == DTypeEnum::QuantizedS8) ||
+            (opr->param().format == param::ConvBias::Format::NCHW44_WINOGRAD &&
+             opr->param().output_block_size == 2 &&
+             param.winograd_matmul_format == param::MatrixMul::Format::MK8 &&
+             param.filter_type.enumv() == DTypeEnum::QuantizedS16)) &&
+           opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION &&
+           (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
+            param.filter_meta.spatial[0] == 3) &&
+           (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
+            param.filter_meta.stride[0] == 1) &&
+           (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] &&
+            param.filter_meta.dilation[0] == 1) &&
+           param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
+           param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
+           param.bias_type.enumv() == DTypeEnum::QuantizedS32 &&
+           param.dst_type.enumv() == DTypeEnum::QuantizedS8;
+    }
+    MIDOUT_END();
+    return false;
+}
+
+size_t ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::get_workspace(
+        fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
+    MIDOUT_BEGIN(
+            megdnn_arm_common_conv_bias_int8,
+            midout_iv(
+                    "arm_common_AlgoS8WinogradF23_8x8_NCHW44::get_workspace"_hash)) {
+        winograd::winograd_2x3_8x8_s8_nchw44 strategy(
+                param.src_type, param.filter_type, param.dst_type);
+        return megdnn::winograd::ConvBias<winograd::winograd_2x3_8x8_s8_nchw44,
+                                          param::MatrixMul::Format::MK8>(
+                       strategy, m_tile_size, param.nr_threads, param.osz[0],
+                       param.osz[1], param.filter_meta.ocpg)
+                .get_workspace_size(param, m_matmul_algo);
+    }
+    MIDOUT_END();
+    return 0;
+}
+
+SmallVector<ConvBiasImpl::NCBKern>
+ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::dispatch_kerns(
+        fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {
+    MIDOUT_BEGIN(
+            megdnn_arm_common_conv_bias_int8,
+            midout_iv(
+                    "arm_common_AlgoS8WinogradF23_8x8_NCHW44::dispatch_kerns"_hash)) {
+        winograd::winograd_2x3_8x8_s8_nchw44 strategy(
+                param.src_type, param.filter_type, param.dst_type);
+        auto winograd_impl =
+                megdnn::winograd::ConvBias<winograd::winograd_2x3_8x8_s8_nchw44,
+                                           param::MatrixMul::Format::MK8>(
+                        strategy, m_tile_size, param.nr_threads, param.osz[0],
+                        param.osz[1], param.filter_meta.ocpg);
+        return winograd_impl.get_kerns(param, m_matmul_algo);
+    }
+    MIDOUT_END();
+    return {};
+}
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/algos.h b/dnn/src/arm_common/conv_bias/int8/algos.h
index 5b2629dd5e8cce3c492a47a010267b38f21a5dad..fc5dcad5c27d5f31c2c5739fce8906453fa45678 100644
--- a/dnn/src/arm_common/conv_bias/int8/algos.h
+++ b/dnn/src/arm_common/conv_bias/int8/algos.h
@@ -220,6 +220,68 @@ private:
     uint32_t m_tile_size;
 };
 
+//=======================input int8 compute fp32 output int8============
+class ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44 final : public AlgoBase {
+public:
+    AlgoS8CF32WinogradF23_4x4_NCHW44(fallback::MatrixMulImpl::AlgoBase* matmul_algo,
+                          uint32_t tile_size)
+            : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {}
+    bool is_reproducible() const override { return true; }
+    const char* name() const override {
+        if (m_name.empty()) {
+            m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>(
+                    m_matmul_algo->name(), {4, 2, m_tile_size},
+                    param::ConvBias::Format::NCHW44);
+        }
+        return m_name.c_str();
+    }
+    bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
+                AlgoSelectionStrategy algo_selection_strategy) const override;
+    size_t get_workspace(fallback::ConvBiasImpl*,
+                         const NCBKernSizeParam& param) const override;
+    virtual SmallVector<NCBKern> dispatch_kerns(
+            fallback::ConvBiasImpl* opr,
+            const NCBKernSizeParam& param) const override;
+    static std::vector<fallback::MatrixMulImpl::Algorithm*>
+    get_avaiable_matmul_algos(const NCBKernSizeParam& param);
+
+private:
+    fallback::MatrixMulImpl::AlgoBase* m_matmul_algo;
+    mutable std::string m_name;
+    uint32_t m_tile_size;
+};
+
+//=======================input int8 compute int16 output int8============
+class ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44 final : public AlgoBase {
+public:
+    AlgoS8WinogradF23_8x8_NCHW44(fallback::MatrixMulImpl::AlgoBase* matmul_algo,
+                          uint32_t tile_size)
+            : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {}
+    bool is_reproducible() const override { return true; }
+    const char* name() const override {
+        if (m_name.empty()) {
+            m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>(
+                    m_matmul_algo->name(), {8, 2, m_tile_size},
+                    param::ConvBias::Format::NCHW44);
+        }
+        return m_name.c_str();
+    }
+    bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param,
+                AlgoSelectionStrategy algo_selection_strategy) const override;
+    size_t get_workspace(fallback::ConvBiasImpl*,
+                         const NCBKernSizeParam& param) const override;
+    virtual SmallVector<NCBKern> dispatch_kerns(
+            fallback::ConvBiasImpl* opr,
+            const NCBKernSizeParam& param) const override;
+    static std::vector<fallback::MatrixMulImpl::Algorithm*>
+    get_avaiable_matmul_algos(const NCBKernSizeParam& param);
+
+private:
+    fallback::MatrixMulImpl::AlgoBase* m_matmul_algo;
+    mutable std::string m_name;
+    uint32_t m_tile_size;
+};
+
 }  // namespace arm_common
 }  // namespace megdnn
 
diff --git a/dnn/src/arm_common/conv_bias/int8/strategy.h b/dnn/src/arm_common/conv_bias/int8/strategy.h
index 717f99da6fba8844c472b74cf50d11d5ceed8ecb..f4bc4a334dac79606f2658b41f40b21cf0a25d97 100644
--- a/dnn/src/arm_common/conv_bias/int8/strategy.h
+++ b/dnn/src/arm_common/conv_bias/int8/strategy.h
@@ -20,6 +20,10 @@ namespace winograd {
 
 MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 8, 8,
                              winograd_2x3_8x8_s8)
+MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 8, 8,
+                             winograd_2x3_8x8_s8_nchw44)
+MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, float, float, 2, 3, 4, 4,
+                             winograd_2x3_4x4_s8_f32_nchw44)
 }
 }  // namespace arm_common
 }  // namespace megdnn
diff --git a/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_4x4.cpp b/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_4x4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3177fd1dd3a4ef3b99450e09c462ab5ff00923f8
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_4x4.cpp
@@ -0,0 +1,372 @@
+/**
+ * \file dnn/src/arm_common/conv_bias/fp32/strategy_nchw44_2x3_4x4.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "src/arm_common/conv_bias/int8/strategy.h"
+#include "src/arm_common/simd_macro/marm_neon.h"
+#include "src/arm_common/utils.h"
+#include "src/common/unroll_macro.h"
+#include "src/common/utils.h"
+#include "src/fallback/conv_bias/winograd/winograd.h"
+
+#include "src/arm_common/conv_bias/winograd_common/winograd_common.h"
+#include "src/naive/matrix_mul/matrix_mul_helper.h"
+#include "src/arm_common/elemwise_helper/op_unary.h"
+#include "src/arm_common/conv_bias/fp32/helper.h"
+
+#include "midout.h"
+
+MIDOUT_DECL(megdnn_arm_common_winograd_nchw44_s8_comp_fp32_f23)
+
+using namespace megdnn;
+using namespace arm_common;
+namespace {
+struct InputTransform2X3 {
+    template <bool inner>
+    static void prepare(const int8_t* input, float* patch, float* patchT,
+                        int ih_start, int iw_start, size_t IH, size_t IW,
+                        size_t ic, size_t IC, size_t PH, size_t PW) {
+        megdnn_assert(
+                ic % 4 == 0 && IC % 4 == 0,
+                "Winograd input prepare param is not times of 4!");
+        constexpr size_t alpha = 2 + 3 - 1;
+        MEGDNN_MARK_USED_VAR(patch);
+        if (inner) {
+            const int8_t* input_ptr =
+                    input + ic * IH * IW + ih_start * IW * 4 + iw_start * 4;
+            for (size_t ico = 0; ico < 4; ++ico) {
+                int8x16_t v_input = vld1q_s8(input_ptr);
+                int16x8_t v_low = vmovl_s8(vget_low_s8(v_input));
+                int16x8_t v_high = vmovl_s8(vget_high_s8(v_input));
+                int32x4_t v_0 = vmovl_s16(vget_low_s16(v_low));
+                int32x4_t v_1 = vmovl_s16(vget_high_s16(v_low));
+                int32x4_t v_2 = vmovl_s16(vget_low_s16(v_high));
+                int32x4_t v_3 = vmovl_s16(vget_high_s16(v_high));
+
+                vst1q_f32(patchT + ico * 4 * alpha + 0 * 4,
+                          vcvtq_f32_s32(v_0));
+                vst1q_f32(patchT + ico * 4 * alpha + 1 * 4,
+                          vcvtq_f32_s32(v_1));
+                vst1q_f32(patchT + ico * 4 * alpha + 2 * 4,
+                          vcvtq_f32_s32(v_2));
+                vst1q_f32(patchT + ico * 4 * alpha + 3 * 4,
+                          vcvtq_f32_s32(v_3));
+                input_ptr += IW * 4;
+            }
+        } else {
+            if (PH > 0 || PW > 0) {
+                memset(patchT, 0, sizeof(float) * 4 * alpha * alpha);
+            }
+            InputGetter<const int8_t*, float32x4_t> getter;
+            const int8_t* input_ptr = input + ic * IH * IW;
+            int ih0_act = std::max<int>(ih_start, 0),
+                ih1_act = std::min<int>(ih_start + alpha, IH),
+                iw0_act = std::max<int>(iw_start, 0),
+                iw1_act = std::min<int>(iw_start + alpha, IW);
+            // partial copy
+            for (int ih = ih0_act; ih < ih1_act; ++ih) {
+                for (int iw = iw0_act; iw < iw1_act; ++iw) {
+                    size_t iho = ih - ih_start, iwo = iw - iw_start;
+                    vst1q_f32(patchT + iho * alpha * 4 + iwo * 4,
+                              getter(input_ptr + ih * IW * 4 + iw * 4));
+                }
+            }
+        }
+    }
+
+    static void transform(const float* patchT, float* input_transform_buf,
+                          size_t unit_idx, size_t nr_units_in_tile, size_t ic,
+                          size_t IC) {
+        constexpr size_t alpha = 2 + 3 - 1;
+        // BT * d * B
+#define cb(m, n)               \
+    Vector<float, 4> d##m##n = \
+            Vector<float, 4>::load(patchT + m * 4 * 4 + n * 4);
+
+        UNROLL_CALL_NOWRAPPER_D2(4, 4, cb);
+#undef cb
+
+        //! 1   0 -1 0    d00 d01 d02 d03     1 0  0  0
+        //! 0   1  1 0    d10 d11 d12 d13     0 1 -1 -1
+        //! 0  -1  1 0    d20 d21 d22 d23    -1 1  1  0
+        //! 0  -1  0 1    d30 d31 d32 d33     0 0  0  1
+#define cb(m)                   \
+    auto t0##m = d0##m - d2##m; \
+    auto t1##m = d1##m + d2##m; \
+    auto t2##m = d2##m - d1##m; \
+    auto t3##m = d3##m - d1##m;
+
+        UNROLL_CALL_NOWRAPPER(4, cb);
+#undef cb
+
+#define cb(m)                    \
+    d##m##0 = t##m##0 - t##m##2; \
+    d##m##1 = t##m##1 + t##m##2; \
+    d##m##2 = t##m##2 - t##m##1; \
+    d##m##3 = t##m##3 - t##m##1;
+
+        UNROLL_CALL_NOWRAPPER(4, cb);
+#undef cb
+
+        size_t ICB = IC / 4;
+        size_t icb = ic / 4;
+#define cb(m, n)                                                \
+    d##m##n.save(input_transform_buf +                          \
+                 (m * alpha + n) * ICB * nr_units_in_tile * 4 + \
+                 icb * nr_units_in_tile * 4 + unit_idx * 4);
+        UNROLL_CALL_NOWRAPPER_D2(4, 4, cb)
+#undef cb
+    }
+};
+
+template <BiasMode bmode, typename Op>
+struct OutputTransform2X3 {
+    static void transform(const float* output_transform_buf, const float* bias,
+                          int8_t* output, float* transform_mid_buf,
+                          size_t oh_start, size_t ow_start, size_t OH,
+                          size_t OW, size_t oc_start, size_t oc_end,
+                          size_t oc_index, size_t unit_idx,
+                          size_t nr_units_in_tile, const DType& src_dtype,
+                          const DType& filter_dtype, const DType& dst_dtype) {
+        float scale_filter = 0.f;
+        MEGDNN_MARK_USED_VAR(transform_mid_buf);
+        if (filter_dtype.enumv() == DTypeEnum::QuantizedS8) {
+            scale_filter = filter_dtype.param<dtype::QuantizedS8>().scale;
+        } else if (filter_dtype.enumv() == DTypeEnum::QuantizedS32) {
+            megdnn_assert(filter_dtype.enumv() == DTypeEnum::QuantizedS32);
+            scale_filter = filter_dtype.param<dtype::QuantizedS32>().scale;
+        }
+        float input_filter_scale =
+                src_dtype.param<dtype::QuantizedS8>().scale * scale_filter;
+        DType buffer_dtype = dtype::QuantizedS32(input_filter_scale);
+        Op op(buffer_dtype, dst_dtype);
+
+        //! AT * m * A
+        constexpr size_t alpha = 2 + 3 - 1;
+
+        size_t oc = oc_start + oc_index;
+        size_t OCB = (oc_end - oc_start) / 4;
+        size_t ocb = oc_index / 4;
+
+#define cb(m, n)                                           \
+    auto v##m##n = Vector<float, 4>::load(                 \
+            output_transform_buf +                         \
+            (m * alpha + n) * OCB * nr_units_in_tile * 4 + \
+            ocb * nr_units_in_tile * 4 + unit_idx * 4);
+        UNROLL_CALL_NOWRAPPER_D2(4, 4, cb);
+#undef cb
+        //! 1  1  1 0  v00 v01 v02 v03    1  0
+        //! 0  1 -1 1  v10 v11 v12 v13    1  1
+        //!            v20 v21 v22 v23    1 -1
+        //!            v30 v31 v32 v33    0  1
+#define cb(m)                           \
+    auto t0##m = v0##m + v1##m + v2##m; \
+    auto t1##m = v1##m - v2##m + v3##m;
+
+        UNROLL_CALL_NOWRAPPER(4, cb);
+#undef cb
+
+        Vector<float, 4> result[2][2];
+
+        result[0][0] = t00 + t01 + t02;
+        result[1][0] = t10 + t11 + t12;
+        result[0][1] = t01 - t02 + t03;
+        result[1][1] = t11 - t12 + t13;
+
+        const int32_t* tmp_bias =
+                static_cast<const int32_t*>(static_cast<const void*>(bias));
+        Vector<float, 4> vbias;
+        if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
+            const float32x4_t vvbias = vcvtq_f32_s32(vld1q_s32(tmp_bias + oc));
+            vbias = Vector<float, 4>(vvbias);
+
+          result[0][0] += vbias;
+          result[0][1] += vbias;
+          result[1][0] += vbias;
+          result[1][1] += vbias;
+        }
+
+#undef cb
+
+#if MEGDNN_AARCH64
+        int32_t* tmp_ouput = static_cast<int32_t*>(static_cast<void*>(output));
+#endif
+        for (size_t oho = 0; oho < 2 && oh_start + oho < OH; ++oho) {
+            for (size_t owo = 0; owo < 2 && ow_start + owo < OW; ++owo) {
+                size_t oh = oh_start + oho;
+                size_t ow = ow_start + owo;
+
+                Vector<float, 4> res;
+                res = result[oho][owo];
+                if (bmode == BiasMode::BIAS) {
+                    const float32x4_t vvbias = vcvtq_f32_s32(vld1q_s32(
+                            tmp_bias + oc * OH * OW + oh * OW * 4 + ow * 4));
+                    res += Vector<float, 4>(vvbias);
+                }
+#if MEGDNN_AARCH64
+                int8x8_t v_res = op(res.value);
+                tmp_ouput[oc * OH * OW / 4 + oh * OW + ow] =
+                        vget_lane_s32(vreinterpret_s32_s8(v_res), 0);
+#else
+                //! armv7 using neon there is some error ,so using scalar
+                //! compute
+                dt_qint8 res_int8 = dt_qint8(0);
+#define cb(i)                                               \
+    res_int8 = op(dt_qint32(vgetq_lane_f32(res.value, i))); \
+    output[oc * OH * OW + oh * OW * 4 + ow * 4 + i] = res_int8.as_int8();
+                UNROLL_CALL_NOWRAPPER(4, cb);
+#undef cb
+#endif
+            }
+        }
+    }
+};
+}  // namespace
+
+namespace megdnn {
+namespace arm_common {
+namespace winograd {
+
+MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_2x3_4x4_s8_f32_nchw44)
+void winograd_2x3_4x4_s8_f32_nchw44::filter(const int8_t* filter,
+                                 float* filter_transform_buf,
+                                 float* transform_mid_buf, size_t OC, size_t IC,
+                                 size_t oc_start, size_t oc_end) {
+    constexpr int alpha = 2 + 3 - 1;
+
+    /**
+     * origin: (4x3) * (3 x 3) * (3 x 4)
+     */
+    //! 1      0    0    v00 v01 v02   1 0.5  0.5 0
+    //! 0.5  0.5  0.5    v10 v11 v12   0 0.5 -0.5 0
+    //! 0.5 -0.5  0.5    v20 v21 v22   0 0.5  0.5 1
+    //! 0      0    1
+
+    InputGetter<const int8_t*, float32x4_t> getter;
+    MEGDNN_MARK_USED_VAR(transform_mid_buf);
+    megdnn_assert((oc_end - oc_start) % 4 == 0 && oc_start % 4 == 0 &&
+                          oc_end % 4 == 0 && IC % 4 == 0 && OC % 4 == 0,
+                  "Winograd filter transform input param is not times of 4!");
+    size_t OCB = OC / 4;
+    size_t ICB = IC / 4;
+
+    for (size_t ocb = oc_start / 4; ocb < oc_end / 4; ocb++) {
+        for (size_t icb = 0; icb < ICB; icb++) {
+            for (size_t ic_inner = 0; ic_inner < 4; ic_inner++) {
+                const int8_t* fptr = filter + (ocb * ICB + icb) * 3 * 3 * 4 * 4 +
+                                    ic_inner * 4;
+
+#define cb(m, n)               \
+    Vector<float, 4> g##m##n = \
+            Vector<float, 4>(getter(fptr + (m * 3 + n) * 4 * 4));
+
+                UNROLL_CALL_NOWRAPPER_D2(3, 3, cb)
+#undef cb
+
+#define FILTER_TRANSFORM(n, wd, g)    \
+    auto wd##n##0 = g##0##n;          \
+    tmp0 = (g##0##n + g##2##n) * 0.5; \
+    tmp1 = g##1##n * 0.5;             \
+    auto wd##n##1 = tmp0 + tmp1;      \
+    auto wd##n##2 = tmp0 - tmp1;      \
+    auto wd##n##3 = g##2##n;
+                Vector<float, 4> tmp0, tmp1;
+                UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g);
+                UNROLL_CALL_RAW(4, FILTER_TRANSFORM, ret, wd);
+#undef FILTER_TRANSFORM
+
+
+#define cb(m, n)                                                             \
+    ret##m##n.save(filter_transform_buf +                                    \
+                   (m * alpha + n) * OCB * ICB * 4 * 4 + ocb * ICB * 4 * 4 + \
+                   icb * 4 * 4 + ic_inner * 4);
+                UNROLL_CALL_NOWRAPPER_D2(4, 4, cb)
+#undef cb
+            }
+        }
+    }
+}
+
+void winograd_2x3_4x4_s8_f32_nchw44::input(const int8_t* input, float* input_transform_buf,
+                                float* transform_mid_buf, size_t IH, size_t IW,
+                                size_t IC, size_t PH, size_t PW,
+                                size_t unit_start_idx,
+                                size_t nr_units_in_tile) {
+    megdnn_assert(IC % 4 == 0);
+    constexpr int alpha = 3 + 2 - 1;
+
+    auto units_w =
+            div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE);
+    float* patch = transform_mid_buf;
+    float* patchT = transform_mid_buf + 4 * alpha * alpha;
+
+    for (size_t ic = 0; ic < IC; ic += 4) {
+        rep(unit_idx, nr_units_in_tile) {
+            size_t index = unit_start_idx + unit_idx;
+            size_t nh = index / units_w;
+            size_t nw = index % units_w;
+            int ih_start = nh * OUTPUT_BLOCK_SIZE - PH;
+            int iw_start = nw * OUTPUT_BLOCK_SIZE - PW;
+            if (ih_start >= 0 && ih_start + alpha <= static_cast<int>(IH) &&
+                iw_start >= 0 && iw_start + alpha <= static_cast<int>(IW)) {
+                InputTransform2X3::prepare<true>(input, patch, patchT, ih_start,
+                                                 iw_start, IH, IW, ic, IC,PH,PW);
+                InputTransform2X3::transform(patchT, input_transform_buf,
+                                             unit_idx, nr_units_in_tile, ic,
+                                             IC);
+
+            } else {
+                InputTransform2X3::prepare<false>(input, patch, patchT,
+                                                  ih_start, iw_start, IH, IW,
+                                                  ic, IC,PH,PW);
+                InputTransform2X3::transform(patchT, input_transform_buf,
+                                             unit_idx, nr_units_in_tile, ic,
+                                             IC);
+            }
+        }
+    }
+}
+
+void winograd_2x3_4x4_s8_f32_nchw44::output(const float* output_transform_buf,
+                                 const float* bias, int8_t* output,
+                                 float* transform_mid_buf, BiasMode bmode,
+                                 NonlineMode nonline_mode, size_t OH, size_t OW,
+                                 size_t oc_start, size_t oc_end,
+                                 size_t unit_start_idx,
+                                 size_t nr_units_in_tile) {
+#define cb(_bmode, _nonline_op, ...) \
+    OutputTransform2X3<_bmode MEGDNN_COMMA _nonline_op>::transform(__VA_ARGS__);
+
+    auto units_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE);
+    for (size_t oc = oc_start; oc < oc_end; oc += 4) {
+        size_t oc_index = oc - oc_start;
+        rep(unit_idx, nr_units_in_tile) {
+            size_t index = unit_start_idx + unit_idx;
+            auto nh = index / units_w;
+            auto nw = index % units_w;
+            size_t oh_start = nh * OUTPUT_BLOCK_SIZE;
+            size_t ow_start = nw * OUTPUT_BLOCK_SIZE;
+            DISPATCH_CONV_WINOGRAD_BIAS_QUANTIZED(
+                    megdnn_arm_common_winograd_nchw44_s8_comp_fp32_f23, cb,
+                    dt_qint32, dt_qint8, bmode, nonline_mode,
+                    output_transform_buf, bias, output, transform_mid_buf,
+                    oh_start, ow_start, OH, OW, oc_start, oc_end, oc_index,
+                    unit_idx, nr_units_in_tile, src_dtype, filter_dtype,
+                    dst_dtype);
+        }
+    }
+#undef cb
+}
+
+}  // namespace winograd
+}  // namespace arm_common
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_8x8.cpp b/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_8x8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ca25e1d04d476f54bfb0dc5e259f3d9bd2a5965
--- /dev/null
+++ b/dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_8x8.cpp
@@ -0,0 +1,404 @@
+/**
+ * \file dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_8x8.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "src/fallback/conv_bias/winograd/winograd.h"
+#include "src/naive/matrix_mul/matrix_mul_helper.h"
+
+#include "src/arm_common/conv_bias/winograd_common/winograd_common.h"
+#include "src/arm_common/elemwise_helper/op_unary.h"
+#include "src/arm_common/conv_bias/int8/strategy.h"
+#include "src/arm_common/conv_bias/int8/helper.h"
+#include "src/arm_common/simd_macro/marm_neon.h"
+#include "src/arm_common/utils.h"
+
+#include "src/common/winograd/winograd_generator.h"
+#include "src/common/unroll_macro.h"
+#include "src/common/utils.h"
+
+#include "midout.h"
+
+MIDOUT_DECL(megdnn_arm_common_winograd_nchw44_s8_int16_8x8)
+
+using namespace megdnn;
+using namespace arm_common;
+
+namespace {
+
+struct FilterTransform2X3_qs8 {
+    static void transform(const int8_t* filter_ptr, int16_t* filter_transform_buf,
+                          int16_t* transform_mid_buf, size_t OC, size_t IC,
+                          size_t oc_start, size_t oc_end) {
+        constexpr int alpha = 2 + 3 - 1;
+
+    /**
+     * origin: (4x3) * (3 x 3) * (3 x 4)
+     */
+    //! 1      0    0    v00 v01 v02   1 0.5  0.5 0
+    //! 0.5  0.5  0.5    v10 v11 v12   0 0.5 -0.5 0
+    //! 0.5 -0.5  0.5    v20 v21 v22   0 0.5  0.5 1
+    //! 0      0    1
+
+    //! 2   0  0    v00 v01 v02   2 1  1 0
+    //! 1   1  1    v10 v11 v12   0 1 -1 0
+    //! 1  -1  1    v20 v21 v22   0 1  1 2
+    //! 0   0  2
+        //! G * g * GT
+
+        InputGetter<const int8_t*, int16x4_t> getter;
+        MEGDNN_MARK_USED_VAR(transform_mid_buf);
+        megdnn_assert(
+                (oc_end - oc_start) % 4 == 0 && oc_start % 4 == 0 &&
+                        oc_end % 4 == 0 && IC % 8 == 0 && OC % 8 == 0,
+                "Winograd filter transform input param is not times of 8!");
+        size_t OCB = OC / 8;
+        size_t ICB = IC / 8;
+        size_t ICB4 = IC / 4;
+        for (size_t ocb = oc_start / 4; ocb < oc_end / 4; ocb++) {
+            size_t tmp_ocb = ocb / 2;
+            size_t index = ((ocb & 1) == 0) ? 0 : 1;
+            for (size_t icb = 0; icb < ICB4; icb++) {
+                for (size_t ic_inner = 0; ic_inner < 4; ic_inner++) {
+                    const int8_t* fptr = filter_ptr +
+                                         (ocb * ICB4 + icb) * 3 * 3 * 4 * 4 +
+                                         ic_inner * 4;
+#define cb(m, n)                 \
+    Vector<int16_t, 4> g##m##n = \
+            Vector<int16_t, 4>(getter(fptr + (m * 3 + n) * 4 * 4));
+
+                    UNROLL_CALL_NOWRAPPER_D2(3, 3, cb)
+#undef cb
+
+#define FILTER_TRANSFORM(n, wd, g)   \
+    auto wd##n##0 = g##0##n * 2;     \
+    v_tmp = g##0##n + g##2##n;       \
+    auto wd##n##1 = v_tmp + g##1##n; \
+    auto wd##n##2 = v_tmp - g##1##n; \
+    auto wd##n##3 = g##2##n * 2;
+                    Vector<int16_t, 4> v_tmp;
+                    UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g);
+                    UNROLL_CALL_RAW(4, FILTER_TRANSFORM, ret, wd);
+#undef FILTER_TRANSFORM
+
+#define cb(m, n)                                                              \
+    ret##m##n.save(                                                           \
+            filter_transform_buf + (m * alpha + n) * OCB * ICB * 8 * 8 +      \
+            tmp_ocb * ICB * 8 * 8 + icb * 4 * 8 + ic_inner * 8 + index * 4);
+
+                    UNROLL_CALL_NOWRAPPER_D2(4, 4, cb)
+#undef cb
+                }
+            }
+        }
+    }
+};
+
+struct InputTransform2X3_qs8 {
+    template <bool inner>
+    static void prepare(const int8_t* input, int16_t* patch, int16_t* patchT,
+                        int ih_start, int iw_start, size_t IH, size_t IW,
+                        size_t ic, size_t IC, size_t PH, size_t PW) {
+        megdnn_assert(ic % 8 == 0 && IC % 8 == 0,
+                      "Winograd input prepare param is not times of 4!");
+        MEGDNN_MARK_USED_VAR(patch);
+        constexpr size_t alpha = 2 + 3 - 1;
+        if (inner) {
+            const int8_t* input_ptr =
+                    input + ic * IH * IW + ih_start * IW * 4 + iw_start * 4;
+            for (size_t ico = 0; ico < alpha; ++ico) {
+                int8x16_t v_input0 = vld1q_s8(input_ptr);  // c0123
+                int8x16_t v_input1 =
+                        vld1q_s8(input_ptr + IH * IW * 4);  // c4567
+                int32x4_t v32_00 = vreinterpretq_s32_s8(v_input0);
+                int32x4_t v32_01 = vreinterpretq_s32_s8(v_input1);
+
+                int32x4x2_t v_trn = vtrnq_s32(v32_00, v32_01);  // c01234567
+
+                v_input0 = vreinterpretq_s8_s32(v_trn.val[0]);
+                v_input1 = vreinterpretq_s8_s32(v_trn.val[1]);
+
+                int16x8_t v0_low = vmovl_s8(vget_low_s8(v_input0));
+                int16x8_t v0_high = vmovl_s8(vget_high_s8(v_input0));
+                int16x8_t v1_low = vmovl_s8(vget_low_s8(v_input1));
+                int16x8_t v1_high = vmovl_s8(vget_high_s8(v_input1));
+
+                vst1q_s16(patchT + ico * 8 * alpha + 0 * 8, v0_low);
+                vst1q_s16(patchT + ico * 8 * alpha + 1 * 8, v1_low);
+                vst1q_s16(patchT + ico * 8 * alpha + 2 * 8, v0_high);
+                vst1q_s16(patchT + ico * 8 * alpha + 3 * 8, v1_high);
+                input_ptr += IW * 4;  // next row
+            }
+        } else {
+            if (PH || PW) {
+                memset(patchT, 0, sizeof(int16_t) * 8 * alpha * alpha);
+            }
+            InputGetter<const int8_t*, int16x4_t> getter;
+            const int8_t* input_ptr = input + ic * IH * IW;
+            int ih0_act = std::max<int>(ih_start, 0),
+                ih1_act = std::min<int>(ih_start + alpha, IH),
+                iw0_act = std::max<int>(iw_start, 0),
+                iw1_act = std::min<int>(iw_start + alpha, IW);
+            // partial copy
+            for (int ih = ih0_act; ih < ih1_act; ++ih) {
+                for (int iw = iw0_act; iw < iw1_act; ++iw) {
+                    size_t iho = ih - ih_start, iwo = iw - iw_start;
+                    vst1q_s16(patchT + iho * alpha * 8 + iwo * 8,
+                              vcombine_s16(
+                                      getter(input_ptr + ih * IW * 4 + iw * 4),
+                                      getter(input_ptr + IH * IW * 4 +
+                                             ih * IW * 4 + iw * 4)));
+                }
+            }
+        }
+    }
+
+    static void transform(const int16_t* patchT, int16_t* input_transform_buf,
+                          size_t unit_idx, size_t nr_units_in_tile, size_t ic,
+                          size_t IC) {
+        constexpr size_t alpha = 2 + 3 - 1;
+        // BT * d * B
+#define cb(m, n)                 \
+    Vector<int16_t, 8> d##m##n = \
+            Vector<int16_t, 8>::load(patchT + m * 4 * 8 + n * 8);
+
+        UNROLL_CALL_NOWRAPPER_D2(4, 4, cb);
+#undef cb
+
+        //! 1   0 -1 0    d00 d01 d02 d03     1 0  0  0
+        //! 0   1  1 0    d10 d11 d12 d13     0 1 -1 -1
+        //! 0  -1  1 0    d20 d21 d22 d23    -1 1  1  0
+        //! 0  -1  0 1    d30 d31 d32 d33     0 0  0  1
+#define cb(m)                   \
+    auto t0##m = d0##m - d2##m; \
+    auto t1##m = d1##m + d2##m; \
+    auto t2##m = d2##m - d1##m; \
+    auto t3##m = d3##m - d1##m;
+
+        UNROLL_CALL_NOWRAPPER(4, cb);
+#undef cb
+
+#define cb(m)                    \
+    d##m##0 = t##m##0 - t##m##2; \
+    d##m##1 = t##m##1 + t##m##2; \
+    d##m##2 = t##m##2 - t##m##1; \
+    d##m##3 = t##m##3 - t##m##1;
+
+        UNROLL_CALL_NOWRAPPER(4, cb);
+#undef cb
+
+        size_t ICB = IC / 8;
+        size_t icb = ic / 8;
+#define cb(m, n)                                                \
+    d##m##n.save(input_transform_buf +                          \
+                 (m * alpha + n) * ICB * nr_units_in_tile * 8 + \
+                 icb * nr_units_in_tile * 8 + unit_idx * 8);
+        UNROLL_CALL_NOWRAPPER_D2(4, 4, cb)
+#undef cb
+    }
+};
+
+template <BiasMode bmode, typename Op>
+struct OutputTransform2X3_qs8 {
+    static void transform(const int32_t* output_transform_buf,
+                          const int32_t* bias, int8_t* output,
+                          int32_t* transform_mid_buf, size_t oh_start,
+                          size_t ow_start, size_t OH, size_t OW,
+                          size_t oc_start, size_t oc_end, size_t oc_index,
+                          size_t unit_idx, size_t nr_units_in_tile,
+                          const DType& src_dtype, const DType& filter_dtype,
+                          const DType& dst_dtype) {
+        MEGDNN_MARK_USED_VAR(transform_mid_buf);
+        float scale_filter = 0.f;
+        if (filter_dtype.enumv() == DTypeEnum::QuantizedS8) {
+            scale_filter = filter_dtype.param<dtype::QuantizedS8>().scale;
+        } else {
+            megdnn_assert(filter_dtype.enumv() == DTypeEnum::QuantizedS16);
+            scale_filter = filter_dtype.param<dtype::QuantizedS16>().scale;
+        }
+        float input_filter_scale =
+                src_dtype.param<dtype::QuantizedS8>().scale * scale_filter;
+        DType buffer_dtype = dtype::QuantizedS32(input_filter_scale * 0.5f *
+                                                 0.5f * 1.0f * 1.0f);
+        Op op(buffer_dtype, dst_dtype);
+        //! AT * m * A
+        constexpr size_t alpha = 2 + 3 - 1;
+
+        size_t oc = oc_start + oc_index;
+        size_t OCB = (oc_end - oc_start) / 8;
+        size_t ocb = oc_index / 8;
+
+#define cb(m, n)                                           \
+    auto v##m##n = Vector<int32_t, 8>::load(               \
+            output_transform_buf +                         \
+            (m * alpha + n) * OCB * nr_units_in_tile * 8 + \
+            ocb * nr_units_in_tile * 8 + unit_idx * 8);
+        UNROLL_CALL_NOWRAPPER_D2(4, 4, cb);
+#undef cb
+        //! 1  1  1 0  v00 v01 v02 v03    1  0
+        //! 0  1 -1 1  v10 v11 v12 v13    1  1
+        //!            v20 v21 v22 v23    1 -1
+        //!            v30 v31 v32 v33    0  1
+#define cb(m)                           \
+    auto t0##m = v0##m + v1##m + v2##m; \
+    auto t1##m = v1##m - v2##m + v3##m;
+
+        UNROLL_CALL_NOWRAPPER(4, cb);
+#undef cb
+
+
+       Vector<int32_t, 8> result[2][2];
+
+       result[0][0] = t00 + t01 + t02;
+       result[1][0] = t10 + t11 + t12;
+       result[0][1] = t01 - t02 + t03;
+       result[1][1] = t11 - t12 + t13;
+
+       if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) {
+           Vector<int32_t, 8> vbias;
+           vbias = Vector<int32_t, 8>::load(bias + oc) * 4;
+
+           result[0][0] += vbias;
+           result[0][1] += vbias;
+           result[1][0] += vbias;
+           result[1][1] += vbias;
+       }
+
+#if MEGDNN_AARCH64
+        int32_t* tmp_output = static_cast<int32_t*>(static_cast<void*>(output));
+#endif
+        for (size_t oho = 0; oho < 2 && oh_start + oho < OH; ++oho) {
+            for (size_t owo = 0; owo < 2 && ow_start + owo < OW; ++owo) {
+                size_t oh = oh_start + oho;
+                size_t ow = ow_start + owo;
+                Vector<int32_t, 8> res = result[oho][owo];
+                if (bmode == BiasMode::BIAS) {
+                    int32x4x2_t vbias;
+                    vbias.val[0] = vld1q_s32(bias + oc * OH * OW + oh * OW * 4 +
+                                             ow * 4);
+                    vbias.val[1] = vld1q_s32(bias + (oc + 4) * OH * OW +
+                                             oh * OW * 4 + ow * 4);
+                    res += Vector<int32_t, 8>(vbias) * 4;
+                }
+#if MEGDNN_AARCH64
+                int8x8_t res_int8 = op(res.value);
+                int32x2_t res32 = vreinterpret_s32_s8(res_int8);
+                tmp_output[oc / 4 * OH * OW + oh * OW + ow] =
+                        vget_lane_s32(res32, 0);
+                tmp_output[(oc / 4 + 1) * OH * OW + oh * OW + ow] =
+                        vget_lane_s32(res32, 1);
+#else
+
+                dt_qint8 res_int8 = dt_qint8(0);
+#define cb(i)                                                             \
+    res_int8 = op(dt_qint32(vgetq_lane_s32(res.value.val[0], i)));        \
+    output[oc * OH * OW + oh * OW * 4 + ow * 4 + i] = res_int8.as_int8(); \
+    res_int8 = op(dt_qint32(vgetq_lane_s32(res.value.val[1], i)));        \
+    output[(oc + 4) * OH * OW + oh * OW * 4 + ow * 4 + i] = res_int8.as_int8();
+                UNROLL_CALL_NOWRAPPER(4, cb);
+#undef cb
+#endif
+            }
+        }
+#undef cb
+    }
+};
+}  // namespace
+
+namespace megdnn {
+namespace arm_common {
+namespace winograd {
+
+MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_2x3_8x8_s8_nchw44)
+
+void winograd_2x3_8x8_s8_nchw44::filter(const int8_t* filter,
+                                 int16_t* filter_transform_buf,
+                                 int16_t* transform_mid_buf, size_t OC,
+                                 size_t IC, size_t oc_start, size_t oc_end) {
+    FilterTransform2X3_qs8::transform(filter, filter_transform_buf,
+                                      transform_mid_buf, OC, IC, oc_start,
+                                      oc_end);
+}
+
+void winograd_2x3_8x8_s8_nchw44::input(const int8_t* input,
+                                int16_t* input_transform_buf,
+                                int16_t* transform_mid_buf, size_t IH,
+                                size_t IW, size_t IC, size_t PH, size_t PW,
+                                size_t unit_start_idx,
+                                size_t nr_units_in_tile) {
+    megdnn_assert(IC % 8 == 0);
+    constexpr int alpha = 3 + 2 - 1;
+    auto units_w = div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE);
+    int16_t* patch = transform_mid_buf;
+    int16_t* patchT = transform_mid_buf;// + 8 * alpha * alpha;
+
+    for (size_t ic = 0; ic < IC; ic += 8) {
+        rep(unit_idx, nr_units_in_tile) {
+            size_t index = unit_start_idx + unit_idx;
+            size_t nh = index / units_w;
+            size_t nw = index % units_w;
+            int ih_start = nh * OUTPUT_BLOCK_SIZE - PH;
+            int iw_start = nw * OUTPUT_BLOCK_SIZE - PW;
+            if (ih_start >= 0 && ih_start + alpha <= static_cast<int>(IH) &&
+                iw_start >= 0 && iw_start + alpha <= static_cast<int>(IW)) {
+                InputTransform2X3_qs8::prepare<true>(input, patch, patchT,
+                                                     ih_start, iw_start, IH, IW,
+                                                     ic, IC,PH,PW);
+                InputTransform2X3_qs8::transform(patchT, input_transform_buf,
+                                                 unit_idx, nr_units_in_tile, ic,
+                                                 IC);
+
+            } else {
+                InputTransform2X3_qs8::prepare<false>(input, patch, patchT,
+                                                      ih_start, iw_start, IH,
+                                                      IW, ic, IC,PH,PW);
+                InputTransform2X3_qs8::transform(patchT, input_transform_buf,
+                                                 unit_idx, nr_units_in_tile, ic,
+                                                 IC);
+            }
+        }
+    }
+}
+
+void winograd_2x3_8x8_s8_nchw44::output(const int* output_transform_buf,
+                                 const int* bias, int8_t* output,
+                                 int* transform_mid_buf, BiasMode bmode,
+                                 NonlineMode nonline_mode, size_t OH, size_t OW,
+                                 size_t oc_start, size_t oc_end,
+                                 size_t unit_start_idx, 
+                                 size_t nr_units_in_tile) {
+#define cb(_bmode, _nonline_op, ...)                                    \
+    OutputTransform2X3_qs8<_bmode MEGDNN_COMMA _nonline_op>::transform( \
+            __VA_ARGS__);
+    auto units_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE);
+    for (size_t oc = oc_start; oc < oc_end; oc += 8) {
+        size_t oc_index = oc - oc_start;
+        rep(unit_idx, nr_units_in_tile) {
+            size_t index = unit_start_idx + unit_idx;
+            auto nh = index / units_w;
+            auto nw = index % units_w;
+            size_t oh_start = nh * OUTPUT_BLOCK_SIZE;
+            size_t ow_start = nw * OUTPUT_BLOCK_SIZE;
+            DISPATCH_CONV_WINOGRAD_BIAS_QUANTIZED(
+                    megdnn_arm_common_winograd_nchw44_s8_int16_8x8, cb,
+                    dt_qint32, dt_qint8, bmode, nonline_mode,
+                    output_transform_buf, bias, output, transform_mid_buf,
+                    oh_start, ow_start, OH, OW, oc_start, oc_end, oc_index,
+                    unit_idx, nr_units_in_tile, src_dtype, filter_dtype,
+                    dst_dtype);
+        }
+    }
+#undef cb
+}
+
+}  // namespace winograd
+}  // namespace arm_common
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/arm_common/conv_bias/opr_impl.cpp b/dnn/src/arm_common/conv_bias/opr_impl.cpp
index a1bab1b2d954aa0ed242a4a875ce4c788248becc..f8fd4f204f4f85f5baad3468e7bef2acab51fcaf 100644
--- a/dnn/src/arm_common/conv_bias/opr_impl.cpp
+++ b/dnn/src/arm_common/conv_bias/opr_impl.cpp
@@ -201,6 +201,14 @@ public:
                         static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
                         tile_size));
                 winograd_algos.emplace_back(refhold.back().get());
+                refhold.emplace_back(new AlgoS8CF32WinogradF23_4x4_NCHW44(
+                        static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
+                        tile_size));
+                winograd_algos.emplace_back(refhold.back().get());
+                refhold.emplace_back(new AlgoS8WinogradF23_8x8_NCHW44(
+                        static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo),
+                        tile_size));
+                winograd_algos.emplace_back(refhold.back().get());
             }
         }
     }
diff --git a/dnn/src/arm_common/conv_bias/opr_impl.h b/dnn/src/arm_common/conv_bias/opr_impl.h
index e1451126ac8879ffd3e19e8118b594b0d8c2317f..1e92a0fb0605d29d3c168c6c19abee6cd4f08f52 100644
--- a/dnn/src/arm_common/conv_bias/opr_impl.h
+++ b/dnn/src/arm_common/conv_bias/opr_impl.h
@@ -79,6 +79,8 @@ private:
     class AlgoI8x8x16Stride2;
     class AlgoI8x8x16Stride2Filter2;
     class AlgoS8WinogradF23_8x8;
+    class AlgoS8CF32WinogradF23_4x4_NCHW44;
+    class AlgoS8WinogradF23_8x8_NCHW44;
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     class AlgoF16Direct;
     class AlgoF16DirectStride1;
diff --git a/dnn/src/arm_common/conv_bias/winograd_common/winograd_common.h b/dnn/src/arm_common/conv_bias/winograd_common/winograd_common.h
index d1e43466c4772d189ccc92d9f22a5365130fe226..345935a2d4712355108d72015f912e1d1d0601dc 100644
--- a/dnn/src/arm_common/conv_bias/winograd_common/winograd_common.h
+++ b/dnn/src/arm_common/conv_bias/winograd_common/winograd_common.h
@@ -37,6 +37,13 @@ struct InputGetter<const uint8_t*, uint16x4_t> {
         return vget_low_u16(vmovl_u8(vld1_u8(ptr))) - zp;
     }
 };
+
+template <>
+struct InputGetter<const int8_t*, float32x4_t> {
+    float32x4_t operator()(const int8_t* ptr) {
+        return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vld1_s8(ptr)))));
+    }
+};
 }  // namespace
 }  // namespace arm_common
 }  // namespace megdnn
diff --git a/dnn/src/arm_common/elemwise_helper/kimpl/relu.h b/dnn/src/arm_common/elemwise_helper/kimpl/relu.h
index 76949ddbe44e12d50778a2659de29856bbe39f3f..cd52b5ba77e65cd9c70532e8ddd8b84f0b52d9e8 100644
--- a/dnn/src/arm_common/elemwise_helper/kimpl/relu.h
+++ b/dnn/src/arm_common/elemwise_helper/kimpl/relu.h
@@ -189,6 +189,11 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8> {
         vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero());
         return QConverter::convert<int8x8_t, float32x4_t>(vitem0);
     }
+    int8x8_t operator()(const float32x4_t& src) const {
+        auto vitem0 = vmulq_f32(src, this->vscale);
+        vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero());
+        return QConverter::convert<int8x8_t, float32x4_t>(vitem0);
+    }
 };
 #else
 template <>
@@ -215,12 +220,25 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8>,
         return vqmovn_s16(vcombine_s16(vqmovn_s32(vrshlq_s32(vitem0, vshift)),
                                        vqmovn_s32(vrshlq_s32(vitem1, vshift))));
     }
+    int8x8_t operator()(const float32x4_t& vsrc) const {
+        int32x4_t vitem0 = vqrdmulhq_s32(vcvtq_s32_f32(vsrc), vmultiplier);
+        vitem0 = vmaxq_s32(vitem0, QConverterBase::vzero());
+        vitem0 = vrshlq_s32(vitem0, vshift);
+        int16x4_t vitem = vqmovn_s32(vitem0);
+        return vqmovn_s16(vcombine_s16(vitem, vitem));
+    }
     void operator()(const int32x4_t& src, dt_qint8* dst) const {
         auto vitem0 = vmulq_f32(vcvtq_f32_s32(src), this->vscale);
         vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero());
         auto result = QConverter::convert<int8x8_t, float32x4_t>(vitem0);
         vst1_lane_s32(reinterpret_cast<int32_t*>(dst), (int32x2_t)result, 0);
     }
+    void operator()(const float32x4_t& src, dt_qint8* dst) const {
+        auto vitem0 = vmulq_f32(src, this->vscale);
+        vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero());
+        auto result = QConverter::convert<int8x8_t, float32x4_t>(vitem0);
+        vst1_lane_s32(reinterpret_cast<int32_t*>(dst), (int32x2_t)result, 0);
+    }
 };
 #endif
 
diff --git a/dnn/src/arm_common/elemwise_helper/kimpl/typecvt.h b/dnn/src/arm_common/elemwise_helper/kimpl/typecvt.h
index 3104b0eef29fece29383fd5aab1138a886e9a35c..c52a8c5db777bb1b77c1cd388e745dd777462712 100644
--- a/dnn/src/arm_common/elemwise_helper/kimpl/typecvt.h
+++ b/dnn/src/arm_common/elemwise_helper/kimpl/typecvt.h
@@ -50,6 +50,10 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8> {
         auto vitem0 = vmulq_f32(vcvtq_f32_s32(src), this->vscale);
         return QConverter::convert<int8x8_t, float32x4_t>(vitem0);
     }
+    int8x8_t operator()(const float32x4_t& src) const {
+        auto vitem0 = vmulq_f32(src, this->vscale);
+        return QConverter::convert<int8x8_t, float32x4_t>(vitem0);
+    }
 };
 #else
 template <>
@@ -95,6 +99,13 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8>,
         int16x4_t vres0_int16 = vqmovn_s32(vrshlq_s32(vitem0, vshift));
         return vqmovn_s16(vcombine_s16(vres0_int16, vres0_int16));
     }
+    int8x8_t operator()(const float32x4_t& src) const {
+        int32x4_t vitem0 = vqrdmulhq_s32(vcvtq_s32_f32(src), vmultiplier);
+        auto fixup0 = vshrq_n_s32(vitem0, 31);
+        vitem0 = vqaddq_s32(vitem0, fixup0);
+        int16x4_t vres0_int16 = vqmovn_s32(vrshlq_s32(vitem0, vshift));
+        return vqmovn_s16(vcombine_s16(vres0_int16, vres0_int16));
+    }
 };
 #endif
 
diff --git a/dnn/src/arm_common/utils.h b/dnn/src/arm_common/utils.h
index 3b790fbe65b1414be3db05ee0e99abee8e7b96d9..d5d368fccbbe42d89a7858b3f2d9af6612b323a1 100644
--- a/dnn/src/arm_common/utils.h
+++ b/dnn/src/arm_common/utils.h
@@ -369,6 +369,72 @@ struct Vector<int16_t, 8> {
     }
 };
 
+template <>
+struct Vector<int16_t, 4> {
+    int16x4_t value;
+    Vector() {}
+    Vector(const int16_t v) { value = vdup_n_s16(v); }
+    Vector(const Vector& lr) { value = lr.value; }
+    Vector(const Vector&& lr) { value = std::move(lr.value); }
+    Vector(const int16x4_t& v) { value = v; }
+    static Vector load(const int16_t* addr) {
+        Vector v;
+        v.value = vld1_s16(addr);
+        return v;
+    }
+    static void save(int16_t* addr, const Vector& v) {
+        vst1_s16(addr, v.value);
+    }
+    void save(int16_t* addr) { save(addr, *this); }
+    Vector operator+(const Vector& lr) {
+        Vector dst;
+        dst.value = vadd_s16(value, lr.value);
+        return dst;
+    }
+    Vector& operator+=(const Vector& lr) {
+        value = vadd_s16(value, lr.value);
+        return *this;
+    }
+    Vector operator-(const Vector& lr) {
+        Vector dst;
+        dst.value = vsub_s16(value, lr.value);
+        return dst;
+    }
+    Vector& operator-=(const Vector& lr) {
+        value = vsub_s16(value, lr.value);
+        return *this;
+    }
+    Vector operator*(int16_t lr) {
+        Vector dst;
+        dst.value = vmul_n_s16(value, lr);
+        return dst;
+    }
+    Vector operator*(const Vector& lr) {
+        Vector dst;
+        dst.value = vmul_s16(value, lr.value);
+        return dst;
+    }
+    Vector& operator*=(const Vector& lr) {
+        value = vmul_s16(value, lr.value);
+        return *this;
+    }
+    Vector& operator=(const Vector& lr) {
+        value = lr.value;
+        return *this;
+    }
+    Vector& operator=(const Vector&& lr) {
+        value = std::move(lr.value);
+        return *this;
+    }
+    Vector operator-() {
+        Vector dst;
+        dst.value = -value;
+        return dst;
+    }
+};
+
+
+
 template <>
 struct Vector<int32_t, 8> {
     int32x4x2_t value;
diff --git a/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp b/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp
index 3842a65013bcb8c4af79fe51e25bb349a192478e..08f6d3d9bdd9afcb4bb7558d69a968fbbdd7dbaa 100644
--- a/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp
+++ b/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp
@@ -109,12 +109,36 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
     }
     if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
         const dt_int8* src_ptr = src.compatible_ptr<dt_int8>();
-        dt_int16* dst_ptr = dst.compatible_ptr<dt_int16>();
-        dt_int16* workspace_ptr = workspace.ptr<dt_int16>();
-        if (FW == 3) {
-            if (m == 2) {
-                DISPATCH(winograd_2x3_8x8_s8, param::Winograd::Format::MK8, 1,
-                         0);
+        if (param().compute_mode == param::ConvBias::ComputeMode::DEFAULT) {
+            dt_int16* dst_ptr = dst.compatible_ptr<dt_int16>();
+            dt_int16* workspace_ptr = workspace.ptr<dt_int16>();
+            if (FW == 3) {
+                if (m == 2) {
+                    if (pack_c_size == 1) {
+                        DISPATCH(winograd_2x3_8x8_s8,
+                                 param::Winograd::Format::MK8, 1, 0);
+                    } else if (pack_c_size == 4) {
+                        DISPATCH(winograd_2x3_8x8_s8_nchw44,
+                                 param::Winograd::Format::MK8, 1, 0);
+                    }else{
+                        megdnn_throw("only support pack_c_size = 1 or 4");
+                    }
+                }
+            }
+        } else {
+            dt_int32* dst_ptr_tmp = dst.compatible_ptr<dt_int32>();
+            dt_int32* workspace_ptr_tmp = workspace.ptr<dt_int32>();
+            float* dst_ptr = reinterpret_cast<float*>(dst_ptr_tmp);
+            float* workspace_ptr = reinterpret_cast<float*>(workspace_ptr_tmp);
+            if (pack_c_size == 4) {
+                if (FW == 3) {
+                    if (m == 2) {
+                        DISPATCH(winograd_2x3_4x4_s8_f32_nchw44,
+                                 param::Winograd::Format::MK4, 1, 1);
+                    }
+                }
+            } else {
+                megdnn_throw("only support pack_c_size == 4");
             }
         }
     }
diff --git a/dnn/src/common/conv_bias.cpp b/dnn/src/common/conv_bias.cpp
index af5d6caa5f9d1dfeb9783010183024ddfeccf42e..c486a119320f45a4c1ec255f05e6e76b01d0cbd4 100644
--- a/dnn/src/common/conv_bias.cpp
+++ b/dnn/src/common/conv_bias.cpp
@@ -37,7 +37,9 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
          param().format == param::ConvBias::Format::NCHW88_WINOGRAD ||
          param().format == param::ConvBias::Format::NCHW44_WINOGRAD) &&
         src.dtype.category() == DTypeCategory::QUANTIZED) {
-        megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16);
+        megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16 ||
+                      //!int8 winogradf23_44 using float,QuantizedS32 take the scale
+                      filter.dtype.enumv() == DTypeEnum::QuantizedS32);
         megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 ||
                       src.dtype.enumv() == DTypeEnum::Quantized8Asymm);
     } else {
@@ -49,7 +51,12 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
         if (param().format == param::ConvBias::Format::NCHW_WINOGRAD ||
             param().format == param::ConvBias::Format::NCHW88_WINOGRAD ||
             param().format == param::ConvBias::Format::NCHW44_WINOGRAD) {
-            scale_filter = filter.dtype.param<dtype::QuantizedS16>().scale;
+            if (filter.dtype.enumv() == DTypeEnum::QuantizedS32) {
+                //!int8 winogradf23_44 using float,QuantizedS32 take the scale
+                scale_filter = filter.dtype.param<dtype::QuantizedS32>().scale;
+            } else {
+                scale_filter = filter.dtype.param<dtype::QuantizedS16>().scale;
+            }
         } else {
             scale_filter = filter.dtype.param<dtype::QuantizedS8>().scale;
         }
diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp
index e346d4e6449e94cab792b0304472a631232c5509..914d23515a6e90bd7098aad7532483b5a82572ac 100644
--- a/dnn/src/common/convolution.cpp
+++ b/dnn/src/common/convolution.cpp
@@ -312,11 +312,14 @@ void make_canonized_filter_meta_nchwxx(
     size_t img_ndim = 2;
     size_t flt_start = 0;
     size_t flt_spatial_start = 2;
+    size_t pack_c_size = 0;
     if (param.sparse == Param::Sparse::DENSE) {
         if (filter.ndim == img_ndim + 4) {
             // oihw8i8o case
-            megdnn_assert(filter[filter.ndim - 2] == pack_size &&
-                                  filter[filter.ndim - 1] == pack_size,
+            megdnn_assert((filter[filter.ndim - 2] == pack_size &&
+                           filter[filter.ndim - 1] == pack_size) ||
+                                  (filter[filter.ndim - 2] == 2 * pack_size &&
+                                   filter[filter.ndim - 1] == 2 * pack_size),
                           "last 2 dim of filter must be %zu, but got %zu, %zu",
                           pack_size, filter[filter.ndim - 2],
                           filter[filter.ndim - 1]);
@@ -326,8 +329,14 @@ void make_canonized_filter_meta_nchwxx(
                 param.format == Param::Format::NCHW44_WINOGRAD) {
                 flt_start = 2;
             }
-            ret.ocpg = filter[flt_start] * pack_size;
-            ret.icpg = filter[flt_start + 1] * pack_size;
+            if (filter[filter.ndim - 2] == 2 * pack_size &&
+                filter[filter.ndim - 1] == 2 * pack_size) {
+                pack_c_size = 2 * pack_size;
+            } else {
+                pack_c_size = pack_size;
+            }
+            ret.ocpg = filter[flt_start] * pack_c_size;
+            ret.icpg = filter[flt_start + 1] * pack_c_size;
         } else if (filter.ndim == img_ndim + 3) {
             // ohwi8o
             megdnn_assert(param.format != Param::Format::NCHW88_WINOGRAD,
@@ -375,15 +384,23 @@ void make_canonized_filter_meta_nchwxx(
                           "bad filter ndim for group convolution: "
                           "spatial_ndim=%zu filter_ndim=%zu",
                           img_ndim, filter.ndim);
-            megdnn_assert(filter[filter.ndim - 1] == pack_size &&
-                                  filter[filter.ndim - 2] == pack_size,
+            megdnn_assert((filter[filter.ndim - 1] == pack_size &&
+                           filter[filter.ndim - 2] == pack_size) ||
+                           (filter[filter.ndim - 1] == 2 * pack_size &&
+                            filter[filter.ndim - 2] == 2 * pack_size),
                           "last 2 dim of filter must be %zu, but got %zu, %zu",
                           pack_size, filter[filter.ndim - 2],
                           filter[filter.ndim - 1]);
 
             ret.group = filter[0];
-            ret.ocpg = filter_oc * pack_size;
-            ret.icpg = filter_ic * pack_size;
+            if (filter[filter.ndim - 2] == 2 * pack_size &&
+                filter[filter.ndim - 1] == 2 * pack_size) {
+                ret.ocpg = filter_oc * 2 * pack_size;
+                ret.icpg = filter_ic * 2 * pack_size;
+            } else {
+                ret.ocpg = filter_oc * pack_size;
+                ret.icpg = filter_ic * pack_size;
+            }
         }
     }
     ret.spatial_ndim = 2;
@@ -596,8 +613,17 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src,
     } else if (src.enumv() == DTypeEnum::QuantizedS8 ||
                src.enumv() == DTypeEnum::Quantized8Asymm ||
                src.enumv() == DTypeEnum::Quantized4Asymm) {
-        supported_dst_dtype.push_back(
-                dtype::QuantizedS32(mul_scale(src, filter)));
+        //! Qint8 winograd compute with float, in order to bringing the filter
+        //! scale, here just use QuantizedS32 as filter type.
+        if (src.enumv() == DTypeEnum::QuantizedS8 &&
+            filter.enumv() == DTypeEnum::QuantizedS32) {
+            supported_dst_dtype.push_back(dtype::QuantizedS32(
+                    src.param<dtype::QuantizedS8>().scale *
+                    filter.param<dtype::QuantizedS32>().scale));
+        } else {
+            supported_dst_dtype.push_back(
+                    dtype::QuantizedS32(mul_scale(src, filter)));
+        }
         if (dst.valid() && dst.enumv() == src.enumv()) {
             supported_dst_dtype.push_back(dst);
         }
@@ -625,12 +651,13 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src,
         megdnn_assert(dst_supported, "unsupported Conv(%s, %s) -> %s",
                       src.name(), filter.name(), dst.name());
     }
-    megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32
+    megdnn_assert((param().compute_mode == Param::ComputeMode::FLOAT32 ||
+                   param().compute_mode == Param::ComputeMode::DEFAULT)
 #if !MEGDNN_DISABLE_FLOAT16
-                          || src.enumv() == DTypeEnum::Float16
-                          || src.enumv() == DTypeEnum::BFloat16
+                          || src.enumv() == DTypeEnum::Float16 ||
+                          src.enumv() == DTypeEnum::BFloat16
 #endif
-                          ,
+                  ,
                   "ComputeMode::FLOAT32 is only available for Float16/BFloat16 "
                   "input / output.");
 }
@@ -645,10 +672,12 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
     megdnn_assert_contiguous(src);
     megdnn_assert_contiguous(filter);
     megdnn_assert(src.ndim >= 3_z, "%s", errmsg().c_str());
-    if (param().format == Param::Format::NCHW_WINOGRAD &&
+    if ((param().format == Param::Format::NCHW_WINOGRAD ||
+         param().format == Param::Format::NCHW44_WINOGRAD) &&
         src.dtype.category() == DTypeCategory::QUANTIZED) {
-        megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16, "%s",
-                      errmsg().c_str());
+        megdnn_assert((filter.dtype.enumv() == DTypeEnum::QuantizedS16 ||
+                       filter.dtype.enumv() == DTypeEnum::QuantizedS32),
+                      "%s", errmsg().c_str());
         megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 ||
                               src.dtype.enumv() == DTypeEnum::Quantized8Asymm,
                       "%s", errmsg().c_str());
@@ -741,14 +770,18 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
         if (param().format == Param::Format::NCHW44 ||
             param().format == Param::Format::NCHW44_DOT ||
             param().format == Param::Format::NCHW44_WINOGRAD) {
+            //!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul
             megdnn_assert((src.ndim == 4 && filter.ndim == 5 &&
                            filter[filter.ndim - 1] == 4) ||
                                   (src.ndim == 5 &&
                                    ((filter.ndim == 6 &&
-                                     filter[filter.ndim - 1] == 4) ||
+                                     (filter[filter.ndim - 1] == 4 ||
+                                      filter[filter.ndim - 1] == 8)) ||
                                     (filter.ndim == 7 &&
-                                     filter[filter.ndim - 1] == 4 &&
-                                     filter[filter.ndim - 2] == 4)) &&
+                                     (filter[filter.ndim - 1] == 4 ||
+                                      filter[filter.ndim - 1] == 8) &&
+                                     (filter[filter.ndim - 2] == 4 ||
+                                      filter[filter.ndim - 2] == 8))) &&
                                    src[src.ndim - 1] == 4),
                           "NCHW44 require src ndim is 5 and filter's ndim is 6 "
                           ", and last shape two is 4 but got src %s, filter %s",
diff --git a/dnn/src/common/winograd/winograd_helper.cpp b/dnn/src/common/winograd/winograd_helper.cpp
index 10ba9af1541a625b3dcf01b98d181861a1d949cd..0e3cfa5b3e26cc421d5ed58e3b7ffa1f090fbf02 100644
--- a/dnn/src/common/winograd/winograd_helper.cpp
+++ b/dnn/src/common/winograd/winograd_helper.cpp
@@ -67,8 +67,8 @@ constexpr size_t layout_pack_size(param::ConvBias::Format layout) {
     switch (layout) {
         case param::ConvBias::Format::NHWCD4:
             return 4;
-        case param::ConvBias::Format::NCHW4:
         case param::ConvBias::Format::NCHW44:
+        case param::ConvBias::Format::NCHW4:
             return 4;
         case param::ConvBias::Format::NCHW32:
             return 32;
@@ -365,6 +365,7 @@ INST(uint8_t, uint8_t, int16_t, int)
             _output_compute_type, layout, param::MatrixMul::Format::MK4>;
 INST(float, float, float, float, param::ConvBias::Format::NCHW)
 INST(float, float, float, float, param::ConvBias::Format::NCHW44)
+INST(int8_t, int8_t, float, float, param::ConvBias::Format::NCHW44)
 #undef INST
 
 #define INST(_ctype, _dst_type, _input_filter_compute_type, \
@@ -373,6 +374,7 @@ INST(float, float, float, float, param::ConvBias::Format::NCHW44)
             _ctype, _dst_type, _input_filter_compute_type,  \
             _output_compute_type, layout, param::MatrixMul::Format::MK8>;
 INST(int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW)
+INST(int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW44)
 INST(float, float, float, float, param::ConvBias::Format::NCHW88)
 MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16,
                         param::ConvBias::Format::NCHW))
diff --git a/dnn/src/common/winograd_filter_preprocess.cpp b/dnn/src/common/winograd_filter_preprocess.cpp
index c3471ed99f144c00f9623c1313ff5abf123e3ae3..486aadb6694617d7f2a84f72955a2e14449a3cac 100644
--- a/dnn/src/common/winograd_filter_preprocess.cpp
+++ b/dnn/src/common/winograd_filter_preprocess.cpp
@@ -56,8 +56,16 @@ void WinogradFilterPreprocess::deduce_layout(const TensorLayout& src,
     DType dst_type = src.dtype;
     if (src.dtype.category() == DTypeCategory::QUANTIZED) {
         megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8);
-        dst_type = dtype::QuantizedS16(
-                src.dtype.param<dtype::QuantizedS8>().scale);
+        if (param().compute_mode ==
+            param::ConvBias::ComputeMode::DEFAULT) {
+            //! input int8 compute short
+            dst_type = dtype::QuantizedS16(
+                    src.dtype.param<dtype::QuantizedS8>().scale);
+        } else {
+            //! input int8 compute float32
+            dst_type = dtype::QuantizedS32(
+                    src.dtype.param<dtype::QuantizedS8>().scale);
+        }
     }
 
     if (src.ndim == 4 || src.ndim == 6) {
@@ -123,8 +131,16 @@ size_t WinogradFilterPreprocess::get_workspace_in_bytes(
     if (src.dtype.category() == DTypeCategory::QUANTIZED) {
         megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 ||
                       src.dtype.enumv() == DTypeEnum::Quantized8Asymm);
-        output_compute_dtype = dtype::QuantizedS16(
-                src.dtype.param<dtype::QuantizedS8>().scale);
+        if (param().compute_mode ==
+            param::ConvBias::ComputeMode::DEFAULT) {
+            //! input int8 compute short
+            output_compute_dtype = dtype::QuantizedS16(
+                    src.dtype.param<dtype::QuantizedS8>().scale);
+        } else {
+            //! input int8 compute float32
+            output_compute_dtype = dtype::QuantizedS32(
+                    src.dtype.param<dtype::QuantizedS8>().scale);
+        }
     }
 
     size_t FW = src[3];
diff --git a/dnn/src/naive/conv_bias/opr_impl.cpp b/dnn/src/naive/conv_bias/opr_impl.cpp
index ffa2190d37e0f7e7b3fdd892967daacb13eea589..87abe1d374df5be5738d9a0a1395d3601c590aa4 100644
--- a/dnn/src/naive/conv_bias/opr_impl.cpp
+++ b/dnn/src/naive/conv_bias/opr_impl.cpp
@@ -118,6 +118,9 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
         DISPATCH(QuantizedS8, QuantizedS32)
         DISPATCH(Quantized8Asymm, QuantizedS32)
         DISPATCH(Quantized4Asymm, QuantizedS32)
+        DISPATCH_RAW(QuantizedS8, QuantizedS32, QuantizedS32, FLOAT32,
+                     (convolution::forward_bias<dt_int8, dt_int8, dt_int32,
+                                                dt_int32>))
 #if !MEGDNN_DISABLE_FLOAT16
         DISPATCH(Float16, Float16)
         DISPATCH_RAW(Float16, Float16, Float16, FLOAT32,
diff --git a/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp b/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
index 0ebf578d698d60a78a480a8e3b76787fa2297c48..26490bf153d7164240907a30ad131e660811a2ee 100644
--- a/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
+++ b/dnn/src/naive/winograd_filter_preprocess/opr_impl.cpp
@@ -171,7 +171,6 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
                 }
             }
 #undef cb
-#undef DISPATCH_FORMAT_MK8
 #undef DISPATCH_DTYPE
         }
         else if (pack_c_size == 4) {  //! NCHW44
@@ -195,6 +194,15 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
     if (src.layout.dtype.enumv() == DTypeEnum::Float32) {               \
         DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \
                         DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0);     \
+    }                                                                   \
+    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {           \
+        if (param().format == param::Winograd::Format::MK4) {           \
+            DISPATCH_KERNEL(dt_int8, dt_int8, dt_float32, dt_float32,   \
+                            DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \
+        } else if (param().format == param::Winograd::Format::MK8) {    \
+            DISPATCH_KERNEL(dt_int8, dt_int8, dt_int16, dt_int32,       \
+                            DISPATCH_FORMAT_MK8, 2.0f, _midout_tag, 0); \
+        }                                                               \
     }
             if (FW == 3) {
                 if (m == 2) {
@@ -208,6 +216,7 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src,
             }
 #undef cb
 #undef DISPATCH_FORMAT_MK8
+#undef DISPATCH_FORMAT_MK4
 #undef DISPATCH_KERNEL
 #undef DISPATCH_DTYPE
         }
diff --git a/dnn/test/arm_common/conv_bias.cpp b/dnn/test/arm_common/conv_bias.cpp
index 55c01ec6f4daeda4d64408d8cdf7fe6cd14d8b22..7651dd7ed94fb85a74dc576074121bc440d8f4ed 100644
--- a/dnn/test/arm_common/conv_bias.cpp
+++ b/dnn/test/arm_common/conv_bias.cpp
@@ -98,7 +98,8 @@ CB_TEST(H_SWISH);
 
 #if MEGDNN_WITH_BENCHMARK
 
-static void benchmark_convbias(Handle* handle, bool is_fp32 = false) {
+static void benchmark_convbias(Handle* handle, std::string int_name,
+                               std::string float_name, bool is_fp32 = false) {
     constexpr size_t RUNS = 30;
 
     Benchmarker<ConvBias> benchmarker_int(handle);
@@ -109,12 +110,12 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) {
             .set_dtype(4, dtype::QuantizedS8(60.25))
             .set_display(false);
     benchmarker_int.set_before_exec_callback(
-            conv_bias::ConvBiasAlgoChecker<ConvBias>("IM2COLMATMUL:.+"));
+            conv_bias::ConvBiasAlgoChecker<ConvBias>(int_name.c_str()));
 
     Benchmarker<ConvBias> benchmarker_float(handle);
     benchmarker_float.set_display(false).set_times(RUNS);
     benchmarker_float.set_before_exec_callback(
-            conv_bias::ConvBiasAlgoChecker<ConvBias>("IM2COLMATMUL:.+"));
+            conv_bias::ConvBiasAlgoChecker<ConvBias>(float_name.c_str()));
 
     Benchmarker<ConvBias> benchmarker_nchw44(handle);
     if (is_fp32) {
@@ -233,13 +234,24 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) {
         }
     }
 }
+
 TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_NCHW44) {
-    benchmark_convbias(handle(), true);
-    benchmark_convbias(handle(), false);
+#if MEGDNN_AARCH64
+    benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
+                       "IM2COLMATMUL:AARCH64_F32K8X12X1:192", true);
+#else
+    benchmark_convbias(handle(), "IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8:384",
+                       "IM2COLMATMUL:ARMV7_F32:192", true);
+#endif
 }
 TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_CONVBIAS_NCHW44) {
-    benchmark_convbias(handle(), true);
-    benchmark_convbias(handle(), false);
+#if MEGDNN_AARCH64
+    benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
+                       "IM2COLMATMUL:AARCH64_F32K8X12X1:192", true);
+#else
+    benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384",
+                       "IM2COLMATMUL:ARMV7_F32:192", true);
+#endif
 }
 
 #endif
@@ -506,7 +518,7 @@ void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name,
                computations / used_im2col, used / used_im2col);
     }
 }
-
+#if MEGDNN_AARCH64
 TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) {
     printf("=========================compare "
            "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16, "
@@ -515,6 +527,7 @@ TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) {
                                     "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16",
                                     handle(), 3, 4);
 }
+#endif
 
 TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) {
     constexpr size_t RUNS = 50;
diff --git a/dnn/test/arm_common/conv_bias_multi_thread.cpp b/dnn/test/arm_common/conv_bias_multi_thread.cpp
index 0f41a0e309f8283e2d83dd4e7c8b4465aab91910..2ca1c830934940f456a50863a5f24e955c0370b0 100644
--- a/dnn/test/arm_common/conv_bias_multi_thread.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp
@@ -989,7 +989,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) {
     checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
             ssprintf("WINOGRAD:%s:8:2:32", matmul_name).c_str()));
 
-    std::vector<TestArg> args = get_winograd_mk_packed_args(8);
     std::vector<TestArg> quantized_args =
             get_quantized_winograd_mk_packed_args(8);
     UniformIntRNG int_rng{-50, 50};
@@ -999,6 +998,174 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) {
         dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3);
 }
 
+
+TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8) {
+    using namespace conv_bias;
+
+    Checker<ConvBiasForward> checker(handle());
+    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
+                          const std::vector<size_t>& out_size, DType A_dtype,
+                          DType B_dtype, DType C_dtype, DType D_dtype,
+                          param::MatrixMul::Format format, float eps) {
+        for (auto&& arg : args) {
+            for (uint32_t m : out_size) {
+              checker.set_extra_opr_impl(std::bind(
+                      winograd_algo_extra_impl, std::placeholders::_1, m,
+                      arg.param, handle, format));
+              checker.set_dtype(0, A_dtype)
+                      .set_dtype(1, B_dtype)
+                      .set_dtype(2, C_dtype)
+                      .set_dtype(4, D_dtype)
+                      .set_epsilon(eps)
+                      .set_param(arg.param)
+                      .execs({arg.src, arg.filter, arg.bias, {}, {}});
+           }
+        }
+    };
+
+#if MEGDNN_AARCH64
+    const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8";
+#else
+    const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8";
+#endif
+    checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
+            ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str()));
+
+    std::vector<TestArg> quantized_args = get_int8_nchw44_args (3,4);
+    UniformIntRNG int_rng{-50, 50};
+    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
+    run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f),
+        dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
+        dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3);
+}
+
+
+TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_GROUPMODE) {
+    using namespace conv_bias;
+
+    Checker<ConvBiasForward> checker(handle());
+    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
+                          const std::vector<size_t>& out_size, DType A_dtype,
+                          DType B_dtype, DType C_dtype, DType D_dtype,
+                          param::MatrixMul::Format format, float eps) {
+        for (auto&& arg : args) {
+            for (uint32_t m : out_size) {
+              checker.set_extra_opr_impl(std::bind(
+                      winograd_algo_extra_impl, std::placeholders::_1, m,
+                      arg.param, handle, format));
+              checker.set_dtype(0, A_dtype)
+                      .set_dtype(1, B_dtype)
+                      .set_dtype(2, C_dtype)
+                      .set_dtype(4, D_dtype)
+                      .set_epsilon(eps)
+                      .set_param(arg.param)
+                      .execs({arg.src, arg.filter, arg.bias, {}, {}});
+           }
+        }
+    };
+
+#if MEGDNN_AARCH64
+    const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8";
+#else
+    const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8";
+#endif
+    checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
+            ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str()));
+
+    std::vector<TestArg> quantized_args =
+            get_int8_nchw44_args(3, 4, false, true);
+    UniformIntRNG int_rng{-50, 50};
+    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
+    run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f),
+        dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f),
+        dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3);
+}
+
+TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32) {
+    using namespace conv_bias;
+
+    Checker<ConvBiasForward> checker(handle());
+    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
+                          const std::vector<size_t>& out_size, DType A_dtype,
+                          DType B_dtype, DType C_dtype, DType D_dtype,
+                          param::MatrixMul::Format format, float eps) {
+        for (auto&& arg : args) {
+            for (uint32_t m : out_size) {
+                checker.set_extra_opr_impl(std::bind(
+                        winograd_algo_extra_impl, std::placeholders::_1, m,
+                        arg.param, handle, format));
+                checker.set_dtype(0, A_dtype)
+                        .set_dtype(1, B_dtype)
+                        .set_dtype(2, C_dtype)
+                        .set_dtype(4, D_dtype)
+                        .set_epsilon(eps)
+                        .set_param(arg.param)
+                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
+            }
+        }
+    };
+
+    float epsilon = 0.001;
+#if MEGDNN_AARCH64
+    const char* matmul_name = "AARCH64_F32_MK4_4x16";
+#else
+    const char* matmul_name = "ARMV7_F32_MK4_4x8"; 
+#endif
+    checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
+            ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str()));
+    std::vector<TestArg> quantized_args =
+            get_int8_nchw44_args(3, 4, true);
+    UniformIntRNG int_rng{-50, 50};
+    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
+    run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f),
+        dtype::QuantizedS8(0.01887994f),
+        dtype::QuantizedS32(0.41113496f * 0.01887994f),
+        dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, epsilon);
+}
+
+TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32_GROUPMODE) {
+    using namespace conv_bias;
+
+    Checker<ConvBiasForward> checker(handle());
+    auto run = [&checker](Handle* handle, const std::vector<TestArg>& args,
+                          const std::vector<size_t>& out_size, DType A_dtype,
+                          DType B_dtype, DType C_dtype, DType D_dtype,
+                          param::MatrixMul::Format format, float eps) {
+        for (auto&& arg : args) {
+            for (uint32_t m : out_size) {
+                checker.set_extra_opr_impl(std::bind(
+                        winograd_algo_extra_impl, std::placeholders::_1, m,
+                        arg.param, handle, format));
+                checker.set_dtype(0, A_dtype)
+                        .set_dtype(1, B_dtype)
+                        .set_dtype(2, C_dtype)
+                        .set_dtype(4, D_dtype)
+                        .set_epsilon(eps)
+                        .set_param(arg.param)
+                        .execs({arg.src, arg.filter, arg.bias, {}, {}});
+            }
+        }
+    };
+
+    float epsilon = 0.001;
+#if MEGDNN_AARCH64
+    const char* matmul_name = "AARCH64_F32_MK4_4x16";
+#else
+    const char* matmul_name = "ARMV7_F32_MK4_4x8"; 
+#endif
+    checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
+            ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str()));
+    std::vector<TestArg> quantized_args =
+            get_int8_nchw44_args(3, 4, true, true);
+    UniformIntRNG int_rng{-50, 50};
+    checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng);
+    run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f),
+        dtype::QuantizedS8(0.01887994f),
+        dtype::QuantizedS32(0.41113496f * 0.01887994f),
+        dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, epsilon);
+}
+
+
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F23) {
     using namespace conv_bias;
diff --git a/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp b/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp
index e8accc27b0c28c6ebff386744eb95c020c33ca8a..945f6cc9370c0c4981022ffa6ea68c7fadc3dc7d 100644
--- a/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp
@@ -1185,6 +1185,197 @@ TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_F32) {
                    {1, {4}}, data_type);
 }
 
+TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_INT8) {
+    constexpr size_t RUNS = 50;
+
+    param::ConvBias param;
+    param.nonlineMode = param::ConvBias::NonlineMode::RELU;
+    param.pad_h = 1;
+    param.pad_w = 1;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.sparse = param::ConvBias::Sparse::GROUP;
+
+    std::vector<std::pair<SmallVector<TensorShape>, float>>
+            shapes_and_computation;
+    auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
+                          size_t FS, size_t group) {
+        SmallVector<TensorShape> shapes{{N, IC, H, W},
+                                        {group, OC / group, IC / group, FS, FS},
+                                        {1, OC, 1, 1},
+                                        {},
+                                        {N, OC, H, W}};
+        TensorShape dst{N, OC, H, W};
+        float computations =
+                ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
+                 dst.total_nr_elems()) *
+                1e-6;
+        shapes_and_computation.push_back(std::make_pair(shapes, computations));
+    };
+
+    bench_case(1, 32, 32, 200, 200, 3, 4);
+    bench_case(1, 32, 32, 200, 200, 3, 1);
+    bench_case(1, 32, 32, 128, 128, 3, 4);
+    bench_case(1, 32, 32, 128, 128, 3, 1);
+    bench_case(1, 32, 32, 100, 100, 3, 4);
+    bench_case(1, 32, 32, 100, 100, 3, 1);
+    bench_case(1, 32, 32, 80, 80, 3, 4);
+
+    bench_case(1, 512, 512, 14, 14, 3, 1);
+    bench_case(1, 512, 256, 14, 14, 3, 1);
+    bench_case(1, 512, 128, 14, 14, 3, 1);
+    bench_case(1, 512, 64, 14, 14, 3, 1);
+
+    bench_case(1, 512, 512, 7, 7, 3, 1);
+    bench_case(1, 512, 256, 7, 7, 3, 1);
+    bench_case(1, 512, 128, 7, 7, 3, 1);
+    bench_case(1, 512, 64, 7, 7, 3, 1);
+
+    std::string algo_name;
+#if MEGDNN_AARCH64
+    algo_name = "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2:32";
+#else
+    algo_name = "WINOGRAD:ARMV7_INT16X16X32_MK8_4X8:8:2:32";
+#endif
+
+
+    std::vector<DType> data_type = {dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
+                                   dtype::QuantizedS32(6.25f) ,dtype::QuantizedS8(60.25f) };
+    printf("Benchmark WINOGRAD_IN8_MK8 algo\n");
+    benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
+                   {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
+    benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
+                   {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
+    benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
+                   {1, {4}}, data_type);
+}
+
+TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
+       BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_MK8) {
+    constexpr size_t RUNS = 50;
+
+    param::ConvBias param;
+    param.nonlineMode = param::ConvBias::NonlineMode::RELU;
+    param.pad_h = 1;
+    param.pad_w = 1;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.sparse = param::ConvBias::Sparse::DENSE;
+    param.format = param::ConvBias::Format::NCHW44;
+
+    std::vector<std::pair<SmallVector<TensorShape>, float>>
+            shapes_and_computation;
+    auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
+                          size_t FS, size_t group) {
+        SmallVector<TensorShape> shapes{{N, IC / 4, H, W, 4},
+                                        {OC / 4, IC / 4, FS, FS, 4, 4},
+                                        {1, OC / 4, 1, 1, 4},
+                                        {},
+                                        {N, OC / 4, H, W, 4}};
+        TensorShape dst{N, OC, H, W};
+        float computations =
+                ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
+                 dst.total_nr_elems()) *
+                1e-6;
+        shapes_and_computation.push_back(std::make_pair(shapes, computations));
+    };
+
+    bench_case(1, 32, 32, 200, 200, 3, 1);
+    bench_case(1, 32, 32, 128, 128, 3, 1);
+    bench_case(1, 32, 32, 100, 100, 3, 1);
+
+    bench_case(1, 512, 512, 14, 14, 3, 1);
+    bench_case(1, 512, 256, 14, 14, 3, 1);
+    bench_case(1, 512, 128, 14, 14, 3, 1);
+    bench_case(1, 512, 64, 14, 14, 3, 1);
+
+    bench_case(1, 512, 512, 7, 7, 3, 1);
+    bench_case(1, 512, 256, 7, 7, 3, 1);
+    bench_case(1, 512, 128, 7, 7, 3, 1);
+    bench_case(1, 512, 64, 7, 7, 3, 1);
+
+    std::string algo_name;
+#if MEGDNN_AARCH64
+    algo_name = "WINOGRAD_NCHW44:AARCH64_INT16X16X32_MK8_8X8:8:2:32";
+#else
+    algo_name = "WINOGRAD_NCHW44:ARMV7_INT16X16X32_MK8_4X8:8:2:32";
+#endif
+
+    std::vector<DType> data_type = {
+            dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
+            dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
+    printf("Benchmark WINOGRAD_INT8_MK8 algo\n");
+    benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
+                   {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
+    benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
+                   {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
+    benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
+                   {1, {4}}, data_type);
+}
+
+TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS,
+       BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_COMP_F32) {
+    constexpr size_t RUNS = 50;
+
+    param::ConvBias param;
+    param.nonlineMode = param::ConvBias::NonlineMode::RELU;
+    param.pad_h = 1;
+    param.pad_w = 1;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.sparse = param::ConvBias::Sparse::DENSE;  // GROUP;
+    param.format = param::ConvBias::Format::NCHW44;
+
+    std::vector<std::pair<SmallVector<TensorShape>, float>>
+            shapes_and_computation;
+    auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W,
+                          size_t FS, size_t group) {
+        SmallVector<TensorShape> shapes{{N, IC / 4, H, W, 4},
+                                        {OC / 4, IC / 4, FS, FS, 4, 4},
+                                        {1, OC / 4, 1, 1, 4},
+                                        {},
+                                        {N, OC / 4, H, W, 4}};
+        TensorShape dst{N, OC, H, W};
+        float computations =
+                ((IC / group) * FS * FS * dst.total_nr_elems() * 2 +
+                 dst.total_nr_elems()) *
+                1e-6;
+        shapes_and_computation.push_back(std::make_pair(shapes, computations));
+    };
+
+    bench_case(1, 32, 32, 200, 200, 3, 1);
+    bench_case(1, 32, 32, 128, 128, 3, 1);
+    bench_case(1, 32, 32, 100, 100, 3, 1);
+
+    bench_case(1, 512, 512, 14, 14, 3, 1);
+    bench_case(1, 512, 256, 14, 14, 3, 1);
+    bench_case(1, 512, 128, 14, 14, 3, 1);
+    bench_case(1, 512, 64, 14, 14, 3, 1);
+
+    bench_case(1, 512, 512, 7, 7, 3, 1);
+    bench_case(1, 512, 256, 7, 7, 3, 1);
+    bench_case(1, 512, 128, 7, 7, 3, 1);
+    bench_case(1, 512, 64, 7, 7, 3, 1);
+
+    std::string algo_name;
+#if MEGDNN_AARCH64
+    algo_name = "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:2:32";
+#else
+    algo_name = "WINOGRAD_NCHW44:ARMV7_F32_MK4_4x8:4:2:32";
+#endif
+
+    std::vector<DType> data_type = {
+            dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f),
+            dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)};
+    printf("Benchmark WINOGRAD_INT8_NCHW44_MK4_COMP_F32 algo\n");
+    benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
+                   {4, {4, 5, 6, 7}}, {1, {4}}, data_type);
+    benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
+                   {4, {4, 5, 6, 7}}, {1, {7}}, data_type);
+    benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
+                   {1, {4}}, data_type);
+}
+
 TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_FP32) {
     constexpr size_t RUNS = 50;
 
diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp
index 22fdf90af02d09215a2c8226d14273df4e982a4d..8bd3f5b01dfe1fcb4be273738f23d122bb464b56 100644
--- a/dnn/test/common/conv_bias.cpp
+++ b/dnn/test/common/conv_bias.cpp
@@ -9,6 +9,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
 #include "test/common/conv_bias.h"
+#include "megdnn/opr_param_defs.h"
 #include "src/common/utils.h"
 #include "test/common/benchmarker.h"
 namespace megdnn {
@@ -242,7 +243,8 @@ std::vector<TestArg> get_winograd_mk_packed_args(size_t pack_size) {
     return args;
 }
 
-std::vector<TestArg> get_quantized_winograd_mk_packed_args(size_t pack_size) {
+std::vector<TestArg> get_quantized_winograd_mk_packed_args(
+        size_t pack_size, bool compute_float32) {
     std::vector<TestArg> args;
 
     param::ConvBias cur_param;
@@ -260,13 +262,16 @@ std::vector<TestArg> get_quantized_winograd_mk_packed_args(size_t pack_size) {
         cur_param.sparse = param::ConvBias::Sparse::DENSE;
         cur_param.pad_h = cur_param.pad_w = 1;
 
+        if(compute_float32){
+            cur_param.compute_mode = param::ConvBias::ComputeMode::FLOAT32;
+        }
+
         args.emplace_back(cur_param, TensorShape{1, pack_size, 3, 3},
                           TensorShape{pack_size, pack_size, 3, 3},
                           TensorShape{1, pack_size, 1, 1});
         //! no bias
         args.emplace_back(cur_param, TensorShape{2, ic, i, i},
                           TensorShape{oc, ic, 3, 3}, TensorShape{});
-
         //! bias
         args.emplace_back(cur_param, TensorShape{2, ic, i, i},
                           TensorShape{oc, ic, 3, 3}, TensorShape{2, oc, i, i});
@@ -372,7 +377,7 @@ std::vector<TestArg> get_int8_nchw4_args(size_t kernel_size) {
     for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) {
     for (size_t b : {64, 16}) {
     for (size_t ic : {16, 32}) {
-    for (size_t oc : {64, 32}) {
+    for (size_t oc : {16, 32}) {
     for (size_t h : {8}) {
     for (size_t w : {8, 11}) {
     for (int p : {0, static_cast<int>(kernel_size / 2)}) {
@@ -399,6 +404,95 @@ std::vector<TestArg> get_int8_nchw4_args(size_t kernel_size) {
     return args;
 }
 
+std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size,
+                                          bool compute_float32,
+                                          bool group_mode) {
+    std::vector<TestArg> args;
+    param::ConvBias cur_param;
+    megdnn_assert(pack_size > 0, "not support pack_size");
+    megdnn_assert(kernel_size > 0, "not support kernel_size");
+    using NLMode = param::ConvBias::NonlineMode;
+
+    //// clang-format off
+    for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU}) {
+    for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) {
+    for (size_t b : {1,2}) {
+    for (size_t ic : {8,16}) {
+    for (size_t oc : {8,16}) {
+    for (size_t h : {9,23}) {
+    for (size_t w : {9,23}) {
+    for (int p : {0, static_cast<int>(kernel_size / 2)}) {
+    for (size_t s : {1}) {
+        if (kernel_size == 7) {
+            b = std::min(b, 32_z);
+        }
+        size_t f = kernel_size;
+        cur_param.mode = mode;
+        cur_param.nonlineMode = nlmode;
+        if (pack_size == 4){
+            cur_param.format = param::ConvBias::Format::NCHW44;
+        } else if(pack_size == 8){
+            cur_param.format = param::ConvBias::Format::NCHW88;
+        }
+
+        if(compute_float32){
+            cur_param.compute_mode =
+                    param::ConvBias::ComputeMode::FLOAT32;
+        }
+
+        cur_param.sparse = param::ConvBias::Sparse::DENSE;
+        cur_param.pad_h = cur_param.pad_w = p;
+        cur_param.stride_h = cur_param.stride_w = s;
+        if (!group_mode) {
+            //! no bias
+            args.emplace_back(cur_param,
+                              TensorShape{b, ic / pack_size, h, w, pack_size},
+                              TensorShape{oc / pack_size, ic / pack_size, f, f,
+                                          pack_size, pack_size},
+                              TensorShape{});
+
+            //! bias channel
+            args.emplace_back(cur_param,
+                              TensorShape{b, ic / pack_size, h, w, pack_size},
+                              TensorShape{oc / pack_size, ic / pack_size, f, f,
+                                          pack_size, pack_size},
+                              TensorShape{1, oc / pack_size, 1, 1, pack_size});
+            //! bias
+            args.emplace_back(
+                    cur_param, TensorShape{b, ic / pack_size, h, w, pack_size},
+                    TensorShape{oc / pack_size, ic / pack_size, f, f, pack_size,
+                                pack_size},
+                    TensorShape{b, oc / pack_size, (h - f + 2 * p) / s + 1,
+                                (w - f + 2 * p) / s + 1, pack_size});
+        } else {
+            cur_param.sparse = param::ConvBias::Sparse::GROUP;
+            args.emplace_back(
+                    cur_param,
+                    TensorShape{2, 2 * ic / pack_size, h, w, pack_size},
+                    TensorShape{2, oc / pack_size, ic / pack_size, 3, 3,
+                                pack_size, pack_size},
+                    TensorShape{2, 2 * oc / pack_size, (h - f + 2 * p) / s + 1,
+                                (w - f + 2 * p) / s + 1, pack_size});
+
+            args.emplace_back(
+                    cur_param,
+                    TensorShape{2, 2 * ic / pack_size, h, w, pack_size},
+                    TensorShape{2, oc / pack_size, ic / pack_size, f, f,
+                                pack_size, pack_size},
+                    TensorShape{1, 2 * oc / pack_size, 1, 1, pack_size});
+            args.emplace_back(
+                    cur_param,
+                    TensorShape{2, 2 * ic / pack_size, h, w, pack_size},
+                    TensorShape{2, oc / pack_size, ic / pack_size, f, f,
+                                pack_size, pack_size},
+                    TensorShape{});
+        }
+    } } } } } } } } }
+    // clang-format on
+
+    return args;
+}
+
 std::vector<TestArg> get_int8_nchw4_args_check_bounds(size_t kernel_size) {
     std::vector<TestArg> args;
     param::ConvBias cur_param;
@@ -990,11 +1084,14 @@ void checker_conv_bias_int8x8x16(std::vector<conv_bias::TestArg> args,
 void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
                               param::ConvBias param, Handle* handle,
                               param::MatrixMul::Format format) {
-    megdnn_assert(param.format == param::ConvBias::Format::NCHW);
+    megdnn_assert(param.format == param::ConvBias::Format::NCHW ||
+                  param.format == param::ConvBias::Format::NCHW44);
     auto winograd_preprocess_opr =
             handle->create_operator<WinogradFilterPreprocess>();
     winograd_preprocess_opr->param().output_block_size = m;
     winograd_preprocess_opr->param().format = format;
+    winograd_preprocess_opr->param().compute_mode =
+            param.compute_mode;
     TensorLayout filter_transform_layout;
     winograd_preprocess_opr->deduce_layout(tensors[1].layout,
                                            filter_transform_layout);
@@ -1004,7 +1101,12 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
 
     auto conv_bias_opr = handle->create_operator<ConvBias>();
     conv_bias_opr->param() = param;
-    conv_bias_opr->param().format = param::ConvBias::Format::NCHW_WINOGRAD;
+    if (param.format == param::ConvBias::Format::NCHW) {
+        conv_bias_opr->param().format = param::ConvBias::Format::NCHW_WINOGRAD;
+    } else {
+        conv_bias_opr->param().format =
+                param::ConvBias::Format::NCHW44_WINOGRAD;
+    }
     conv_bias_opr->param().output_block_size = m;
     size_t conv_bias_workspace_in_bytes = conv_bias_opr->get_workspace_in_bytes(
             tensors[0].layout, filter_transform_layout, tensors[2].layout,
@@ -1021,7 +1123,6 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
                                   wb.get_workspace(2));
     conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
                         tensors[3], tensors[4], nullptr, wb.get_workspace(1));
-
     free(wb.ptr());
 };
 
diff --git a/dnn/test/common/conv_bias.h b/dnn/test/common/conv_bias.h
index 9ef78d6dd14a165d6c0dd3c3df61723e41544fdb..e52815b5494015b8943348e916f866aec87f4e9d 100644
--- a/dnn/test/common/conv_bias.h
+++ b/dnn/test/common/conv_bias.h
@@ -36,7 +36,7 @@ std::vector<TestArg> get_chanwise_args();
 std::vector<TestArg> get_winograd_args(size_t kernel_size);
 std::vector<TestArg> get_winograd_mk_packed_args(size_t pack_size = 4);
 std::vector<TestArg> get_quantized_winograd_mk_packed_args(
-        size_t pack_size = 4);
+        size_t pack_size = 4, bool compute_float32 = false);
 std::vector<TestArg> get_quantized_args_with_nlmode(
         param::ConvBias::NonlineMode nlmode);
 std::vector<TestArg> get_quantized_args();
@@ -55,6 +55,10 @@ std::vector<TestArg> get_int8_chwn4_args_small_batch(size_t kernel_size);
 std::vector<TestArg> get_int8_nchw4_tensorcore_args(size_t kernel_size);
 std::vector<TestArg> get_int8_chwn4_tensorcore_args(size_t kernel_size);
 
+std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size,
+                                          bool compute_float32 = false,
+                                          bool group_mode = false);
+
 template <typename Opr>
 using ConvBiasAlgoChecker = AlgoChecker<Opr>;
 
diff --git a/src/gopt/impl/weights_preprocess.cpp b/src/gopt/impl/weights_preprocess.cpp
index 3f8c84f3f379f6f94fea6003061934918f6e231d..31606613dcfb6b6ef5980fbea4ff0d03e9fb8235 100644
--- a/src/gopt/impl/weights_preprocess.cpp
+++ b/src/gopt/impl/weights_preprocess.cpp
@@ -102,12 +102,28 @@ void WinogradTransformReplacePass::apply(OptState& opt) const {
                     opr::ConvBiasForward::get_matmul_format(winograd_param);
             winograd_preprocess_param.output_block_size =
                     winograd_param.output_block_size;
+
+            size_t pack_c_size = 1;
+            if (new_inp[0]->shape().ndim == 5) {
+                pack_c_size = new_inp[0]->layout().shape[4];
+            }
+
+            if (conv_bias_opr.input(0)->dtype().enumv() ==
+                        DTypeEnum::QuantizedS8 &&
+                pack_c_size == 4 &&
+                winograd_preprocess_param.format ==
+                        megdnn::param::MatrixMul::Format::MK4) {
+                winograd_preprocess_param.compute_mode =
+                        megdnn::param::ConvBias::ComputeMode::FLOAT32;
+            }
+
             auto winograd_preprocess_opr = opr::WinogradFilterPreprocess::make(
                     new_inp[1], winograd_preprocess_param);
             mgb_assert(inputs.size() == 2 || inputs.size() == 3,
                        "input size need to be 2/3, but got: %zu",
                        inputs.size());
             SymbolVar new_conv_bias_opr;
+
             auto conv_bias_param = conv_bias_opr.param();
             if (new_inp[0]->shape().ndim == 4) {
                 conv_bias_param.format =
@@ -126,6 +142,7 @@ void WinogradTransformReplacePass::apply(OptState& opt) const {
                                algo_name.c_str());
                 }
             }
+
             conv_bias_param.output_block_size =
                     winograd_param.output_block_size;
             if (inputs.size() == 2) {
diff --git a/src/opr/impl/dnn/convolution.cpp b/src/opr/impl/dnn/convolution.cpp
index bf65ac8ee2291dd9f1ad293900a76ff3c6bcd12f..9f72c3f71fbd3cc29abe1dc2064b6926bd270301 100644
--- a/src/opr/impl/dnn/convolution.cpp
+++ b/src/opr/impl/dnn/convolution.cpp
@@ -1541,8 +1541,9 @@ void ConvBiasForward::check_winograd_param_valid(
                     dtype.enumv() == DTypeEnum::QuantizedS8 ||
                     dtype.enumv() == DTypeEnum::Quantized8Asymm) &&
                            (param.channel_block_size == 1 ||
+                            param.channel_block_size == 4 ||
                             param.channel_block_size == 8),
-                   "only support 1/8 for the channel_block_size of "
+                   "only support 1/4/8 for the channel_block_size of "
                    "winograd param, got %u",
                    param.channel_block_size);
     }