feat(mgb/gopt): add nchw44 optpass

GitOrigin-RevId: dc38724558b0c6635ea9a3137e1c0d0acc665a0f

feat(mgb/gopt): add nchw44 optpass
GitOrigin-RevId: dc38724558b0c6635ea9a3137e1c0d0acc665a0f
554ce352 · Megvii Engine Team · Xinran Xu · 7d1e1f9a · 554ce352 · 554ce352
8 changed file
--- a/python_module/megengine/_internal/__init__.py
+++ b/python_module/megengine/_internal/__init__.py
@@ -541,7 +541,8 @@ def optimize_for_inference(
    fuse_conv_bias_nonlinearity=False,
    use_tensor_core=False,
    fuse_conv_bias_with_z=False,
-    use_nchw88=False
+    use_nchw88=False,
+    use_nchw44=False
 ):
    """optimize computing graph for inference

@@ -559,7 +560,9 @@ def optimize_for_inference(
        OpenCL devices
    :param fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
        into one opr. This is supported only in NHWCD4 format.
-    :param use_nchw88: whether to use NCHW4 tensor format. This maybe faster some
+    :param use_nchw88: whether to use NCHW88 tensor format. This maybe faster some
+        times.
+    :param use_nchw44: whether to use NCHW44 tensor format. This maybe faster some
        times.


@@ -577,6 +580,7 @@ def optimize_for_inference(
        "use_tensor_core",
        "fuse_conv_bias_with_z",
        "use_nchw88",
+        "use_nchw44",
    ]:
        if settings[i]:
            getattr(opt, "enable_{}".format(i))()

--- a/python_module/src/swig/misc.i
+++ b/python_module/src/swig/misc.i
@@ -79,6 +79,7 @@ struct _OptimizeForInferenceOptions {
    SET(use_tensor_core);
    SET(fuse_conv_bias_with_z);
    SET(use_nchw88);
+    SET(use_nchw44);
 #undef SET
 };


--- a/sdk/load-and-run/dump_with_testcase_mge.py
+++ b/sdk/load-and-run/dump_with_testcase_mge.py
@@ -253,6 +253,7 @@ def optimize_for_inference(args, outputs):
        'enable_ioc16': 'f16_io_comp',
        'enable_hwcd4': 'use_nhwcd4',
        'enable_nchw88': 'use_nchw88',
+        'enable_nchw44': 'use_nchw44',
        'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity',
        'enable_tensorcore': 'use_tensor_core',
        'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z',
@@ -385,6 +386,12 @@ def main():
        help='transform the model format from NCHW to NCHW88 '
        'for inference'
    )
+    parser.add_argument(
+        '--enable-nchw44',
+        action='store_true',
+        help='transform the model format from NCHW to NCHW44 '
+        'for inference'
+    )
    parser.add_argument(
        '--enable-tensorcore',
        action='store_true',

--- a/src/gopt/impl/framework.cpp
+++ b/src/gopt/impl/framework.cpp
@@ -700,6 +700,9 @@ GraphOptimizer& GraphOptimizer::add_preset_passes(
        if (inference_opt->use_nchw88) {
            add_pass(EnableNchwxxPass::make_nchwxx_converter(8));
        }
+        if (inference_opt->use_nchw44) {
+            add_pass(EnableNchwxxPass::make_nchwxx_converter(4));
+        }
        if (inference_opt->use_tensor_core) {
            mgb_assert(inference_opt->fuse_conv_bias_nonlinearity,
                       "enable tensor core should fuse conv bias activation "

--- a/src/gopt/impl/tensor_reformat.cpp
+++ b/src/gopt/impl/tensor_reformat.cpp
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */

 #include "megbrain/gopt/inference.h"
@@ -63,7 +64,10 @@ public:
        NCHW4_TO_CHWN4,               //!< from nchw4 layout to chwn4 layout
        CHWN4_TO_NCHW4,               //!< from chwn4 layout to nchw4 layout
        NCHW_TO_NCHW88,               //!< from nchw layout to nchw88 layout
+        NCHW_TO_NCHW44,               //!< from nchw layout to nchw44 layout
        NCHW88_TO_NCHW,               //!< from nchw88 layout to nchw layout
+        NCHW44_TO_NCHW,               //!< from nchw44 layout to nchw layout
+
        WEIGHT_NCHW_TO_NCHW88_DENSE,  //!< weight from nchw layout to nchw88
                                      //!< layout
        WEIGHT_NCHW_TO_NCHW88_GROUP,  //!< group weight from nchw layout to
@@ -73,6 +77,16 @@ public:
        //!< the weight layout of input is nchw output is nchw88, special for
        //!< shape weight in nchw like {64, 2, 3, 3} to {8, 3, 3, 2, 8}
        WEIGHT_HYBIRD_NCHW_NCHW88,
+
+        WEIGHT_NCHW_TO_NCHW44_DENSE,  //!< weight from nchw layout to nchw44
+                                      //!< layout
+        WEIGHT_NCHW_TO_NCHW44_GROUP,  //!< group weight from nchw layout to
+                                      //!< nchw44 layout
+        WEIGHT_NCHW_TO_NCHW44_CHAN,   //!< channel wise weight from nchw layout
+                                      //!< to nchw44 layout
+        //!< the weight layout of input is nchw output is nchw44, special for
+        //!< shape weight in nchw like {64, 2, 3, 3} to {16, 3, 3, 2, 4}
+        WEIGHT_HYBIRD_NCHW_NCHW44,
    };

    RelayoutPlaceholder(VarNode* src_var, LayoutType layout_type);
@@ -203,10 +217,8 @@ void TensorReformatPass::RelayoutPlaceholder::init_output_static_infer_desc() {
            dst[3] = inp_shape[3];
            dst[4] = inp_shape[4];
            dst[5] = 8;
-        } else {
-            mgb_assert(
-                    layout_type() ==
-                    RelayoutPlaceholder::LayoutType::WEIGHT_HYBIRD_NCHW_NCHW88);
+        } else if (layout_type() ==
+                   RelayoutPlaceholder::LayoutType::WEIGHT_HYBIRD_NCHW_NCHW88) {
            mgb_assert(inp_shape.ndim == 4 && inp_shape[0] % 8 == 0);
            dst.ndim = 5;
            dst[0] = inp_shape[0] / 8;
@@ -214,6 +226,68 @@ void TensorReformatPass::RelayoutPlaceholder::init_output_static_infer_desc() {
            dst[2] = inp_shape[3];
            dst[3] = inp_shape[1];
            dst[4] = 8;
+        } else if (layout_type() ==
+                   RelayoutPlaceholder::LayoutType::NCHW_TO_NCHW44) {
+            mgb_assert(inp_shape.ndim == 4 && inp_shape[1] % 4 == 0);
+            dst.ndim = 5;
+            dst[0] = inp_shape[0];
+            dst[1] = inp_shape[1] / 4;
+            dst[2] = inp_shape[2];
+            dst[3] = inp_shape[3];
+            dst[4] = 4;
+        } else if (layout_type() ==
+                   RelayoutPlaceholder::LayoutType::NCHW44_TO_NCHW) {
+            mgb_assert(inp_shape.ndim == 5 && inp_shape[4] == 4);
+            dst.ndim = 4;
+            dst[0] = inp_shape[0];
+            dst[1] = inp_shape[1] * 4;
+            dst[2] = inp_shape[2];
+            dst[3] = inp_shape[3];
+        } else if (layout_type() == RelayoutPlaceholder::LayoutType::
+                                            WEIGHT_NCHW_TO_NCHW44_DENSE) {
+            mgb_assert(inp_shape.ndim == 4 && inp_shape[0] % 4 == 0 &&
+                       inp_shape[1] % 4 == 0);
+            dst.ndim = 6;
+            dst[0] = inp_shape[0] / 4;
+            dst[1] = inp_shape[1] / 4;
+            dst[2] = inp_shape[2];
+            dst[3] = inp_shape[3];
+            dst[4] = 4;
+            dst[5] = 4;
+        } else if (layout_type() == RelayoutPlaceholder::LayoutType::
+                                            WEIGHT_NCHW_TO_NCHW44_GROUP) {
+            mgb_assert(inp_shape.ndim == 5 && inp_shape[1] % 4 == 0 &&
+                       inp_shape[2] % 4 == 0);
+            dst.ndim = 7;
+            dst[0] = inp_shape[0];
+            dst[1] = inp_shape[1] / 4;
+            dst[2] = inp_shape[2] / 4;
+            dst[3] = inp_shape[3];
+            dst[4] = inp_shape[4];
+            dst[5] = 4;
+            dst[6] = 4;
+        } else if (layout_type() == RelayoutPlaceholder::LayoutType::
+                                            WEIGHT_NCHW_TO_NCHW44_CHAN) {
+            mgb_assert(inp_shape.ndim == 5 && inp_shape[1] == 1 &&
+                       inp_shape[2] == 1 && inp_shape[0] % 4 == 0);
+            dst.ndim = 6;
+            dst[0] = inp_shape[0] / 4;
+            dst[1] = inp_shape[1];
+            dst[2] = inp_shape[2];
+            dst[3] = inp_shape[3];
+            dst[4] = inp_shape[4];
+            dst[5] = 4;
+        } else {
+            mgb_assert(
+                    layout_type() ==
+                    RelayoutPlaceholder::LayoutType::WEIGHT_HYBIRD_NCHW_NCHW44);
+            mgb_assert(inp_shape.ndim == 4 && inp_shape[0] % 4 == 0);
+            dst.ndim = 5;
+            dst[0] = inp_shape[0] / 4;
+            dst[1] = inp_shape[2];
+            dst[2] = inp_shape[3];
+            dst[3] = inp_shape[1];
+            dst[4] = 4;
        }
        return true;
    };
@@ -418,6 +492,104 @@ void TensorReformatPass::translate_pass(OptState& opt) const {
        auto y2 = opr::Reshape::make(y1, tshp1);
        return y2.node();
    };
+    reformat[LayoutType::NCHW_TO_NCHW44] = [](VarNode* inp) -> VarNode* {
+        auto x = SymbolVar(inp);
+        auto xshp = opr::GetVarShape::make(x);
+        auto cv = [&x](int v) { return x.make_scalar(v); };
+        auto sub = [&xshp, &cv](int idx) {
+            return opr::IndexAt::make(xshp, {{0, cv(idx)}});
+        };
+        auto tshp0 = opr::Concat::make(
+                     {sub(0), sub(1) / 4, cv(4), sub(2), sub(3)}, 0),
+             tshp1 = opr::Concat::make(
+                     {sub(0), sub(1) / 4, sub(2), sub(3), cv(4)}, 0);
+        auto y0 = opr::Reshape::make(x, tshp0);
+        auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2});
+        auto y2 = opr::Reshape::make(y1, tshp1);
+        return y2.node();
+    };
+    reformat[LayoutType::NCHW44_TO_NCHW] = [](VarNode* inp) -> VarNode* {
+        auto x = SymbolVar(inp);
+        auto xshp = opr::GetVarShape::make(x);
+        auto cv = [&x](int v) { return x.make_scalar(v); };
+        auto sub = [&xshp, &cv](int idx) {
+            return opr::IndexAt::make(xshp, {{0, cv(idx)}});
+        };
+        auto tshp0 = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
+        auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
+        auto y1 = opr::Reshape::make(y0, tshp0);
+        return y1.node();
+    };
+    reformat[LayoutType::WEIGHT_NCHW_TO_NCHW44_DENSE] =
+            [](VarNode* inp) -> VarNode* {
+        auto x = SymbolVar(inp);
+        auto xshp = opr::GetVarShape::make(x);
+        auto cv = [&x](int v) { return x.make_scalar(v); };
+        auto sub = [&xshp, &cv](int idx) {
+            return opr::IndexAt::make(xshp, {{0, cv(idx)}});
+        };
+        auto tshp0 = opr::Concat::make(
+                     {sub(0) / 4, cv(4), sub(1) / 4, cv(4), sub(2), sub(3)}, 0),
+             tshp1 = opr::Concat::make(
+                     {sub(0) / 4, sub(1) / 4, sub(2), sub(3), cv(4), cv(4)}, 0);
+        auto y0 = opr::Reshape::make(x, tshp0);
+        auto y1 = opr::Dimshuffle::make(y0, {0, 2, 4, 5, 3, 1});
+        auto y2 = opr::Reshape::make(y1, tshp1);
+        return y2.node();
+    };
+    reformat[LayoutType::WEIGHT_NCHW_TO_NCHW44_GROUP] =
+            [](VarNode* inp) -> VarNode* {
+        auto x = SymbolVar(inp);
+        auto xshp = opr::GetVarShape::make(x);
+        auto cv = [&x](int v) { return x.make_scalar(v); };
+        auto sub = [&xshp, &cv](int idx) {
+            return opr::IndexAt::make(xshp, {{0, cv(idx)}});
+        };
+        auto tshp0 = opr::Concat::make({sub(0), sub(1) / 4, cv(4), sub(2) / 4,
+                                        cv(4), sub(3), sub(4)},
+                                       0),
+             tshp1 = opr::Concat::make({sub(0), sub(1) / 4, sub(2) / 4, sub(3),
+                                        sub(4), cv(4), cv(4)},
+                                       0);
+        auto y0 = opr::Reshape::make(x, tshp0);
+        auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 5, 6, 4, 2});
+        auto y2 = opr::Reshape::make(y1, tshp1);
+        return y2.node();
+    };
+    reformat[LayoutType::WEIGHT_NCHW_TO_NCHW44_CHAN] =
+            [](VarNode* inp) -> VarNode* {
+        auto x = SymbolVar(inp);
+        auto xshp = opr::GetVarShape::make(x);
+        auto cv = [&x](int v) { return x.make_scalar(v); };
+        auto sub = [&xshp, &cv](int idx) {
+            return opr::IndexAt::make(xshp, {{0, cv(idx)}});
+        };
+        auto tshp0 = opr::Concat::make(
+                     {sub(0) / 4, cv(4), sub(1), sub(2), sub(3), sub(4)}, 0),
+             tshp1 = opr::Concat::make(
+                     {sub(0) / 4, sub(1), sub(2), sub(3), sub(4), cv(4)}, 0);
+        auto y0 = opr::Reshape::make(x, tshp0);
+        auto y1 = opr::Dimshuffle::make(y0, {0, 2, 3, 4, 5, 1});
+        auto y2 = opr::Reshape::make(y1, tshp1);
+        return y2.node();
+    };
+    reformat[LayoutType::WEIGHT_HYBIRD_NCHW_NCHW44] =
+            [](VarNode* inp) -> VarNode* {
+        auto x = SymbolVar(inp);
+        auto xshp = opr::GetVarShape::make(x);
+        auto cv = [&x](int v) { return x.make_scalar(v); };
+        auto sub = [&xshp, &cv](int idx) {
+            return opr::IndexAt::make(xshp, {{0, cv(idx)}});
+        };
+        auto tshp0 = opr::Concat::make(
+                     {sub(0) / 4, cv(4), sub(1), sub(2), sub(3)}, 0),
+             tshp1 = opr::Concat::make(
+                     {sub(0) / 4, sub(2), sub(3), sub(1), cv(4)}, 0);
+        auto y0 = opr::Reshape::make(x, tshp0);
+        auto y1 = opr::Dimshuffle::make(y0, {0, 3, 4, 2, 1});
+        auto y2 = opr::Reshape::make(y1, tshp1);
+        return y2.node();
+    };

    auto rewriter = opt.graph().make_rewriter();
    auto on_opr = [&reformat, &rewriter](OperatorNodeBase* opr) {
@@ -1071,16 +1243,24 @@ std::unique_ptr<EnableCHWN4Pass> EnableCHWN4Pass::make_chwn4_converter() {
 VarNode* EnableNchwxxPass::on_graph_endpoint_var(VarNode* new_var,
                                                 VarNode* orig_var) const {
    if (!orig_var->shape().eq_shape(new_var->shape())) {
+        if (m_pack_c_size == 8) {
+            return RelayoutPlaceholder::make(
+                           new_var,
+                           RelayoutPlaceholder::LayoutType::NCHW88_TO_NCHW)
+                    .node();
+        } else if (m_pack_c_size == 4) {
            return RelayoutPlaceholder::make(
-                       new_var, RelayoutPlaceholder::LayoutType::NCHW88_TO_NCHW)
+                           new_var,
+                           RelayoutPlaceholder::LayoutType::NCHW44_TO_NCHW)
                    .node();
        }
+    }
    return new_var;
 }

 std::unique_ptr<EnableNchwxxPass> EnableNchwxxPass::make_nchwxx_converter(
        size_t pack_c_size) {
-    auto ret = std::make_unique<EnableNchwxxPass>();
+    auto ret = std::make_unique<EnableNchwxxPass>(pack_c_size);
    ret->set_var_replace_check_flag(VarReplaceCheckFlag::NOCHECK);
    //! First is whether the conv can trans to nchwxx, second is the filter
    //! trans mode
@@ -1102,8 +1282,18 @@ std::unique_ptr<EnableNchwxxPass> EnableNchwxxPass::make_nchwxx_converter(
    megdnn::param::Pooling::Format pooling_format =
            megdnn::param::Pooling::Format::NCHW88;
    std::string convter_pass_name = "conv_format_nchw88";
-    mgb_assert(pack_c_size == static_cast<size_t>(8),
-               "The ConvertFormatPass to nchwxx only support NCHW88 now !");
+    if (pack_c_size == 4) {
+        weight_to_nchwxx_mode_dense = RelayoutMode::WEIGHT_NCHW_TO_NCHW44_DENSE;
+        weight_to_nchwxx_mode_group = RelayoutMode::WEIGHT_NCHW_TO_NCHW44_GROUP;
+        weight_to_nchwxx_mode_chan = RelayoutMode::WEIGHT_NCHW_TO_NCHW44_CHAN;
+        hybrid_nchw_nchwxx = RelayoutMode::WEIGHT_HYBIRD_NCHW_NCHW44;
+        src_to_nchwxx_mode = RelayoutMode::NCHW_TO_NCHW44;
+        src_to_nchw_mode = RelayoutMode::NCHW44_TO_NCHW;
+        conv_bias_format = megdnn::param::ConvBias::Format::NCHW44;
+        conv_format = megdnn::param::ConvolutionV0::Format::NCHW44;
+        pooling_format = megdnn::param::Pooling::Format::NCHW44;
+        convter_pass_name = "conv_format_nchw44";
+    }
    auto test_trans_nchwxx =
            [pack_c_size, weight_to_nchwxx_mode_dense,
             weight_to_nchwxx_mode_group, weight_to_nchwxx_mode_chan,
@@ -1297,7 +1487,7 @@ std::unique_ptr<EnableNchwxxPass> EnableNchwxxPass::make_nchwxx_converter(
            auto new_param = conv_bias_opr.param();
            new_param.format = conv_bias_format;
            auto new_conv_bias_opr = opr::ConvBias::make(
-                    conv_bias_src, conv_bias_filter, new_param,
+                    conv_bias_src, conv_bias_filter, conv_bias_bias, new_param,
                    conv_bias_opr.execution_policy(), conv_bias_opr.config());
            OperatorNodeBase* new_opr = new_conv_bias_opr.node()->owner_opr();
            mgb_assert(new_conv_bias_opr.shape().ndim == 5,
@@ -1330,6 +1520,51 @@ std::unique_ptr<EnableNchwxxPass> EnableNchwxxPass::make_nchwxx_converter(
        }
    };

+    auto replace_concat_opr = [=](OperatorNodeBase* opr,
+                                  const VarNodeArray& new_inp) {
+        mgb_assert(opr->input().size() == new_inp.size());
+        bool has_inp_changed = false;
+        bool can_exec_ncwxx = true;
+        for (size_t i = 0; i < opr->input().size(); i++) {
+            if (new_inp[i]->shape().ndim == 5) {
+                has_inp_changed = true;
+                break;
+            } else if (new_inp[i]->shape().ndim == 4) {
+                if (new_inp[i]->shape()[1] % pack_c_size != 0) {
+                    can_exec_ncwxx = false;
+                }
+            }
+        }
+        if (has_inp_changed) {
+            auto temp_inp = new_inp;
+            if (can_exec_ncwxx) {
+                for (size_t i = 0; i < opr->input().size(); i++) {
+                    if (new_inp[i]->shape().ndim == 4) {
+                        auto new_var = RelayoutPlaceholder::make(
+                                new_inp[i], src_to_nchwxx_mode);
+                        temp_inp[i] = new_var.node();
+                    } else {
+                        mgb_assert((new_inp[i]->shape().ndim == 5) ||
+                                   new_inp[i]->shape().is_scalar());
+                    }
+                }
+            } else {
+                for (size_t i = 0; i < opr->input().size(); i++) {
+                    if (new_inp[i]->shape().ndim == 5) {
+                        auto new_var = RelayoutPlaceholder::make(
+                                new_inp[i], src_to_nchw_mode);
+                        temp_inp[i] = new_var.node();
+                    }
+                }
+            }
+            return serialization::copy_opr_shallow(*opr, temp_inp,
+                                                   opr->config());
+        } else {
+            return serialization::copy_opr_shallow(*opr, new_inp,
+                                                   opr->config());
+        }
+    };
+
    auto replace_elemwise_opr = [=](OperatorNodeBase* opr,
                                    const VarNodeArray& new_inp) {
        mgb_assert(opr->input().size() == new_inp.size());
@@ -1382,6 +1617,7 @@ std::unique_ptr<EnableNchwxxPass> EnableNchwxxPass::make_nchwxx_converter(
    replace_func[opr::Convolution::typeinfo()] = replace_conv_opr;
    replace_func[opr::ConvBias::typeinfo()] = replace_conv_bias_opr;
    replace_func[opr::PoolingForward::typeinfo()] = replace_pooling_opr;
+    replace_func[opr::Concat::typeinfo()] = replace_concat_opr;
    replace_func[opr::Elemwise::typeinfo()] = replace_elemwise_opr;
    replace_func[opr::TypeCvt::typeinfo()] = replace_elemwise_opr;
    replace_func[opr::ElemwiseMultiType::typeinfo()] = replace_elemwise_opr;
@@ -1390,13 +1626,10 @@ std::unique_ptr<EnableNchwxxPass> EnableNchwxxPass::make_nchwxx_converter(
    replace_func[opr::ConvolutionBackwardData::typeinfo()] =
            relayout_inp_to_nchw;
    replace_func[opr::Subtensor::typeinfo()] = relayout_inp_to_nchw;
-    replace_func[opr::Concat::typeinfo()] = relayout_inp_to_nchw;
-    replace_func[opr::Reshape::typeinfo()] = relayout_inp_to_nchw;
    replace_func[opr::GetVarShape::typeinfo()] = relayout_inp_to_nchw;
    replace_func[opr::Dimshuffle::typeinfo()] = relayout_inp_to_nchw;
    replace_func[opr::Reduce::typeinfo()] = relayout_inp_to_nchw;
    replace_func[opr::AssertEqual::typeinfo()] = relayout_inp_to_nchw;
-    replace_func[opr::Broadcast::typeinfo()] = relayout_inp_to_nchw;
    replace_func[opr::IncrSubtensor::typeinfo()] = relayout_inp_to_nchw;
    replace_func[opr::ResizeForward::typeinfo()] = relayout_inp_to_nchw;
    replace_func[opr::WarpPerspectiveForward::typeinfo()] =

--- a/src/gopt/include/megbrain/gopt/inference.h
+++ b/src/gopt/include/megbrain/gopt/inference.h
@@ -234,16 +234,18 @@ namespace gopt {
     */
    class EnableNchwxxPass final : public TensorReformatPass {
        std::string m_name = "tensor_format_nchwxx";
+        size_t m_pack_c_size;
        VarNode* on_graph_endpoint_var(VarNode* new_var,
                                       VarNode* orig_var) const override;
        //! the flag for conv to transform to nchwxx
        enum class TransType {
-            TRANS_PURE_NCHWXX,    //!< weight and src all trans to nchw88
-            TRANS_HYBIRD_NCHWXX,  //!< input is nchw, output is nchw88
+            TRANS_PURE_NCHWXX,    //!< weight and src all trans to nchwxx
+            TRANS_HYBIRD_NCHWXX,  //!< input is nchw, output is nchwxx
            TRANS_NONE,           //!< no need trans
        };

    public:
+        EnableNchwxxPass(size_t pack_c_size) : m_pack_c_size(pack_c_size) {}
        const char* name() const override {
            return mgb_cstr_log(m_name.c_str());
        }
@@ -265,6 +267,8 @@ namespace gopt {
        bool use_nhwcd4 = false;
        //! whether to compute using NCHW88 tensor format
        bool use_nchw88 = false;
+        //! whether to compute using NCHW44 tensor format
+        bool use_nchw44 = false;
        //! whether to enable tensor core
        bool use_tensor_core = false;
        //! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b)
@@ -283,6 +287,7 @@ namespace gopt {
        SET(use_tensor_core);
        SET(fuse_conv_bias_with_z);
        SET(use_nchw88);
+        SET(use_nchw44);
 #undef SET
    };


--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -2325,5 +2325,86 @@ TEST(TestGoptInference, ConvertFormatNCHW88) {
    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
 }

+TEST(TestGoptInference, ConvertFormatNCHW44) {
+    HostTensorGenerator<> gen;
+    auto cn = CompNode::load("cpu0");
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp) {
+        return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
+    };
+    auto mkcvar = [&](const char* name, const TensorShape& shp) {
+        return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
+                .rename(name);
+    };
+
+    auto host_x = gen({2, 3, 16, 16}, cn);
+    auto x = opr::Host2DeviceCopy::make(*graph, host_x);
+    //!Hybrid nchw88 mode
+    opr::Convolution::Param param_conv;
+    param_conv.pad_h = param_conv.pad_w = 1;
+    auto w1 = mkcvar("w1", {8, 3, 3, 3}),
+         conv1 = opr::Convolution::make(x, w1, param_conv);
+    //!channel wise
+    opr::ConvBias::Param param_conv_bias;
+    param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
+    param_conv_bias.sparse = opr::ConvBias::Param::Sparse::GROUP;
+    auto w2 = mkcvar("w2", {8, 1, 1, 3, 3}), b2 = mkcvar("b2", {1, 8, 1, 1}),
+         conv2 = opr::ConvBias::make(conv1, w2, b2, param_conv_bias);
+    //! group
+    auto w3 = mkcvar("w3", {2, 4, 4, 3, 3}), b3 = mkcvar("b3", {1, 8, 1, 1}),
+         conv3 = opr::ConvBias::make(conv2, w3, b3, param_conv_bias);
+
+    auto shape_of = opr::GetVarShape::make(conv3);
+    auto subtensor = opr::Subtensor::make(
+            shape_of, {opr::Subtensor::AxisIndexer::make_interval(
+                              0, x.make_scalar(2), None, x.make_scalar(1))});
+    opr::Resize::Param param_resize;
+    param_resize.format = opr::Resize::Param::Format::NCHW;
+    auto resize = opr::ResizeForward::make(conv3, subtensor * 2, param_resize);
+    auto mat = mkcvar("mat", {2, 3, 3}),
+         warp = opr::WarpPerspectiveForward::make(
+                 resize, mat, nullptr, cg::var_from_tensor_shape(x, {4, 4}));
+
+    auto b = mkvar("b", {1, 8, 1, 1}),
+         elem = opr::Elemwise::make({warp + b},
+                                    opr::Elemwise::Param::Mode::RELU);
+    //! Dense
+    param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
+    param_conv_bias.pad_h = param_conv_bias.pad_w = 1;
+    auto w4 = mkcvar("w4", {4, 8, 3, 3}), b4 = mkcvar("b4", {1, 4, 1, 1}),
+         conv4 = opr::ConvBias::make(elem, w4, b4, param_conv_bias);
+    auto w5 = mkcvar("w5", {6, 4, 3, 3}), b5 = mkcvar("b5", {1, 6, 1, 1}),
+         conv5 = opr::ConvBias::make(conv4, w5, b5, param_conv_bias);
+    auto w6 = mkcvar("w6", {4, 6, 3, 3}), b6 = mkcvar("b6", {1, 4, 1, 1}),
+         y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias);
+
+    SymbolVar y_opt;
+    unpack_vector(
+            gopt::optimize_for_inference(
+                    {y},
+                    gopt::OptimizeForInferenceOptions{}.enable_use_nchw44()),
+            y_opt);
+
+    ASSERT_EQ(opr::ConvBias::Param::Format::NCHW44,
+              find_opr<opr::ConvBias>(y_opt).param().format);
+
+    graph->compile({{y_opt, {}}})
+            ->to_json()
+            ->writeto_fpath(
+                    output_file("TestGoptInference.ConvertFormatNCHW44.json"));
+
+    HostTensorND host_y_opt, host_y;
+    auto func = graph->compile({make_callback_copy(y, host_y),
+                                make_callback_copy(y_opt, host_y_opt)});
+    func->execute();
+    //! meybe go to winograd in x86-32, so set error 1e-1
+    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
+
+    *host_x = *gen({2, 3, 32, 32}, cn);
+    func->execute();
+    //! meybe go to winograd in x86-32, so set error 1e-1
+    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
+}

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/plugin/impl/opr_footprint.cpp
+++ b/src/plugin/impl/opr_footprint.cpp
@@ -99,7 +99,7 @@ uint64_t eval_conv_computation(const TensorShape& src_shape,
            group = filter_shape[0];
        }
        if (param.format == Param::Format::NCHW88) {
-            //! if channel wise weight layout is {group/8, 1, 1, FH, FW, 8}
+            //! if channel wise weight layout is {group/8, FH, FW, 1, 1, 8}
            if (filter_shape[1] == 1 && filter_shape[2] == 1) {
                group *= 8;
            }
@@ -107,6 +107,15 @@ uint64_t eval_conv_computation(const TensorShape& src_shape,
                                 src_shape[1] / group * 2;
            return hybird_nchwx ? computation : computation * 8;
        }
+        if (param.format == Param::Format::NCHW44) {
+            //! if channel wise weight layout is {group/4, FH, FW, 1, 1, 4}
+            if (filter_shape[1] == 1 && filter_shape[2] == 1) {
+                group *= 4;
+            }
+            size_t computation = dst_shape.total_nr_elems() * fh * fw *
+                                 src_shape[1] / group * 2;
+            return hybird_nchwx ? computation : computation * 4;
+        }
        if (param.format == Param::Format::NCHW32) {
            return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 32 /
                   group * 2;
@@ -135,6 +144,7 @@ uint64_t eval_conv_computation(const TensorShape& src_shape,
    };
    if (param.format == Param::Format::NCHW4 ||
        param.format == Param::Format::NCHW88 ||
+        param.format == Param::Format::NCHW44 ||
        param.format == Param::Format::NCHW32) {
        return eval_conv_computation_nchwx();
    }