feat(mgb/opr): add weight preprocess option

GitOrigin-RevId: 9d83a174fad2c4d6a1bca86c8597c8b6b4544376

feat(mgb/opr): add weight preprocess option
GitOrigin-RevId: 9d83a174fad2c4d6a1bca86c8597c8b6b4544376
add3a1bc · Megvii Engine Team · ee2e2b3c · add3a1bc · add3a1bc · add3a1bc
4 changed file
--- a/sdk/load-and-run/src/mgblar.cpp
+++ b/sdk/load-and-run/src/mgblar.cpp
@@ -194,6 +194,26 @@ R"__usage__(
    Execute operators with kernels implemented in MegDNN with CHWN4 tensor format. Can only be used
    on Nvidia GPUs, whose compute capability is above 6.1.
 )__usage__"
+R"__usage__(
+  --enable-nchw44
+    Execute operators with kernels implemented in MegDNN with NCHW44 tensor format. This can only
+    be used on arm of armv7 and arm64, support data tyep of float32, qint8 and int8x8x16.
+)__usage__"
+R"__usage__(
+  --enable-nhw88
+    Execute operators with kernels implemented in MegDNN with NCHW88 tensor format. This can only
+    be used on x86 with data type float.
+)__usage__"
+R"__usage__(
+  --enable-nhw44-dot
+    Execute operators with kernels implemented in MegDNN with NCHW44-DOT tensor format. This Can
+    only be used on arm32 and arm64 with dot-product supported, and only support qint8 model
+)__usage__"
+R"__usage__(
+  --weight-preprocess
+    Execute operators with weight preprocess, which can optimize the operator execution time with
+    algo of winograd, im2col ,etc., but it may consume more memory.
+)__usage__"

 ;

@@ -1226,6 +1246,11 @@ Args Args::from_argv(int argc, char **argv) {
            graph_opt.graph_opt.weight_winograd_transform = true;
            continue;
        }
+        if (!strcmp(argv[i], "--weight-preprocess")) {
+            mgb_log_warn("enable weight-preprocess optimization");
+            graph_opt.graph_opt.enable_weight_preprocess();
+            continue;
+        }

        fprintf(stderr, "invalid arg: %s\n", argv[i]);
        ret.args_parse_ret = -1;

--- a/src/core/include/megbrain/graph/cg.h
+++ b/src/core/include/megbrain/graph/cg.h
@@ -97,6 +97,9 @@ struct GraphCommonOptimizeOptions {
    bool fuse_conv_bias_with_z = false;
    //! whether to enable fast-run profiled winograd opr replace
    bool weight_winograd_transform = false;
+    //! whether to enable weight preprocess, if enabled it may use more
+    //! memory, default disable now
+    bool weight_preprocess = false;
    enum LayoutTransform : uint32_t {
        DEFAULT,
        NCHW4,       ///< compute using NCHW4 tensor format
@@ -127,6 +130,7 @@ struct GraphCommonOptimizeOptions {
    SET(fuse_conv_bias_nonlinearity);
    SET(fuse_conv_bias_with_z);
    SET(weight_winograd_transform);
+    SET(weight_preprocess);
 #undef SET
 #define SET(_trans, _trans_capital)                                 \
    GraphCommonOptimizeOptions& enable_##_trans() {                 \

--- a/src/opr/impl/dnn/convolution.cpp
+++ b/src/opr/impl/dnn/convolution.cpp
@@ -963,6 +963,9 @@ void mixin::WeightPreprocessExecutor::record_preprocessed_weight(

 bool mixin::WeightPreprocessExecutor::mixin_allow_weight_preprocess(
        const cg::OperatorNodeBase& opr) const {
+    if (!opr.owner_graph()->options().graph_opt.weight_preprocess) {
+        return false;
+    }
    if (!opr.input(1)->contain_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE))
        return false;
    if (cg::is_const_var_value(opr.input(1)))

--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -2225,6 +2225,7 @@ protected:
                           iw = ih;
        comp_node = CompNode::load("cpux");
        graph = ComputingGraph::make();
+        graph->options().graph_opt.weight_preprocess = is_weight_preprocess();
        TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
        x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
        auto x = opr::Host2DeviceCopy::make(*graph, x_host);
@@ -2247,6 +2248,8 @@ protected:

    void run() { func->execute().wait(); }

+    virtual bool is_weight_preprocess() { return true; }
+
    void TearDown() override {
        func.reset();
        // Triggers mock check
@@ -2346,6 +2349,33 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
    }
 }

+class TestNoWeightPreprocess : public TestWeightPreprocess {
+    bool is_weight_preprocess() override { return false; }
+};
+
+TEST_F(TestNoWeightPreprocess, NoPreprocess) {
+    using ::testing::_;
+    using ::testing::Return;
+    auto& mock = mock_conv();
+
+    MockAlgorithm algo;
+    EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
+            .WillRepeatedly(Return(&algo));
+    EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
+            .WillRepeatedly(Return(0));
+    EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
+            .WillRepeatedly(Return(0));
+
+    {
+        ::testing::InSequence seq;
+        // Return empty preprocess filters, indicating no need to preprocess
+        EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)).Times(0);
+        EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
+        EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
+        run();
+    }
+}
+
 }  // anonymous namespace

 #endif