diff --git a/sdk/load-and-run/src/mgblar.cpp b/sdk/load-and-run/src/mgblar.cpp index 7d2b29324f37c8e114089b08eca256b4b3254b68..7488a3bd45be00daa724909033d1ad5d0ca9948d 100644 --- a/sdk/load-and-run/src/mgblar.cpp +++ b/sdk/load-and-run/src/mgblar.cpp @@ -83,7 +83,7 @@ R"__usage__( hard to profile host time. Use --profile-host to focus on host time profiling. --input [ filepath | string] - Set up inputs for megbrain model. for example: --data image.ppm --data + Set up inputs for megbrain model. for example: --data image.ppm --data param.json --data bbox:bbox.npy@batchid:b.npy --data rect:[0,0,227,227]; batchid:0,1,2,3. --io-dump or --bin-io-dump should be enabled at the same time. @@ -974,7 +974,7 @@ Args Args::from_argv(int argc, char **argv) { #endif if (!strcmp(argv[i], "--enable-chwn4")) { mgb_log_warn("enable chwn4 optimization"); - graph_opt.graph_opt.enable_chwn4 = true; + graph_opt.graph_opt.enable_chwn4(); continue; } #if MGB_ENABLE_JSON diff --git a/src/core/impl/graph/cg_impl.cpp b/src/core/impl/graph/cg_impl.cpp index 97aaaebbbd98698d420e3b240523d77e1e07ba12..53f31e6dc0d2ff2f08138d1f06f08abd5e476a94 100644 --- a/src/core/impl/graph/cg_impl.cpp +++ b/src/core/impl/graph/cg_impl.cpp @@ -17,6 +17,7 @@ #include "megbrain/gopt/inference.h" #include "megbrain/gopt/basic_arith.h" #include "megbrain/gopt/misc.h" +#include "megbrain/graph/cg.h" #include "megbrain/graph/event.h" #include "megbrain/graph/exc_extra_info.h" #include "megbrain/graph/helper.h" @@ -457,14 +458,17 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare( } #endif - if (options().graph_opt.enable_chwn4) { - options().graph_opt.enable_chwn4 = false; - gopt::reformat_to_chwn4_transform_dest_vars_inplace(dest_vars); - } if (options().graph_opt.winograd_transform) { options().graph_opt.winograd_transform = false; gopt::transform_vars_inplace_with_winograd(dest_vars); } + if (options().graph_opt.transform_chwn4()) { + gopt::GraphOptimizer optimizer; + optimizer.apply_optimize_options(options().graph_opt); + options().graph_opt.layout_transform = + cg::GraphCommonOptimizeOptions::LayoutTransform::DEFAULT; + optimizer.apply_inplace(dest_vars); + } #if MGB_JIT if (std::abs(options().graph_opt_level) == 0 && options().graph_opt.jit) { diff --git a/src/core/include/megbrain/graph/cg.h b/src/core/include/megbrain/graph/cg.h index 5283be523d4a6bc57cb4447717f4320c91180259..d84cdc6c2ddde52e73982d84c48be3d0fbca7e81 100644 --- a/src/core/include/megbrain/graph/cg.h +++ b/src/core/include/megbrain/graph/cg.h @@ -81,6 +81,59 @@ public: virtual size_t static_alloc_version(ComputingGraph* graph) const; }; +/** + * \brief common optimize options, it both can be used for optimize for + * inference in graph dump but also used in graph optimization in runtime. + */ +struct GraphCommonOptimizeOptions { + //! whether to enable IO in float16 compute in float32 + bool f16_io_f32_comp = false; + //! whether to enable tranform to pure float16 model + bool f16_io_comp = false; + //! whether to enable conv bias nonlinearity fusion + bool fuse_conv_bias_nonlinearity = false; + enum LayoutTransform : uint32_t { + DEFAULT, + NHWCD4, ///< compute using NHWCD4 tensor format + NCHW88, ///< compute using NCHW88 tensor format + NCHW44, ///< compute using NCHW44 tensor format + NCHW32, ///< compute using NCHW32 tensor format, used for + ///< tensorcore + CHWN4, ///< compute using CHWN4 tensor format, transformed mainly + ///< used for cuda + }; + LayoutTransform layout_transform = LayoutTransform::DEFAULT; + //! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b) + //! + z -> conv_bias(x, w, b, z) + bool fuse_conv_bias_with_z = false; + +#define SET(n) \ + GraphCommonOptimizeOptions& enable_##n() { \ + n = true; \ + return *this; \ + } + SET(f16_io_f32_comp); + SET(f16_io_comp); + SET(fuse_conv_bias_nonlinearity); + SET(fuse_conv_bias_with_z); +#undef SET +#define SET(_trans, _trans_capital) \ + GraphCommonOptimizeOptions& enable_##_trans() { \ + layout_transform = LayoutTransform::_trans_capital; \ + return *this; \ + } \ + bool transform_##_trans() const { \ + return layout_transform == LayoutTransform::_trans_capital; \ + } + + SET(nhwcd4, NHWCD4); + SET(nchw88, NCHW88); + SET(nchw44, NCHW44); + SET(nchw32, NCHW32); + SET(chwn4, CHWN4); +#undef SET +}; + /*! * \brief Computing graph. * @@ -232,7 +285,7 @@ class ComputingGraph : public std::enable_shared_from_this, } seq_opt; //! graph optimization options - struct GraphOpt { + struct GraphOpt : GraphCommonOptimizeOptions { //! whether to enable JIT; JIT would also be enabled at O3 //! this value indicates JIT level: 1 for basic elemwise opr; 2 //! for including reduce oprs @@ -241,8 +294,6 @@ class ComputingGraph : public std::enable_shared_from_this, bool tensorrt = false; //! whether to enable fast-run profiled winograd opr replace bool winograd_transform = false; - //! whether to enable nchw4->chwn4 opr replace - bool enable_chwn4 = false; } graph_opt; //! get attribute for an operator diff --git a/src/gopt/impl/framework.cpp b/src/gopt/impl/framework.cpp index 0cca3cfc69a7c1573f19e86dbd3190be7918a5ba..eb1e64a0644ed389debbc508d004f736a5e75209 100644 --- a/src/gopt/impl/framework.cpp +++ b/src/gopt/impl/framework.cpp @@ -14,6 +14,7 @@ #include "megbrain/gopt/basic_arith.h" #include "megbrain/gopt/misc.h" #include "megbrain/gopt/gtrans.h" +#include "megbrain/graph/cg.h" #include "megbrain/graph/event.h" #include "megbrain/graph/exc_extra_info.h" #include "megbrain/serialization/serializer.h" @@ -672,7 +673,11 @@ GraphOptimizer& GraphOptimizer::add_preset_passes( } #endif - apply_optimize_options(inference_opt); + if (inference_opt) { + add_pass(); + apply_optimize_options(*inference_opt); + } + if (inference_opt) { // merge params to reduce loading time and graph overhead @@ -699,32 +704,32 @@ VarNode* GraphOptimizer::var_replace_lookup(VarNode *var) { } } -void GraphOptimizer::apply_optimize_options( - const OptimizeOptions* options) { - if (!options) return; - if (options->f16_io_comp) { +const GraphOptimizer& GraphOptimizer::apply_optimize_options( + const cg::GraphCommonOptimizeOptions& options) { + if (options.f16_io_comp) { add_pass(ConvertF32ToF16Pass::make(false)); } - if (options->f16_io_f32_comp) { + if (options.f16_io_f32_comp) { add_pass(ConvertF32ToF16Pass::make(true)); } - if (options->transform_nhwcd4()) { + if (options.transform_nhwcd4()) { add_pass(ConvertFormatPass::make_nhwcd4_converter()); add_pass(); } - if (options->transform_nchw88()) { + if (options.transform_nchw88()) { add_pass(EnableNchwxxPass::make_nchwxx_converter(8)); } - if (options->transform_nchw44()) { + if (options.transform_nchw44()) { add_pass(EnableNchwxxPass::make_nchwxx_converter(4)); } - if (options->transform_nchw32()) { + if (options.transform_nchw32()) { add_pass(); + add_pass(); add_pass(EnableTensorCorePass::make_tensorcore_converter()); add_pass(); add_pass(); } - if (options->transform_chwn4()) { + if (options.transform_chwn4()) { add_pass(); add_pass(); add_pass(EnableCHWN4Pass::make_chwn4_converter()); @@ -732,14 +737,15 @@ void GraphOptimizer::apply_optimize_options( add_pass(); } - if (options->fuse_conv_bias_nonlinearity) { + if (options.fuse_conv_bias_nonlinearity) { add_pass(); } - if (options->fuse_conv_bias_with_z) { + if (options.fuse_conv_bias_with_z) { add_pass(); add_pass(); } add_pass(); + return *this; } /* ================ ConstVarPropogateBase ================ */ diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp index c797be7949b4018e8e26a91934a43bb17553ec13..8f1507a5651b9d0c10b59c33ccdf2be80b685207 100644 --- a/src/gopt/impl/tensor_reformat.cpp +++ b/src/gopt/impl/tensor_reformat.cpp @@ -2215,16 +2215,4 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const { Impl{opt}; } -void gopt::reformat_to_chwn4_transform_dest_vars_inplace( - mgb::cg::VarNodeArray& dest_vars) { - gopt::GraphOptimizer optimizer; - optimizer.add_pass(); - optimizer.add_pass(); - optimizer.add_pass(EnableCHWN4Pass::make_chwn4_converter()); - optimizer.add_pass(); - optimizer.add_pass(); - optimizer.add_pass(); - optimizer.apply_inplace(dest_vars); -} - // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/gopt/include/megbrain/gopt/framework.h b/src/gopt/include/megbrain/gopt/framework.h index 77a2f0f8f37084ebca8e46a41902667b08ec22d1..f5f41cdb31c2e3f8a2b789037e8e15518ee61045 100644 --- a/src/gopt/include/megbrain/gopt/framework.h +++ b/src/gopt/include/megbrain/gopt/framework.h @@ -13,6 +13,7 @@ #include "megbrain/graph.h" #include "megbrain/gopt/gtrans.h" +#include "megbrain/graph/cg.h" namespace mgb { namespace gopt { @@ -377,60 +378,6 @@ namespace gopt { RecursiveSubGraphRewriteHelper(OptState &state); }; - /** - * \brief common optimize options, it both can be used for optimize for - * inference in graph dump but also used in graph optimization in runtime. - */ - struct OptimizeOptions { - //! whether to enable IO in float16 compute in float32 - bool f16_io_f32_comp = false; - //! whether to enable tranform to pure float16 model - bool f16_io_comp = false; - //! whether to enable conv bias nonlinearity fusion - bool fuse_conv_bias_nonlinearity = false; - enum LayoutTransform : uint32_t { - DEFAULT, - NHWCD4, ///< compute using NHWCD4 tensor format - NCHW88, ///< compute using NCHW88 tensor format - NCHW44, ///< compute using NCHW44 tensor format - NCHW32, ///< compute using NCHW32 tensor format, used for - ///< tensorcore - CHWN4, ///< compute using CHWN4 tensor format, transformed mainly - ///< used for cuda - }; - LayoutTransform layout_transform = LayoutTransform::DEFAULT; - //! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b) - //! + z -> conv_bias(x, w, b, z) - bool fuse_conv_bias_with_z = false; - -#define SET(n) \ - OptimizeOptions& enable_##n() { \ - n = true; \ - return *this; \ - } - SET(f16_io_f32_comp); - SET(f16_io_comp); - SET(fuse_conv_bias_nonlinearity); - SET(fuse_conv_bias_with_z); -#undef SET -#define SET(_trans, _trans_capital) \ - OptimizeOptions& enable_##_trans() { \ - layout_transform = LayoutTransform::_trans_capital; \ - return *this; \ - } \ - bool transform_##_trans() const { \ - return layout_transform == LayoutTransform::_trans_capital; \ - } - - SET(nhwcd4, NHWCD4); - SET(nchw88, NCHW88); - SET(nchw44, NCHW44); - SET(nchw32, NCHW32); - SET(chwn4, CHWN4); -#undef SET - }; - - /*! * \brief manage passes and their applying on graphs * @@ -523,7 +470,8 @@ namespace gopt { /** * \brief apply optimize options */ - void apply_optimize_options(const OptimizeOptions* options); + const GraphOptimizer& apply_optimize_options( + const cg::GraphCommonOptimizeOptions& options); }; /*! diff --git a/src/gopt/include/megbrain/gopt/inference.h b/src/gopt/include/megbrain/gopt/inference.h index 773d05ec4c8e3ac6a8084e026316c4dc85a4b4fd..9e653c58123f47922eab2fa197561e2670311fd4 100644 --- a/src/gopt/include/megbrain/gopt/inference.h +++ b/src/gopt/include/megbrain/gopt/inference.h @@ -12,6 +12,7 @@ #pragma once #include "megbrain/gopt/framework.h" +#include "megbrain/graph/cg.h" namespace mgb { namespace gopt { @@ -256,7 +257,7 @@ namespace gopt { size_t pack_c_size); }; - struct OptimizeForInferenceOptions : OptimizeOptions {}; + struct OptimizeForInferenceOptions : cg::GraphCommonOptimizeOptions {}; /*! * \brief optimize a computing graph for inference @@ -325,13 +326,6 @@ namespace gopt { void apply(OptState& opt) const override; }; - /*! - * \brief transform tensor format in a network to c/4hwn4 format, and - * accelerate the inference speed on Nvidia platform - */ - void reformat_to_chwn4_transform_dest_vars_inplace( - mgb::cg::VarNodeArray& dest_vars); - } // namespace gopt } // namespace mgb