From e24fcd00c1df1c5f71c971fded9f6fd8e215bada Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 11 May 2020 02:18:32 +0800 Subject: [PATCH] refactor(gopt): use graphcommonoptimizeoptions for graphopt GitOrigin-RevId: dd8a93813ae7885bdc23e43f197a86c19e25ddc2 --- sdk/load-and-run/src/mgblar.cpp | 4 +- src/core/impl/graph/cg_impl.cpp | 12 +++-- src/core/include/megbrain/graph/cg.h | 57 +++++++++++++++++++-- src/gopt/impl/framework.cpp | 32 +++++++----- src/gopt/impl/tensor_reformat.cpp | 12 ----- src/gopt/include/megbrain/gopt/framework.h | 58 ++-------------------- src/gopt/include/megbrain/gopt/inference.h | 10 +--- 7 files changed, 88 insertions(+), 97 deletions(-) diff --git a/sdk/load-and-run/src/mgblar.cpp b/sdk/load-and-run/src/mgblar.cpp index 7d2b29324..7488a3bd4 100644 --- a/sdk/load-and-run/src/mgblar.cpp +++ b/sdk/load-and-run/src/mgblar.cpp @@ -83,7 +83,7 @@ R"__usage__( hard to profile host time. Use --profile-host to focus on host time profiling. --input [ filepath | string] - Set up inputs for megbrain model. for example: --data image.ppm --data + Set up inputs for megbrain model. for example: --data image.ppm --data param.json --data bbox:bbox.npy@batchid:b.npy --data rect:[0,0,227,227]; batchid:0,1,2,3. --io-dump or --bin-io-dump should be enabled at the same time. @@ -974,7 +974,7 @@ Args Args::from_argv(int argc, char **argv) { #endif if (!strcmp(argv[i], "--enable-chwn4")) { mgb_log_warn("enable chwn4 optimization"); - graph_opt.graph_opt.enable_chwn4 = true; + graph_opt.graph_opt.enable_chwn4(); continue; } #if MGB_ENABLE_JSON diff --git a/src/core/impl/graph/cg_impl.cpp b/src/core/impl/graph/cg_impl.cpp index 97aaaebbb..53f31e6dc 100644 --- a/src/core/impl/graph/cg_impl.cpp +++ b/src/core/impl/graph/cg_impl.cpp @@ -17,6 +17,7 @@ #include "megbrain/gopt/inference.h" #include "megbrain/gopt/basic_arith.h" #include "megbrain/gopt/misc.h" +#include "megbrain/graph/cg.h" #include "megbrain/graph/event.h" #include "megbrain/graph/exc_extra_info.h" #include "megbrain/graph/helper.h" @@ -457,14 +458,17 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare( } #endif - if (options().graph_opt.enable_chwn4) { - options().graph_opt.enable_chwn4 = false; - gopt::reformat_to_chwn4_transform_dest_vars_inplace(dest_vars); - } if (options().graph_opt.winograd_transform) { options().graph_opt.winograd_transform = false; gopt::transform_vars_inplace_with_winograd(dest_vars); } + if (options().graph_opt.transform_chwn4()) { + gopt::GraphOptimizer optimizer; + optimizer.apply_optimize_options(options().graph_opt); + options().graph_opt.layout_transform = + cg::GraphCommonOptimizeOptions::LayoutTransform::DEFAULT; + optimizer.apply_inplace(dest_vars); + } #if MGB_JIT if (std::abs(options().graph_opt_level) == 0 && options().graph_opt.jit) { diff --git a/src/core/include/megbrain/graph/cg.h b/src/core/include/megbrain/graph/cg.h index 5283be523..d84cdc6c2 100644 --- a/src/core/include/megbrain/graph/cg.h +++ b/src/core/include/megbrain/graph/cg.h @@ -81,6 +81,59 @@ public: virtual size_t static_alloc_version(ComputingGraph* graph) const; }; +/** + * \brief common optimize options, it both can be used for optimize for + * inference in graph dump but also used in graph optimization in runtime. + */ +struct GraphCommonOptimizeOptions { + //! whether to enable IO in float16 compute in float32 + bool f16_io_f32_comp = false; + //! whether to enable tranform to pure float16 model + bool f16_io_comp = false; + //! whether to enable conv bias nonlinearity fusion + bool fuse_conv_bias_nonlinearity = false; + enum LayoutTransform : uint32_t { + DEFAULT, + NHWCD4, ///< compute using NHWCD4 tensor format + NCHW88, ///< compute using NCHW88 tensor format + NCHW44, ///< compute using NCHW44 tensor format + NCHW32, ///< compute using NCHW32 tensor format, used for + ///< tensorcore + CHWN4, ///< compute using CHWN4 tensor format, transformed mainly + ///< used for cuda + }; + LayoutTransform layout_transform = LayoutTransform::DEFAULT; + //! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b) + //! + z -> conv_bias(x, w, b, z) + bool fuse_conv_bias_with_z = false; + +#define SET(n) \ + GraphCommonOptimizeOptions& enable_##n() { \ + n = true; \ + return *this; \ + } + SET(f16_io_f32_comp); + SET(f16_io_comp); + SET(fuse_conv_bias_nonlinearity); + SET(fuse_conv_bias_with_z); +#undef SET +#define SET(_trans, _trans_capital) \ + GraphCommonOptimizeOptions& enable_##_trans() { \ + layout_transform = LayoutTransform::_trans_capital; \ + return *this; \ + } \ + bool transform_##_trans() const { \ + return layout_transform == LayoutTransform::_trans_capital; \ + } + + SET(nhwcd4, NHWCD4); + SET(nchw88, NCHW88); + SET(nchw44, NCHW44); + SET(nchw32, NCHW32); + SET(chwn4, CHWN4); +#undef SET +}; + /*! * \brief Computing graph. * @@ -232,7 +285,7 @@ class ComputingGraph : public std::enable_shared_from_this, } seq_opt; //! graph optimization options - struct GraphOpt { + struct GraphOpt : GraphCommonOptimizeOptions { //! whether to enable JIT; JIT would also be enabled at O3 //! this value indicates JIT level: 1 for basic elemwise opr; 2 //! for including reduce oprs @@ -241,8 +294,6 @@ class ComputingGraph : public std::enable_shared_from_this, bool tensorrt = false; //! whether to enable fast-run profiled winograd opr replace bool winograd_transform = false; - //! whether to enable nchw4->chwn4 opr replace - bool enable_chwn4 = false; } graph_opt; //! get attribute for an operator diff --git a/src/gopt/impl/framework.cpp b/src/gopt/impl/framework.cpp index 0cca3cfc6..eb1e64a06 100644 --- a/src/gopt/impl/framework.cpp +++ b/src/gopt/impl/framework.cpp @@ -14,6 +14,7 @@ #include "megbrain/gopt/basic_arith.h" #include "megbrain/gopt/misc.h" #include "megbrain/gopt/gtrans.h" +#include "megbrain/graph/cg.h" #include "megbrain/graph/event.h" #include "megbrain/graph/exc_extra_info.h" #include "megbrain/serialization/serializer.h" @@ -672,7 +673,11 @@ GraphOptimizer& GraphOptimizer::add_preset_passes( } #endif - apply_optimize_options(inference_opt); + if (inference_opt) { + add_pass(); + apply_optimize_options(*inference_opt); + } + if (inference_opt) { // merge params to reduce loading time and graph overhead @@ -699,32 +704,32 @@ VarNode* GraphOptimizer::var_replace_lookup(VarNode *var) { } } -void GraphOptimizer::apply_optimize_options( - const OptimizeOptions* options) { - if (!options) return; - if (options->f16_io_comp) { +const GraphOptimizer& GraphOptimizer::apply_optimize_options( + const cg::GraphCommonOptimizeOptions& options) { + if (options.f16_io_comp) { add_pass(ConvertF32ToF16Pass::make(false)); } - if (options->f16_io_f32_comp) { + if (options.f16_io_f32_comp) { add_pass(ConvertF32ToF16Pass::make(true)); } - if (options->transform_nhwcd4()) { + if (options.transform_nhwcd4()) { add_pass(ConvertFormatPass::make_nhwcd4_converter()); add_pass(); } - if (options->transform_nchw88()) { + if (options.transform_nchw88()) { add_pass(EnableNchwxxPass::make_nchwxx_converter(8)); } - if (options->transform_nchw44()) { + if (options.transform_nchw44()) { add_pass(EnableNchwxxPass::make_nchwxx_converter(4)); } - if (options->transform_nchw32()) { + if (options.transform_nchw32()) { add_pass(); + add_pass(); add_pass(EnableTensorCorePass::make_tensorcore_converter()); add_pass(); add_pass(); } - if (options->transform_chwn4()) { + if (options.transform_chwn4()) { add_pass(); add_pass(); add_pass(EnableCHWN4Pass::make_chwn4_converter()); @@ -732,14 +737,15 @@ void GraphOptimizer::apply_optimize_options( add_pass(); } - if (options->fuse_conv_bias_nonlinearity) { + if (options.fuse_conv_bias_nonlinearity) { add_pass(); } - if (options->fuse_conv_bias_with_z) { + if (options.fuse_conv_bias_with_z) { add_pass(); add_pass(); } add_pass(); + return *this; } /* ================ ConstVarPropogateBase ================ */ diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp index c797be794..8f1507a56 100644 --- a/src/gopt/impl/tensor_reformat.cpp +++ b/src/gopt/impl/tensor_reformat.cpp @@ -2215,16 +2215,4 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const { Impl{opt}; } -void gopt::reformat_to_chwn4_transform_dest_vars_inplace( - mgb::cg::VarNodeArray& dest_vars) { - gopt::GraphOptimizer optimizer; - optimizer.add_pass(); - optimizer.add_pass(); - optimizer.add_pass(EnableCHWN4Pass::make_chwn4_converter()); - optimizer.add_pass(); - optimizer.add_pass(); - optimizer.add_pass(); - optimizer.apply_inplace(dest_vars); -} - // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/gopt/include/megbrain/gopt/framework.h b/src/gopt/include/megbrain/gopt/framework.h index 77a2f0f8f..f5f41cdb3 100644 --- a/src/gopt/include/megbrain/gopt/framework.h +++ b/src/gopt/include/megbrain/gopt/framework.h @@ -13,6 +13,7 @@ #include "megbrain/graph.h" #include "megbrain/gopt/gtrans.h" +#include "megbrain/graph/cg.h" namespace mgb { namespace gopt { @@ -377,60 +378,6 @@ namespace gopt { RecursiveSubGraphRewriteHelper(OptState &state); }; - /** - * \brief common optimize options, it both can be used for optimize for - * inference in graph dump but also used in graph optimization in runtime. - */ - struct OptimizeOptions { - //! whether to enable IO in float16 compute in float32 - bool f16_io_f32_comp = false; - //! whether to enable tranform to pure float16 model - bool f16_io_comp = false; - //! whether to enable conv bias nonlinearity fusion - bool fuse_conv_bias_nonlinearity = false; - enum LayoutTransform : uint32_t { - DEFAULT, - NHWCD4, ///< compute using NHWCD4 tensor format - NCHW88, ///< compute using NCHW88 tensor format - NCHW44, ///< compute using NCHW44 tensor format - NCHW32, ///< compute using NCHW32 tensor format, used for - ///< tensorcore - CHWN4, ///< compute using CHWN4 tensor format, transformed mainly - ///< used for cuda - }; - LayoutTransform layout_transform = LayoutTransform::DEFAULT; - //! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b) - //! + z -> conv_bias(x, w, b, z) - bool fuse_conv_bias_with_z = false; - -#define SET(n) \ - OptimizeOptions& enable_##n() { \ - n = true; \ - return *this; \ - } - SET(f16_io_f32_comp); - SET(f16_io_comp); - SET(fuse_conv_bias_nonlinearity); - SET(fuse_conv_bias_with_z); -#undef SET -#define SET(_trans, _trans_capital) \ - OptimizeOptions& enable_##_trans() { \ - layout_transform = LayoutTransform::_trans_capital; \ - return *this; \ - } \ - bool transform_##_trans() const { \ - return layout_transform == LayoutTransform::_trans_capital; \ - } - - SET(nhwcd4, NHWCD4); - SET(nchw88, NCHW88); - SET(nchw44, NCHW44); - SET(nchw32, NCHW32); - SET(chwn4, CHWN4); -#undef SET - }; - - /*! * \brief manage passes and their applying on graphs * @@ -523,7 +470,8 @@ namespace gopt { /** * \brief apply optimize options */ - void apply_optimize_options(const OptimizeOptions* options); + const GraphOptimizer& apply_optimize_options( + const cg::GraphCommonOptimizeOptions& options); }; /*! diff --git a/src/gopt/include/megbrain/gopt/inference.h b/src/gopt/include/megbrain/gopt/inference.h index 773d05ec4..9e653c581 100644 --- a/src/gopt/include/megbrain/gopt/inference.h +++ b/src/gopt/include/megbrain/gopt/inference.h @@ -12,6 +12,7 @@ #pragma once #include "megbrain/gopt/framework.h" +#include "megbrain/graph/cg.h" namespace mgb { namespace gopt { @@ -256,7 +257,7 @@ namespace gopt { size_t pack_c_size); }; - struct OptimizeForInferenceOptions : OptimizeOptions {}; + struct OptimizeForInferenceOptions : cg::GraphCommonOptimizeOptions {}; /*! * \brief optimize a computing graph for inference @@ -325,13 +326,6 @@ namespace gopt { void apply(OptState& opt) const override; }; - /*! - * \brief transform tensor format in a network to c/4hwn4 format, and - * accelerate the inference speed on Nvidia platform - */ - void reformat_to_chwn4_transform_dest_vars_inplace( - mgb::cg::VarNodeArray& dest_vars); - } // namespace gopt } // namespace mgb -- GitLab