From a3cd3fc74fda32266269ab9a5f46928e32c73e66 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 9 Sep 2021 16:18:53 +0800 Subject: [PATCH] test(mgb/gopt): add testcase for global layout transform GitOrigin-RevId: f9669e1ba0d4c46ca8aab3161870d17d7762bf8b --- src/gopt/impl/dynamic_programming_solver.cpp | 5 +- src/gopt/impl/layout_transform_pass.cpp | 6 +- src/gopt/impl/opr_tensor_formats_config.cpp | 52 +- src/gopt/impl/profiler_impl.cpp | 58 +- src/gopt/impl/reformat_manager.cpp | 19 +- .../megbrain/gopt/global_layout_transform.h | 5 +- .../include/megbrain/gopt/reformat_manager.h | 12 - src/gopt/test/inference.cpp | 73 +++ src/gopt/test/layout_transform_pass.cpp | 610 ++++++++++++++---- src/gopt/test/network.cpp | 237 +++++++ src/gopt/test/network.h | 77 +++ 11 files changed, 994 insertions(+), 160 deletions(-) create mode 100644 src/gopt/test/network.cpp create mode 100644 src/gopt/test/network.h diff --git a/src/gopt/impl/dynamic_programming_solver.cpp b/src/gopt/impl/dynamic_programming_solver.cpp index 847ea63b..01a3627d 100644 --- a/src/gopt/impl/dynamic_programming_solver.cpp +++ b/src/gopt/impl/dynamic_programming_solver.cpp @@ -28,7 +28,10 @@ public: private: using TensorFormatsBitSet = uint32_t; using State = SmallVector; - static constexpr uint32_t MAX_TENSOR_FORMATS = sizeof(TensorFormatsBitSet); + /// 1bit represents one kind of tensor formats + static constexpr uint32_t BITS_PER_BYTE = 8; + static constexpr uint32_t MAX_TENSOR_FORMATS = + sizeof(TensorFormatsBitSet) * BITS_PER_BYTE; TensorFormatsBitSet add(TensorFormatsBitSet& set, TensorFormats fmt) { mgb_assert(static_cast(fmt) < MAX_TENSOR_FORMATS); set |= (1 << static_cast(fmt)); diff --git a/src/gopt/impl/layout_transform_pass.cpp b/src/gopt/impl/layout_transform_pass.cpp index 0c0399a1..b46670c2 100644 --- a/src/gopt/impl/layout_transform_pass.cpp +++ b/src/gopt/impl/layout_transform_pass.cpp @@ -111,8 +111,6 @@ void LayoutTransformPass::apply(OptState& opt) const { } new_var = reformat({new_var}); } - if (from != to && !new_var->shape().is_scalar()) - new_var = reformat({new_var}); new_inp[i] = new_var; } VarNode* new_out; @@ -164,7 +162,9 @@ void LayoutTransformPass::apply(OptState& opt) const { } } else { auto new_opr = rewriter.auto_replace_outputs(opr); - var2fmts[new_opr->output(0)] = base_fmt; + for (auto&& ov : new_opr->usable_output()) { + var2fmts[ov] = base_fmt; + } } }; opt.graph().iter(on_opr); diff --git a/src/gopt/impl/opr_tensor_formats_config.cpp b/src/gopt/impl/opr_tensor_formats_config.cpp index db74bcec..5a570544 100644 --- a/src/gopt/impl/opr_tensor_formats_config.cpp +++ b/src/gopt/impl/opr_tensor_formats_config.cpp @@ -245,19 +245,26 @@ struct ConvTensorFormatsDispatcherImpl { if (i == 2) available &= opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS32; - else - available &= opr->input(i)->dtype().enumv() == - DTypeEnum::Quantized4Asymm || - opr->input(i)->dtype().enumv() == - DTypeEnum::QuantizedS4; + else { + bool i4_config = opr->input(i)->dtype().enumv() == + DTypeEnum::Quantized4Asymm || + opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS4; + bool i8_config = opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS8; + available &= (i4_config || i8_config); + } config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); TensorType tensor_type = i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; config.input_tensor_types.emplace_back(tensor_type); } - available &= + bool i4_config = opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4; + bool i8_config = + opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + available &= (i4_config || i8_config); config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); available &= conv.param().sparse == Opr::Param::Sparse::DENSE; config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC, @@ -496,6 +503,38 @@ struct ConvTensorFormatsDispatcherImpl +struct ConvTensorFormatsDispatcherImpl { + using Opr = opr::ConvolutionBackwardData; + static Maybe dispatch( + const OperatorNodeBase* opr) { + const auto& conv = opr->cast_final_safe(); + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW4; + bool available = true; + for (size_t i = 0; i < opr->input().size(); ++i) { + available &= + opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); + TensorType tensor_type = + i == 0 ? TensorType::WEIGHT : TensorType::FEATURE; + config.input_tensor_types.emplace_back(tensor_type); + } + available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); + available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE; + config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC, + TensorFormats::NHWC, + TensorFormats::NHWC}; + config.output_tensor_formats = {TensorFormats::NHWC}; + if (available) + return config; + return None; + } +}; + struct StaticData { struct KeyHash { size_t operator()(const std::pair& val) const { @@ -543,6 +582,7 @@ StaticData::StaticData() { OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4); OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW); + OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NHWC); OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4); OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW); diff --git a/src/gopt/impl/profiler_impl.cpp b/src/gopt/impl/profiler_impl.cpp index 7ab62c3c..bc2a84ee 100644 --- a/src/gopt/impl/profiler_impl.cpp +++ b/src/gopt/impl/profiler_impl.cpp @@ -17,7 +17,6 @@ #include "megbrain/graph/event.h" #include "megbrain/opr/dnn/pooling.h" #include "megbrain/opr/imgproc.h" -#include "megbrain/opr/nn_int.h" #include "megbrain/opr/io.h" #include "megbrain/opr/nn_int.h" #include "megbrain/plugin/base.h" @@ -167,11 +166,12 @@ private: static constexpr float PROFILE_TIME_OUT = 1e7; using ReformatAttribute = ReformatKey::Attribute; /*! - * \brief profile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.) + * \brief profile opr format agnostic operators (like elemwise, elemwise + * multi type, typecvt etc.) * * \param opr pointer to the operator node to be profiled * \param base_format the original tensor format of the operator node. - * \param available_tensor_formats the available tensor formats + * \param available_tensor_formats the available tensor formats * \return the operator node record */ OperatorNodeRecord profile_operator( @@ -220,7 +220,7 @@ private: ReformatAttribute::DEFAULT) const; float profile_var_node(const VarNode* var, TensorFormats base_format, const ReformatKey& key) const; - int m_runs; /// sample times of the profiler + int m_runs; /// sample times of the profiler }; ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( @@ -281,10 +281,6 @@ ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( record.opr = opr; auto& costs = record.costs; for (auto&& i : available_configs) { - /// XXXX remove later - if (i.opr_format == OprFormat::NCHW && - opr->input(0)->dtype().enumv() != DTypeEnum::Float32) - continue; costs[i.opr_format] = profile_operator(opr, base_config, i, extra_attribute); } @@ -403,8 +399,8 @@ float ProfilerImpl::profile_var_node(const VarNode* var, auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( var, base_format, key); auto y = builder({aligned_var.node()}); - if (!m_var_node_filter(var, aligned_tensor_shape, y->shape(), - TensorFormat{})) + + if (!m_var_node_filter(var, aligned_tensor_shape, y->shape(), key)) return PROFILE_TIME_OUT; ThinHashSet set; DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); }); @@ -533,6 +529,17 @@ ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold) m_var_node_threshold{var_node_threshold} { m_opr_filter = [this](const OperatorNodeBase* opr, OperatorNodeBase* new_opr) { + /// \note: for the considerations of performance, we skip nchw(naive) + /// kernels for conv bias on CUDA platform. to remove this later + if (auto conv = try_cast_as_op(new_opr)) { + if (conv->output(0)->comp_node().device_type() == + CompNode::DeviceType::CUDA && + conv->input(0)->dtype().category() == + DTypeCategory::QUANTIZED && + conv->param().format == OprFormat::NCHW) { + return false; + } + } float comp1 = m_opr_footprint.get_computation( const_cast(opr)); float comp2 = m_opr_footprint.get_computation(new_opr); @@ -541,18 +548,27 @@ ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold) return true; }; m_var_node_filter = [this](const VarNode* var, TensorShape from, - TensorShape to, TensorFormat format) { - TensorFormat default_; - TensorLayout orig_ly, from_ly, to_ly; - if (format == default_) { - orig_ly = {var->shape(), var->dtype()}; - from_ly = {from, var->dtype()}; - to_ly = {to, var->dtype()}; - } else { - orig_ly = {var->shape(), var->dtype(), format}; - from_ly = {from, var->dtype(), format}; - to_ly = {to, var->dtype(), format}; + TensorShape to, ReformatKey key) { + /// \note: due to the alignment requirement of low-bit tensor, we skip + /// some layout transform for low-bit tensors. The skipped layout + /// transforms do not have corresponding dnn kernel and cannot be + /// implemented by tensor manip operators (like reshape, dimshuffle, + /// subtensor, etc.). + if (var->dtype().enumv() == DTypeEnum::QuantizedS4 || + var->dtype().enumv() == DTypeEnum::Quantized4Asymm) { + if (key.input_format == TensorFormats::NCHW && + key.output_format != TensorFormats::NHWC && + key.output_format != TensorFormats::NCHWc64) { + return false; + } + if (key.output_format == TensorFormats::NCHW && + key.input_format != TensorFormats::NHWC && + key.input_format != TensorFormats::NCHWc64) { + return false; + } } + TensorLayout orig_ly = {var->shape(), var->dtype()}, + from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()}; float orig_memory = orig_ly.span().dist_byte() * 2.f; float reformat_memory = from_ly.span().dist_byte() + to_ly.span().dist_byte(); diff --git a/src/gopt/impl/reformat_manager.cpp b/src/gopt/impl/reformat_manager.cpp index d3786838..f83333ea 100644 --- a/src/gopt/impl/reformat_manager.cpp +++ b/src/gopt/impl/reformat_manager.cpp @@ -329,10 +329,21 @@ ReformatManager::ReformatImpl ReformatManager::get( const ReformatKey& key) const { using Attribute = ReformatKey::Attribute; MGB_TRY { - auto find = m_cache.find(key); - if (find != m_cache.end()) { - auto rst = find->second; - return rst; + { + auto find = m_cache.find(key); + if (find != m_cache.end()) { + auto rst = find->second; + return rst; + } + } + if (key.attribute == Attribute::AUTO_PADDING_NHWC) { + auto key_ = key; + key_.attribute = Attribute::DEFAULT; + auto find = m_cache.find(key_); + if (find != m_cache.end()) { + auto rst = find->second; + return rst; + } } mgb_assert(!(key.attribute & Attribute::IMAGE2D) && !(key.attribute & Attribute::IC_SMALL)); diff --git a/src/gopt/include/megbrain/gopt/global_layout_transform.h b/src/gopt/include/megbrain/gopt/global_layout_transform.h index 50a9b615..4d690ab2 100644 --- a/src/gopt/include/megbrain/gopt/global_layout_transform.h +++ b/src/gopt/include/megbrain/gopt/global_layout_transform.h @@ -222,8 +222,9 @@ public: }; using OprFilter = thin_function; - using VarNodeFilter = thin_function; + using VarNodeFilter = + thin_function; ProfilerBase(float opr_threshold = 2.f, float var_node_threshold = 2.f); ProfilerBase(OprFilter opr_filter, VarNodeFilter var_node_filter = {}) diff --git a/src/gopt/include/megbrain/gopt/reformat_manager.h b/src/gopt/include/megbrain/gopt/reformat_manager.h index cef608e4..e4f34ff2 100644 --- a/src/gopt/include/megbrain/gopt/reformat_manager.h +++ b/src/gopt/include/megbrain/gopt/reformat_manager.h @@ -146,18 +146,6 @@ private: }; MGB_DEF_ENUM_CLASS_BIT_OPR(ReformatManager::ReformatKey::Attribute); -// -//TensorShape make_aligned_tensor_shape( -// const VarNode* var, TensorFormats orig_formats, -// TensorFormats target_formats, -// ReformatManager::ReformatKey::Attribute extra_attribute = -// ReformatManager::ReformatKey::Attribute::DEFAULT); -// -//TensorShape make_aligned_weight_shape( -// const VarNode* var, TensorFormats orig_formats, -// TensorFormats target_formats, TensorFormats extra_formats, -// ReformatManager::ReformatKey::Attribute extra_attribute = -// ReformatManager::ReformatKey::Attribute::DEFAULT); } // namespace gopt } // namespace mgb diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index 817f6384..59d700fe 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -4104,6 +4104,79 @@ TEST(TestGoptInference, PreProcessCaseAutopadNCHW64) { opr::RelayoutFormat::Param::Mode::NCHW_NCHW4); } +TEST(TestGoptInference, PreProcessCaseAutopadNHWC) { + REQUIRE_GPU(1); + HostTensorGenerator gen(0, 255); + auto cn = CompNode::load("gpu0"); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver < 75) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 75); + return; + } + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkcvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) + .rename(name), + dtype); + }; + size_t n = 2; + size_t c = 3; + size_t h = 32; + size_t w = 32; + auto host_x1 = gen({n, c, h, w}, cn); + + auto x = opr::Host2DeviceCopy::make(*graph, host_x1); + auto x_u8_fp32 = opr::TypeCvt::make(x, dtype::Float32(), cn); + auto x_s8_fp32 = x_u8_fp32 - 128; + auto x_s8 = opr::TypeCvt::make(x_s8_fp32, dtype::QuantizedS8(2.5f), cn); + auto host_val = + std::make_shared(cn, dtype::QuantizedS8(2.5f)); + TensorShape scalar{1, 1, 1, 1}; + host_val->resize(scalar); + auto ptr = host_val->raw_ptr(); + size_t size_bytes = + TensorLayout{scalar, dtype::QuantizedS8(2.5f)}.span().dist_byte(); + std::memset(ptr, 0, size_bytes); + auto padding = opr::ImmutableTensor::make(*graph, *host_val); + padding = opr::Broadcast::make(padding, {n, 1, h, w}); + auto padded_x = opr::Concat::make({x_s8, padding}, 1); + auto nhwc_x = opr::Dimshuffle::make(padded_x, {0, 2, 3, 1}); + auto weight = mkcvar("weight", {16, 3, 3, 4}, dtype::QuantizedS8(2.5f)), + bias = mkcvar("bias", {1, 1, 1, 16}, dtype::QuantizedS32(6.25f)); + opr::ConvBias::Param param; + param.format = opr::ConvBias::Param::Format::NHWC; + param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = 1; + auto result = + opr::ConvBias::make(nhwc_x, weight, bias, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + auto y = opr::TypeCvt::make(result, dtype::Float32()); + SymbolVar y_opt; + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_preprocess(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); + + graph->compile({{y_opt, {}}}) + ->to_json() + ->writeto_fpath(output_file( + "TestGoptInference.PreProcessCaseAutopadNHWC.json")); + + HostTensorND host_y_opt, host_y; + auto func = graph->compile({make_callback_copy(y, host_y), + make_callback_copy(y_opt, host_y_opt)}); + func->execute(); + MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); + ASSERT_TRUE(find_opr(y_opt).param().mode == + opr::RelayoutFormat::Param::Mode::NCHW_NCHW4); +} + TEST(TestGoptInference, WarpAndPreProcessCase1) { REQUIRE_GPU(1); HostTensorGenerator gen(0, 255); diff --git a/src/gopt/test/layout_transform_pass.cpp b/src/gopt/test/layout_transform_pass.cpp index 575761b8..2faf5598 100644 --- a/src/gopt/test/layout_transform_pass.cpp +++ b/src/gopt/test/layout_transform_pass.cpp @@ -10,7 +10,8 @@ * implied. */ -#include "./helper.h" +#include "./network.h" +#include "megbrain/comp_node_env.h" #include "megbrain/gopt/global_layout_transform.h" #include "megbrain/gopt/inference.h" #include "megbrain/opr/dnn/pooling.h" @@ -24,23 +25,145 @@ using namespace gopt; using namespace serialization; #if MGB_CUDA -TEST(TestLayoutTransform, Feature) { - auto inp_file = InputFile::make_fs("./feat.mdl"); +namespace { +//! find first the operator of specific type; raise exception if not found +template +T& find_opr(SymbolVar endpoint) { + T* found = nullptr; + auto cb = [&found](cg::OperatorNodeBase* opr) { + if (!found && opr->same_type()) { + found = &opr->cast_final_safe(); + } + }; + cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); + mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str()); + return *found; +} - auto format = GraphLoader::identify_graph_dump_format(*inp_file); - ASSERT_TRUE(format.valid()); - auto loader = GraphLoader::make(std::move(inp_file), format.val()); +template +size_t find_opr_num(SymbolVar endpoint) { + size_t opr_num = 0; + auto cb = [&opr_num](cg::OperatorNodeBase* opr) { + if (opr->same_type()) { + printf("%s, %s\n", opr->cname(), opr->dyn_typeinfo()->name); + opr_num++; + } + }; + cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); + return opr_num; +} +} // namespace + +TEST(TestLayoutTransform, Resnet18_QS8) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver < 75) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 75); + return; + } + Network network(cn); + /// batch size = 1 reduce test time + auto output = make_resnet18(network, 16, dtype::QuantizedS8{1.f}); + using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; + S strategy = S::PROFILE; + gopt::modify_opr_algo_strategy_inplace({{output}}, strategy); - GraphLoader::LoadConfig load_config; - load_config.comp_graph = ComputingGraph::make(); - auto&& graph_opt = load_config.comp_graph->options(); - graph_opt.graph_opt.enable_fuse_conv_bias_nonlinearity(); - graph_opt.graph_opt.enable_fuse_conv_bias_with_z(); - auto ret = loader->load(load_config, false); + HostTensorND t1; + auto func1 = network.graph->compile({make_callback_copy(output, t1)}); + func1->execute(); + using OprFormat = LayoutTransformContext::OprFormat; + using OprList = LayoutTransformContext::OprList; + using ReformatAttribute = LayoutTransformContext::ReformatAttribute; + using Attribute = LayoutTransformContext::Attribute; + OprList opr_list = { + opr::ConvBiasForward::typeinfo(), + opr::ElemwiseMultiType::typeinfo(), + opr::Elemwise::typeinfo(), + opr::TypeCvt::typeinfo(), + opr::PoolingForward::typeinfo(), + opr::WarpPerspectiveForward::typeinfo(), + }; + SmallVector available_tensor_formats = { + TensorFormats::NCHW, TensorFormats::NHWC, TensorFormats::NCHWc4, + TensorFormats::NCHWc32, TensorFormats::CHWNc4}; + Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, + ReformatAttribute::AUTO_PADDING_NHWC}; + auto ctx = std::make_unique( + std::move(opr_list), std::move(available_tensor_formats), + attribute); + ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), + {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, + OprFormat::NHWC}) + .add_opr_config(opr::PoolingForward::typeinfo(), + {OprFormat::NCHW4, OprFormat::NCHW32, + OprFormat::NHWC, OprFormat::CHWN4}); + auto profiler = ProfilerBase::make_profiler(); + std::unique_ptr solver{ + new DynamicProgrammingSolver(std::move(profiler))}; + auto new_output = gopt::GraphOptimizer{} + .add_pass() + .add_pass() + .add_pass(std::move(ctx), + std::move(solver)) + .add_pass() + .add_pass(FuseNCHW4Int8Preprocess::make()) + .add_pass() + .add_pass() + .add_pass() + .apply({{output}}) + .endpoint_vars(); + auto new_out_var = new_output[0]; + /// check global layout transform pass + auto nr_dimshuffle = find_opr_num(new_out_var); + ASSERT_EQ(nr_dimshuffle, 3u); + /// check pass fuse conv bias with z + auto nr_elemwise_mult_type = + find_opr_num(new_out_var); + ASSERT_EQ(nr_elemwise_mult_type, 4u); + /// 21 convolutions, 21 weights and 21 bias, total 42 parameters + const auto& param_merge = + find_opr(new_out_var); + ASSERT_EQ(param_merge.output().size(), 42u); + /// check first conv format + const auto& first_conv = find_opr(new_out_var); + const auto& cast = first_conv.cast_final_safe(); + ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW4); + + GraphProfiler gprof{network.graph.get()}; + HostTensorND t2; + auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)}); + func2->execute(); + gprof.to_json_full(func2.get()) + ->writeto_fpath(output_file("resnet18_qs8.json")); + /// check correct + MGB_ASSERT_TENSOR_EQ(t1, t2); +} + +TEST(TestLayoutTransform, Resnet18_QS4) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver < 75) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 75); + return; + } + Network network(cn); + auto output = make_resnet18(network, 16, dtype::QuantizedS4{1.f}); using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; S strategy = S::PROFILE; - gopt::modify_opr_algo_strategy_inplace({ret.output_var_list}, strategy); + gopt::modify_opr_algo_strategy_inplace({{output}}, strategy); + + HostTensorND t1; + auto func1 = network.graph->compile({make_callback_copy(output, t1)}); + func1->execute(); using OprFormat = LayoutTransformContext::OprFormat; using OprList = LayoutTransformContext::OprList; @@ -55,74 +178,113 @@ TEST(TestLayoutTransform, Feature) { opr::WarpPerspectiveForward::typeinfo(), }; SmallVector available_tensor_formats = { - TensorFormats::NCHWc4, TensorFormats::NCHWc32, - TensorFormats::CHWNc4}; - Attribute attribute = {OprFormat::NCHW4, TensorFormats::NCHWc4, - ReformatAttribute::DEFAULT}; + TensorFormats::NCHW, TensorFormats::NHWC, + TensorFormats::NCHWc4, TensorFormats::NCHWc32, + TensorFormats::NCHWc64, TensorFormats::CHWNc4}; + Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, + ReformatAttribute::AUTO_PADDING_NHWC}; auto ctx = std::make_unique( std::move(opr_list), std::move(available_tensor_formats), attribute); ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), - {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4}) + {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, + OprFormat::NHWC, OprFormat::NCHW64}) .add_opr_config( opr::PoolingForward::typeinfo(), - {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4}) - .add_opr_config(opr::WarpPerspectiveForward::typeinfo(), - OprFormat::NCHW4); + {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, + OprFormat::NHWC, OprFormat::CHWN4}); auto profiler = ProfilerBase::make_profiler(); - auto filter = [](const GraphPartition& partition) { - auto has_nchw4_conv = false; - for (auto&& opr : partition.all_oprs()) { - if (opr->dyn_typeinfo() == opr::ConvBiasForward::typeinfo()) { - auto& conv = opr->cast_final_safe(); - if (conv.param().format == - LayoutTransformContext::OprFormat::NCHW4) { - has_nchw4_conv = true; - break; - } - } - } - return has_nchw4_conv; - }; - std::unique_ptr solver{new DynamicProgrammingSolver( - std::move(profiler), std::move(filter))}; - auto new_out_vars = gopt::GraphOptimizer{} - .add_pass() - .add_pass() - .add_pass( - std::move(ctx), std::move(solver)) - .add_pass() - .add_pass(FuseNCHW4Int8Preprocess::make()) - .add_pass() - .add_pass() - .add_pass() - .apply(ret.output_var_list) - .endpoint_vars(); - auto dumper = GraphDumper::make(OutputFile::make_fs("model_opt.mgb")); - dumper->dump({new_out_vars}); + std::unique_ptr solver{ + new DynamicProgrammingSolver(std::move(profiler))}; + auto new_output = gopt::GraphOptimizer{} + .add_pass() + .add_pass() + .add_pass(std::move(ctx), + std::move(solver)) + .add_pass() + .add_pass(FuseNCHW4Int8Preprocess::make()) + .add_pass() + .add_pass() + .add_pass() + .apply({{output}}) + .endpoint_vars(); + auto new_out_var = new_output[0]; + /// check global layout transform pass + auto nr_dimshuffle = find_opr_num(new_out_var); + ASSERT_EQ(nr_dimshuffle, 3u); + /// check pass fuse conv bias with z + auto nr_elemwise_mult_type = + find_opr_num(new_out_var); + ASSERT_EQ(nr_elemwise_mult_type, 4u); + /// 21 convolutions, 21 weights and 21 bias, total 42 parameters + const auto& param_merge = + find_opr(new_out_var); + ASSERT_EQ(param_merge.output().size(), 42u); + /// check first conv format + const auto& first_conv = find_opr(new_out_var); + const auto& cast = first_conv.cast_final_safe(); + ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NHWC); + + GraphProfiler gprof{network.graph.get()}; + HostTensorND t2; + auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)}); + func2->execute(); + gprof.to_json_full(func2.get()) + ->writeto_fpath(output_file("resnet18_qs4.json")); + MGB_ASSERT_TENSOR_EQ(t1, t2); } -TEST(TestLayoutTransform, Detection) { - auto inp_file = InputFile::make_fs("./det.mdl"); - static const char* magic = "mgbteset0"; - size_t skip_size = sizeof(magic) + sizeof(uint32_t); - char skip[skip_size]; - inp_file->read(skip, skip_size); +TEST(TestLayoutTransform, Resnet18_NCHW64) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver < 75) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 75); + return; + } + Network network(cn); + auto output = make_resnet18(network, 64, dtype::QuantizedS4{1.f}); + using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; + S strategy = S::PROFILE; + gopt::modify_opr_algo_strategy_inplace({{output}}, strategy); - auto format = GraphLoader::identify_graph_dump_format(*inp_file); - ASSERT_TRUE(format.valid()); - auto loader = GraphLoader::make(std::move(inp_file), format.val()); + HostTensorND t1; + auto func1 = network.graph->compile({make_callback_copy(output, t1)}); + func1->execute(); - GraphLoader::LoadConfig load_config; - load_config.comp_graph = ComputingGraph::make(); - auto&& graph_opt = load_config.comp_graph->options(); - graph_opt.graph_opt.enable_fuse_conv_bias_nonlinearity(); - graph_opt.graph_opt.enable_fuse_conv_bias_with_z(); - auto ret = loader->load(load_config, false); + SymbolVar new_out_var; + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw64(); + unpack_vector(gopt::optimize_for_inference({output}, options), new_out_var); + GraphProfiler gprof{network.graph.get()}; + HostTensorND t2; + auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)}); + func2->execute(); + gprof.to_json_full(func2.get()) + ->writeto_fpath(output_file("resnet18_nchw64.json")); + MGB_ASSERT_TENSOR_EQ(t1, t2); +} + +TEST(TestLayoutTransform, Detection_QS8) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver < 75) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 75); + return; + } + Network network(cn); + auto outputs = make_det(network, 16, dtype::QuantizedS8{1.f}); using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; S strategy = S::PROFILE; - gopt::modify_opr_algo_strategy_inplace({ret.output_var_list}, strategy); + gopt::modify_opr_algo_strategy_inplace({outputs}, strategy); using OprFormat = LayoutTransformContext::OprFormat; using OprList = LayoutTransformContext::OprList; @@ -130,8 +292,6 @@ TEST(TestLayoutTransform, Detection) { using Attribute = LayoutTransformContext::Attribute; OprList opr_list = { opr::ConvBiasForward::typeinfo(), - opr::ConvolutionForward::typeinfo(), - opr::ConvolutionBackwardData::typeinfo(), opr::ElemwiseMultiType::typeinfo(), opr::Elemwise::typeinfo(), opr::TypeCvt::typeinfo(), @@ -143,51 +303,228 @@ TEST(TestLayoutTransform, Detection) { TensorFormats::NCHWc4, TensorFormats::NCHWc32, TensorFormats::NCHWc64, TensorFormats::CHWNc4}; Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, - ReformatAttribute::DEFAULT}; + ReformatAttribute::AUTO_PADDING_NHWC}; auto ctx = std::make_unique( std::move(opr_list), std::move(available_tensor_formats), attribute); - ctx->add_opr_config( - opr::ConvBiasForward::typeinfo(), - {OprFormat::NCHW, OprFormat::NHWC, OprFormat::NCHW4, - OprFormat::NCHW32, OprFormat::NCHW64, OprFormat::CHWN4}) - .add_opr_config(opr::ConvolutionForward::typeinfo(), - {OprFormat::NCHW, OprFormat::NCHW4}) - .add_opr_config(opr::ConvolutionBackwardData::typeinfo(), - {OprFormat::NCHW, OprFormat::NCHW4}) + ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), + {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, + OprFormat::NHWC, OprFormat::NCHW64}) .add_opr_config( opr::PoolingForward::typeinfo(), - {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC, - OprFormat::NCHW64, OprFormat::CHWN4}) - .add_opr_config( - opr::WarpPerspectiveForward::typeinfo(), - {OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64}); + {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, + OprFormat::NHWC, OprFormat::CHWN4}); + auto profiler = ProfilerBase::make_profiler(); + std::unique_ptr solver{ + new DynamicProgrammingSolver(std::move(profiler))}; + auto new_outputs = gopt::GraphOptimizer{} + .add_pass() + .add_pass() + .add_pass(std::move(ctx), + std::move(solver)) + .add_pass() + .add_pass(FuseNCHW4Int8Preprocess::make()) + .add_pass() + .add_pass() + .add_pass() + .apply({{outputs}}) + .endpoint_vars(); + + GraphProfiler gprof{network.graph.get()}; + using OutputSpecItem = cg::ComputingGraph::OutputSpecItem; + std::vector output_spec; + for (const auto& i : new_outputs) { + output_spec.emplace_back(OutputSpecItem{i, {}}); + } + auto func = network.graph->compile(output_spec); + func->execute(); + gprof.to_json_full(func.get())->writeto_fpath(output_file("det_qs8.json")); +} + +TEST(TestLayoutTransform, Detection_QS4) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver < 75) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 75); + return; + } + Network network(cn); + auto outputs = make_det(network, 16, dtype::QuantizedS4{1.f}); + using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; + S strategy = S::PROFILE; + gopt::modify_opr_algo_strategy_inplace({outputs}, strategy); + using OprFormat = LayoutTransformContext::OprFormat; + using OprList = LayoutTransformContext::OprList; + using ReformatAttribute = LayoutTransformContext::ReformatAttribute; + using Attribute = LayoutTransformContext::Attribute; + OprList opr_list = { + opr::ConvBiasForward::typeinfo(), + opr::ElemwiseMultiType::typeinfo(), + opr::Elemwise::typeinfo(), + opr::TypeCvt::typeinfo(), + opr::PoolingForward::typeinfo(), + opr::WarpPerspectiveForward::typeinfo(), + }; + SmallVector available_tensor_formats = { + TensorFormats::NCHW, TensorFormats::NHWC, + TensorFormats::NCHWc4, TensorFormats::NCHWc32, + TensorFormats::NCHWc64, TensorFormats::CHWNc4}; + Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, + ReformatAttribute::AUTO_PADDING_NHWC}; + auto ctx = std::make_unique( + std::move(opr_list), std::move(available_tensor_formats), + attribute); + ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), + {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, + OprFormat::NHWC, OprFormat::NCHW64}) + .add_opr_config( + opr::PoolingForward::typeinfo(), + {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64, + OprFormat::NHWC, OprFormat::CHWN4}); auto profiler = ProfilerBase::make_profiler(); std::unique_ptr solver{ new DynamicProgrammingSolver(std::move(profiler))}; - auto new_out_vars = gopt::GraphOptimizer{} - .add_pass( - std::move(ctx), std::move(solver)) - .add_pass() - .add_pass(FuseNCHW4Int8Preprocess::make()) - .add_pass() - .add_pass() - .add_pass() - .apply(ret.output_var_list) - .endpoint_vars(); + auto new_outputs = gopt::GraphOptimizer{} + .add_pass() + .add_pass() + .add_pass(std::move(ctx), + std::move(solver)) + .add_pass() + .add_pass(FuseNCHW4Int8Preprocess::make()) + .add_pass() + .add_pass() + .add_pass() + .apply({{outputs}}) + .endpoint_vars(); + + GraphProfiler gprof{network.graph.get()}; using OutputSpecItem = cg::ComputingGraph::OutputSpecItem; - std::vector outs(new_out_vars.size()); - for (size_t i = 0; i < new_out_vars.size(); ++i) { - auto cb = [](DeviceTensorND& /* d */) {}; - outs[i] = std::make_pair(new_out_vars[i], cb); + std::vector output_spec; + for (const auto& i : new_outputs) { + output_spec.emplace_back(OutputSpecItem{i, {}}); } - GraphProfiler gprof{load_config.comp_graph.get()}; - auto func = load_config.comp_graph->compile(outs); - for (size_t i = 0; i < 10; ++i) - func->execute(); - func->wait(); - gprof.to_json_full(func.get())->writeto_fpath(output_file("det.json")); + auto func = network.graph->compile(output_spec); + func->execute(); + gprof.to_json_full(func.get())->writeto_fpath(output_file("det_qs4.json")); +} + +/*! + * test the performance of the solver when network is wide. + */ +TEST(TestLayoutTransform, Wide) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + Network network(cn); + auto data = network.add_var("data", {16, 3, 64, 64}); + auto f = network.add_conv(data, 16, {3, 3}, dtype::Float32(), true, {2, 2}, + {1, 1}); + f = network.add_conv(f, 16, {3, 3}, dtype::Float32(), true, {2, 2}, {1, 1}); + f = network.add_conv(f, 16, {3, 3}, dtype::Float32(), true, {2, 2}, {1, 1}); + SymbolVarArray stages; + for (size_t i = 0; i < 8; ++i) { + f = f * f + f; + stages.push_back(f); + } + auto y = stages[0]; + for (size_t i = 1; i < stages.size(); ++i) { + y = y + stages[i]; + } + + using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; + S strategy = S::PROFILE; + gopt::modify_opr_algo_strategy_inplace({y}, strategy); + + using OprFormat = LayoutTransformContext::OprFormat; + using OprList = LayoutTransformContext::OprList; + using ReformatAttribute = LayoutTransformContext::ReformatAttribute; + using Attribute = LayoutTransformContext::Attribute; + OprList opr_list = { + opr::ConvBiasForward::typeinfo(), + opr::Elemwise::typeinfo(), + }; + SmallVector available_tensor_formats = {TensorFormats::NCHW, + TensorFormats::NHWC}; + Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, + ReformatAttribute::DEFAULT}; + auto ctx = std::make_unique( + std::move(opr_list), std::move(available_tensor_formats), + attribute); + ctx->add_opr_config(opr::ConvBiasForward::typeinfo(), + {OprFormat::NCHW, OprFormat::NHWC}); + auto profiler = ProfilerBase::make_profiler(); + std::unique_ptr solver{ + new DynamicProgrammingSolver(std::move(profiler))}; + auto v = gopt::GraphOptimizer{} + .add_pass() + .add_pass() + .add_pass(std::move(ctx), + std::move(solver)) + .add_pass() + .add_pass() + .add_pass() + .apply({{y}}) + .endpoint_vars(); + const auto& sym_o = v[0]; + GraphProfiler gprof{network.graph.get()}; + auto func = network.graph->compile({{sym_o, {}}}); + func->execute(); + gprof.to_json_full(func.get())->writeto_fpath(output_file("wide.json")); + /// check global layout transform pass, no dimshuffle + auto nr_dimshuffle = find_opr_num(sym_o); + ASSERT_EQ(nr_dimshuffle, 0u); + auto nr_param_merge = find_opr_num(sym_o); + ASSERT_EQ(nr_param_merge, 1u); + /// check first conv format + const auto& first_conv = find_opr(sym_o); + const auto& cast = first_conv.cast_final_safe(); + ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW); +} + +TEST(TestLayoutTransform, ElemwiseMultiType) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + Network network(cn); + auto x = network.add_var("x", {64, 64, 1, 2}); + auto y = network.add_var("y", {64, 64, 1, 2}); + x = network.add_type_cvt(x, dtype::QuantizedS4{1.f}); + y = network.add_type_cvt(y, dtype::QuantizedS4{1.f}); + auto x_ = network.add_type_cvt(x, dtype::Float32()); + auto y_ = network.add_type_cvt(y, dtype::Float32()); + auto z = network.add_elemwise({x_, y_}, dtype::Float32(), + opr::Elemwise::Mode::FUSE_ADD_RELU); + z = network.add_type_cvt(z, dtype::QuantizedS4{1.f}); + z = network.add_type_cvt(z, dtype::Float32()); + auto z2 = network.add_elemwise({x, y}, dtype::QuantizedS4{1.f}, + opr::Elemwise::Mode::FUSE_ADD_RELU); + z2 = network.add_type_cvt(z2, dtype::Float32()); + HostTensorND t1; + auto func1 = network.graph->compile({make_callback_copy(z, t1)}); + func1->execute(); + + HostTensorND t3; + auto func3 = network.graph->compile({make_callback_copy(z2, t3)}); + func3->execute(); + + auto alter_x = opr::RelayoutFormat::make( + x, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64); + auto alter_y = opr::RelayoutFormat::make( + y, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64); + auto alter_z = + network.add_elemwise({alter_x, alter_y}, dtype::QuantizedS4{1.f}, + opr::Elemwise::Mode::FUSE_ADD_RELU); + alter_z = opr::RelayoutFormat::make( + alter_z, megdnn::param::RelayoutFormat::Mode::NCHW64_NCHW); + alter_z = network.add_type_cvt(alter_z, dtype::Float32()); + HostTensorND t2; + auto func2 = network.graph->compile({make_callback_copy(alter_z, t2)}); + func2->execute(); + // MGB_ASSERT_TENSOR_EQ(t1, t3); + MGB_ASSERT_TENSOR_EQ(t2, t3); } TEST(TestLayoutTransform, DetectionHead) { @@ -196,7 +533,7 @@ TEST(TestLayoutTransform, DetectionHead) { cn.activate(); REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); - constexpr size_t N = 16, C = 3, H = 768, W = 1280; + constexpr size_t N = 16, C = 3, H = 736, W = 1280; HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -284,20 +621,71 @@ TEST(TestLayoutTransform, DetectionHead) { .add_pass() .apply(SymbolVarArray{y}) .endpoint_vars(); + const auto& v = new_out_vars[0]; using OutputSpecItem = cg::ComputingGraph::OutputSpecItem; - std::vector outs(new_out_vars.size()); - for (size_t i = 0; i < new_out_vars.size(); ++i) { - auto cb = [](DeviceTensorND& /* d */) {}; - outs[i] = std::make_pair(new_out_vars[i], cb); + std::vector outs; + for (const auto& i : new_out_vars) { + outs.emplace_back(OutputSpecItem{i, {}}); } GraphProfiler gprof{graph.get()}; auto func = graph->compile(outs); - for (size_t i = 0; i < 10; ++i) - func->execute(); - func->wait(); + func->execute(); gprof.to_json_full(func.get())->writeto_fpath(output_file("det_head.json")); + /// check reformat + auto nr_reformat = find_opr_num(v); + ASSERT_EQ(nr_reformat, 2u); + /// check dimshuffle + auto nr_dimshuffle = find_opr_num(v); + ASSERT_EQ(nr_dimshuffle, 0u); + /// check conv_bias + auto nr_conv = find_opr_num(v); + ASSERT_EQ(nr_conv, 2u); + /// check first conv format + const auto& first_conv = find_opr(v); + const auto& cast = first_conv.cast_final_safe(); + ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW4_NHWC); } #endif +TEST(TestLayoutTransform, CanonicalizeLayoutTransform) { + constexpr size_t N = 64, C = 64, H = 1, W = 1; + auto cn = CompNode::load("xpu0"); + Network network(cn); + auto x = network.add_var("x", {N, C / 4, H, W, 4}); + x = network.add_type_cvt(x, dtype::QuantizedS4{1.f}); + using NamedTensorShape = megdnn::NamedTensorShape; + auto src = NamedTensorShape::make_named_tensor_shape( + NamedTensorShape::Format::NCHW4); + auto dst = NamedTensorShape::make_named_tensor_shape( + NamedTensorShape::Format::NHWC); + auto [builder, _] = gopt::ReformatEmitter(src, dst).emit(); + MGB_MARK_USED_VAR(_); + x = SymbolVar(builder({x.node()})); + x = opr::Reshape::make(x, {N, H, W, C}); + x = network.add_type_cvt(x, dtype::Float32()); + + SymbolVar another_x; + unpack_vector(gopt::GraphOptimizer{} + .add_pass() + .apply({{x}}) + .endpoint_vars(), + another_x); + const auto& astype = find_opr(x); + EXPECT_TRUE(astype.input(0)->owner_opr()->dyn_typeinfo() == + opr::Host2DeviceCopy::typeinfo()); + const auto& another_astype = find_opr(another_x); + EXPECT_TRUE(another_astype.input(0)->owner_opr()->dyn_typeinfo() == + opr::Reshape::typeinfo()); + + HostTensorND t1; + auto func1 = network.graph->compile({make_callback_copy(x, t1)}); + func1->execute(); + + HostTensorND t2; + auto func2 = network.graph->compile({make_callback_copy(another_x, t2)}); + func2->execute(); + MGB_ASSERT_TENSOR_EQ(t1, t2); +} + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/gopt/test/network.cpp b/src/gopt/test/network.cpp new file mode 100644 index 00000000..c070f77b --- /dev/null +++ b/src/gopt/test/network.cpp @@ -0,0 +1,237 @@ +/** + * \file src/gopt/test/network.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./network.h" + +using namespace mgb; + +SymbolVar Network::add_conv(SymbolVar f, size_t output_channels, + KernSize kern_size, DType out_dtype, bool has_relu, + Stride stride, Padding padding) { + static int weight_idx = 0; + static int bias_idx = 0; + + size_t input_channels = f.node()->shape()[1]; + auto weight = add_cvar( + ssprintf("w%d", weight_idx).c_str(), + {output_channels, input_channels, kern_size[0], kern_size[1]}); + auto bias = add_cvar(ssprintf("b%d", bias_idx).c_str(), + {1, output_channels, 1, 1}); + if (out_dtype.category() == DTypeCategory::QUANTIZED) { + weight = add_type_cvt(weight, out_dtype); + bias = add_type_cvt(bias, dtype::QuantizedS32{1.f}); + } + opr::ConvBias::Param param; + param.stride_h = stride[0], param.stride_w = stride[1]; + param.pad_h = padding[0], param.pad_w = padding[1]; + if (has_relu) { + param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; + } else { + param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; + } + + auto conv = opr::ConvBias::make(f, weight, bias, param, {}, + OperatorNodeConfig{out_dtype}); + weight_idx++; + bias_idx++; + return conv; +} + +SymbolVar Network::add_deconv(SymbolVar f, size_t ratio, size_t output_channels, + DType out_dtype) { + static int weight_idx = 0; + size_t kernel = ratio * 2 - ratio % 2; + size_t pad = ratio / 2; + + size_t input_channels = f.node()->shape()[1]; + auto weight = add_cvar(ssprintf("w%d", weight_idx).c_str(), + {input_channels, output_channels, kernel, kernel}); + + if (out_dtype.category() == DTypeCategory::QUANTIZED) { + weight = add_type_cvt(weight, out_dtype); + } + opr::ConvolutionBackwardData::Param param; + param.stride_h = param.stride_w = ratio; + param.pad_h = param.pad_w = pad; + + auto deconv = opr::ConvolutionBackwardData::make( + weight, f, param, {}, OperatorNodeConfig{out_dtype}); + weight_idx++; + return deconv; +} + +SymbolVar Network::add_elemwise(const SymbolVarArray inps, DType out_dtype, + opr::Elemwise::Param::Mode mode) { + using ElemMode = opr::Elemwise::Param::Mode; + using MultiMode = opr::ElemwiseMultiType::Param::Mode; + static const ThinHashMap map = { + {ElemMode::ADD, MultiMode::QADD}, + {ElemMode::FUSE_ADD_RELU, MultiMode::QFUSE_ADD_RELU}}; + if (out_dtype.category() == DTypeCategory::QUANTIZED) { + MultiMode alter_mode = map.at(mode); + return opr::ElemwiseMultiType::make(inps, {alter_mode}, + OperatorNodeConfig{out_dtype}); + } else { + return opr::Elemwise::make(inps, mode); + } +} + +SymbolVar Network::add_pooling(SymbolVar f, Window window, Stride stride, + Padding padding, + opr::Pooling::Param::Mode mode) { + opr::Pooling::Param param; + param.window_h = window[0], param.window_w = window[1]; + param.stride_h = stride[0], param.stride_w = stride[1]; + param.pad_h = padding[0], param.pad_w = padding[1]; + param.mode = mode; + return opr::Pooling::make(f, param); +} + +SymbolVar Network::add_type_cvt(SymbolVar f, DType out_dtype) { + return opr::TypeCvt::make(f, out_dtype); +} + +SymbolVar mgb::create_block(Network& network, SymbolVar f_in, size_t stride, + size_t num_outputs1, bool has_proj, + DType out_dtype) { + auto proj = f_in; + if (has_proj) { + proj = network.add_conv(f_in, num_outputs1, {1, 1}, out_dtype, false, + {stride, stride}); + } + + auto f = network.add_conv(f_in, num_outputs1, {3, 3}, out_dtype, true, + {stride, stride}, {1, 1}); + + f = network.add_conv(f, num_outputs1, {3, 3}, out_dtype, true, {1, 1}, + {1, 1}); + + f = network.add_elemwise({f, proj}, out_dtype, + opr::Elemwise::Mode::FUSE_ADD_RELU); + return f; +} + +SymbolVar mgb::make_resnet18(Network& network, size_t batch, DType out_dtype) { + auto data = network.add_var("data", {batch, 4, 224, 224}); + if (out_dtype.category() == DTypeCategory::QUANTIZED) + data = network.add_type_cvt(data, dtype::QuantizedS8{1.f}); + auto first = out_dtype; + if (out_dtype.category() == DTypeCategory::QUANTIZED) + first = dtype::QuantizedS8{1.f}; + auto f = network.add_conv(data, 64, {7, 7}, first, true, {2, 2}, {3, 3}); + if (out_dtype.enumv() == DTypeEnum::QuantizedS4 || + out_dtype.enumv() == DTypeEnum::Quantized4Asymm) { + f = network.add_type_cvt(f, out_dtype); + } + f = network.add_pooling(f, {3, 3}, {2, 2}, {1, 1}); + + using Vector = SmallVector; + Vector stages = {2, 2, 2, 2}; + Vector mid_outputs = {64, 128, 256, 512}; + Vector enable_stride = {0, 1, 1, 1}; + for (size_t i = 0; i < 4; ++i) { + auto s = stages[i]; + auto o = mid_outputs[i]; + auto es = enable_stride[i]; + for (size_t j = 0; j < s; ++j) { + size_t stride = !es || j > 0 ? 1 : 2; + bool has_proj = j > 0 ? false : true; + f = create_block(network, f, stride, o, has_proj, out_dtype); + } + } + f = network.add_pooling(f, {7, 7}, {7, 7}, {0, 0}, + opr::Pooling::Param::Mode::AVERAGE); + + f = network.add_type_cvt(f, dtype::Float32()); + return f; +} + +namespace { +SymbolVarArray make_pyramids(Network& network, size_t batch, DType out_dtype) { + SymbolVarArray pyramids; + auto data = network.add_var("data", {batch, 3, 256, 256}); + data = data + (-128.f); + if (out_dtype.category() == DTypeCategory::QUANTIZED) + data = network.add_type_cvt(data, dtype::QuantizedS8{1.f}); + auto first = out_dtype; + if (out_dtype.category() == DTypeCategory::QUANTIZED) + first = dtype::QuantizedS8{1.f}; + auto f = network.add_conv(data, 16, {3, 3}, first, true, {2, 2}, {1, 1}); + f = network.add_conv(f, 16, {3, 3}, first, true, {1, 1}, {1, 1}); + f = network.add_conv(f, 32, {3, 3}, first, true, {2, 2}, {1, 1}); + if (out_dtype.enumv() == DTypeEnum::QuantizedS4 || + out_dtype.enumv() == DTypeEnum::Quantized4Asymm) { + f = network.add_type_cvt(f, out_dtype); + } + + using Vector = SmallVector; + Vector stages = {3, 6, 6, 3}; + Vector mid_outputs = {32, 64, 128, 256}; + Vector enable_stride = {0, 1, 1, 1}; + for (size_t i = 0; i < 4; ++i) { + auto s = stages[i]; + auto o = mid_outputs[i]; + auto es = enable_stride[i]; + for (size_t j = 0; j < s; ++j) { + size_t stride = !es || j > 0 ? 1 : 2; + bool has_proj = j > 0 ? false : true; + f = create_block(network, f, stride, o, has_proj, out_dtype); + } + pyramids.push_back(f); + } + + for (size_t i = 0; i < pyramids.size(); ++i) { + pyramids[i] = network.add_type_cvt(pyramids[i], first); + } + return pyramids; +} + +SymbolVarArray fusion_pyramids_feature(Network& network, + SymbolVarArray pyramids, + size_t fpn_conv_channels) { + bool touch = false; + SymbolVar x; + SymbolVarArray fpn; + for (int i = 5; i >= 3; --i) { + auto f = network.add_conv(pyramids[i - 2], fpn_conv_channels, {1, 1}, + dtype::QuantizedS8{1.f}, false, {1, 1}, + {0, 0}); + if (!touch) { + x = f; + } else { + x = network.add_deconv(x, 2, 16, dtype::QuantizedS8{1.f}); + x = network.add_elemwise({x, f}, dtype::QuantizedS8{1.f}, + opr::Elemwise::Mode::ADD); + } + fpn.push_back(x); + } + + x = fpn[0]; + for (int i = 6; i < 8; ++i) { + x = network.add_conv(x, fpn_conv_channels, {3, 3}, + dtype::QuantizedS8{1.f}, true, {2, 2}, {1, 1}); + } + return fpn; +} +} // namespace + +SymbolVarArray mgb::make_det(Network& network, size_t batch, DType out_dtype) { + SymbolVarArray outputs; + auto pyramids = make_pyramids(network, batch, out_dtype); + auto fpn_hv = fusion_pyramids_feature(network, pyramids, 16); + auto fpn_plate = fusion_pyramids_feature(network, pyramids, 16); + outputs.insert(outputs.end(), fpn_hv.begin(), fpn_hv.end()); + outputs.insert(outputs.end(), fpn_plate.begin(), fpn_plate.end()); + return outputs; +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/gopt/test/network.h b/src/gopt/test/network.h new file mode 100644 index 00000000..ee4364c5 --- /dev/null +++ b/src/gopt/test/network.h @@ -0,0 +1,77 @@ +/** + * \file src/gopt/test/network.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once + +#include "megbrain/test/helper.h" + +#include "megbrain/gopt/framework.h" +#include "megbrain/opr/basic_arith_wrapper.h" +#include "megbrain/opr/blas.h" +#include "megbrain/opr/dnn/convolution.h" +#include "megbrain/opr/dnn/pooling.h" +#include "megbrain/opr/imgproc.h" +#include "megbrain/opr/nn_int.h" +#include "megbrain/opr/tensor_gen.h" +#include "megbrain/opr/tensor_manip.h" +#include "megbrain/opr/utility.h" + +namespace mgb { +class Network { +private: + HostTensorGenerator<> gen; + CompNode cn; + +public: + std::shared_ptr graph = ComputingGraph::make(); + Network(CompNode cn_) : cn{cn_} {} + ~Network() noexcept = default; + using KernSize = SmallVector; + using Stride = SmallVector; + using Padding = SmallVector; + SymbolVar add_var(const char* name, const TensorShape& shp = {1}) { + return opr::Host2DeviceCopy::make(*graph, gen(shp), cn).rename(name); + } + SymbolVar add_cvar(const char* name, const TensorShape& shp = {1}) { + return opr::SharedDeviceTensor::make(*graph, *gen(shp), cn) + .rename(name); + } + + SymbolVar add_conv(SymbolVar f, size_t output_channels, KernSize kern_size, + DType out_dtype = dtype::Float32(), bool has_relu = true, + Stride stride = {1, 1}, Padding padding = {0, 0}); + SymbolVar add_deconv(SymbolVar f, size_t ratio, size_t output_channels, + DType out_dtype); + SymbolVar add_elemwise( + const SymbolVarArray inps, DType out_dtype = dtype::Float32(), + opr::Elemwise::Param::Mode mode = opr::Elemwise::Param::Mode::ADD); + using Window = SmallVector; + SymbolVar add_pooling( + SymbolVar f, Window window, Stride stride = {1, 1}, + Padding padding = {0, 0}, + opr::Pooling::Param::Mode mode = opr::Pooling::Param::Mode::MAX); + SymbolVar add_type_cvt(SymbolVar f, DType out_dtype = dtype::Float32()); +}; + +SymbolVar create_block(Network& network, SymbolVar f, size_t stride, + size_t num_outputs1, bool has_proj = false, + DType out_dtype = dtype::Float32()); + +SymbolVar make_resnet18(Network& network, size_t batch = 16, + DType out_dtype = dtype::Float32()); + +SymbolVarArray make_det(Network& network, size_t batch = 16, + DType out_dtype = dtype::Float32()); + +} // namespace mgb + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} -- GitLab