From 3eb29f5e2a381325b934a4a9da3603a010af57a7 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 29 Jun 2020 12:19:18 +0800 Subject: [PATCH] feat(mgb/opt): add nchw->nchw4 in tensorcore pass GitOrigin-RevId: 755f8dfefe28bb14dba5d86a54a9bb725af285ed --- src/gopt/impl/framework.cpp | 2 + src/gopt/impl/tensor_reformat.cpp | 41 +++--- src/gopt/test/inference.cpp | 208 +++++++++++++++++++++++------- 3 files changed, 191 insertions(+), 60 deletions(-) diff --git a/src/gopt/impl/framework.cpp b/src/gopt/impl/framework.cpp index 4eb74791b..ce0ada788 100644 --- a/src/gopt/impl/framework.cpp +++ b/src/gopt/impl/framework.cpp @@ -756,6 +756,7 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( cb(nchw32, { add_pass(); add_pass(); + add_pass(EnableNCHW4Pass::make_nchw4_converter()); add_pass(EnableTensorCorePass::make_tensorcore_converter()); add_pass(); add_pass(); @@ -763,6 +764,7 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( cb(chwn4, { add_pass(); add_pass(); + add_pass(EnableNCHW4Pass::make_nchw4_converter()); add_pass(EnableCHWN4Pass::make_chwn4_converter()); add_pass(); add_pass(); diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp index f76206b80..c6426de91 100644 --- a/src/gopt/impl/tensor_reformat.cpp +++ b/src/gopt/impl/tensor_reformat.cpp @@ -1356,16 +1356,17 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter(){ using RelayoutMode = RelayoutPlaceholder::LayoutType; megdnn::param::Convolution::Format conv_format = megdnn::param::Convolution::Format::NCHW4; - megdnn::param::ConvBias::Format conv_bias_format = + megdnn::param::ConvBias::Format conv_bias_format = megdnn::param::ConvBias::Format::NCHW4; megdnn::param::BatchConvBias::Format batch_conv_bias_format = megdnn::param::BatchConvBias::Format::NCHW4; RelayoutMode src_to_nchw4_mode = RelayoutMode::NCHW_TO_NCHW4; RelayoutMode src_to_nchw_mode = RelayoutMode::NCHW4_TO_NCHW; - RelayoutMode weight_to_nchw4_mode_dense = + RelayoutMode weight_to_nchw4_mode_dense = RelayoutMode::WEIGHT_NCHW_TO_NCHW4_DENSE; - RelayoutMode weight_to_nchw4_mode_group = + RelayoutMode weight_to_nchw4_mode_group = RelayoutMode::WEIGHT_NCHW_TO_NCHW4_GROUP; + auto trans_nchw4 = [weight_to_nchw4_mode_dense, weight_to_nchw4_mode_group]( const megdnn::param::Convolution::Sparse conv_mode, @@ -1391,9 +1392,11 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter(){ OperatorNodeBase* opr, const VarNodeArray& new_inp) { mgb_assert(opr->input().size() == new_inp.size()); auto& conv_opr = opr->cast_final_safe(); - mgb_assert(conv_opr.param().format == - megdnn::param::Convolution::Format::NCHW, - "ConvertFormat Pass only support converting NCHW to NCHW4"); + if (conv_opr.param().format != + megdnn::param::Convolution::Format::NCHW) { + return serialization::copy_opr_shallow(*opr, new_inp, + opr->config()); + } VarNode *conv_src = new_inp[0], *conv_filter = new_inp[1]; // src: NCHW --> NCWH4 if (new_inp[0]->shape().ndim != 5) { @@ -1427,7 +1430,13 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter(){ mgb_assert(opr->input().size() == new_inp.size()); auto& batch_conv_bias_opr = opr->cast_final_safe(); - mgb_assert(batch_conv_bias_opr.param().format == + if (batch_conv_bias_opr.param().format != + megdnn::param::BatchConvBias::Format::NCHW) { + return serialization::copy_opr_shallow(*opr, new_inp, + opr->config()); + } + + mgb_assert(batch_conv_bias_opr.param().format == megdnn::param::BatchConvBias::Format::NCHW, "ConvertFormat Pass only support converting NCHW to NCHW4"); // what should be converted: src, weight @@ -1494,9 +1503,12 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter(){ const VarNodeArray& new_inp) { mgb_assert(opr->input().size() == new_inp.size()); auto& conv_bias_opr = opr->cast_final_safe(); - mgb_assert(conv_bias_opr.param().format == - megdnn::param::ConvBias::Format::NCHW, - "ConvertFormat Pass only support converting NCHW to NCHW4"); + if (conv_bias_opr.param().format != + megdnn::param::Convolution::Format::NCHW) { + return serialization::copy_opr_shallow(*opr, new_inp, + opr->config()); + } + // what should be converted: src, weight VarNode *conv_bias_src = new_inp[0], *conv_bias_filter = new_inp[1]; // src: NCHW --> NCHW4 @@ -1604,8 +1616,9 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter(){ using Format = Param::Format; mgb_assert(opr->input().size() == new_inp.size()); auto& pooling = opr->cast_final_safe(); - mgb_assert(pooling.param().format == Format::NCHW, - "ConvertFormat Pass only support converting NCHW to NCHW4."); + if (pooling.param().format != Format::NCHW) { + return opr; + } if (new_inp[0]->shape().ndim == 5) { mgb_assert(new_inp[0]->dtype().enumv() == DTypeEnum::QuantizedS8); auto new_param = pooling.param(); @@ -1628,8 +1641,6 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter(){ using Format = Param::Format; mgb_assert(opr->input().size() == new_inp.size()); auto& resize = opr->cast_final_safe(); - mgb_assert(resize.param().format == Format::NCHW, - "ConvertFormat Pass only support converting NCHW to NCHW4."); if (new_inp[0]->shape().ndim == 5) { mgb_assert(new_inp[0]->dtype().enumv() == DTypeEnum::QuantizedS8); auto new_param = resize.param(); @@ -1652,8 +1663,6 @@ std::unique_ptr EnableNCHW4Pass::make_nchw4_converter(){ using Format = Param::Format; mgb_assert(opr->input().size() == new_inp.size()); auto& warp = opr->cast_final_safe(); - mgb_assert(warp.param().format == Format::NCHW, - "ConvertFormat Pass only support converting NCHW to NCHW4."); if (new_inp[0]->shape().ndim == 5) { mgb_assert(new_inp[0]->dtype().enumv() == DTypeEnum::QuantizedS8); auto new_param = warp.param(); diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index 8f06f07a0..5d7159850 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -1512,6 +1512,7 @@ TEST_PASS(FuseConvBiasNonlinPass, Basic) { #if MGB_CUDA + TEST(TestEnableTensorCore, SmallInputShape) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); @@ -1579,6 +1580,104 @@ TEST(TestEnableTensorCore, SmallInputShape) { MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); } + +TEST(TestEnableTensorCore, Nchw4Nchw) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver < 75) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 75); + return; + } + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + auto mkcvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) + .rename(name), + dtype); + }; + + auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C, + size_t H, size_t W) -> TensorShape { + mgb_assert(C % 4 == 0); + if (format == opr::ConvBias::Param::Format::NCHW4) { + return {N, C / 4, H, W, 4}; + } else { + mgb_assert(format == opr::ConvBias::Param::Format::NCHW); + return {N, C, H, W}; + } + }; + + for (auto format : {opr::ConvBias::Param::Format::NCHW, + opr::ConvBias::Param::Format::NCHW4}) { + auto x = mkvar("x", mkshape(format, 32, 64, 16, 16), + dtype::QuantizedS8(2.5f)), + w = mkcvar("w1", mkshape(format, 64, 64, 3, 3), + dtype::QuantizedS8(2.5f)), + b = mkcvar("b", mkshape(format, 1, 64, 1, 1), + dtype::QuantizedS32(6.25f)), + z = mkcvar("b1", mkshape(format, 32, 64, 8, 8), + dtype::QuantizedS8(2.5f)); + opr::ConvBias::Param param; + param.format = format; + param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = 1; + + auto y = opr::ConvBias::make( + x, w, b, z, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + y = opr::ConvBias::make(y, w, b, param, {}, + OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); + y = opr::TypeCvt::make(y, dtype::Float32()); + + SymbolVar y_opt; + SymbolVar y_no_tc; + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_nchw32().enable_fuse_conv_bias_nonlinearity(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); + } + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_fuse_conv_bias_nonlinearity(); + unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc); + } + auto nr_dimshuffle = find_opr_num(y_opt); + std::string json_name; + ASSERT_EQ(2u, nr_dimshuffle); + if (format == opr::ConvBias::Param::Format::NCHW4) { + json_name = "TestGoptInference.Nchw4Nchw.NCHW4.json"; + } else { + mgb_assert(format == opr::ConvBias::Param::Format::NCHW); + json_name = "TestGoptInference.Nchw4Nchw.NCHW.json"; + } + + graph->compile({{y_opt, {}}}) + ->to_json() + ->writeto_fpath(output_file(json_name.c_str())); + HostTensorND host_y, host_y_opt; + auto func = graph->compile({make_callback_copy(y_no_tc, host_y), + make_callback_copy(y_opt, host_y_opt)}); + func->execute(); + MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); + } +} + TEST(TestEnableTensorCore, ConvBiasWithZ) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); @@ -2043,53 +2142,74 @@ TEST(TestGoptInference, EnableCHWN4) { .rename(name), dtype); }; + auto mkshape = [](opr::ConvBias::Param::Format format, size_t N, size_t C, + size_t H, size_t W) -> TensorShape { + mgb_assert(C % 4 == 0); + if (format == opr::ConvBias::Param::Format::NCHW4) { + return {N, C / 4, H, W, 4}; + } else { + mgb_assert(format == opr::ConvBias::Param::Format::NCHW); + return {N, C, H, W}; + } + }; - auto x = mkvar("x", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)), - w = mkcvar("w1", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)), - b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)), - b1 = mkvar("b1", {32, 16, 16, 16, 4}, dtype::QuantizedS8(2.5f)); - opr::ConvBias::Param param; - param.format = opr::ConvBias::Param::Format::NCHW4; - param.stride_h = param.stride_w = 1; - param.pad_h = param.pad_w = 1; - param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; + for (auto format : {opr::ConvBias::Param::Format::NCHW, + opr::ConvBias::Param::Format::NCHW4}) { + auto x = mkvar("x", mkshape(format, 32, 64, 16, 16), + dtype::QuantizedS8(2.5f)), + w = mkcvar("w1", mkshape(format, 64, 64, 3, 3), + dtype::QuantizedS8(2.5f)), + b = mkcvar("b", mkshape(format, 1, 64, 1, 1), + dtype::QuantizedS32(6.25f)), + b1 = mkvar("b1", mkshape(format, 32, 64, 16, 16), + dtype::QuantizedS8(2.5f)); + opr::ConvBias::Param param; + param.format = format; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = 1; + param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; - auto y = opr::ConvBiasForward::make( - x, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); - auto y1 = opr::ElemwiseMultiType::make( - {y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU, - OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); - auto y2 = opr::ConvBiasForward::make( - y, w, b, param, {}, OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); - auto y3 = opr::ElemwiseMultiType::make( - {y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB, - OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); - auto y4 = opr::ElemwiseMultiType::make( - {y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD, - OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); - y4 = opr::ElemwiseMultiType::make( - {y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD, - OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); - y4 = opr::TypeCvt::make(y4, dtype::Float32()); - SymbolVar y_opt; - SymbolVar y_cudnn; - { - auto options = gopt::OptimizeForInferenceOptions{}; - options.enable_chwn4(); - unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt); + auto y = opr::ConvBiasForward::make( + x, w, b, param, {}, + OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); + auto y1 = opr::ElemwiseMultiType::make( + {y, b1}, opr::ElemwiseMultiType::Mode::QFUSE_ADD_RELU, + OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); + auto y2 = opr::ConvBiasForward::make( + y, w, b, param, {}, + OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); + auto y3 = opr::ElemwiseMultiType::make( + {y, b1}, opr::ElemwiseMultiType::Param::Mode::QSUB, + OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); + auto y4 = opr::ElemwiseMultiType::make( + {y1, y2}, opr::ElemwiseMultiType::Param::Mode::QADD, + OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); + y4 = opr::ElemwiseMultiType::make( + {y3, y4}, opr::ElemwiseMultiType::Param::Mode::QADD, + OperatorNodeConfig{dtype::QuantizedS8{2.5f}}); + y4 = opr::TypeCvt::make(y4, dtype::Float32()); + SymbolVar y_opt; + SymbolVar y_cudnn; + { + auto options = gopt::OptimizeForInferenceOptions{}; + options.enable_chwn4(); + unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt); + } + unpack_vector(gopt::GraphOptimizer{} + .add_pass() + .add_pass() + .apply({{y4}}) + .endpoint_vars(), + y_cudnn); + + ASSERT_EQ(opr::ConvBias::Param::Format::CHWN4, + find_opr(y_opt).param().format); + HostTensorND host_y, host_y_opt; + auto func = graph->compile({make_callback_copy(y_cudnn, host_y), + make_callback_copy(y_opt, host_y_opt)}); + func->execute(); + MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); } - unpack_vector(gopt::GraphOptimizer{} - .add_pass() - .add_pass() - .apply({{y4}}) - .endpoint_vars(), - y_cudnn); - - HostTensorND host_y, host_y_opt; - auto func = graph->compile({make_callback_copy(y_cudnn, host_y), - make_callback_copy(y_opt, host_y_opt)}); - func->execute(); - MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); } TEST(TestGoptInference, EnableCHWN4WarpPespective) { -- GitLab