diff --git a/src/gopt/impl/global_layout_transform/layout_transform_context.cpp b/src/gopt/impl/global_layout_transform/layout_transform_context.cpp index c603a804549d425e90969919cf4f191a5728dbb1..48b98268301c5577dcd7a05e1315fe0652dce673 100644 --- a/src/gopt/impl/global_layout_transform/layout_transform_context.cpp +++ b/src/gopt/impl/global_layout_transform/layout_transform_context.cpp @@ -78,13 +78,13 @@ std::unique_ptr make_cuda_ctx( OprFormatConfigID::NHWC}) .add_opr_config( opr::PoolingForward::typeinfo(), - {OprFormatConfigID::NCHW4, OprFormatConfigID::NCHW32, - OprFormatConfigID::NHWC, OprFormatConfigID::NCHW64, - OprFormatConfigID::CHWN4}) + {OprFormatConfigID::NCHW, OprFormatConfigID::NCHW4, + OprFormatConfigID::NCHW32, OprFormatConfigID::NHWC, + OprFormatConfigID::NCHW64, OprFormatConfigID::CHWN4}) .add_opr_config( opr::WarpPerspectiveForward::typeinfo(), - {OprFormatConfigID::NHWC, OprFormatConfigID::NCHW4, - OprFormatConfigID::NCHW64}); + {OprFormatConfigID::NCHW, OprFormatConfigID::NHWC, + OprFormatConfigID::NCHW4, OprFormatConfigID::NCHW64}); return ctx; } diff --git a/src/gopt/impl/global_layout_transform/opr_tensor_formats_config.cpp b/src/gopt/impl/global_layout_transform/opr_tensor_formats_config.cpp index 980795d36a1944bf39af4a5407db1d465bbb7084..9a2809abb732ff67e74ccfa0619bf0aa6b1fa3bf 100644 --- a/src/gopt/impl/global_layout_transform/opr_tensor_formats_config.cpp +++ b/src/gopt/impl/global_layout_transform/opr_tensor_formats_config.cpp @@ -191,8 +191,11 @@ struct OprSingleInOutTensorFormatsDispatcherImpl { config.typeinfo = opr->dyn_typeinfo(); config.opr_format = OprFormat::NHWC; config.config_id = OprFormatConfigID::NHWC; - bool available = opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || + bool f16_config = DNN_FLOAT16_SELECT( + (opr->input(0)->dtype().enumv() == DTypeEnum::Float16), true); + bool i4_config = opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4; + bool available = f16_config || i4_config; config.input_dtypes = {opr->input(0)->dtype().enumv()}; config.input_tensor_types = {TensorType::FEATURE}; available &= opr->output(0)->dtype().enumv() == opr->input(0)->dtype().enumv(); @@ -275,16 +278,22 @@ struct ConvTensorFormatsDispatcherImpl { config.opr_format = OprFormat::NHWC; config.config_id = OprFormatConfigID::NHWC; auto check_dtype = [](const DType& dt) { + bool f16_config = + DNN_FLOAT16_SELECT((dt.enumv() == DTypeEnum::Float16), true); bool i4_config = dt.enumv() == DTypeEnum::Quantized4Asymm || dt.enumv() == DTypeEnum::QuantizedS4; bool i8_config = dt.enumv() == DTypeEnum::QuantizedS8; - return i4_config || i8_config; + return f16_config || i4_config || i8_config; }; bool available = true; for (size_t i = 0; i < opr->input().size(); ++i) { - if (i == 2) - available &= opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS32; - else { + if (i == 2) { + available &= + opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS32 || + DNN_FLOAT16_SELECT( + opr->input(i)->dtype().enumv() == DTypeEnum::Float16, + true); + } else { available &= check_dtype(opr->input(i)->dtype()); } config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); @@ -866,12 +875,18 @@ struct ConvTensorFormatsDispatcherImpl< config.config_id = OprFormatConfigID::NHWC; bool available = true; for (size_t i = 0; i < opr->input().size(); ++i) { - available &= opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8; + available &= + opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8 || + DNN_FLOAT16_SELECT( + opr->input(i)->dtype().enumv() == DTypeEnum::Float16, true); config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); TensorType tensor_type = i == 0 ? TensorType::WEIGHT : TensorType::FEATURE; config.input_tensor_types.emplace_back(tensor_type); } - available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + available &= + opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8 || + DNN_FLOAT16_SELECT( + opr->output(0)->dtype().enumv() == DTypeEnum::Float16, true); config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE; config.input_tensor_formats = { @@ -934,6 +949,7 @@ StaticData::StaticData() { OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW44_DOT_HYBRID); OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW); + OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NHWC); OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4); OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW44); OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW88); diff --git a/src/gopt/test/layout_transform_pass.cpp b/src/gopt/test/layout_transform_pass.cpp index 99eaf33c136a763c6813323a61284224055b853b..aa13177c36007016cee357fb77eec30f20151aab 100644 --- a/src/gopt/test/layout_transform_pass.cpp +++ b/src/gopt/test/layout_transform_pass.cpp @@ -29,6 +29,8 @@ #include "./cache_data.h" #endif +#include "megbrain/plugin/opr_io_dump.h" + using namespace mgb; using namespace gopt; using namespace serialization; @@ -748,6 +750,95 @@ TEST(TestLayoutTransform, CanonicalizeLayoutTransform) { MGB_ASSERT_TENSOR_EQ(t1, t2); } +#if MGB_CUDA +TEST(TestLayoutTransform, Resnet18_F16) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; + auto sm_ver = prop.major * 10 + prop.minor; + if (sm_ver < 70) { + printf("This testcast ignored due to insufficient cuda cap(got: %d, " + "expected: %d)\n", + sm_ver, 70); + return; + } + + Network network(cn); + auto output = make_resnet18(network, 16); + using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; + S strategy = S::PROFILE; + gopt::modify_opr_algo_strategy_inplace({{output}}, strategy); + + HostTensorND t1; + auto func1 = network.graph->compile({make_callback_copy(output, t1)}); + func1->execute(); + + using OprFormatConfigID = LayoutTransformContext::OprFormatConfigID; + using OprList = LayoutTransformContext::OprList; + using Attribute = LayoutTransformContext::Attribute; + using Target = LayoutTransformContext::Target; + using ReformatAttribute = LayoutTransformContext::ReformatAttribute; + OprList opr_list = { + opr::ConvBiasForward::typeinfo(), opr::ElemwiseMultiType::typeinfo(), + opr::Elemwise::typeinfo(), opr::TypeCvt::typeinfo(), + opr::PoolingForward::typeinfo(), opr::WarpPerspectiveForward::typeinfo(), + }; + SmallVector available_tensor_formats = { + TensorFormats::NCHW, TensorFormats::NHWC}; + Attribute attribute = { + OprFormatConfigID::NCHW, TensorFormats::NCHW, Target::UNSPEC, + ReformatAttribute::AUTO_PADDING_NHWC}; + auto ctx = std::make_unique( + std::move(opr_list), std::move(available_tensor_formats), attribute); + ctx->add_opr_config( + opr::ConvBiasForward::typeinfo(), + {OprFormatConfigID::NCHW, OprFormatConfigID::NHWC}) + .add_opr_config( + opr::PoolingForward::typeinfo(), + {OprFormatConfigID::NCHW, OprFormatConfigID::NHWC}); +#if MGB_WITH_CACHED_TEST + auto profiler = std::make_unique( + static_cast(TestLayoutTransform_Resnet18_F16.data()), + TestLayoutTransform_Resnet18_F16.size()); +#else + auto profiler = ProfilerBase::make_cached_profiler( + "TestLayoutTransform.Resnet18_F16.cache"); +#endif + std::unique_ptr solver{ + new DynamicProgrammingSolver(std::move(profiler))}; + auto new_output = + gopt::GraphOptimizer{} + .add_pass(ConvertF32ToF16Pass::make(false)) + .add_pass() + .add_pass() + .add_pass(std::move(ctx), std::move(solver)) + .add_pass() + .add_pass(FuseNCHW4Int8Preprocess::make()) + .add_pass() + .add_pass() + .add_pass() + .apply({{output}}) + .endpoint_vars(); + auto new_out_var = new_output[0]; + /// check global layout transform pass + auto nr_dimshuffle = find_opr_num(new_out_var); + ASSERT_EQ(nr_dimshuffle, 4u); + /// check pass fuse conv bias with z + auto nr_elemwise = find_opr_num(new_out_var); + ASSERT_EQ(nr_elemwise, 4u); + /// 21 convolutions, 21 weights and 21 bias, total 42 parameters + const auto& param_merge = find_opr(new_out_var); + ASSERT_EQ(param_merge.output().size(), 42u); + + GraphProfiler gprof{network.graph.get()}; + HostTensorND t2; + auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)}); + func2->execute(); + gprof.to_json_full(func2.get())->writeto_fpath(output_file("resnet18_f16.json")); + MGB_ASSERT_TENSOR_NEAR(t1, t2, 1e-3); +} +#endif + TEST(TestLayoutTransform, Resnet18_F32) { auto cn = CompNode::load("cpu0"); @@ -1115,4 +1206,5 @@ TEST(TestLayoutTransform, MobileNetV2_NCHW44_DOT) { /// check correct MGB_ASSERT_TENSOR_EQ(t1, t2); } + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/gopt/test/network.cpp b/src/gopt/test/network.cpp index aa992f709f9d67d63c54666f2b65f4f2c80017e8..8ed27bd038eeab561faecaadedcfdf9ee448d1b8 100644 --- a/src/gopt/test/network.cpp +++ b/src/gopt/test/network.cpp @@ -38,8 +38,13 @@ SymbolVar Network::add_conv( param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; } - auto conv = opr::ConvBias::make( - f, weight, bias, param, {}, OperatorNodeConfig{out_dtype}); + SymbolVar conv; + if (out_dtype.category() == DTypeCategory::QUANTIZED) { + conv = opr::ConvBias::make( + f, weight, bias, param, {}, OperatorNodeConfig{out_dtype}); + } else { + conv = opr::ConvBias::make(f, weight, bias, param, {}); + } weight_idx++; bias_idx++; return conv;