提交 1fb7d34f 编写于 作者: M Megvii Engine Team

refactor(gopt): refactor layout transform for optimize for inference

GitOrigin-RevId: df4e8866d3a5032fea1712b830498c18bb6e57c3
上级 50d285fc
...@@ -539,7 +539,7 @@ def optimize_for_inference( ...@@ -539,7 +539,7 @@ def optimize_for_inference(
f16_io_comp=False, f16_io_comp=False,
use_nhwcd4=False, use_nhwcd4=False,
fuse_conv_bias_nonlinearity=False, fuse_conv_bias_nonlinearity=False,
use_tensor_core=False, use_nchw32=False,
fuse_conv_bias_with_z=False, fuse_conv_bias_with_z=False,
use_nchw88=False, use_nchw88=False,
use_nchw44=False use_nchw44=False
...@@ -564,6 +564,8 @@ def optimize_for_inference( ...@@ -564,6 +564,8 @@ def optimize_for_inference(
times. times.
:param use_nchw44: whether to use NCHW44 tensor format. This maybe faster some :param use_nchw44: whether to use NCHW44 tensor format. This maybe faster some
times. times.
:param use_nchw32: whether to use NCHW32 tensor format. Mainly used for
nvidia tensorcore.
:return: list of transformed vars corresponding to given output vars :return: list of transformed vars corresponding to given output vars
...@@ -575,15 +577,28 @@ def optimize_for_inference( ...@@ -575,15 +577,28 @@ def optimize_for_inference(
for i in [ for i in [
"f16_io_f32_comp", "f16_io_f32_comp",
"f16_io_comp", "f16_io_comp",
"use_nhwcd4",
"fuse_conv_bias_nonlinearity", "fuse_conv_bias_nonlinearity",
"use_tensor_core",
"fuse_conv_bias_with_z", "fuse_conv_bias_with_z",
"use_nchw88",
"use_nchw44",
]: ]:
if settings[i]: if settings[i]:
getattr(opt, "enable_{}".format(i))() getattr(opt, "enable_{}".format(i))()
layout_tranform = None
for k, v in {
"use_nhwcd4": "nchw2nhwcd4",
"use_nchw32": "nchw2nchw32",
"use_nchw88": "nchw2nchw88",
"use_nchw44": "nchw2nchw44",
}.items():
if settings[k]:
assert (
not layout_tranform
), "Only one layout transform supported, both {} and {}".format(
layout_tranform, k
)
getattr(opt, "enable_{}".format(v))()
layout_tranform = k
vec = _detail._VectorSymbolVar() vec = _detail._VectorSymbolVar()
for i in output_vars: for i in output_vars:
assert isinstance(i, _detail.SymbolVar), "bad var: {}".format(i) assert isinstance(i, _detail.SymbolVar), "bad var: {}".format(i)
......
...@@ -71,15 +71,19 @@ class _PersistentCache { ...@@ -71,15 +71,19 @@ class _PersistentCache {
}; };
struct _OptimizeForInferenceOptions { struct _OptimizeForInferenceOptions {
#define SET(n) void enable_##n() #define SET(n) void enable_##n();
SET(f16_io_f32_comp); SET(f16_io_f32_comp);
SET(f16_io_comp); SET(f16_io_comp);
SET(fuse_conv_bias_nonlinearity); SET(fuse_conv_bias_nonlinearity);
SET(use_nhwcd4); SET(fuse_conv_bias_with_z);
SET(use_tensor_core); #undef SET
SET(fuse_conv_bias_with_z); #define SET(_trans, _trans_capital) \
SET(use_nchw88); void enable_##_trans(); \
SET(use_nchw44);
SET(nchw2nhwcd4, NCHW2NHWCD4);
SET(nchw2nchw88, NCHW2NCHW88);
SET(nchw2nchw44, NCHW2NCHW44);
SET(nchw2nchw32, NCHW2NCHW32);
#undef SET #undef SET
}; };
......
...@@ -255,7 +255,7 @@ def optimize_for_inference(args, outputs): ...@@ -255,7 +255,7 @@ def optimize_for_inference(args, outputs):
'enable_nchw88': 'use_nchw88', 'enable_nchw88': 'use_nchw88',
'enable_nchw44': 'use_nchw44', 'enable_nchw44': 'use_nchw44',
'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity', 'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity',
'enable_tensorcore': 'use_tensor_core', 'enable_nchw32': 'use_nchw32',
'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z', 'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z',
} }
kwargs = {} kwargs = {}
...@@ -393,7 +393,7 @@ def main(): ...@@ -393,7 +393,7 @@ def main():
'for inference' 'for inference'
) )
parser.add_argument( parser.add_argument(
'--enable-tensorcore', '--enable-nchw32',
action='store_true', action='store_true',
help='transform the model format from NCHW4 to NCHW32 ' help='transform the model format from NCHW4 to NCHW32 '
'for inference on nvidia TensoCore' 'for inference on nvidia TensoCore'
......
...@@ -642,21 +642,6 @@ GraphOptimizer& GraphOptimizer::add_preset_passes( ...@@ -642,21 +642,6 @@ GraphOptimizer& GraphOptimizer::add_preset_passes(
add_pass<ArithMulDistributePass>(); add_pass<ArithMulDistributePass>();
add_pass<ReorderArithChainPass>(cv_type); add_pass<ReorderArithChainPass>(cv_type);
if (inference_opt) {
if (inference_opt->use_nhwcd4) {
add_pass(ConvertFormatPass::make_nhwcd4_converter());
}
if (inference_opt->f16_io_f32_comp) {
add_pass(ConvertF32ToF16Pass::make(true));
}
if (inference_opt->f16_io_comp) {
add_pass(ConvertF32ToF16Pass::make(false));
}
// fuse again after reordering
add_pass<ParamFusePass>();
}
add_pass<ArithFusePass>(); add_pass<ArithFusePass>();
// reorder again because shapes of fused oprs might change // reorder again because shapes of fused oprs might change
add_pass<ReorderArithChainPass>(cv_type); add_pass<ReorderArithChainPass>(cv_type);
...@@ -687,32 +672,7 @@ GraphOptimizer& GraphOptimizer::add_preset_passes( ...@@ -687,32 +672,7 @@ GraphOptimizer& GraphOptimizer::add_preset_passes(
} }
#endif #endif
if (inference_opt) { apply_optimize_options(inference_opt);
if (inference_opt->fuse_conv_bias_nonlinearity)
add_pass<FuseConvBiasNonlinPass>();
if (inference_opt->fuse_conv_bias_with_z) {
mgb_assert(inference_opt->fuse_conv_bias_nonlinearity,
"fuse conv bias with z input should fuse conv bias "
"activation "
"first");
add_pass<FuseConvBiasZPass>();
}
if (inference_opt->use_nchw88) {
add_pass(EnableNchwxxPass::make_nchwxx_converter(8));
}
if (inference_opt->use_nchw44) {
add_pass(EnableNchwxxPass::make_nchwxx_converter(4));
}
if (inference_opt->use_tensor_core) {
mgb_assert(inference_opt->fuse_conv_bias_nonlinearity,
"enable tensor core should fuse conv bias activation "
"first");
add_pass(EnableTensorCorePass::make_tensorcore_converter());
add_pass<ShuffleShuffleRemovePass>();
add_pass<RemoveRedundantTypeCvtPass>();
}
add_pass<ParamFusePass>();
}
if (inference_opt) { if (inference_opt) {
// merge params to reduce loading time and graph overhead // merge params to reduce loading time and graph overhead
...@@ -739,6 +699,42 @@ VarNode* GraphOptimizer::var_replace_lookup(VarNode *var) { ...@@ -739,6 +699,42 @@ VarNode* GraphOptimizer::var_replace_lookup(VarNode *var) {
} }
} }
void GraphOptimizer::apply_optimize_options(
const OptimizeOptions* options) {
if (!options) return;
if (options->f16_io_comp) {
add_pass(ConvertF32ToF16Pass::make(false));
}
if (options->f16_io_f32_comp) {
add_pass(ConvertF32ToF16Pass::make(true));
}
if (options->transform_nchw2nhwcd4()) {
add_pass(ConvertFormatPass::make_nhwcd4_converter());
add_pass<FuseConvBiasNonlinPass>();
}
if (options->transform_nchw2nchw88()) {
add_pass(EnableNchwxxPass::make_nchwxx_converter(8));
}
if (options->transform_nchw2nchw44()) {
add_pass(EnableNchwxxPass::make_nchwxx_converter(4));
}
if (options->transform_nchw2nchw32()) {
add_pass<FuseConvBiasNonlinPass>();
add_pass(EnableTensorCorePass::make_tensorcore_converter());
add_pass<ShuffleShuffleRemovePass>();
add_pass<RemoveRedundantTypeCvtPass>();
}
if (options->fuse_conv_bias_nonlinearity) {
add_pass<FuseConvBiasNonlinPass>();
}
if (options->fuse_conv_bias_with_z) {
add_pass<FuseConvBiasNonlinPass>();
add_pass<FuseConvBiasZPass>();
}
add_pass<ParamFusePass>();
}
/* ================ ConstVarPropogateBase ================ */ /* ================ ConstVarPropogateBase ================ */
ConstVarPropogateBase::AddOprResult ConstVarPropogateBase::add_opr( ConstVarPropogateBase::AddOprResult ConstVarPropogateBase::add_opr(
......
...@@ -1770,7 +1770,7 @@ public: ...@@ -1770,7 +1770,7 @@ public:
return reformat.node(); return reformat.node();
}; };
m_reformat[std::make_pair(TensorFormat::CHWN4, TensorFormat::NCHW4)] = m_reformat[std::make_pair(TensorFormat::CHWN4, TensorFormat::NCHW4)] =
[](VarNode* inp) -> VarNode* { [](VarNode* inp) -> VarNode* {
megdnn::param::RelayoutFormat param; megdnn::param::RelayoutFormat param;
......
...@@ -377,6 +377,57 @@ namespace gopt { ...@@ -377,6 +377,57 @@ namespace gopt {
RecursiveSubGraphRewriteHelper(OptState &state); RecursiveSubGraphRewriteHelper(OptState &state);
}; };
/**
* \brief common optimize options, it both can be used for optimize for
* inference in graph dump but also used in graph optimization in runtime.
*/
struct OptimizeOptions {
//! whether to enable IO in float16 compute in float32
bool f16_io_f32_comp = false;
//! whether to enable tranform to pure float16 model
bool f16_io_comp = false;
//! whether to enable conv bias nonlinearity fusion
bool fuse_conv_bias_nonlinearity = false;
enum LayoutTransform : uint32_t {
DEFAULT,
NCHW2NHWCD4, ///< compute using NHWCD4 tensor format
NCHW2NCHW88, ///< compute using NCHW88 tensor format
NCHW2NCHW44, ///< compute using NCHW44 tensor format
NCHW2NCHW32, ///< compute using NCHW32 tensor format, used for
///< tensorcore
};
LayoutTransform layout_transform = LayoutTransform::DEFAULT;
//! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b)
//! + z -> conv_bias(x, w, b, z)
bool fuse_conv_bias_with_z = false;
#define SET(n) \
OptimizeOptions& enable_##n() { \
n = true; \
return *this; \
}
SET(f16_io_f32_comp);
SET(f16_io_comp);
SET(fuse_conv_bias_nonlinearity);
SET(fuse_conv_bias_with_z);
#undef SET
#define SET(_trans, _trans_capital) \
OptimizeOptions& enable_##_trans() { \
layout_transform = LayoutTransform::_trans_capital; \
return *this; \
} \
bool transform_##_trans() const { \
return layout_transform == LayoutTransform::_trans_capital; \
}
SET(nchw2nhwcd4, NCHW2NHWCD4);
SET(nchw2nchw88, NCHW2NCHW88);
SET(nchw2nchw44, NCHW2NCHW44);
SET(nchw2nchw32, NCHW2NCHW32);
#undef SET
};
/*! /*!
* \brief manage passes and their applying on graphs * \brief manage passes and their applying on graphs
* *
...@@ -465,6 +516,11 @@ namespace gopt { ...@@ -465,6 +516,11 @@ namespace gopt {
* var_replace_map(var->owner_graph()) corresponding to var * var_replace_map(var->owner_graph()) corresponding to var
*/ */
static VarNode* var_replace_lookup(VarNode *var); static VarNode* var_replace_lookup(VarNode *var);
/**
* \brief apply optimize options
*/
void apply_optimize_options(const OptimizeOptions* options);
}; };
/*! /*!
......
...@@ -256,40 +256,7 @@ namespace gopt { ...@@ -256,40 +256,7 @@ namespace gopt {
size_t pack_c_size); size_t pack_c_size);
}; };
struct OptimizeForInferenceOptions { struct OptimizeForInferenceOptions : OptimizeOptions {};
//! whether to enable IO in float16 compute in float32
bool f16_io_f32_comp = false;
//! whether to enable tranform to pure float16 model
bool f16_io_comp = false;
//! whether to enable conv bias nonlinearity fusion
bool fuse_conv_bias_nonlinearity = false;
//! whether to compute using NHWCD4 tensor format
bool use_nhwcd4 = false;
//! whether to compute using NCHW88 tensor format
bool use_nchw88 = false;
//! whether to compute using NCHW44 tensor format
bool use_nchw44 = false;
//! whether to enable tensor core
bool use_tensor_core = false;
//! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b)
//! + z -> conv_bias(x, w, b, z)
bool fuse_conv_bias_with_z = false;
#define SET(n) \
OptimizeForInferenceOptions& enable_##n() { \
n = true; \
return *this; \
}
SET(f16_io_f32_comp);
SET(f16_io_comp);
SET(fuse_conv_bias_nonlinearity);
SET(use_nhwcd4);
SET(use_tensor_core);
SET(fuse_conv_bias_with_z);
SET(use_nchw88);
SET(use_nchw44);
#undef SET
};
/*! /*!
* \brief optimize a computing graph for inference * \brief optimize a computing graph for inference
......
...@@ -635,10 +635,9 @@ TEST(TestGoptInference, Float16IOFloat32Compute) { ...@@ -635,10 +635,9 @@ TEST(TestGoptInference, Float16IOFloat32Compute) {
y = opr::Concat::make({y, -y}, 0); y = opr::Concat::make({y, -y}, 0);
y = opr::Reduce::make(y, {}, y.make_scalar(1)); y = opr::Reduce::make(y, {}, y.make_scalar(1));
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector(gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y}, gopt::OptimizeForInferenceOptions{} options.enable_f16_io_f32_comp();
.enable_f16_io_f32_comp()), unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
y_opt);
ASSERT_EQ(y_opt.dtype(), dtype::Float32()); ASSERT_EQ(y_opt.dtype(), dtype::Float32());
HostTensorND host_y, host_y_opt; HostTensorND host_y, host_y_opt;
...@@ -683,10 +682,9 @@ TEST(TestGoptInference, Float16IOFloat32ComputeWarpPerspective) { ...@@ -683,10 +682,9 @@ TEST(TestGoptInference, Float16IOFloat32ComputeWarpPerspective) {
TensorShape out_shp{20, 20}; TensorShape out_shp{20, 20};
auto y = opr::WarpPerspective::make(a, mat, out_shp); auto y = opr::WarpPerspective::make(a, mat, out_shp);
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector(gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y}, gopt::OptimizeForInferenceOptions{} options.enable_f16_io_f32_comp();
.enable_f16_io_f32_comp()), unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
y_opt);
ASSERT_EQ(y_opt.dtype(), dtype::Float32()); ASSERT_EQ(y_opt.dtype(), dtype::Float32());
HostTensorND host_y, host_y_opt; HostTensorND host_y, host_y_opt;
auto func = graph->compile({make_callback_copy(y, host_y), auto func = graph->compile({make_callback_copy(y, host_y),
...@@ -723,10 +721,9 @@ TEST(TestGoptInference, Float16IOFloat32ComputeRemap) { ...@@ -723,10 +721,9 @@ TEST(TestGoptInference, Float16IOFloat32ComputeRemap) {
auto map = opr::Host2DeviceCopy::make(*graph, map_host).rename("map"); auto map = opr::Host2DeviceCopy::make(*graph, map_host).rename("map");
auto y = opr::Remap::make(a, map); auto y = opr::Remap::make(a, map);
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector(gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y}, gopt::OptimizeForInferenceOptions{} options.enable_f16_io_f32_comp();
.enable_f16_io_f32_comp()), unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
y_opt);
ASSERT_EQ(y_opt.dtype(), dtype::Float32()); ASSERT_EQ(y_opt.dtype(), dtype::Float32());
HostTensorND host_y, host_y_opt; HostTensorND host_y, host_y_opt;
auto func = graph->compile({make_callback_copy(y, host_y), auto func = graph->compile({make_callback_copy(y, host_y),
...@@ -770,10 +767,9 @@ TEST(TestGoptInference, Uint8IOFloat16ComputeWarpPerspective) { ...@@ -770,10 +767,9 @@ TEST(TestGoptInference, Uint8IOFloat16ComputeWarpPerspective) {
TensorShape out_shp{20, 20}; TensorShape out_shp{20, 20};
auto y = opr::WarpPerspective::make(a, mat, out_shp); auto y = opr::WarpPerspective::make(a, mat, out_shp);
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector(gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y}, gopt::OptimizeForInferenceOptions{} options.enable_f16_io_comp();
.enable_f16_io_comp()), unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
y_opt);
ASSERT_EQ(y_opt.dtype(), dtype::Uint8()); ASSERT_EQ(y_opt.dtype(), dtype::Uint8());
HostTensorND host_y, host_y_opt; HostTensorND host_y, host_y_opt;
auto func = graph->compile({make_callback_copy(y, host_y), auto func = graph->compile({make_callback_copy(y, host_y),
...@@ -801,10 +797,9 @@ TEST(TestGoptInference, Float32TOFloat16) { ...@@ -801,10 +797,9 @@ TEST(TestGoptInference, Float32TOFloat16) {
y = opr::Reduce::make(y, {}, y.make_scalar(1)); y = opr::Reduce::make(y, {}, y.make_scalar(1));
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector(gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y}, gopt::OptimizeForInferenceOptions{} options.enable_f16_io_comp();
.enable_f16_io_comp()), unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
y_opt);
return y_opt; return y_opt;
}; };
...@@ -857,10 +852,9 @@ TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) { ...@@ -857,10 +852,9 @@ TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) {
auto y = d0 + b; auto y = d0 + b;
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector(gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y}, gopt::OptimizeForInferenceOptions{} options.enable_f16_io_comp();
.enable_f16_io_comp()), unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
y_opt);
return y_opt; return y_opt;
}; };
...@@ -897,7 +891,7 @@ TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) { ...@@ -897,7 +891,7 @@ TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) {
TEST(TestGoptInference, Float32TOFloat16Linspace) { TEST(TestGoptInference, Float32TOFloat16Linspace) {
CompNode cn = CompNode::load("cpu0"); CompNode cn = CompNode::load("cpu0");
HostTensorGenerator<> gen(0, 1, 0); HostTensorGenerator<> gen(0, 1, 0);
auto host_x = gen({3, 1}, cn); auto host_x = gen({3, 1}, cn);
auto graph = ComputingGraph::make(); auto graph = ComputingGraph::make();
auto make_f32_to_f16_graph = [&]() { auto make_f32_to_f16_graph = [&]() {
...@@ -916,10 +910,9 @@ TEST(TestGoptInference, Float32TOFloat16Linspace) { ...@@ -916,10 +910,9 @@ TEST(TestGoptInference, Float32TOFloat16Linspace) {
auto mm = opr::MatrixMul::make(x, y); auto mm = opr::MatrixMul::make(x, y);
SymbolVar mm_opt; SymbolVar mm_opt;
unpack_vector(gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{mm}, gopt::OptimizeForInferenceOptions{} options.enable_f16_io_comp();
.enable_f16_io_comp()), unpack_vector(gopt::optimize_for_inference({mm}, options), mm_opt);
mm_opt);
return mm_opt; return mm_opt;
}; };
...@@ -998,11 +991,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4) { ...@@ -998,11 +991,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4) {
y = opr::Convolution::make(elem, w2, param); y = opr::Convolution::make(elem, w2, param);
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector( auto options = gopt::OptimizeForInferenceOptions{};
gopt::optimize_for_inference( options.enable_nchw2nhwcd4();
{y}, unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()),
y_opt);
ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4, ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
find_opr<opr::Convolution>(y_opt).param().format); find_opr<opr::Convolution>(y_opt).param().format);
...@@ -1059,11 +1050,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4LOCAL) { ...@@ -1059,11 +1050,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4LOCAL) {
y = opr::Convolution::make(group_local, w5, param); y = opr::Convolution::make(group_local, w5, param);
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector( auto options = gopt::OptimizeForInferenceOptions{};
gopt::optimize_for_inference( options.enable_nchw2nhwcd4();
{y}, unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()),
y_opt);
ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4, ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
find_opr<opr::Convolution>(y_opt).param().format); find_opr<opr::Convolution>(y_opt).param().format);
...@@ -1112,11 +1101,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4Deconv) { ...@@ -1112,11 +1101,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4Deconv) {
y = opr::ConvolutionBackwardData::make(w1, conv, param, {}, {}); y = opr::ConvolutionBackwardData::make(w1, conv, param, {}, {});
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector( auto options = gopt::OptimizeForInferenceOptions{};
gopt::optimize_for_inference( options.enable_nchw2nhwcd4();
{y}, unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()),
y_opt);
ASSERT_EQ(opr::Convolution::Param::Format::NCHW, ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
find_opr<opr::ConvolutionBackwardData>(y_opt).param().format); find_opr<opr::ConvolutionBackwardData>(y_opt).param().format);
...@@ -1159,11 +1146,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4Qint8) { ...@@ -1159,11 +1146,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4Qint8) {
OperatorNodeConfig{dtype::QuantizedS8(0.2f)}); OperatorNodeConfig{dtype::QuantizedS8(0.2f)});
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector( auto options = gopt::OptimizeForInferenceOptions{};
gopt::optimize_for_inference( options.enable_nchw2nhwcd4();
{y}, unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()),
y_opt);
ASSERT_EQ(opr::ConvBias::Param::Format::NHWCD4, ASSERT_EQ(opr::ConvBias::Param::Format::NHWCD4,
find_opr<opr::ConvBias>(y_opt).param().format); find_opr<opr::ConvBias>(y_opt).param().format);
...@@ -1213,11 +1198,9 @@ TEST(TestGoptInference, ConvertFormatPadIC) { ...@@ -1213,11 +1198,9 @@ TEST(TestGoptInference, ConvertFormatPadIC) {
auto w1 = mkcvar("w1", {12, 12, 3, 3}); auto w1 = mkcvar("w1", {12, 12, 3, 3});
auto y = opr::Convolution::make(concat, w1, param); auto y = opr::Convolution::make(concat, w1, param);
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector( auto options = gopt::OptimizeForInferenceOptions{};
gopt::optimize_for_inference( options.enable_nchw2nhwcd4();
{y}, unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()),
y_opt);
HostTensorND host_y_opt, host_y; HostTensorND host_y_opt, host_y;
auto func = graph->compile({make_callback_copy(y, host_y), auto func = graph->compile({make_callback_copy(y, host_y),
...@@ -1301,11 +1284,9 @@ TEST(TestGoptInference, ConvBiasNonlinearityFusePass) { ...@@ -1301,11 +1284,9 @@ TEST(TestGoptInference, ConvBiasNonlinearityFusePass) {
opr::Elemwise::make({y_cut}, opr::Elemwise::Param::Mode::RELU), opr::Elemwise::make({y_cut}, opr::Elemwise::Param::Mode::RELU),
y_y = opr::Convolution::make(y_expand, w3, param), y = y_y + y_tmp; y_y = opr::Convolution::make(y_expand, w3, param), y = y_y + y_tmp;
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector(gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y}, gopt::OptimizeForInferenceOptions{} options.enable_nchw2nhwcd4().enable_fuse_conv_bias_nonlinearity();
.enable_use_nhwcd4() unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
.enable_fuse_conv_bias_nonlinearity()),
y_opt);
ASSERT_EQ(3u, find_opr<opr::ConvBias>(y_opt).input().size()); ASSERT_EQ(3u, find_opr<opr::ConvBias>(y_opt).input().size());
graph->compile({{y_opt, {}}}) graph->compile({{y_opt, {}}})
->to_json() ->to_json()
...@@ -1533,15 +1514,16 @@ TEST(TestEnableTensorCore, SmallInputShape) { ...@@ -1533,15 +1514,16 @@ TEST(TestEnableTensorCore, SmallInputShape) {
SymbolVar y_opt; SymbolVar y_opt;
SymbolVar y_no_tc; SymbolVar y_no_tc;
unpack_vector(gopt::optimize_for_inference( {
{y}, gopt::OptimizeForInferenceOptions{} auto options = gopt::OptimizeForInferenceOptions{};
.enable_fuse_conv_bias_nonlinearity() options.enable_nchw2nchw32().enable_fuse_conv_bias_nonlinearity();
.enable_use_tensor_core()), unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
y_opt); }
unpack_vector(gopt::optimize_for_inference( {
{y}, gopt::OptimizeForInferenceOptions{} auto options = gopt::OptimizeForInferenceOptions{};
.enable_fuse_conv_bias_nonlinearity()), options.enable_fuse_conv_bias_nonlinearity();
y_no_tc); unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
}
auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt); auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
ASSERT_EQ(2u, nr_dimshuffle); ASSERT_EQ(2u, nr_dimshuffle);
HostTensorND host_y, host_y_opt; HostTensorND host_y, host_y_opt;
...@@ -1597,15 +1579,16 @@ TEST(TestEnableTensorCore, ConvBiasWithZ) { ...@@ -1597,15 +1579,16 @@ TEST(TestEnableTensorCore, ConvBiasWithZ) {
SymbolVar y_opt; SymbolVar y_opt;
SymbolVar y_no_tc; SymbolVar y_no_tc;
unpack_vector(gopt::optimize_for_inference( {
{y}, gopt::OptimizeForInferenceOptions{} auto options = gopt::OptimizeForInferenceOptions{};
.enable_fuse_conv_bias_nonlinearity() options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32();
.enable_use_tensor_core()), unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
y_opt); }
unpack_vector(gopt::optimize_for_inference( {
{y}, gopt::OptimizeForInferenceOptions{} auto options = gopt::OptimizeForInferenceOptions{};
.enable_fuse_conv_bias_nonlinearity()), options.enable_fuse_conv_bias_nonlinearity();
y_no_tc); unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
}
HostTensorND host_y, host_y_opt; HostTensorND host_y, host_y_opt;
auto func = graph->compile({make_callback_copy(y_no_tc, host_y), auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
make_callback_copy(y_opt, host_y_opt)}); make_callback_copy(y_opt, host_y_opt)});
...@@ -1664,15 +1647,16 @@ TEST(TestGoptInference, EnableTensorCore) { ...@@ -1664,15 +1647,16 @@ TEST(TestGoptInference, EnableTensorCore) {
y4 = opr::TypeCvt::make(y4, dtype::Float32()); y4 = opr::TypeCvt::make(y4, dtype::Float32());
SymbolVar y_opt; SymbolVar y_opt;
SymbolVar y_no_tc; SymbolVar y_no_tc;
unpack_vector(gopt::optimize_for_inference( {
{y4}, gopt::OptimizeForInferenceOptions{} auto options = gopt::OptimizeForInferenceOptions{};
.enable_fuse_conv_bias_nonlinearity() options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32();
.enable_use_tensor_core()), unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
y_opt); }
unpack_vector(gopt::optimize_for_inference( {
{y4}, gopt::OptimizeForInferenceOptions{} auto options = gopt::OptimizeForInferenceOptions{};
.enable_fuse_conv_bias_nonlinearity()), options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32();
y_no_tc); unpack_vector(gopt::optimize_for_inference({y4}, options), y_no_tc);
}
auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt); auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
ASSERT_EQ(3u, nr_dimshuffle); ASSERT_EQ(3u, nr_dimshuffle);
graph->compile({{y_opt, {}}}) graph->compile({{y_opt, {}}})
...@@ -1763,15 +1747,17 @@ TEST(FuseConvBiasZPass, BlockFuse) { ...@@ -1763,15 +1747,17 @@ TEST(FuseConvBiasZPass, BlockFuse) {
SymbolVar z_fuse; SymbolVar z_fuse;
SymbolVar z_nonfuse; SymbolVar z_nonfuse;
unpack_vector(gopt::optimize_for_inference( {
{z}, gopt::OptimizeForInferenceOptions{} auto options = gopt::OptimizeForInferenceOptions{};
.enable_fuse_conv_bias_nonlinearity() options.enable_fuse_conv_bias_nonlinearity()
.enable_fuse_conv_bias_with_z()), .enable_fuse_conv_bias_with_z();
z_fuse); unpack_vector(gopt::optimize_for_inference({z}, options), z_fuse);
unpack_vector(gopt::optimize_for_inference( }
{z4}, gopt::OptimizeForInferenceOptions{} {
.enable_fuse_conv_bias_nonlinearity()), auto options = gopt::OptimizeForInferenceOptions{};
z_nonfuse); options.enable_fuse_conv_bias_nonlinearity();
unpack_vector(gopt::optimize_for_inference({z4}, options), z_nonfuse);
}
auto nr_elem_multi_type = find_opr_num<mgb::opr::ElemwiseMultiType>(z_fuse); auto nr_elem_multi_type = find_opr_num<mgb::opr::ElemwiseMultiType>(z_fuse);
MGB_MARK_USED_VAR(nr_elem_multi_type); MGB_MARK_USED_VAR(nr_elem_multi_type);
ASSERT_EQ(1u, nr_elem_multi_type); ASSERT_EQ(1u, nr_elem_multi_type);
...@@ -1867,15 +1853,16 @@ TEST(TestEnableTensorCore, ShuffleMerge) { ...@@ -1867,15 +1853,16 @@ TEST(TestEnableTensorCore, ShuffleMerge) {
SymbolVar y_opt; SymbolVar y_opt;
SymbolVar y_no_tc; SymbolVar y_no_tc;
unpack_vector(gopt::optimize_for_inference( {
{y}, gopt::OptimizeForInferenceOptions{} auto options = gopt::OptimizeForInferenceOptions{};
.enable_fuse_conv_bias_nonlinearity() options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32();
.enable_use_tensor_core()), unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
y_opt); }
unpack_vector(gopt::optimize_for_inference( {
{y}, gopt::OptimizeForInferenceOptions{} auto options = gopt::OptimizeForInferenceOptions{};
.enable_fuse_conv_bias_nonlinearity()), options.enable_fuse_conv_bias_nonlinearity();
y_no_tc); unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
}
auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt); auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
ASSERT_EQ(3u, nr_dimshuffle); ASSERT_EQ(3u, nr_dimshuffle);
HostTensorND host_y, host_y_opt; HostTensorND host_y, host_y_opt;
...@@ -1932,13 +1919,13 @@ TEST(FuseConvBiasZPass, Basic) { ...@@ -1932,13 +1919,13 @@ TEST(FuseConvBiasZPass, Basic) {
opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}) { opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}) {
auto y1 = opr::ElemwiseMultiType::make( auto y1 = opr::ElemwiseMultiType::make(
{y, b1}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); {y, b1}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
unpack_vector( {
gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y1}, gopt::OptimizeForInferenceOptions{} options.enable_fuse_conv_bias_nonlinearity()
.enable_fuse_conv_bias_nonlinearity() .enable_fuse_conv_bias_with_z()
.enable_fuse_conv_bias_with_z() .enable_nchw2nchw32();
.enable_use_tensor_core()), unpack_vector(gopt::optimize_for_inference({y1}, options), y_opt);
y_opt); }
auto nr_elemwisemultitype = find_opr_num<opr::ElemwiseMultiType>(y_opt); auto nr_elemwisemultitype = find_opr_num<opr::ElemwiseMultiType>(y_opt);
if (mode == opr::ElemwiseMultiType::Param::Mode::QMUL) { if (mode == opr::ElemwiseMultiType::Param::Mode::QMUL) {
ASSERT_NE(0u, nr_elemwisemultitype); ASSERT_NE(0u, nr_elemwisemultitype);
...@@ -1949,13 +1936,14 @@ TEST(FuseConvBiasZPass, Basic) { ...@@ -1949,13 +1936,14 @@ TEST(FuseConvBiasZPass, Basic) {
auto y2 = opr::ElemwiseMultiType::make( auto y2 = opr::ElemwiseMultiType::make(
{y1, b2}, {mode}, {y1, b2}, {mode},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
unpack_vector( {
gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y2}, gopt::OptimizeForInferenceOptions{} options.enable_fuse_conv_bias_nonlinearity()
.enable_fuse_conv_bias_nonlinearity() .enable_fuse_conv_bias_with_z()
.enable_fuse_conv_bias_with_z() .enable_nchw2nchw32();
.enable_use_tensor_core()), unpack_vector(gopt::optimize_for_inference({y2}, options),
y_opt); y_opt);
}
auto nr_elemwisemultitype = auto nr_elemwisemultitype =
find_opr_num<opr::ElemwiseMultiType>(y_opt); find_opr_num<opr::ElemwiseMultiType>(y_opt);
ASSERT_NE(0u, nr_elemwisemultitype); ASSERT_NE(0u, nr_elemwisemultitype);
...@@ -2401,11 +2389,11 @@ TEST(TestGoptInference, ConvertFormatNCHW88) { ...@@ -2401,11 +2389,11 @@ TEST(TestGoptInference, ConvertFormatNCHW88) {
y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias); y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias);
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector( {
gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y}, options.enable_nchw2nchw88();
gopt::OptimizeForInferenceOptions{}.enable_use_nchw88()), unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
y_opt); }
ASSERT_EQ(opr::ConvBias::Param::Format::NCHW88, ASSERT_EQ(opr::ConvBias::Param::Format::NCHW88,
find_opr<opr::ConvBias>(y_opt).param().format); find_opr<opr::ConvBias>(y_opt).param().format);
...@@ -2483,11 +2471,9 @@ TEST(TestGoptInference, ConvertFormatNCHW44) { ...@@ -2483,11 +2471,9 @@ TEST(TestGoptInference, ConvertFormatNCHW44) {
y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias); y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias);
SymbolVar y_opt; SymbolVar y_opt;
unpack_vector( auto options = gopt::OptimizeForInferenceOptions{};
gopt::optimize_for_inference( options.enable_nchw2nchw44();
{y}, unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
gopt::OptimizeForInferenceOptions{}.enable_use_nchw44()),
y_opt);
ASSERT_EQ(opr::ConvBias::Param::Format::NCHW44, ASSERT_EQ(opr::ConvBias::Param::Format::NCHW44,
find_opr<opr::ConvBias>(y_opt).param().format); find_opr<opr::ConvBias>(y_opt).param().format);
......
...@@ -495,7 +495,7 @@ TEST(TestOprDNN, ConvolutionBackwardFilter) { ...@@ -495,7 +495,7 @@ TEST(TestOprDNN, ConvolutionBackwardFilter) {
Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW}); Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
dest[0] = *out; dest[0] = *out;
}; };
#define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1) #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
#define inp_tensor(N, IC, OC, IH, IW, FH, FW) \ #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
{ TensorShape{N, IC, IH, IW}, \ { TensorShape{N, IC, IH, IW}, \
...@@ -1282,9 +1282,10 @@ TEST(TestOprDNN, ConvBiasINT8x8xX_NCHW4) { ...@@ -1282,9 +1282,10 @@ TEST(TestOprDNN, ConvBiasINT8x8xX_NCHW4) {
*graph, inp[i]); *graph, inp[i]);
} }
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_fuse_conv_bias_nonlinearity();
auto y = gopt::optimize_for_inference({make_graph(inputs)[0]}, auto y = gopt::optimize_for_inference({make_graph(inputs)[0]},
gopt::OptimizeForInferenceOptions{}.enable_fuse_conv_bias_nonlinearity())[0]; options)[0];
//gopt::OptimizeForInferenceOptions{})[0];
auto func = graph->compile({make_callback_copy(y, dest[0])}); auto func = graph->compile({make_callback_copy(y, dest[0])});
func->execute(); func->execute();
func->wait(); func->wait();
...@@ -1720,7 +1721,7 @@ TEST(TestOprDNN, DeformableConvForward) { ...@@ -1720,7 +1721,7 @@ TEST(TestOprDNN, DeformableConvForward) {
} }
}; };
//! generate offset to avoid value near integer //! generate offset to avoid value near integer
/// because bilinear function is not derivable over there /// because bilinear function is not derivable over there
checker.set_input_generator(2, gen_off); checker.set_input_generator(2, gen_off);
checker.set_input_dtype(0, dtype::Float32()); checker.set_input_dtype(0, dtype::Float32());
checker.set_input_dtype(1, dtype::Float32()); checker.set_input_dtype(1, dtype::Float32());
......
...@@ -500,10 +500,10 @@ TEST(TestOprIO, MultipleDeviceTensorWithFormatHolderCpu) { ...@@ -500,10 +500,10 @@ TEST(TestOprIO, MultipleDeviceTensorWithFormatHolderCpu) {
conv2 = opr::Convolution::make(conv1, w2, param); conv2 = opr::Convolution::make(conv1, w2, param);
auto y = opr::Elemwise::make({conv2}, opr::Elemwise::Param::Mode::RELU); auto y = opr::Elemwise::make({conv2}, opr::Elemwise::Param::Mode::RELU);
SymbolVar y_opt = gopt::optimize_for_inference( auto options = gopt::OptimizeForInferenceOptions{};
{y}, gopt::OptimizeForInferenceOptions{} options.enable_nchw2nhwcd4();
.enable_use_nhwcd4())[0] SymbolVar y_opt =
.rename("out"); gopt::optimize_for_inference({y}, options)[0].rename("out");
auto dumper = serialization::GraphDumper::make( auto dumper = serialization::GraphDumper::make(
serialization::OutputFile::make_fs(fname.c_str())); serialization::OutputFile::make_fs(fname.c_str()));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册