diff --git a/dnn/test/arm_common/conv_bias.cpp b/dnn/test/arm_common/conv_bias.cpp index 620e12a882f4c2968f7f9c34b0998857da2f3ddb..fe7f01104127c2c0fecfe1c48c85855272672c3a 100644 --- a/dnn/test/arm_common/conv_bias.cpp +++ b/dnn/test/arm_common/conv_bias.cpp @@ -1745,6 +1745,62 @@ TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_1X1_S1_NCHW_VS_NCHW44_INT8x8x32) { } #endif +TEST_F(ARM_COMMON, BENCHMARK_CONV_BIAS_WINOGRAD_VS_IM2COL_INT8) { + auto&& args = get_winograd_benchmark_args(3, 8); + using namespace conv_bias; + constexpr size_t RUN = 10; + + Benchmarker benchmark_im2col(handle()); + benchmark_im2col.set_display(false); + benchmark_im2col.set_times(RUN); + benchmark_im2col.set_dtype(0, dtype::QuantizedS8(2.5f)) + .set_dtype(1, dtype::QuantizedS8(2.5f)) + .set_dtype(2, dtype::QuantizedS32(6.25f)) + .set_dtype(4, dtype::QuantizedS8(60.25f)); + + Benchmarker benchmark_winograd(handle()); + benchmark_winograd.set_display(false); + benchmark_winograd.set_times(RUN); + benchmark_winograd.set_dtype(0, dtype::QuantizedS8(2.5f)) + .set_dtype(1, dtype::QuantizedS8(2.5f)) + .set_dtype(2, dtype::QuantizedS32(6.25f)) + .set_dtype(4, dtype::QuantizedS8(60.25f)); + + for (auto&& arg : args) { + TensorLayout dst_layout; + auto opr = handle()->create_operator(); + opr->param() = arg.param; + opr->deduce_layout({arg.src, dtype::Float32()}, + {arg.filter, dtype::Float32()}, + {arg.bias, dtype::Float32()}, {}, dst_layout); + //! dst.nr_elems * IC * FH * FW * 2 + float computations = dst_layout.total_nr_elems() * arg.filter[1] * + arg.filter[2] * arg.filter[3] * 2.0 / + (1024 * 1024 * 1024) * 1e3; + + benchmark_im2col.set_param(arg.param); + auto im2col_used = + algo_benchmark( + benchmark_im2col, {arg.src, arg.filter, {}, {}, {}}, + "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16") / + RUN; + + benchmark_winograd.set_param(arg.param); + auto winograd_used = + algo_benchmark( + benchmark_winograd, {arg.src, arg.filter, {}, {}, {}}, + "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2") / + RUN; + + printf("%s %s: im2col: %f ms %f Gflops winograd: %f ms %f GFlops " + "speedup: " + "%f\n", + arg.src.to_string().c_str(), arg.filter.to_string().c_str(), + im2col_used, computations / im2col_used, winograd_used, + computations / winograd_used, im2col_used / winograd_used); + } +} + #endif // vim: syntax=cpp.doxygen diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp index c95c340c1bb5b885388b0d22e3c60bedbb7a028b..e4b87c601b9ebc19bde0f612baf81b16d9adad37 100644 --- a/dnn/test/common/conv_bias.cpp +++ b/dnn/test/common/conv_bias.cpp @@ -736,6 +736,12 @@ std::vector get_winograd_benchmark_args(size_t kernel, pack(64, 64, 123, 123, kernel, kernel / 2); pack(64, 24, 123, 123, kernel, kernel / 2); pack(24, 24, 224, 224, kernel, kernel / 2); + + //! conv in resnet18 + pack(64, 64, 56, 56, kernel, kernel / 2); + pack(128, 128, 28, 28, kernel, kernel / 2); + pack(256, 256, 14, 14, kernel, kernel / 2); + pack(512, 512, 7, 7, kernel, kernel / 2); return args; } diff --git a/src/opr/impl/dnn/convolution.cpp b/src/opr/impl/dnn/convolution.cpp index d8a365087c3722be8ff3b6ffaf626fce66cd8f24..a2f01451e499bd6a2f598ba925092af793c3f1e6 100644 --- a/src/opr/impl/dnn/convolution.cpp +++ b/src/opr/impl/dnn/convolution.cpp @@ -309,6 +309,7 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( return _dt(1.0f) cb(dtype::QuantizedS8); + cb(dtype::QuantizedS16); cb(dtype::QuantizedS32); default: return DType::from_enum(enumv); diff --git a/src/opr/impl/tensor_manip.cpp b/src/opr/impl/tensor_manip.cpp index ba93d3d3eb5c499d838af8bf7218ac4a7d4e3576..2abdd581f8409db3d7c746e56aec9a80f5e166c0 100644 --- a/src/opr/impl/tensor_manip.cpp +++ b/src/opr/impl/tensor_manip.cpp @@ -1549,6 +1549,12 @@ void RelayoutFormat::init_output_format() { /* f{{{ ===================== WinogradFilterPreprocess ===================== */ MGB_DYN_TYPE_OBJ_FINAL_IMPL(WinogradFilterPreprocess); MEGDNN_OPR_INIT1(WinogradFilterPreprocess, "winograd_filter_preprocess") +void WinogradFilterPreprocess::init_output_dtype() { + TensorLayout dst; + TensorLayout src{input(0)->shape(), input(0)->dtype(), input(0)->format()}; + megdnn_opr()->deduce_layout(src, dst); + output(0)->dtype(dst.dtype); +} // f}}} // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/opr/include/megbrain/opr/tensor_manip.h b/src/opr/include/megbrain/opr/tensor_manip.h index 4d80558a176abbe6faa619ca3d54cf5c3ea127ff..f02c9a38fdaaacb1c3919082d9de9ad7b20a85c8 100644 --- a/src/opr/include/megbrain/opr/tensor_manip.h +++ b/src/opr/include/megbrain/opr/tensor_manip.h @@ -637,7 +637,16 @@ MGB_DEFINE_OPR_CLASS(RelayoutFormat, * * See docs of megdnn params for more details */ -MGB_DEFINE_MEGDNN_OPR_WRAPPER_FWD1(WinogradFilterPreprocess); +MGB_DEFINE_OPR_CLASS(WinogradFilterPreprocess, + intl::MegDNNOprWrapperFwd) + public: + WinogradFilterPreprocess(VarNode* p0, const Param& param, + const OperatorNodeConfig& config); + static SymbolVar make(SymbolVar p0, const Param& param = {}, + const OperatorNodeConfig& config = {}); + void init_output_dtype() override final; +}; + } // opr } // mgb