diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp index 95e28d0104980378eae9b9cdb661188011c478a3..873f3fcabf9d96e989bedddc20f26685fb2233e6 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp +++ b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/fallback/conv_bias/conv1x1/algos.h" @@ -67,7 +68,8 @@ size_t ConvBiasImpl::AlgoConv1x1::get_workspace( MIDOUT_END(); } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) { - Conv1x1Kerns dispatcher; + Conv1x1Kerns + dispatcher; return dispatcher .get_bundle(param, matmul_param, m_matmul_algo, compt_oc_block_size) @@ -116,7 +118,8 @@ SmallVector ConvBiasImpl::AlgoConv1x1::dispatch_kerns( MIDOUT_END(); } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) { - Conv1x1Kerns dispatcher; + Conv1x1Kerns + dispatcher; whole_bundle = dispatcher.get_bundle( param, matmul_param, m_matmul_algo, compt_oc_block_size); matmul_bundle = m_matmul_algo->get_bundle(matmul_param); @@ -140,7 +143,7 @@ SmallVector ConvBiasImpl::AlgoConv1x1::dispatch_kerns( Conv1x1StrategyBase* conv1x1_strategy = Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, - opr->param().format); + opr->param().format); auto kern_packA = [this, whole_bundle, matmul_bundle, param, compt_oc_block_size, conv1x1_strategy]( @@ -171,8 +174,8 @@ SmallVector ConvBiasImpl::AlgoConv1x1::dispatch_kerns( pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { - ret_kern.push_back({kern_packB, {1}}); - } + ret_kern.push_back({kern_packB, {1}}); + } } ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); @@ -230,7 +233,11 @@ bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr, param, OH * OW, get_oc_tile_size_heuristic(param)); bool matmul_usable = m_matmul_algo->usable(matmul_param); - return matmul_usable && + auto pack_mode = m_matmul_algo->packmode(); + bool strategy_usable = Conv1x1Factory::can_make_conv1x1_strategy( + param, pack_mode, opr->param().format); + + return matmul_usable && strategy_usable && (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && param.filter_meta.dilation[0] == 1) && diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp index 320da5081cab2ea4e4771f38eca3cde61f76e879..e32f6d3adfe6677b0533aff32bb7a1e325c56fa8 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp @@ -6,11 +6,12 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ -#include #include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" +#include #include "midout.h" @@ -157,10 +158,9 @@ std::unique_ptr create_conv1x1_strategy( dt_int32, dt_int8, dt_int32, dt_int32, PostprocessMode::NO_PROCESS, "NoPack::INT8x8x32_INT32"_hash); - cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, - dtype::QuantizedS8, dtype::QuantizedS32, - dtype::QuantizedS32, dt_int8, dt_int32, dt_int32, - PostprocessMode::NO_PROCESS, + cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dtype::QuantizedS8, + dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, + dt_int32, PostprocessMode::NO_PROCESS, "NoPack::QINT8x8x32_QINT32"_hash); break; @@ -208,6 +208,19 @@ Conv1x1StrategyBase* Conv1x1Factory::make_conv1x1_strategy( return storage.get(param, pack_mode, format); } +bool Conv1x1Factory::can_make_conv1x1_strategy( + const ConvBiasImpl::NCBKernSizeParam& param, + MatrixMulImpl::AlgoBase::PackMode pack_mode, param::ConvBias::Format) { +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC || !MEGDNN_DISABLE_FLOAT16 + if ((pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK || + pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) && + param.src_type.enumv() == DTypeTrait::enumv) { + return false; + } +#endif + return true; +} + } // namespace conv1x1 } // namespace fallback } // namespace megdnn diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h index 8e0456de69346755d2e89b5b223a535e8fd72c61..5cb95ab49827873b77736578fadfedae3a325603 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h @@ -320,6 +320,11 @@ public: const ConvBiasImpl::NCBKernSizeParam& param, MatrixMulImpl::AlgoBase::PackMode pack_mode, param::ConvBias::Format format); + + static bool can_make_conv1x1_strategy( + const ConvBiasImpl::NCBKernSizeParam& param, + MatrixMulImpl::AlgoBase::PackMode pack_mode, + param::ConvBias::Format format); }; } // namespace conv1x1 diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp index 995acec3aca0ebc7eee492ee8d73624533a8ebb0..f2466b77e51bf41c58ced3e64828a70305c35570 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.cpp +++ b/dnn/src/fallback/conv_bias/opr_impl.cpp @@ -27,7 +27,7 @@ using namespace megdnn; using namespace fallback; size_t megdnn::fallback::get_format_pack_size(param::ConvBias::Format format) { - switch(format){ + switch (format) { case param::ConvBias::Format::NCHW44: case param::ConvBias::Format::NCHW4: return 4_z; @@ -57,10 +57,18 @@ public: auto&& matmul_algos = static_cast(matmul_opr)->algo_pack(); for (auto&& algo : matmul_algos) { +#if MEGDNN_X86 +//! As we haven't direct conv for int8x8x16 yet, if we disable gemv here, it may +//! fallback to naive implementation, which may cause performance very low, so +//! here we just enable im2col for gemv in x86 backend. +//! FIXME: remove it when we add direct conv support for int8x8x16 +#else if (algo->algoset() == MatrixMulImpl::AlgoBase::AlgoSet::ALGO_TYPE_GEMV) { continue; } +#endif + for (size_t ohw_tile_size : {192, 384, 96, 48, 24}) { refhold.emplace_back(new AlgoIm2col( static_cast(algo), diff --git a/dnn/test/fallback/convolution.cpp b/dnn/test/fallback/convolution.cpp index 8124b00450954d256455d4a93934952c4c008505..17fc65193f75d121c5199afcd148c27ee87eaedc 100644 --- a/dnn/test/fallback/convolution.cpp +++ b/dnn/test/fallback/convolution.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "test/fallback/fixture.h" @@ -73,24 +74,115 @@ TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL) { profile(3, 3, 112, 112, 3, 1); } +TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL_8832) { + using Param = Convolution::Param; + auto run = [&](const TensorShapeArray& shapes, Param param) { + Benchmarker benchmarker_float(handle()); + size_t RUN = 50; + auto tfloat = benchmarker_float.set_display(false) + .set_dtype(0, dtype::Int8{}) + .set_dtype(1, dtype::Int8{}) + .set_dtype(2, dtype::Int32{}) + .set_times(RUN) + .set_param(param) + .exec(shapes); + size_t IC = shapes[1][1]; + size_t FH = shapes[1][2]; + size_t FW = shapes[1][3]; + TensorLayout dst_layout; + auto opr = handle()->create_operator(); + opr->param() = param; + opr->deduce_layout({shapes[0], dtype::Float32()}, + {shapes[1], dtype::Float32()}, dst_layout); + printf("fp32 flops: %.3f mflops\n", + (IC * dst_layout.total_nr_elems() * FH * FW * 2) / + (tfloat / RUN * 1000)); + }; + auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, + size_t stride) { + Param param; + param.stride_h = stride; + param.stride_w = stride; + param.pad_h = kernel / 2; + param.pad_w = kernel / 2; + param.pad_h = 0; + param.pad_w = 0; + printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n", + oc, ic, w, h, stride, kernel); + + run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param); + }; + + profile(48, 128, 56, 88, 1, 1); + profile(56, 128, 64, 80, 3, 1); + profile(24, 3, 256, 320, 3, 2); +} + +TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL_8816) { + using Param = Convolution::Param; + auto run = [&](const TensorShapeArray& shapes, Param param) { + Benchmarker benchmarker_float(handle()); + size_t RUN = 50; + auto tfloat = benchmarker_float.set_display(false) + .set_dtype(0, dtype::Int8{}) + .set_dtype(1, dtype::Int8{}) + .set_dtype(2, dtype::Int16{}) + .set_times(RUN) + .set_param(param) + .exec(shapes); + size_t IC = shapes[1][1]; + size_t FH = shapes[1][2]; + size_t FW = shapes[1][3]; + TensorLayout dst_layout; + auto opr = handle()->create_operator(); + opr->param() = param; + opr->deduce_layout({shapes[0], dtype::Float32()}, + {shapes[1], dtype::Float32()}, dst_layout); + printf("fp32 flops: %.3f mflops\n", + (IC * dst_layout.total_nr_elems() * FH * FW * 2) / + (tfloat / RUN * 1000)); + }; + auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, + size_t stride) { + Param param; + param.stride_h = stride; + param.stride_w = stride; + param.pad_h = kernel / 2; + param.pad_w = kernel / 2; + param.pad_h = 0; + param.pad_w = 0; + printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n", + oc, ic, w, h, stride, kernel); + + run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param); + }; + + profile(48, 128, 56, 88, 1, 1); + profile(48, 128, 56, 88, 1, 2); + profile(56, 128, 64, 80, 3, 1); + profile(24, 3, 256, 320, 3, 2); +} + TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_BACKWARD_DATA) { using Param = ConvolutionBackwardData::Param; auto run = [&](const TensorLayoutArray& tensors, Param param) { Benchmarker benchmarker_fallback(handle()); size_t RUN = 500; benchmarker_fallback.set_display(false) - .set_dtype(0, dtype::Float32{}) - .set_dtype(1, dtype::Float32{}) - .set_times(RUN) - .set_param(param); - auto tmatmul = benchmarker_fallback.set_before_exec_callback( - AlgoChecker( - "DeconvMatmul")) - .exec(tensors); - auto tdirect = benchmarker_fallback.set_before_exec_callback( - AlgoChecker( - "DeconvDirect")) - .exec(tensors); + .set_dtype(0, dtype::Float32{}) + .set_dtype(1, dtype::Float32{}) + .set_times(RUN) + .set_param(param); + auto tmatmul = benchmarker_fallback + .set_before_exec_callback( + AlgoChecker( + "DeconvMatmul")) + .exec(tensors); + auto tdirect = benchmarker_fallback + .set_before_exec_callback( + AlgoChecker( + "DeconvDirect")) + .exec(tensors); size_t IC = tensors[0][1]; size_t FH = tensors[0][2]; size_t FW = tensors[0][3]; @@ -98,8 +190,8 @@ TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_BACKWARD_DATA) { printf("Direct_time: %.3f ms Direct_flops: %.3f mflops\n", tdirect, total_flops / (tdirect / RUN * 1000)); printf("Matmul_time: %.3f ms Matmul_flops: %.3f mflops\n", tmatmul, - total_flops / (tmatmul/ RUN * 1000)); - printf("speedup: %.3f\n", tdirect/tmatmul); + total_flops / (tmatmul / RUN * 1000)); + printf("speedup: %.3f\n", tdirect / tmatmul); }; auto profile = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, @@ -154,6 +246,51 @@ TEST_F(FALLBACK, CONVOLUTION_MATRIX_MUL) { run(1, 3, 3, 112, 112, 3, 1); run(1, 1, 1, 1, 1, 3, 3); } + +#if MEGDNN_X86 +TEST_F(FALLBACK_MULTI_THREADS, CONVOLUTION_8816) { + Checker checker(handle()); + using Param = Convolution::Param; + checker.set_before_exec_callback(AlgoChecker(".+FB_GEMV.+")); + auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc, + size_t fh, size_t fw, size_t pad, size_t stride, + size_t group) { + Param param; + param.sparse = group > 1 ? param::Convolution::Sparse::GROUP + : param::Convolution::Sparse::DENSE; + param.pad_h = param.pad_w = pad; + param.stride_h = param.stride_w = stride; + checker.set_param(param); + if (group > 1) { + checker.execl( + {{{n, ic, ih, iw}, dtype::Int8()}, + {{group, oc / group, ic / group, fh, fw}, dtype::Int8()}, + {{}, dtype::Int16()}}); + } else { + checker.execl({{{n, ic, ih, iw}, dtype::Int8()}, + {{oc, ic, fh, fw}, dtype::Int8()}, + {{}, dtype::Int16()}}); + } + }; + + for (auto n : {1, 2}) + for (auto ic : {3, 4, 8, 12, 16}) + for (auto oc : {4, 8, 16, 32}) + for (auto ih : {7, 14, 15, 22}) + for (auto iw : {7, 13, 11, 32}) + for (auto filter : {1, 2, 3, 5, 7}) + for (auto stride : {1, 2}) + for (auto pad : {0, filter / 2}) { + run(n, ic, ih, iw, oc, filter, filter, pad, + stride, 1); + if (ic == oc) { + run(n, ic, ih, iw, oc, filter, filter, + pad, stride, ic); + } + } +} +#endif + TEST_F(FALLBACK, CONVOLUTION_NAIVE_ALGO_FP16) { Checker checker(handle()); using Param = Convolution::Param; @@ -222,7 +359,7 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVOLUTION_NAIVE_ALGO) { TensorShape src{n, ic, ih, iw}, filter{group, oc / group, ic / group, fh, fw}; checker.set_param(param).set_dtype(2, {}); - //!float32 + //! float32 checker.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32()); checker.execs({src, filter, {}}); //! float16 @@ -257,10 +394,10 @@ TEST_F(FALLBACK, CONVOLUTION_MATRIX_MUL_SINT8) { param.pad_h = param.pad_w = 1; param.stride_h = param.stride_w = 1; checker.set_param(param) - .set_dtype(0, dtype::QuantizedS8(0.2f)) - .set_dtype(1, dtype::QuantizedS8(0.2f)) + .set_dtype(0, dtype::QuantizedS8(0.2f)) + .set_dtype(1, dtype::QuantizedS8(0.2f)) // Use inferred output dtype. - .set_dtype(2, {}); + .set_dtype(2, {}); checker.execs({{n, ic, ih, iw}, {oc, ic, fh, fw}, {}}); };