diff --git a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp index 652722c8fb4df43bcc0516cda45e3e4373bf3c8d..8bad554f92582394b4f535bdb4ab9114ee06c8b9 100644 --- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp @@ -107,7 +107,7 @@ static void do_conv_kern(WorkspaceBundle bundle, constexpr int oc_idx = 0; int oc_block = oc; int oh_block = block_helper(kern_param.nr_threads, oh2, - ic * iw * sizeof(float) * 2); + ic * iw * sizeof(float) * stride_h); const int oh_idx = ncb_index.ndrange_id[2]; const int oh_block_real = std::min(oh - oh_idx * oh_block, oh_block); const int ih_real = oh_block_real * stride_h + fh - stride_h; @@ -297,8 +297,9 @@ ConvBiasImpl::AlgoF32DirectNCHW44::dispatch_kerns( int oh = param.osz[0]; int ic = param.filter_meta.icpg; int iw = param.isz[1]; - int oh_block = - block_helper(param.nr_threads, oh, ic * iw * sizeof(float) * 2); + int stride_h = param.filter_meta.stride[0]; + int oh_block = block_helper(param.nr_threads, oh, + ic * iw * sizeof(float) * stride_h); CpuNDRange ncb_range = {static_cast(batch), static_cast(group), static_cast(div_ceil(oh, oh_block))}; diff --git a/dnn/test/arm_common/conv_bias.cpp b/dnn/test/arm_common/conv_bias.cpp index 00f720f622a21a05587ae84f712fcfa0539ddcbc..e8ddd8f00d6062aa032a04c78946fa139a47abad 100644 --- a/dnn/test/arm_common/conv_bias.cpp +++ b/dnn/test/arm_common/conv_bias.cpp @@ -118,24 +118,30 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) { conv_bias::ConvBiasAlgoChecker( "IM2COLMATMUL:AARCH64_F32K8X12X1:192")); - Benchmarker benchmarker_int_nchw44(handle); + Benchmarker benchmarker_nchw44(handle); if (is_fp32) { - benchmarker_int_nchw44.set_times(RUNS) + benchmarker_nchw44.set_times(RUNS) .set_dtype(0, dtype::Float32()) .set_dtype(1, dtype::Float32()) .set_dtype(2, dtype::Float32()) .set_dtype(4, dtype::Float32()) .set_display(false); } else { - benchmarker_int_nchw44.set_times(RUNS) + benchmarker_nchw44.set_times(RUNS) .set_dtype(0, dtype::QuantizedS8(2.5)) .set_dtype(1, dtype::QuantizedS8(2.5)) .set_dtype(2, dtype::QuantizedS32(6.25)) .set_dtype(4, dtype::QuantizedS8(60.25)) .set_display(false); } - benchmarker_int_nchw44.set_before_exec_callback( - conv_bias::ConvBiasAlgoChecker(".+")); + auto nchw44_algo_regx = ".*(DIRECT|NCHW_NCHW44).*"; +#if __ARM_FEATURE_DOTPROD + if (!is_fp32) { + nchw44_algo_regx = ".*DOT.*"; + } +#endif + benchmarker_nchw44.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(nchw44_algo_regx)); auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS, size_t stride, bool input_nchw = false) { @@ -171,7 +177,7 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) { bias = {1, OC / 4, 1, 1, 4}; dst = {N, OC / 4, OH, OW, 4}; - auto int_nchw44_used = benchmarker_int_nchw44.set_param(param).exec( + auto int_nchw44_used = benchmarker_nchw44.set_param(param).exec( {src, filter, bias, {}, dst}) / RUNS; float computations = IC * (FS * FS) * dst.total_nr_elems() * 2 * 1e-6;