feat(dnn/arm_common): add nchw44 8x8x16 stride1 stride2

2x2 3x3 5x5 7x7 directconv GitOrigin-RevId: 3710182af1974775c0960a4ebac3c7cc7e3d93d5

feat(dnn/arm_common): add nchw44 8x8x16 stride1 stride2
2x2 3x3 5x5 7x7 directconv GitOrigin-RevId: 3710182af1974775c0960a4ebac3c7cc7e3d93d5
03808112 · Megvii Engine Team · 2dbe8194 · 03808112 · 03808112 · 03808112
10 changed file
--- a/dnn/src/arm_common/conv_bias/int8x8x16/algos.h
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/algos.h
@@ -38,6 +38,18 @@ public:
            const NCBKernSizeParam& param) const override;
 };

+class ConvBiasImpl::AlgoS8x8x16DirectNCHW44 final : public AlgoBase {
+public:
+    AlgoS8x8x16DirectNCHW44() {}
+    bool is_reproducible() const override { return true; }
+    const char* name() const override { return "S8x8x16_NCHW44_DIRECT"; }
+    bool usable(const NCBKernSizeParam& param,
+                AlgoSelectionStrategy algo_selection_strategy) const override;
+    size_t get_workspace(const NCBKernSizeParam& param) const override;
+    virtual SmallVector<NCBKern> dispatch_kerns(
+            const NCBKernSizeParam& param) const override;
+};
+
 class ConvBiasImpl::AlgoI8x8x16Stride2 final : public AlgoBase {
    SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
    WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const;

--- a/dnn/src/arm_common/conv_bias/int8x8x16/direct_8x8x16_nchw44_algo.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/direct_8x8x16_nchw44_algo.cpp
--- a/dnn/src/arm_common/conv_bias/int8x8x16/direct_8x8x16_nchw44_kern.h
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/direct_8x8x16_nchw44_kern.h
+/**
+ * \file dnn/src/arm_common/conv_bias/int8x8x16/direct_nchw44_kern.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "src/arm_common/simd_macro/marm_neon.h"
+#include "src/common/utils.h"
+#include "src/fallback/conv_bias/common.h"
+
+namespace megdnn {
+namespace arm_common {
+namespace int8x8x16_direct_nchw44 {
+
+/**
+origin src shape <n, ic/4, h, w, 4>
+packed src shape <n, ic/4, h, w, 16>
+example: (format like <ic>)
+origin
+<0>  <1>  <2>  <3>
+packed
+low 64 bit  <0> <0> <0> <0> | <1> <1> <1> <1>
+---------------------------------------------------------------------
+high 64 bit <2> <2> <2> <2> | <3> <3> <3> <3>
+**/
+static inline void nchw44_pack_src(const int8_t* src, int8_t* dst, int length) {
+    static const uint8_t src_idx_buffer[16] = {0, 0, 0, 0, 1, 1, 1, 1,
+                                               2, 2, 2, 2, 3, 3, 3, 3};
+    constexpr int pack_ic = 4;
+    constexpr int simd_len = 16;
+    uint8x16_t src_idx = vld1q_u8(src_idx_buffer);
+    for (int i = 0; i < length; i++) {
+        int8x16_t result = vld_dup_tbl_s32(src + i * pack_ic, src_idx);
+        vst1q_s8(dst + i * simd_len, result);
+    }
+}
+
+template <BiasMode bias_mode, int filter_size, int stride>
+struct ConvDirectInt8Nchw44Choose {
+    static void impl(const int8_t* src, const int8_t* filter,
+                     const int16_t* bias, int16_t* dst, const size_t oc,
+                     const size_t ic, const size_t ih, const size_t iw,
+                     const size_t oh, const size_t ow);
+};
+
+}  // namespace int8_direct_nchw44
+}  // namespace arm_common
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/arm_common/conv_bias/int8x8x16/direct_kernels/int8x8x16_direct_nchw44_s1_aarch64.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/direct_kernels/int8x8x16_direct_nchw44_s1_aarch64.cpp
--- a/dnn/src/arm_common/conv_bias/int8x8x16/direct_kernels/int8x8x16_direct_nchw44_s1_armv7.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/direct_kernels/int8x8x16_direct_nchw44_s1_armv7.cpp
--- a/dnn/src/arm_common/conv_bias/int8x8x16/direct_kernels/int8x8x16_direct_nchw44_s2.cpp
+++ b/dnn/src/arm_common/conv_bias/int8x8x16/direct_kernels/int8x8x16_direct_nchw44_s2.cpp
--- a/dnn/src/arm_common/conv_bias/opr_impl.cpp
+++ b/dnn/src/arm_common/conv_bias/opr_impl.cpp
@@ -44,6 +44,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
    AlgoQU8DirectStride1 qu8_direct_stride1;
    AlgoS8DirectStride2 s8_direct_stride2;
    AlgoS8DirectNCHW44 s8_direct_nchw44;
+    AlgoS8x8x16DirectNCHW44 s8x8x16_direct_nchw44;
    AlgoS8DirectNCHWNCHW44 s8_direct_nchw_nchw44;
    AlgoS8DirectStride1 s8_direct_stride1;
    AlgoS8ChanWiseStride1NCHW44 s8_channel_wise_stride1_nchw44;
@@ -94,6 +95,7 @@ public:
        direct_algos.emplace_back(&qu8_direct_stride1);
        direct_algos.emplace_back(&s8_direct_stride2);
        direct_algos.emplace_back(&s8_direct_nchw44);
+        direct_algos.emplace_back(&s8x8x16_direct_nchw44);
        direct_algos.emplace_back(&s8_direct_nchw_nchw44);
        direct_algos.emplace_back(&s8_direct_stride1);


--- a/dnn/src/arm_common/conv_bias/opr_impl.h
+++ b/dnn/src/arm_common/conv_bias/opr_impl.h
@@ -39,6 +39,7 @@ private:
    class AlgoS8DirectStride1;
    class AlgoS8DirectStride2;
    class AlgoS8DirectNCHW44;
+    class AlgoS8x8x16DirectNCHW44;
    class AlgoS8DirectNCHWNCHW44;
    class AlgoQU8DirectStride1;
    class AlgoQU8DirectStride2;

--- a/dnn/test/arm_common/conv_bias.cpp
+++ b/dnn/test/arm_common/conv_bias.cpp
@@ -518,6 +518,116 @@ void benchmark_im2col_single_algo(const char* im2col_name, Handle* handle,
    }
 }

+void benchmark_nchw44_8x8x16_vs_8x8x32(const char* im2col_name, Handle* handle,
+                                       size_t kernel, size_t stride,
+                                       size_t pack_size = 1) {
+    megdnn_assert(stride == 1 || stride == 2, "only support stride 1 or 2");
+    std::vector<conv_bias::TestArg> args;
+    auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
+                    size_t p) {
+        if (ic % pack_size != 0 || oc % pack_size != 0)
+            return;
+        if (w + 2 * p < kernel || h + 2 * p < kernel)
+            return;
+        param::ConvBias param;
+        param.format = param::ConvBias::Format::NCHW44;
+        param.stride_h = stride;
+        param.stride_w = stride;
+        param.pad_h = p;
+        param.pad_w = p;
+        param.sparse = param::ConvBias::Sparse::DENSE;
+        args.push_back(conv_bias::TestArg{
+                param,
+                TensorShape{1, ic / 4, h, w, 4},
+                TensorShape{oc / 4, ic / 4, kernel, kernel, 4, 4},
+                {1, oc / 4, 1, 1, 4}});
+    };
+    pack(1, 64, 56, 56, kernel, 0);
+    pack(8, 64, 56, 56, kernel, 0);
+    pack(16, 64, 56, 56, kernel, 1);
+    pack(32, 64, 56, 56, kernel, 1);
+    pack(1, 64, 100, 100, kernel, 1);
+    pack(8, 64, 100, 100, kernel, 1);
+    pack(1, 64, 100, 100, kernel, 0);
+    pack(8, 64, 100, 100, kernel, 0);
+    pack(16, 64, 100, 100, kernel, 1);
+    pack(32, 64, 100, 100, kernel, 1);
+    pack(64, 64, 100, 100, kernel, 1);
+    pack(128, 64, 100, 100, kernel, 1);
+    pack(256, 64, 100, 100, kernel, 1);
+    pack(512, 64, 100, 100, kernel, 1);
+    pack(1024, 64, 100, 100, kernel, 1);
+    pack(1, 32, 200, 200, kernel, 1);
+    pack(8, 64, 200, 200, kernel, 1);
+    pack(1, 32, 200, 200, kernel, 0);
+    pack(8, 64, 200, 200, kernel, 0);
+    pack(16, 96, 200, 200, kernel, 1);
+    pack(32, 32, 200, 200, kernel, 1);
+    pack(64, 64, 200, 200, kernel, 1);
+    pack(128, 96, 200, 200, kernel, 1);
+    pack(1, 64, 10, 10, kernel, 1);
+    pack(8, 64, 10, 10, kernel, 1);
+    pack(16, 64, 10, 10, kernel, 1);
+    pack(32, 64, 10, 10, kernel, 1);
+    pack(64, 64, 10, 10, kernel, 1);
+    pack(128, 64, 10, 10, kernel, 1);
+    pack(256, 64, 10, 10, kernel, 1);
+    pack(512, 64, 10, 10, kernel, 1);
+    pack(1024, 64, 10, 10, kernel, 1);
+
+    using namespace conv_bias;
+    constexpr size_t RUN = 20;
+
+    Benchmarker<ConvBias> benchmark_im2col(handle);
+    benchmark_im2col.set_display(false);
+    benchmark_im2col.set_times(RUN);
+
+    Benchmarker<ConvBias> benchmark_8832(handle);
+    benchmark_8832.set_display(false);
+    benchmark_8832.set_times(RUN);
+    for (auto&& arg : args) {
+        TensorLayout dst_layout;
+        auto opr = handle->create_operator<ConvBias>();
+        opr->param() = arg.param;
+        opr->deduce_layout({arg.src, dtype::Float32()},
+                           {arg.filter, dtype::Float32()},
+                           {arg.bias, dtype::Float32()}, {}, dst_layout);
+        //! dst.nr_elems * IC * FH * FW * 2
+        float computations = dst_layout.total_nr_elems() * arg.filter[1] *
+                             arg.filter[2] * arg.filter[3] * 2.0 * 4 /
+                             (1024 * 1024 * 1024) * 1e3;
+
+        benchmark_im2col.set_param(arg.param);
+        benchmark_im2col.set_dtype(0, dtype::Int8());
+        benchmark_im2col.set_dtype(1, dtype::Int8());
+        benchmark_im2col.set_dtype(2, dtype::Int16());
+        benchmark_im2col.set_dtype(4, dtype::Int16());
+        auto used_8816 =
+                algo_benchmark<ConvBias>(benchmark_im2col,
+                                         {arg.src, arg.filter, {}, {}, {}},
+                                         im2col_name) /
+                RUN;
+        benchmark_8832.set_param(arg.param);
+        benchmark_8832.set_dtype(0, dtype::QuantizedS8(2.5));
+        benchmark_8832.set_dtype(1, dtype::QuantizedS8(2.5));
+        benchmark_8832.set_dtype(2, dtype::QuantizedS32(6.25));
+        benchmark_8832.set_dtype(4, {});
+        auto used_8832 =
+                algo_benchmark<ConvBias>(benchmark_8832,
+                                         {arg.src, arg.filter, {}, {}, {}},
+                                         "S8_NCHW44_DIRECT") /
+                RUN;
+
+        printf("%s %s: 8816: %f ms %f GFlops ", arg.src.to_string().c_str(),
+               arg.filter.to_string().c_str(), used_8816,
+               computations / used_8816);
+        printf("%s %s: 8832: %f ms %f GFlops ", arg.src.to_string().c_str(),
+               arg.filter.to_string().c_str(), used_8832,
+               computations / used_8832);
+        printf("speedup %f \n", used_8832 / used_8816);
+    }
+}
+
 void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name,
                                     const char* im2col_name, Handle* handle,
                                     size_t kernel, DType src_type,
@@ -872,6 +982,28 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_MATMUL) {
 #endif
 #if MEGDNN_WITH_BENCHMARK

+TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_8X8X16_DIRECT_STRIDE1) {
+    benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 2, 1,
+                                      4);
+    benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 3, 1,
+                                      4);
+    benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 5, 1,
+                                      4);
+    benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 7, 1,
+                                      4);
+}
+
+TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_8X8X16_DIRECT_STRIDE2) {
+    benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 2, 2,
+                                      4);
+    benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 3, 2,
+                                      4);
+    benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 5, 2,
+                                      4);
+    benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 7, 2,
+                                      4);
+}
+
 TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23) {
 #if MEGDNN_AARCH64
    benchmark_winograd("WINOGRAD:AARCH64_F32:1:2", handle(), 3);

--- a/dnn/test/arm_common/conv_bias_multi_thread.cpp
+++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp
@@ -534,11 +534,25 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1_NCHW44) {
            get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, false),
            handle(), "S8_NCHW44_DIRECT");
 }
+
+TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1_NCHW44_8816) {
+    checker_conv_bias_int8x8x16(
+            get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, true),
+            handle(), "S8x8x16_NCHW44_DIRECT");
+}
+
+TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE2_NCHW44_8816) {
+    checker_conv_bias_int8x8x16(
+            get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, true),
+            handle(), "S8x8x16_NCHW44_DIRECT");
+}
+
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1_NCHW44_8832) {
    checker_conv_bias_qint8x8x32(
            get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, true),
            handle(), "S8_NCHW44_DIRECT");
 }
+
 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE2_NCHW44_8832) {
    checker_conv_bias_qint8x8x32(
            get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, true),