feat(x86/rvv): opt AlgoF32DirectNCHWNCHW44

and opt GiMaximumFloat32/GiMinimumFloat32 on x86 GitOrigin-RevId: 825021e867ccf6314aac5a6c4c61df13d1e71705

feat(x86/rvv): opt AlgoF32DirectNCHWNCHW44
and opt GiMaximumFloat32/GiMinimumFloat32 on x86 GitOrigin-RevId: 825021e867ccf6314aac5a6c4c61df13d1e71705
fa59a7b0 · Megvii Engine Team · 0d82e9b7 · fa59a7b0 · fa59a7b0 · fa59a7b0
4 changed file
--- a/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp
+++ b/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp
@@ -748,7 +748,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5(
            GI_FLOAT32_FIXLEN_t src_v[2][5];
 #define COMPUTE_5X5_2(i, dst, src, kernel0, kernel1) \
    load_vec<5>(kernel0, filter + i * 5 * 4);        \
-    load_vec<6>(src, input + i * IW * 4);            \
+    load_vec<5>(src, input + i * IW * 4);            \
    compute_vec<5>(dst[0][0], &src[0], kernel0);     \
    compute_vec<5>(dst[1][0], &src[0], kernel1);
            // line 0
@@ -813,7 +813,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5(
            GI_FLOAT32_FIXLEN_t src_v[2][5];
 #define COMPUTE_5X5_1(i, dst, src, kernel)   \
    load_vec<5>(kernel, filter + i * 5 * 4); \
-    load_vec<6>(src, input + i * IW * 4);    \
+    load_vec<5>(src, input + i * IW * 4);    \
    compute_vec<5>(dst, &src[0], kernel)
            // line 0
            COMPUTE_5X5_1(0, dst_v, src_v[0], kernel[0]);
@@ -1148,7 +1148,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_5x5(
            GI_FLOAT32_FIXLEN_t src_v[2][5];
 #define COMPUTE_5X5_1(i, dst, src, kernel)   \
    load_vec<5>(kernel, filter + i * 5 * 4); \
-    load_vec<6>(src, input + i * IW * 4);    \
+    load_vec<5>(src, input + i * IW * 4);    \
    compute_vec<5>(dst, &src[0], kernel)
            // line 0
            COMPUTE_5X5_1(0, dst_v, src_v[0], kernel[0]);

--- a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
+++ b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
@@ -37,6 +37,26 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> {
    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
 };

+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
+//! GiMultiplyAddScalarFloat32
+#define MLA GiMultiplyAddScalarFloat32
+#define cb(step)                                                                     \
+    c[0][step] = GiFloat32Type2FixLenType(MLA(                                       \
+            GiFixLenType2GiFloat32Type(c[0][step]),                                  \
+            GiFixLenType2GiFloat32Type(weight[0][weight_idx]),                       \
+            *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); \
+    c[1][step] = GiFloat32Type2FixLenType(MLA(                                       \
+            GiFixLenType2GiFloat32Type(c[1][step]),                                  \
+            GiFixLenType2GiFloat32Type(weight[1][weight_idx]),                       \
+            *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4)));
+
+#define cb2(step)                                              \
+    c[0][step] = GiFloat32Type2FixLenType(MLA(                 \
+            GiFixLenType2GiFloat32Type(c[0][step]),            \
+            GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
+            *(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4)));
+#else
 #define cb(step)                                                            \
    c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane(                    \
            GiFixLenType2GiFloat32Type(c[0][step]),                         \
@@ -55,6 +75,8 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> {
            GiFixLenType2GiFloat32Type(weight[0][weight_idx]),              \
            GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \
            (step * stride + src_idx) % 4));
+#undef MLA
+#endif

 #define SHIFT_CAL_HELPER(ow_remain)                                               \
    template <                                                                    \
@@ -151,23 +173,38 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block, stride, ow_
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
+            //! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
+            //! GiMultiplyAddScalarFloat32
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            const float* src[src_reg_size];
+#else
            GI_FLOAT32_FIXLEN_t src[src_reg_size];
+#endif
            GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];

-#define KERNEL_CB(step)                                                                \
-    load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0); \
-    load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(                            \
-            weight, weight_ptr + step * ld_weight_fw, ld_weight_oc);                   \
-    cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);                         \
-    cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);                         \
-    cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);                         \
-    cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight);                         \
-    cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight);                         \
-    cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight);                         \
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+#define SRC_LOAD(step) \
+    load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + step * iw, 0)
+#else
+#define SRC_LOAD(step) \
+    load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0)
+#endif
+
+#define KERNEL_CB(step)                                              \
+    SRC_LOAD(step);                                                  \
+    load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(          \
+            weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
+    cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);       \
+    cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);       \
+    cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);       \
+    cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight);       \
+    cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight);       \
+    cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<6, 6, c_dim, stride, remain_w>(c, src, weight);

            UNROLL_CALL_RAW(7, KERNEL_CB)
 #undef KERNEL_CB
+#undef SRC_LOAD

            src_ptr += ld_src_ic;
            weight_ptr += ld_weight_ic;
@@ -200,20 +237,33 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block, stride, ow_
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            const float* src[src_reg_size];
+#else
            GI_FLOAT32_FIXLEN_t src[src_reg_size];
+#endif
            GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];

-#define KERNEL_CB(step)                                                                \
-    load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0); \
-    load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(                            \
-            weight, weight_ptr + step * ld_weight_fw, ld_weight_oc);                   \
-    cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);                         \
-    cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);                         \
-    cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);                         \
-    cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight);                         \
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+#define SRC_LOAD(step) \
+    load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + step * iw, 0);
+#else
+#define SRC_LOAD(step) \
+    load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + step * iw, 0);
+#endif
+
+#define KERNEL_CB(step)                                              \
+    SRC_LOAD(step);                                                  \
+    load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(          \
+            weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
+    cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);       \
+    cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);       \
+    cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);       \
+    cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight);
            UNROLL_CALL_RAW(5, KERNEL_CB)
 #undef KERNEL_CB
+#undef SRC_LOAD

            src_ptr += ld_src_ic;
            weight_ptr += ld_weight_ic;
@@ -246,10 +296,18 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, ow_
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            const float* src[src_reg_size];
+#else
            GI_FLOAT32_FIXLEN_t src[src_reg_size];
+#endif
            GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];
            // row 0
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr, 0);
+#else
            load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
+#endif
            load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
@@ -257,7 +315,11 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, ow_
            cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);

            // row 1
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + iw, 0);
+#else
            load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + iw, 0);
+#endif
            load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
                    weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
@@ -265,8 +327,12 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride, ow_
            cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);

            // row 2
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + 2 * iw, 0);
+#else
            load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(
                    src, src_ptr + 2 * iw, 0);
+#endif
            load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
                    weight, weight_ptr + 2 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
@@ -637,17 +703,29 @@ struct KerGiXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block, stride, ow_
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            const float* src[src_reg_size];
+#else
            GI_FLOAT32_FIXLEN_t src[src_reg_size];
+#endif
            GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size];
            // row 0
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr, 0);
+#else
            load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
+#endif
            load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
            cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);

            // row 1
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            load_ptr_helper<src_reg_size, 0, simd_len, 0>(src, src_ptr + iw, 0);
+#else
            load_helper<src_reg_size, 0, simd_len, 0, Vld1qF32S>(src, src_ptr + iw, 0);
+#endif
            load_helper<filter_size, 0, oc_step, c_dim, Vld1qF32S>(
                    weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
@@ -670,7 +748,7 @@ struct ConvDirectFp32NchwNchw44 {
        constexpr int fh = filter_size;
        constexpr int fw = filter_size;
        constexpr int ic_step = 1;
-#if MEGDNN_ARMV7
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) || defined(MEGDNN_ARMV7)
        constexpr int big_oc_step = 4;
 #else
        constexpr int big_oc_step = 8;

--- a/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h
+++ b/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h
@@ -62,6 +62,13 @@ struct LoadHelper {
    static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args);
 };

+template <
+        int weight_number, int base_offset, int ptr_step, int oc_block, typename T,
+        typename T2>
+struct LoadPtrHelper {
+    static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset);
+};
+
 #define WEIGHT_CB(step)                   \
    src[step] = GiFloat32Type2FixLenType( \
            Func::impl(ptr + base_offset + step * ptr_step, args...));
@@ -96,6 +103,36 @@ LOAD_HELPER(16);
 #undef LOAD_HELPER
 #undef WEIGHT_CB

+#define WEIGHT_PTR_CB(step) src[step] = ptr + base_offset + step * ptr_step;
+
+#define LOAD_PTR_HELPER(step)                                         \
+    template <int base_offset, int ptr_step, typename T, typename T2> \
+    struct LoadPtrHelper<step, base_offset, ptr_step, 0, T, T2> {     \
+        static GI_FORCEINLINE void impl(T& src, T2 ptr, int) {        \
+            UNROLL_CALL_RAW(step, WEIGHT_PTR_CB);                     \
+        }                                                             \
+    }
+
+LOAD_PTR_HELPER(1);
+LOAD_PTR_HELPER(2);
+LOAD_PTR_HELPER(3);
+LOAD_PTR_HELPER(4);
+LOAD_PTR_HELPER(5);
+LOAD_PTR_HELPER(6);
+LOAD_PTR_HELPER(7);
+LOAD_PTR_HELPER(8);
+LOAD_PTR_HELPER(9);
+LOAD_PTR_HELPER(10);
+LOAD_PTR_HELPER(11);
+LOAD_PTR_HELPER(12);
+LOAD_PTR_HELPER(13);
+LOAD_PTR_HELPER(14);
+LOAD_PTR_HELPER(15);
+LOAD_PTR_HELPER(16);
+
+#undef LOAD_PTR_HELPER
+#undef WEIGHT_PTR_CB
+
 ///////////////////////////c_dim = 1/////////////////////////
 #define WEIGHT_CB(step) \
    src[0][step] =      \
@@ -122,6 +159,29 @@ LOAD_HELPER(9);
 #undef LOAD_HELPER
 #undef WEIGHT_CB

+#define WEIGHT_PTR_CB(step) src[0][step] = ptr + base_offset + step * ptr_step;
+
+#define LOAD_PTR_HELPER(step)                                         \
+    template <int base_offset, int ptr_step, typename T, typename T2> \
+    struct LoadPtrHelper<step, base_offset, ptr_step, 1, T, T2> {     \
+        static GI_FORCEINLINE void impl(T& src, T2 ptr, int) {        \
+            UNROLL_CALL_RAW(step, WEIGHT_PTR_CB);                     \
+        }                                                             \
+    }
+
+LOAD_PTR_HELPER(1);
+LOAD_PTR_HELPER(2);
+LOAD_PTR_HELPER(3);
+LOAD_PTR_HELPER(4);
+LOAD_PTR_HELPER(5);
+LOAD_PTR_HELPER(6);
+LOAD_PTR_HELPER(7);
+LOAD_PTR_HELPER(8);
+LOAD_PTR_HELPER(9);
+
+#undef LOAD_PTR_HELPER
+#undef WEIGHT_PTR_CB
+
 /////////////////////////c_dim = 2///////////////////////////////
 #define WEIGHT_CB(step)                                                                \
    src[0][step] =                                                                     \
@@ -149,6 +209,30 @@ LOAD_HELPER(8);
 #undef LOAD_HELPER
 #undef WEIGHT_CB

+#define WEIGHT_PTR_CB(step)                             \
+    src[0][step] = ptr + base_offset + step * ptr_step; \
+    src[1][step] = ptr + base_offset + step * ptr_step + oc_offset;
+
+#define LOAD_PTR_HELPER(step)                                            \
+    template <int base_offset, int ptr_step, typename T, typename T2>    \
+    struct LoadPtrHelper<step, base_offset, ptr_step, 2, T, T2> {        \
+        static GI_FORCEINLINE void impl(T& src, T2 ptr, int oc_offset) { \
+            UNROLL_CALL_RAW(step, WEIGHT_PTR_CB);                        \
+        }                                                                \
+    }
+
+LOAD_PTR_HELPER(1);
+LOAD_PTR_HELPER(2);
+LOAD_PTR_HELPER(3);
+LOAD_PTR_HELPER(4);
+LOAD_PTR_HELPER(5);
+LOAD_PTR_HELPER(6);
+LOAD_PTR_HELPER(7);
+LOAD_PTR_HELPER(8);
+
+#undef LOAD_HELPER
+#undef WEIGHT_PTR_CB
+
 template <
        int weight_number, int base_offset, int ptr_step, int c_dim, typename Func,
        typename T, typename T2>
@@ -157,6 +241,14 @@ GI_FORCEINLINE void load_helper(T& weight, T2 ptr, int oc_offset) {
            weight, ptr, oc_offset);
 }

+template <
+        int weight_number, int base_offset, int ptr_step, int c_dim, typename T,
+        typename T2>
+GI_FORCEINLINE void load_ptr_helper(T& weight, T2 ptr, int oc_offset) {
+    LoadPtrHelper<weight_number, base_offset, ptr_step, c_dim, T, T2>::impl(
+            weight, ptr, oc_offset);
+}
+
 ////////////////////Store_OCX_OW8_Remain/////////////////////////
 template <int c_dim, int ow_remain, typename Op, typename T, typename T2, typename T3>
 struct StoreOcxOw8Remain {

--- a/dnn/src/fallback/general_intrinsic/gi_float.h
+++ b/dnn/src/fallback/general_intrinsic/gi_float.h
@@ -1110,7 +1110,7 @@ GI_FORCEINLINE
 GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
 #if defined(GI_NEON_INTRINSICS)
    return vmaxq_f32(Vector1, Vector2);
-#elif defined(GI_NEON32_INTRINSICS)
+#elif defined(GI_SSE2_INTRINSICS)
    return _mm_max_ps(Vector1, Vector2);
 #elif defined(GI_RVV_INTRINSICS)
    return vfmax_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
@@ -1127,7 +1127,7 @@ GI_FORCEINLINE
 GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
 #if defined(GI_NEON_INTRINSICS)
    return vminq_f32(Vector1, Vector2);
-#elif defined(GI_NEON32_INTRINSICS)
+#elif defined(GI_SSE2_INTRINSICS)
    return _mm_min_ps(Vector1, Vector2);
 #elif defined(GI_RVV_INTRINSICS)
    return vfmin_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));