From 2d806f9c3cc93ab7008cca12dac8621db8e8658b Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 14 Jun 2022 18:15:52 +0800 Subject: [PATCH] feat(gi): make conv_bias apply gi class type GitOrigin-RevId: daa40f61c1649433b65a6d76f81d602d13ad382e --- .../channel_wise_3x3_s1p1_nchw44_kern.cpp | 164 ++++-- .../channel_wise_5x5_s1p2_nchw44_kern.cpp | 87 +-- .../gi/fp32/channel_wise_nchw44_kern.cpp | 515 +++++++++++------- dnn/src/fallback/conv_bias/gi/fp32/direct.cpp | 51 +- .../direct_kernels/f32_direct_nchw44_kern.cpp | 68 +-- .../f32_direct_nchw44_kern_common_s1.h | 85 +-- .../f32_direct_nchw44_kern_common_s2.h | 79 +-- .../f32_direct_nchw_nchw44_kern_common.h | 58 +- .../conv_bias/gi/fp32/do_conv_stride2.cpp | 133 ++--- .../conv_bias/gi/fp32/filter_transform.h | 6 +- dnn/src/fallback/conv_bias/gi/fp32/helper.h | 86 +-- .../conv_bias/gi/fp32/strategy_2x3_4x4.cpp | 42 +- .../conv_bias/gi/fp32/strategy_4x5.cpp | 27 +- .../conv_bias/gi/fp32/strategy_5x4.cpp | 7 +- .../conv_bias/gi/fp32/strategy_6x3.cpp | 4 +- .../conv_bias/gi/fp32/strategy_6x3_4x4.cpp | 2 +- .../gi/fp32/strategy_f23_mk4_nchw44.cpp | 4 +- .../gi/fp32/strategy_f63_mk4_nchw44.cpp | 36 +- .../gi/fp32/strategy_f73_mk4_nchw44.cpp | 20 +- .../fallback/conv_bias/gi/intrinsic_helper.h | 258 +++++---- dnn/src/fallback/conv_bias/gi/utils.h | 123 +++-- .../fallback/general_intrinsic/gi_common.h | 28 +- 22 files changed, 1170 insertions(+), 713 deletions(-) diff --git a/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_3x3_s1p1_nchw44_kern.cpp b/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_3x3_s1p1_nchw44_kern.cpp index 0d3093acf..03bb0cd42 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_3x3_s1p1_nchw44_kern.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_3x3_s1p1_nchw44_kern.cpp @@ -12,8 +12,8 @@ using namespace fallback; namespace { template -static inline void shift_src(GI_FLOAT32_t rsrc[3][4]) { - GI_FLOAT32_t t[4]; +static inline void shift_src(GI_FLOAT32_FIXLEN_t rsrc[3][4]) { + GI_FLOAT32_FIXLEN_t t[4]; t[0] = rsrc[0][(shift + 0) % 4]; t[1] = rsrc[0][(shift + 1) % 4]; @@ -57,32 +57,51 @@ struct compute_element { template static inline void call( const float*& src0, const float*& src1, const float*& src2, float*& dst, - const float*& bias, const GI_FLOAT32_t& init, GI_FLOAT32_t rsrc[3][4], - GI_FLOAT32_t rfilter[3][3], const Op& op) { + const float*& bias, const GI_FLOAT32_t& init, + GI_FLOAT32_FIXLEN_t rsrc[3][4], GI_FLOAT32_FIXLEN_t rfilter[3][3], + const Op& op) { #define RSRC(i, j) rsrc[i][((j) + bw) % 4] GI_FLOAT32_t rdst = load_bias(bias, init); if (has_top) { - RSRC(0, 3) = GiLoadFloat32(src0 + 8); + RSRC(0, 3) = GiFloat32Type2FixLenType(GiLoadFloat32(src0 + 8)); } - { RSRC(1, 3) = GiLoadFloat32(src1 + 8); } + { RSRC(1, 3) = GiFloat32Type2FixLenType(GiLoadFloat32(src1 + 8)); } if (has_bottom) { - RSRC(2, 3) = GiLoadFloat32(src2 + 8); + RSRC(2, 3) = GiFloat32Type2FixLenType(GiLoadFloat32(src2 + 8)); } if (has_top) { - rdst = GiMlaqFloat32(rdst, RSRC(0, 0), rfilter[0][0]); - rdst = GiMlaqFloat32(rdst, RSRC(0, 1), rfilter[0][1]); - rdst = GiMlaqFloat32(rdst, RSRC(0, 2), rfilter[0][2]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(0, 0)), + GiFixLenType2GiFloat32Type(rfilter[0][0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(0, 1)), + GiFixLenType2GiFloat32Type(rfilter[0][1])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(0, 2)), + GiFixLenType2GiFloat32Type(rfilter[0][2])); } { - rdst = GiMlaqFloat32(rdst, RSRC(1, 0), rfilter[1][0]); - rdst = GiMlaqFloat32(rdst, RSRC(1, 1), rfilter[1][1]); - rdst = GiMlaqFloat32(rdst, RSRC(1, 2), rfilter[1][2]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(1, 0)), + GiFixLenType2GiFloat32Type(rfilter[1][0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(1, 1)), + GiFixLenType2GiFloat32Type(rfilter[1][1])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(1, 2)), + GiFixLenType2GiFloat32Type(rfilter[1][2])); } if (has_bottom) { - rdst = GiMlaqFloat32(rdst, RSRC(2, 0), rfilter[2][0]); - rdst = GiMlaqFloat32(rdst, RSRC(2, 1), rfilter[2][1]); - rdst = GiMlaqFloat32(rdst, RSRC(2, 2), rfilter[2][2]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(2, 0)), + GiFixLenType2GiFloat32Type(rfilter[2][0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(2, 1)), + GiFixLenType2GiFloat32Type(rfilter[2][1])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(2, 2)), + GiFixLenType2GiFloat32Type(rfilter[2][2])); } GiStoreFloat32(dst, op(rdst)); @@ -113,23 +132,42 @@ struct compute_element_right { template static inline void call( float*& dst, const float*& bias, const GI_FLOAT32_t& init, - GI_FLOAT32_t rsrc[3][4], GI_FLOAT32_t rfilter[3][3], const Op& op) { + GI_FLOAT32_FIXLEN_t rsrc[3][4], GI_FLOAT32_FIXLEN_t rfilter[3][3], + const Op& op) { GI_FLOAT32_t rdst = load_bias(bias, init); if (has_top) { - rdst = GiMlaqFloat32(rdst, rsrc[0][0], rfilter[0][0]); - rdst = GiMlaqFloat32(rdst, rsrc[0][1], rfilter[0][1]); - rdst = GiMlaqFloat32(rdst, rsrc[0][2], rfilter[0][2]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[0][0]), + GiFixLenType2GiFloat32Type(rfilter[0][0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[0][1]), + GiFixLenType2GiFloat32Type(rfilter[0][1])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[0][2]), + GiFixLenType2GiFloat32Type(rfilter[0][2])); } { - rdst = GiMlaqFloat32(rdst, rsrc[1][0], rfilter[1][0]); - rdst = GiMlaqFloat32(rdst, rsrc[1][1], rfilter[1][1]); - rdst = GiMlaqFloat32(rdst, rsrc[1][2], rfilter[1][2]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[1][0]), + GiFixLenType2GiFloat32Type(rfilter[1][0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[1][1]), + GiFixLenType2GiFloat32Type(rfilter[1][1])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[1][2]), + GiFixLenType2GiFloat32Type(rfilter[1][2])); } if (has_bottom) { - rdst = GiMlaqFloat32(rdst, rsrc[2][0], rfilter[2][0]); - rdst = GiMlaqFloat32(rdst, rsrc[2][1], rfilter[2][1]); - rdst = GiMlaqFloat32(rdst, rsrc[2][2], rfilter[2][2]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[2][0]), + GiFixLenType2GiFloat32Type(rfilter[2][0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[2][1]), + GiFixLenType2GiFloat32Type(rfilter[2][1])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[2][2]), + GiFixLenType2GiFloat32Type(rfilter[2][2])); } GiStoreFloat32(dst, op(rdst)); @@ -144,20 +182,33 @@ struct compute_element_right_pad { template static inline void call( float*& dst, const float*& bias, const GI_FLOAT32_t& init, - GI_FLOAT32_t rsrc[3][4], GI_FLOAT32_t rfilter[3][3], const Op& op) { + GI_FLOAT32_FIXLEN_t rsrc[3][4], GI_FLOAT32_FIXLEN_t rfilter[3][3], + const Op& op) { GI_FLOAT32_t rdst = load_bias(bias, init); if (has_top) { - rdst = GiMlaqFloat32(rdst, rsrc[0][1], rfilter[0][0]); - rdst = GiMlaqFloat32(rdst, rsrc[0][2], rfilter[0][1]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[0][1]), + GiFixLenType2GiFloat32Type(rfilter[0][0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[0][2]), + GiFixLenType2GiFloat32Type(rfilter[0][1])); } { - rdst = GiMlaqFloat32(rdst, rsrc[1][1], rfilter[1][0]); - rdst = GiMlaqFloat32(rdst, rsrc[1][2], rfilter[1][1]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[1][1]), + GiFixLenType2GiFloat32Type(rfilter[1][0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[1][2]), + GiFixLenType2GiFloat32Type(rfilter[1][1])); } if (has_bottom) { - rdst = GiMlaqFloat32(rdst, rsrc[2][1], rfilter[2][0]); - rdst = GiMlaqFloat32(rdst, rsrc[2][2], rfilter[2][1]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[2][1]), + GiFixLenType2GiFloat32Type(rfilter[2][0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[2][2]), + GiFixLenType2GiFloat32Type(rfilter[2][1])); } GiStoreFloat32(dst, op(rdst)); @@ -171,22 +222,23 @@ struct compute_row { template static inline void call( const float*& src0, const float*& src1, const float*& src2, float*& dst, - const float*& bias, const GI_FLOAT32_t& init, GI_FLOAT32_t rsrc[3][4], - GI_FLOAT32_t rfilter[3][3], int W, const Op& op) { + const float*& bias, const GI_FLOAT32_t& init, + GI_FLOAT32_FIXLEN_t rsrc[3][4], GI_FLOAT32_FIXLEN_t rfilter[3][3], int W, + const Op& op) { if (has_top) { - rsrc[0][0] = GiZeroFloat32(); - rsrc[0][1] = GiLoadFloat32(src0 + 0); - rsrc[0][2] = GiLoadFloat32(src0 + 4); + rsrc[0][0] = GiFloat32Type2FixLenType(GiZeroFloat32()); + rsrc[0][1] = GiFloat32Type2FixLenType(GiLoadFloat32(src0 + 0)); + rsrc[0][2] = GiFloat32Type2FixLenType(GiLoadFloat32(src0 + 4)); } { - rsrc[1][0] = GiZeroFloat32(); - rsrc[1][1] = GiLoadFloat32(src1 + 0); - rsrc[1][2] = GiLoadFloat32(src1 + 4); + rsrc[1][0] = GiFloat32Type2FixLenType(GiZeroFloat32()); + rsrc[1][1] = GiFloat32Type2FixLenType(GiLoadFloat32(src1 + 0)); + rsrc[1][2] = GiFloat32Type2FixLenType(GiLoadFloat32(src1 + 4)); } if (has_bottom) { - rsrc[2][0] = GiZeroFloat32(); - rsrc[2][1] = GiLoadFloat32(src2 + 0); - rsrc[2][2] = GiLoadFloat32(src2 + 4); + rsrc[2][0] = GiFloat32Type2FixLenType(GiZeroFloat32()); + rsrc[2][1] = GiFloat32Type2FixLenType(GiLoadFloat32(src2 + 0)); + rsrc[2][2] = GiFloat32Type2FixLenType(GiLoadFloat32(src2 + 4)); } int w = 0; @@ -246,18 +298,18 @@ void channel_wise_nchw44_float::do_conv_kern_3x3_stride1_padding1( const float* src1 = src; const float* src2 = src + W * 4; - GI_FLOAT32_t rfilter[3][3]; - rfilter[0][0] = GiLoadFloat32(filter + 0); - rfilter[0][1] = GiLoadFloat32(filter + 4); - rfilter[0][2] = GiLoadFloat32(filter + 8); - rfilter[1][0] = GiLoadFloat32(filter + 12); - rfilter[1][1] = GiLoadFloat32(filter + 16); - rfilter[1][2] = GiLoadFloat32(filter + 20); - rfilter[2][0] = GiLoadFloat32(filter + 24); - rfilter[2][1] = GiLoadFloat32(filter + 28); - rfilter[2][2] = GiLoadFloat32(filter + 32); - - GI_FLOAT32_t rsrc[3][4]; + GI_FLOAT32_FIXLEN_t rfilter[3][3]; + rfilter[0][0] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 0)); + rfilter[0][1] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 4)); + rfilter[0][2] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 8)); + rfilter[1][0] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 12)); + rfilter[1][1] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 16)); + rfilter[1][2] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 20)); + rfilter[2][0] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 24)); + rfilter[2][1] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 28)); + rfilter[2][2] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 32)); + + GI_FLOAT32_FIXLEN_t rsrc[3][4]; compute_row::call( src0, src1, src2, dst, bias, init, rsrc, rfilter, W, op); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_5x5_s1p2_nchw44_kern.cpp b/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_5x5_s1p2_nchw44_kern.cpp index b8fa4fbbd..54e1df8e1 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_5x5_s1p2_nchw44_kern.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_5x5_s1p2_nchw44_kern.cpp @@ -12,8 +12,8 @@ using namespace fallback; namespace { template -static inline void shift_src(GI_FLOAT32_t rsrc[6]) { - GI_FLOAT32_t t[6]; +static inline void shift_src(GI_FLOAT32_FIXLEN_t rsrc[6]) { + GI_FLOAT32_FIXLEN_t t[6]; t[0] = rsrc[(shift + 0) % 6]; t[1] = rsrc[(shift + 1) % 6]; @@ -29,12 +29,12 @@ static inline void shift_src(GI_FLOAT32_t rsrc[6]) { rsrc[5] = t[5]; } -static inline void load_filter(const float* filter, GI_FLOAT32_t rfilter[5]) { - rfilter[0] = GiLoadFloat32(filter + 0); - rfilter[1] = GiLoadFloat32(filter + 4); - rfilter[2] = GiLoadFloat32(filter + 8); - rfilter[3] = GiLoadFloat32(filter + 12); - rfilter[4] = GiLoadFloat32(filter + 16); +static inline void load_filter(const float* filter, GI_FLOAT32_FIXLEN_t rfilter[5]) { + rfilter[0] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 0)); + rfilter[1] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 4)); + rfilter[2] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 8)); + rfilter[3] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 12)); + rfilter[4] = GiFloat32Type2FixLenType(GiLoadFloat32(filter + 16)); } template @@ -51,8 +51,8 @@ struct compute_element { template static inline void call( const float*& src, float*& dst, const float*& bias, - const GI_FLOAT32_t& init, GI_FLOAT32_t rsrc[6], GI_FLOAT32_t rfilter[5], - const Op& op) { + const GI_FLOAT32_t& init, GI_FLOAT32_FIXLEN_t rsrc[6], + GI_FLOAT32_FIXLEN_t rfilter[5], const Op& op) { #define RSRC(i) rsrc[((i) + bw) % 6] GI_FLOAT32_t rdst; if (need_load_bias) { @@ -60,13 +60,23 @@ struct compute_element { } else { rdst = GiLoadFloat32(dst); } - RSRC(5) = GiLoadFloat32(src + 12); - - rdst = GiMlaqFloat32(rdst, RSRC(0), rfilter[0]); - rdst = GiMlaqFloat32(rdst, RSRC(1), rfilter[1]); - rdst = GiMlaqFloat32(rdst, RSRC(2), rfilter[2]); - rdst = GiMlaqFloat32(rdst, RSRC(3), rfilter[3]); - rdst = GiMlaqFloat32(rdst, RSRC(4), rfilter[4]); + RSRC(5) = GiFloat32Type2FixLenType(GiLoadFloat32(src + 12)); + + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(0)), + GiFixLenType2GiFloat32Type(rfilter[0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(1)), + GiFixLenType2GiFloat32Type(rfilter[1])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(2)), + GiFixLenType2GiFloat32Type(rfilter[2])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(3)), + GiFixLenType2GiFloat32Type(rfilter[3])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(RSRC(4)), + GiFixLenType2GiFloat32Type(rfilter[4])); if (need_do_op) { rdst = op(rdst); @@ -93,7 +103,7 @@ struct compute_element_right { template static inline void call( float*& dst, const float*& bias, const GI_FLOAT32_t& init, - GI_FLOAT32_t rsrc[6], GI_FLOAT32_t rfilter[5], const Op& op) { + GI_FLOAT32_FIXLEN_t rsrc[6], GI_FLOAT32_FIXLEN_t rfilter[5], const Op& op) { GI_FLOAT32_t rdst; if (need_load_bias) { rdst = load_bias(bias, init); @@ -101,14 +111,24 @@ struct compute_element_right { rdst = GiLoadFloat32(dst); } - rdst = GiMlaqFloat32(rdst, rsrc[0 + padding], rfilter[0]); - rdst = GiMlaqFloat32(rdst, rsrc[1 + padding], rfilter[1]); - rdst = GiMlaqFloat32(rdst, rsrc[2 + padding], rfilter[2]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[0 + padding]), + GiFixLenType2GiFloat32Type(rfilter[0])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[1 + padding]), + GiFixLenType2GiFloat32Type(rfilter[1])); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[2 + padding]), + GiFixLenType2GiFloat32Type(rfilter[2])); if (padding < 2) { - rdst = GiMlaqFloat32(rdst, rsrc[3 + padding], rfilter[3]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[3 + padding]), + GiFixLenType2GiFloat32Type(rfilter[3])); } if (padding < 1) { - rdst = GiMlaqFloat32(rdst, rsrc[4 + padding], rfilter[4]); + rdst = GiMlaqFloat32( + rdst, GiFixLenType2GiFloat32Type(rsrc[4 + padding]), + GiFixLenType2GiFloat32Type(rfilter[4])); } if (need_do_op) { @@ -126,12 +146,13 @@ struct compute_row_src_1x5 { template static inline void call( const float* src, float* dst, const float* bias, const GI_FLOAT32_t& init, - GI_FLOAT32_t rsrc[6], GI_FLOAT32_t rfilter[5], int W, const Op& op) { - rsrc[0] = GiZeroFloat32(); - rsrc[1] = GiZeroFloat32(); - rsrc[2] = GiLoadFloat32(src + 0); - rsrc[3] = GiLoadFloat32(src + 4); - rsrc[4] = GiLoadFloat32(src + 8); + GI_FLOAT32_FIXLEN_t rsrc[6], GI_FLOAT32_FIXLEN_t rfilter[5], int W, + const Op& op) { + rsrc[0] = GiFloat32Type2FixLenType(GiZeroFloat32()); + rsrc[1] = GiFloat32Type2FixLenType(GiZeroFloat32()); + rsrc[2] = GiFloat32Type2FixLenType(GiLoadFloat32(src + 0)); + rsrc[3] = GiFloat32Type2FixLenType(GiLoadFloat32(src + 4)); + rsrc[4] = GiFloat32Type2FixLenType(GiLoadFloat32(src + 8)); int w = 0; @@ -172,8 +193,8 @@ struct compute_row { template static inline void call( const float*& src, float*& dst, const float* filter, const float*& bias, - const GI_FLOAT32_t& init, GI_FLOAT32_t rsrc[6], GI_FLOAT32_t rfilter[5], - int W, const Op& op) { + const GI_FLOAT32_t& init, GI_FLOAT32_FIXLEN_t rsrc[6], + GI_FLOAT32_FIXLEN_t rfilter[5], int W, const Op& op) { if (top_padding < 1) { load_filter(filter + 0, rfilter); compute_row_src_1x5::call( @@ -222,8 +243,8 @@ void channel_wise_nchw44_float::do_conv_kern_5x5_stride1_padding2( init = GiLoadFloat32(bias); } - GI_FLOAT32_t rsrc[6]; - GI_FLOAT32_t rfilter[5]; + GI_FLOAT32_FIXLEN_t rsrc[6]; + GI_FLOAT32_FIXLEN_t rfilter[5]; compute_row<2, 0, bias_mode>::call( src, dst, filter, bias, init, rsrc, rfilter, W, op); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp b/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp index f57a2aeb4..fc8d00844 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/channel_wise_nchw44_kern.cpp @@ -12,14 +12,30 @@ using namespace fallback; namespace { +#define cb(_simd_fixlen_type, _fun_suffix, _simd_type_v2) \ + struct ParamElemFixLenVisitorV2None { \ + _simd_type_v2 operator()( \ + const _simd_fixlen_type& s0, const _simd_fixlen_type& s1) const { \ + _simd_type_v2 ret; \ + GiSetSubVector##_fun_suffix##V2( \ + ret, 0, GiFixLenType2Gi##_fun_suffix##Type(s0)); \ + GiSetSubVector##_fun_suffix##V2( \ + ret, 1, GiFixLenType2Gi##_fun_suffix##Type(s1)); \ + return ret; \ + } \ + }; + +cb(GI_FLOAT32_FIXLEN_t, Float32, GI_FLOAT32_V2_t); +#undef cb + template -void load_vec(GI_FLOAT32_t* dst, const float* src); +void load_vec(GI_FLOAT32_FIXLEN_t* dst, const float* src); -#define cb(i) dst[i] = GiLoadFloat32(src + i * 4); -#define LOAD_MACRO(n) \ - template <> \ - inline void load_vec(GI_FLOAT32_t * dst, const float* src) { \ - UNROLL_CALL_NOWRAPPER(n, cb); \ +#define cb(i) dst[i] = GiFloat32Type2FixLenType(GiLoadFloat32(src + i * 4)); +#define LOAD_MACRO(n) \ + template <> \ + inline void load_vec(GI_FLOAT32_FIXLEN_t * dst, const float* src) { \ + UNROLL_CALL_NOWRAPPER(n, cb); \ } LOAD_MACRO(2); LOAD_MACRO(3); @@ -33,14 +49,20 @@ LOAD_MACRO(9); #undef LOAD_MACRO template -void compute_vec(GI_FLOAT32_t& dst, GI_FLOAT32_t* src, GI_FLOAT32_t* filter); +void compute_vec( + GI_FLOAT32_FIXLEN_t& dst, GI_FLOAT32_FIXLEN_t* src, + GI_FLOAT32_FIXLEN_t* filter); -#define cb(i) dst = GiMlaqFloat32(dst, src[i], filter[i]); -#define COMPUTE_MACRO(n) \ - template <> \ - inline void compute_vec( \ - GI_FLOAT32_t & dst, GI_FLOAT32_t * src, GI_FLOAT32_t * filter) { \ - UNROLL_CALL_NOWRAPPER(n, cb); \ +#define cb(i) \ + dst = GiFloat32Type2FixLenType(GiMlaqFloat32( \ + GiFixLenType2GiFloat32Type(dst), GiFixLenType2GiFloat32Type(src[i]), \ + GiFixLenType2GiFloat32Type(filter[i]))); +#define COMPUTE_MACRO(n) \ + template <> \ + inline void compute_vec( \ + GI_FLOAT32_FIXLEN_t & dst, GI_FLOAT32_FIXLEN_t * src, \ + GI_FLOAT32_FIXLEN_t * filter) { \ + UNROLL_CALL_NOWRAPPER(n, cb); \ } COMPUTE_MACRO(2); COMPUTE_MACRO(3); @@ -51,20 +73,21 @@ COMPUTE_MACRO(5); template struct load_bias_vec; -#define cb_bias(i) dst[i] = GiLoadFloat32((bptr) + i * 4); -#define cb_init(i) dst[i] = init; +#define cb_bias(i) dst[i] = GiFloat32Type2FixLenType(GiLoadFloat32((bptr) + i * 4)); +#define cb_init(i) dst[i] = GiFloat32Type2FixLenType(init); -#define INIT_BIAS_MACRO(n) \ - template \ - struct load_bias_vec { \ - static void impl( \ - GI_FLOAT32_t* dst, const GI_FLOAT32_t& init, const float* bptr) { \ - if (bias_mode == BiasMode::BIAS) { \ - UNROLL_CALL_NOWRAPPER(n, cb_bias); \ - } else { \ - UNROLL_CALL_NOWRAPPER(n, cb_init); \ - } \ - } \ +#define INIT_BIAS_MACRO(n) \ + template \ + struct load_bias_vec { \ + static void impl( \ + GI_FLOAT32_FIXLEN_t* dst, const GI_FLOAT32_t& init, \ + const float* bptr) { \ + if (bias_mode == BiasMode::BIAS) { \ + UNROLL_CALL_NOWRAPPER(n, cb_bias); \ + } else { \ + UNROLL_CALL_NOWRAPPER(n, cb_init); \ + } \ + } \ }; INIT_BIAS_MACRO(1); @@ -78,7 +101,7 @@ INIT_BIAS_MACRO(4); #define COMPUTE_PADDING_KERNEL() \ do { \ int iw = ow * stride - PW; \ - GI_FLOAT32_t result; \ + GI_FLOAT32_FIXLEN_t result; \ load_bias_vec::impl(&result, init, bias + oh * OW * 4 + ow * 4); \ for (int kh = 0; kh < fh; kh++) { \ if (kh + ih < 0 || kh + ih >= static_cast(IH)) \ @@ -87,12 +110,14 @@ INIT_BIAS_MACRO(4); if (kw + iw < 0 || kw + iw >= static_cast(IW)) \ continue; \ const float* sptr = src + (kh + ih) * IW * 4 + (kw + iw) * 4; \ - result = GiMlaqFloat32( \ - result, kernel[kh * fh + kw], GiLoadFloat32(sptr)); \ + result = GiFloat32Type2FixLenType(GiMlaqFloat32( \ + GiFixLenType2GiFloat32Type(result), \ + GiFixLenType2GiFloat32Type(kernel[kh * fh + kw]), \ + GiLoadFloat32(sptr))); \ } \ } \ float* output = dst + oh * OW * 4 + ow * 4; \ - op(result, output); \ + op(GiFixLenType2GiFloat32Type(result), output); \ } while (0) template @@ -101,7 +126,7 @@ struct PaddingCompute { const float* src, const float* bias, float* dst, const int fh, const int stride, const size_t IH, const size_t IW, const size_t OH, const size_t OW, const size_t PH, const size_t PW, - const GI_FLOAT32_t* kernel, const GI_FLOAT32_t& init) { + const GI_FLOAT32_FIXLEN_t* kernel, const GI_FLOAT32_t& init) { size_t oh_start = (PH + stride - 1) / stride; size_t ow_start = (PW + stride - 1) / stride; size_t oh_end = (IH + PH - fh) / stride + 1; @@ -136,7 +161,7 @@ struct PaddingComputeK3P1 { static void compute( const float* src, const float* bias, float* dst, const size_t stride, const size_t IH, const size_t IW, const size_t OH, const size_t OW, - const GI_FLOAT32_t* kernel, const GI_FLOAT32_t& init) { + const GI_FLOAT32_FIXLEN_t* kernel, const GI_FLOAT32_t& init) { constexpr size_t PH = 1, PW = 1, FH = 3; size_t oh_start = (PH + stride - 1) / stride; size_t ow_start = (PW + stride - 1) / stride; @@ -150,128 +175,226 @@ struct PaddingComputeK3P1 { Op op; // line one left { - GI_FLOAT32_t result; + GI_FLOAT32_FIXLEN_t result; load_bias_vec::impl(&result, init, bias); - result = GiMlaqFloat32(result, kernel[4], GiLoadFloat32(src)); - result = GiMlaqFloat32(result, kernel[5], GiLoadFloat32(src + 4)); - result = GiMlaqFloat32(result, kernel[7], GiLoadFloat32(src + IW * 4)); - result = GiMlaqFloat32(result, kernel[8], GiLoadFloat32(src + IW * 4 + 4)); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[4]), GiLoadFloat32(src))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[5]), GiLoadFloat32(src + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[7]), + GiLoadFloat32(src + IW * 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[8]), + GiLoadFloat32(src + IW * 4 + 4))); float* output = dst; - op(result, output); + op(GiFixLenType2GiFloat32Type(result), output); } // line one mid for (size_t ow = ow_start; ow < ow_end; ow++) { int iw = ow * stride - PW; - GI_FLOAT32_t result; + GI_FLOAT32_FIXLEN_t result; load_bias_vec::impl(&result, init, bias + ow * 4); const float* sptr = src + iw * 4; - result = GiMlaqFloat32(result, kernel[3], GiLoadFloat32(sptr)); - result = GiMlaqFloat32(result, kernel[4], GiLoadFloat32(sptr + 4)); - result = GiMlaqFloat32(result, kernel[5], GiLoadFloat32(sptr + 8)); - result = GiMlaqFloat32(result, kernel[6], GiLoadFloat32(sptr + IW * 4)); - result = GiMlaqFloat32(result, kernel[7], GiLoadFloat32(sptr + IW * 4 + 4)); - result = GiMlaqFloat32(result, kernel[8], GiLoadFloat32(sptr + IW * 4 + 8)); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[3]), GiLoadFloat32(sptr))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[4]), GiLoadFloat32(sptr + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[5]), GiLoadFloat32(sptr + 8))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[6]), + GiLoadFloat32(sptr + IW * 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[7]), + GiLoadFloat32(sptr + IW * 4 + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[8]), + GiLoadFloat32(sptr + IW * 4 + 8))); float* output = dst + ow * 4; - op(result, output); + op(GiFixLenType2GiFloat32Type(result), output); } // line one right if (OW != ow_end) { - GI_FLOAT32_t result; + GI_FLOAT32_FIXLEN_t result; load_bias_vec::impl(&result, init, bias + (OW - 1) * 4); const float* sptr = src + (ow_end * stride - PW) * 4; - result = GiMlaqFloat32(result, kernel[3], GiLoadFloat32(sptr)); - result = GiMlaqFloat32(result, kernel[4], GiLoadFloat32(sptr + 4)); - result = GiMlaqFloat32(result, kernel[6], GiLoadFloat32(sptr + IW * 4)); - result = GiMlaqFloat32(result, kernel[7], GiLoadFloat32(sptr + IW * 4 + 4)); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[3]), GiLoadFloat32(sptr))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[4]), GiLoadFloat32(sptr + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[6]), + GiLoadFloat32(sptr + IW * 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[7]), + GiLoadFloat32(sptr + IW * 4 + 4))); float* output = dst + ow_end * 4; - op(result, output); + op(GiFixLenType2GiFloat32Type(result), output); } // mid line for (size_t oh = oh_start; oh < oh_end; oh++) { int ih = oh * stride - PH; // left { - GI_FLOAT32_t result; + GI_FLOAT32_FIXLEN_t result; load_bias_vec::impl(&result, init, bias + oh * OW * 4); const float* sptr = src + ih * IW * 4; - result = GiMlaqFloat32(result, kernel[1], GiLoadFloat32(sptr)); - result = GiMlaqFloat32(result, kernel[2], GiLoadFloat32(sptr + 4)); - result = GiMlaqFloat32(result, kernel[4], GiLoadFloat32(sptr + IW * 4)); - result = GiMlaqFloat32( - result, kernel[5], GiLoadFloat32(sptr + IW * 4 + 4)); - result = GiMlaqFloat32( - result, kernel[7], GiLoadFloat32(sptr + 2 * IW * 4)); - result = GiMlaqFloat32( - result, kernel[8], GiLoadFloat32(sptr + 2 * IW * 4 + 4)); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[1]), GiLoadFloat32(sptr))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[2]), + GiLoadFloat32(sptr + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[4]), + GiLoadFloat32(sptr + IW * 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[5]), + GiLoadFloat32(sptr + IW * 4 + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[7]), + GiLoadFloat32(sptr + 2 * IW * 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[8]), + GiLoadFloat32(sptr + 2 * IW * 4 + 4))); float* output = dst + oh * OW * 4; - op(result, output); + op(GiFixLenType2GiFloat32Type(result), output); } // right if (OW != ow_end) { - GI_FLOAT32_t result; + GI_FLOAT32_FIXLEN_t result; load_bias_vec::impl( &result, init, bias + oh * OW * 4 + (OW - 1) * 4); const float* sptr = src + ih * IW * 4 + (ow_end * stride - PW) * 4; - result = GiMlaqFloat32(result, kernel[0], GiLoadFloat32(sptr)); - result = GiMlaqFloat32(result, kernel[1], GiLoadFloat32(sptr + 4)); - result = GiMlaqFloat32(result, kernel[3], GiLoadFloat32(sptr + IW * 4)); - result = GiMlaqFloat32( - result, kernel[4], GiLoadFloat32(sptr + IW * 4 + 4)); - result = GiMlaqFloat32( - result, kernel[6], GiLoadFloat32(sptr + 2 * IW * 4)); - result = GiMlaqFloat32( - result, kernel[7], GiLoadFloat32(sptr + 2 * IW * 4 + 4)); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[0]), GiLoadFloat32(sptr))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[1]), + GiLoadFloat32(sptr + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[3]), + GiLoadFloat32(sptr + IW * 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[4]), + GiLoadFloat32(sptr + IW * 4 + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[6]), + GiLoadFloat32(sptr + 2 * IW * 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[7]), + GiLoadFloat32(sptr + 2 * IW * 4 + 4))); float* output = dst + oh * OW * 4 + ow_end * 4; - op(result, output); + op(GiFixLenType2GiFloat32Type(result), output); } } // last line left if (OH != oh_end) { size_t oh = OH - 1; { - GI_FLOAT32_t result; + GI_FLOAT32_FIXLEN_t result; load_bias_vec::impl(&result, init, bias + oh * OW * 4); const float* sptr = src + (oh_end * stride - PH) * IW * 4; - result = GiMlaqFloat32(result, kernel[1], GiLoadFloat32(sptr)); - result = GiMlaqFloat32(result, kernel[2], GiLoadFloat32(sptr + 4)); - result = GiMlaqFloat32(result, kernel[4], GiLoadFloat32(sptr + IW * 4)); - result = GiMlaqFloat32( - result, kernel[5], GiLoadFloat32(sptr + IW * 4 + 4)); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[1]), GiLoadFloat32(sptr))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[2]), + GiLoadFloat32(sptr + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[4]), + GiLoadFloat32(sptr + IW * 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[5]), + GiLoadFloat32(sptr + IW * 4 + 4))); float* output = dst + oh_end * OW * 4; - op(result, output); + op(GiFixLenType2GiFloat32Type(result), output); } // last line mid for (size_t ow = ow_start; ow < ow_end; ow++) { int iw = ow * stride - PW; - GI_FLOAT32_t result; + GI_FLOAT32_FIXLEN_t result; load_bias_vec::impl( &result, init, bias + oh * OW * 4 + ow * 4); const float* sptr = src + (oh_end * stride - PH) * IW * 4 + iw * 4; - result = GiMlaqFloat32(result, kernel[0], GiLoadFloat32(sptr)); - result = GiMlaqFloat32(result, kernel[1], GiLoadFloat32(sptr + 4)); - result = GiMlaqFloat32(result, kernel[2], GiLoadFloat32(sptr + 8)); - result = GiMlaqFloat32(result, kernel[3], GiLoadFloat32(sptr + IW * 4)); - result = GiMlaqFloat32( - result, kernel[4], GiLoadFloat32(sptr + IW * 4 + 4)); - result = GiMlaqFloat32( - result, kernel[5], GiLoadFloat32(sptr + IW * 4 + 8)); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[0]), GiLoadFloat32(sptr))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[1]), + GiLoadFloat32(sptr + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[2]), + GiLoadFloat32(sptr + 8))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[3]), + GiLoadFloat32(sptr + IW * 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[4]), + GiLoadFloat32(sptr + IW * 4 + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[5]), + GiLoadFloat32(sptr + IW * 4 + 8))); float* output = dst + oh_end * OW * 4 + ow * 4; - op(result, output); + op(GiFixLenType2GiFloat32Type(result), output); } // last line right if (OW != ow_end) { - GI_FLOAT32_t result; + GI_FLOAT32_FIXLEN_t result; load_bias_vec::impl( &result, init, bias + oh * OW * 4 + (OW - 1) * 4); const float* sptr = src + (oh_end * stride - PH) * IW * 4 + (ow_end * stride - PW) * 4; - result = GiMlaqFloat32(result, kernel[0], GiLoadFloat32(sptr)); - result = GiMlaqFloat32(result, kernel[1], GiLoadFloat32(sptr + 4)); - result = GiMlaqFloat32(result, kernel[3], GiLoadFloat32(sptr + IW * 4)); - result = GiMlaqFloat32( - result, kernel[4], GiLoadFloat32(sptr + IW * 4 + 4)); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[0]), GiLoadFloat32(sptr))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[1]), + GiLoadFloat32(sptr + 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[3]), + GiLoadFloat32(sptr + IW * 4))); + result = GiFloat32Type2FixLenType(GiMlaqFloat32( + GiFixLenType2GiFloat32Type(result), + GiFixLenType2GiFloat32Type(kernel[4]), + GiLoadFloat32(sptr + IW * 4 + 4))); float* output = dst + oh_end * OW * 4 + ow_end * 4; - op(result, output); + op(GiFixLenType2GiFloat32Type(result), output); } } } @@ -284,7 +407,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_2x2( const float* src, const float* filter, const float* bias, float* dst, const size_t IH, const size_t IW, const size_t OH, const size_t OW, const size_t PH, const size_t PW) { - GI_FLOAT32_t kernel[4]; + GI_FLOAT32_FIXLEN_t kernel[4]; load_vec<4>(kernel, filter); Op op; GI_FLOAT32_t init = GiZeroFloat32(); @@ -313,12 +436,12 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_2x2( size_t iw = ow - ow_start; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2][4]; + GI_FLOAT32_FIXLEN_t dst_v[2][4]; load_bias_vec::impl( dst_v[0], init, bias + oh * OW * 4 + ow * 4); load_bias_vec::impl( dst_v[1], init, bias + (oh + 1) * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[3][5]; + GI_FLOAT32_FIXLEN_t src_v[3][5]; load_vec<5>(src_v[0], input); COMPUTE_2X2(dst_v[0], src_v[0], &kernel[0]); load_vec<5>(src_v[1], input + IW * 4); @@ -327,21 +450,22 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_2x2( load_vec<5>(src_v[2], input + 2 * IW * 4); COMPUTE_2X2(dst_v[1], src_v[2], &kernel[2]); - op({{dst_v[0][0], dst_v[0][1]}}, output); - op({{dst_v[0][2], dst_v[0][3]}}, output + 8); - op({{dst_v[1][0], dst_v[1][1]}}, output + OW * 4); - op({{dst_v[1][2], dst_v[1][3]}}, output + OW * 4 + 8); + ParamElemFixLenVisitorV2None vis; + op(vis(dst_v[0][0], dst_v[0][1]), output); + op(vis(dst_v[0][2], dst_v[0][3]), output + 8); + op(vis(dst_v[1][0], dst_v[1][1]), output + OW * 4); + op(vis(dst_v[1][2], dst_v[1][3]), output + OW * 4 + 8); } for (; ow < ow_end; ow++) { size_t iw = ow - ow_start; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2]; + GI_FLOAT32_FIXLEN_t dst_v[2]; load_bias_vec::impl( &dst_v[0], init, bias + oh * OW * 4 + ow * 4); load_bias_vec::impl( &dst_v[1], init, bias + (oh + 1) * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[3][2]; + GI_FLOAT32_FIXLEN_t src_v[3][2]; load_vec<2>(src_v[0], input); compute_vec<2>(dst_v[0], &src_v[0][0], &kernel[0]); load_vec<2>(src_v[1], input + IW * 4); @@ -350,8 +474,8 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_2x2( load_vec<2>(src_v[2], input + 2 * IW * 4); compute_vec<2>(dst_v[1], &src_v[2][0], &kernel[2]); - op(dst_v[0], output); - op(dst_v[1], output + OW * 4); + op(GiFixLenType2GiFloat32Type(dst_v[0]), output); + op(GiFixLenType2GiFloat32Type(dst_v[1]), output + OW * 4); } } for (; oh < oh_end; oh++) { @@ -361,32 +485,33 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_2x2( size_t iw = ow - ow_start; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[1][4]; + GI_FLOAT32_FIXLEN_t dst_v[1][4]; load_bias_vec::impl( dst_v[0], init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[2][5]; + GI_FLOAT32_FIXLEN_t src_v[2][5]; load_vec<5>(src_v[0], input); COMPUTE_2X2(dst_v[0], src_v[0], &kernel[0]); load_vec<5>(src_v[1], input + IW * 4); COMPUTE_2X2(dst_v[0], src_v[1], &kernel[2]); - op({{dst_v[0][0], dst_v[0][1]}}, output); - op({{dst_v[0][2], dst_v[0][3]}}, output + 8); + ParamElemFixLenVisitorV2None vis; + op(vis(dst_v[0][0], dst_v[0][1]), output); + op(vis(dst_v[0][2], dst_v[0][3]), output + 8); } for (; ow < ow_end; ow++) { size_t iw = ow - ow_start; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v; + GI_FLOAT32_FIXLEN_t dst_v; load_bias_vec::impl( &dst_v, init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[2][2]; + GI_FLOAT32_FIXLEN_t src_v[2][2]; load_vec<2>(src_v[0], input); compute_vec<2>(dst_v, &src_v[0][0], &kernel[0]); load_vec<2>(src_v[1], input + IW * 4); compute_vec<2>(dst_v, &src_v[1][0], &kernel[2]); - op(dst_v, output); + op(GiFixLenType2GiFloat32Type(dst_v), output); } } #undef COMPUTE_2X2 @@ -403,7 +528,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_3x3( return; } - GI_FLOAT32_t kernel[9]; + GI_FLOAT32_FIXLEN_t kernel[9]; load_vec<9>(kernel, filter); Op op; GI_FLOAT32_t init = GiZeroFloat32(); @@ -426,12 +551,12 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_3x3( size_t iw = ow - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2][4]; + GI_FLOAT32_FIXLEN_t dst_v[2][4]; load_bias_vec::impl( dst_v[0], init, bias + oh * OW * 4 + ow * 4); load_bias_vec::impl( dst_v[1], init, bias + (oh + 1) * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[2][6]; + GI_FLOAT32_FIXLEN_t src_v[2][6]; load_vec<6>(src_v[0], input); compute_vec<3>(dst_v[0][0], &src_v[0][0], &kernel[0]); compute_vec<3>(dst_v[0][1], &src_v[0][1], &kernel[0]); @@ -461,21 +586,22 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_3x3( compute_vec<3>(dst_v[1][2], &src_v[1][2], &kernel[6]); compute_vec<3>(dst_v[1][3], &src_v[1][3], &kernel[6]); - op({{dst_v[0][0], dst_v[0][1]}}, output); - op({{dst_v[0][2], dst_v[0][3]}}, output + 8); - op({{dst_v[1][0], dst_v[1][1]}}, output + OW * 4); - op({{dst_v[1][2], dst_v[1][3]}}, output + OW * 4 + 8); + ParamElemFixLenVisitorV2None vis; + op(vis(dst_v[0][0], dst_v[0][1]), output); + op(vis(dst_v[0][2], dst_v[0][3]), output + 8); + op(vis(dst_v[1][0], dst_v[1][1]), output + OW * 4); + op(vis(dst_v[1][2], dst_v[1][3]), output + OW * 4 + 8); } for (; ow < ow_end; ow++) { size_t iw = ow - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2]; + GI_FLOAT32_FIXLEN_t dst_v[2]; load_bias_vec::impl( &dst_v[0], init, bias + oh * OW * 4 + ow * 4); load_bias_vec::impl( &dst_v[1], init, bias + (oh + 1) * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[2][3]; + GI_FLOAT32_FIXLEN_t src_v[2][3]; load_vec<3>(src_v[0], input); compute_vec<3>(dst_v[0], &src_v[0][0], &kernel[0]); load_vec<3>(src_v[1], input + IW * 4); @@ -487,8 +613,8 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_3x3( load_vec<3>(src_v[1], input + 3 * IW * 4); compute_vec<3>(dst_v[1], &src_v[1][0], &kernel[6]); - op(dst_v[0], output); - op(dst_v[1], output + OW * 4); + op(GiFixLenType2GiFloat32Type(dst_v[0]), output); + op(GiFixLenType2GiFloat32Type(dst_v[1]), output + OW * 4); } } for (; oh < oh_end; oh++) { @@ -498,10 +624,10 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_3x3( size_t iw = ow - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[4]; + GI_FLOAT32_FIXLEN_t dst_v[4]; load_bias_vec::impl( &dst_v[0], init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[2][6]; + GI_FLOAT32_FIXLEN_t src_v[2][6]; load_vec<6>(src_v[0], input); compute_vec<3>(dst_v[0], &src_v[0][0], &kernel[0]); compute_vec<3>(dst_v[1], &src_v[0][1], &kernel[0]); @@ -517,24 +643,25 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_3x3( compute_vec<3>(dst_v[1], &src_v[0][1], &kernel[6]); compute_vec<3>(dst_v[2], &src_v[0][2], &kernel[6]); compute_vec<3>(dst_v[3], &src_v[0][3], &kernel[6]); - op({{dst_v[0], dst_v[1]}}, output); - op({{dst_v[2], dst_v[3]}}, output + 8); + ParamElemFixLenVisitorV2None vis; + op(vis(dst_v[0], dst_v[1]), output); + op(vis(dst_v[2], dst_v[3]), output + 8); } for (; ow < ow_end; ow++) { size_t iw = ow - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v; + GI_FLOAT32_FIXLEN_t dst_v; load_bias_vec::impl( &dst_v, init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[3][3]; + GI_FLOAT32_FIXLEN_t src_v[3][3]; load_vec<3>(src_v[0], input); compute_vec<3>(dst_v, &src_v[0][0], &kernel[0]); load_vec<3>(src_v[1], input + IW * 4); compute_vec<3>(dst_v, &src_v[1][0], &kernel[3]); load_vec<3>(src_v[2], input + 2 * IW * 4); compute_vec<3>(dst_v, &src_v[2][0], &kernel[6]); - op(dst_v, output); + op(GiFixLenType2GiFloat32Type(dst_v), output); } } } @@ -562,7 +689,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5( if (PH || PW) { PaddingCompute::compute( src, bias, dst, 5, 1, IH, IW, OH, OW, PH, PW, - reinterpret_cast(filter), init); + reinterpret_cast(filter), init); } size_t oh = oh_start; for (; oh + 1 < oh_end; oh += 2) { @@ -572,13 +699,13 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5( size_t iw = ow - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2][2]; + GI_FLOAT32_FIXLEN_t dst_v[2][2]; load_bias_vec::impl( dst_v[0], init, bias + oh * OW * 4 + ow * 4); load_bias_vec::impl( dst_v[1], init, bias + (oh + 1) * OW * 4 + ow * 4); - GI_FLOAT32_t kernel[2][5]; - GI_FLOAT32_t src_v[2][6]; + GI_FLOAT32_FIXLEN_t kernel[2][5]; + GI_FLOAT32_FIXLEN_t src_v[2][6]; #define COMPUTE_5X5_4(i, dst, src, kernel0, kernel1) \ load_vec<5>(kernel0, filter + i * 5 * 4); \ load_vec<6>(src, input + i * IW * 4); \ @@ -604,20 +731,21 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5( compute_vec<5>(dst_v[1][0], &src_v[1][0], kernel[0]); compute_vec<5>(dst_v[1][1], &src_v[1][1], kernel[0]); #undef COMPUTE_5X5_4 - op({{dst_v[0][0], dst_v[0][1]}}, output); - op({{dst_v[1][0], dst_v[1][1]}}, output + OW * 4); + ParamElemFixLenVisitorV2None vis; + op(vis(dst_v[0][0], dst_v[0][1]), output); + op(vis(dst_v[1][0], dst_v[1][1]), output + OW * 4); } for (; ow < ow_end; ow++) { size_t iw = ow - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2][1]; + GI_FLOAT32_FIXLEN_t dst_v[2][1]; load_bias_vec::impl( dst_v[0], init, bias + oh * OW * 4 + ow * 4); load_bias_vec::impl( dst_v[1], init, bias + (oh + 1) * OW * 4 + ow * 4); - GI_FLOAT32_t kernel[2][5]; - GI_FLOAT32_t src_v[2][5]; + GI_FLOAT32_FIXLEN_t kernel[2][5]; + GI_FLOAT32_FIXLEN_t src_v[2][5]; #define COMPUTE_5X5_2(i, dst, src, kernel0, kernel1) \ load_vec<5>(kernel0, filter + i * 5 * 4); \ load_vec<6>(src, input + i * IW * 4); \ @@ -639,8 +767,8 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5( load_vec<5>(src_v[1], input + 5 * IW * 4); compute_vec<5>(dst_v[1][0], &src_v[1][0], kernel[0]); #undef COMPUTE_5X5_2 - op(dst_v[0][0], output); - op(dst_v[1][0], output + OW * 4); + op(GiFixLenType2GiFloat32Type(dst_v[0][0]), output); + op(GiFixLenType2GiFloat32Type(dst_v[1][0]), output + OW * 4); } } for (; oh < oh_end; oh++) { @@ -650,11 +778,11 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5( size_t iw = ow - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[1][2]; + GI_FLOAT32_FIXLEN_t dst_v[1][2]; load_bias_vec::impl( dst_v[0], init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t kernel[2][5]; - GI_FLOAT32_t src_v[2][6]; + GI_FLOAT32_FIXLEN_t kernel[2][5]; + GI_FLOAT32_FIXLEN_t src_v[2][6]; #define COMPUTE_5X5_2(i, dst, src, kernel) \ load_vec<5>(kernel, filter + i * 5 * 4); \ load_vec<6>(src, input + i * IW * 4); \ @@ -671,17 +799,18 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5( // line 4 COMPUTE_5X5_2(4, dst_v, src_v[0], kernel[0]); #undef COMPUTE_5X5_2 - op({{dst_v[0][0], dst_v[0][1]}}, output); + ParamElemFixLenVisitorV2None vis; + op(vis(dst_v[0][0], dst_v[0][1]), output); } for (; ow < ow_end; ow++) { size_t iw = ow - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v; + GI_FLOAT32_FIXLEN_t dst_v; load_bias_vec::impl( &dst_v, init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t kernel[2][5]; - GI_FLOAT32_t src_v[2][5]; + GI_FLOAT32_FIXLEN_t kernel[2][5]; + GI_FLOAT32_FIXLEN_t src_v[2][5]; #define COMPUTE_5X5_1(i, dst, src, kernel) \ load_vec<5>(kernel, filter + i * 5 * 4); \ load_vec<6>(src, input + i * IW * 4); \ @@ -697,7 +826,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride1_5x5( // line 4 COMPUTE_5X5_1(4, dst_v, src_v[0], kernel[0]); #undef COMPUTE_5X5_1 - op(dst_v, output); + op(GiFixLenType2GiFloat32Type(dst_v), output); } } } @@ -707,7 +836,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_2x2( const float* src, const float* filter, const float* bias, float* dst, const size_t IH, const size_t IW, const size_t OH, const size_t OW, const size_t PH, const size_t PW) { - GI_FLOAT32_t kernel[4]; + GI_FLOAT32_FIXLEN_t kernel[4]; load_vec<4>(kernel, filter); Op op; GI_FLOAT32_t init = GiZeroFloat32(); @@ -735,32 +864,33 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_2x2( size_t iw = ow * 2 - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[4]; + GI_FLOAT32_FIXLEN_t dst_v[4]; load_bias_vec::impl( &dst_v[0], init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[2][8]; + GI_FLOAT32_FIXLEN_t src_v[2][8]; load_vec<8>(src_v[0], input); COMPUTE_2X2(dst_v, src_v[0], &kernel[0]); load_vec<8>(src_v[1], input + IW * 4); COMPUTE_2X2(dst_v, src_v[1], &kernel[2]); #undef COMPUTE_2X2 - op({{dst_v[0], dst_v[1]}}, output); - op({{dst_v[2], dst_v[3]}}, output + 8); + ParamElemFixLenVisitorV2None vis; + op(vis(dst_v[0], dst_v[1]), output); + op(vis(dst_v[2], dst_v[3]), output + 8); } for (; ow < ow_end; ow++) { size_t iw = ow * 2 - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v; + GI_FLOAT32_FIXLEN_t dst_v; load_bias_vec::impl( &dst_v, init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[2][2]; + GI_FLOAT32_FIXLEN_t src_v[2][2]; load_vec<2>(src_v[0], input); compute_vec<2>(dst_v, &src_v[0][0], &kernel[0]); load_vec<2>(src_v[1], input + IW * 4); compute_vec<2>(dst_v, &src_v[1][0], &kernel[2]); - op(dst_v, output); + op(GiFixLenType2GiFloat32Type(dst_v), output); } } #undef COMPUTE_2X2 @@ -771,7 +901,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_3x3( const float* src, const float* filter, const float* bias, float* dst, const size_t IH, const size_t IW, const size_t OH, const size_t OW, const size_t PH, const size_t PW) { - GI_FLOAT32_t kernel[9]; + GI_FLOAT32_FIXLEN_t kernel[9]; load_vec<9>(kernel, filter); Op op; GI_FLOAT32_t init = GiZeroFloat32(); @@ -797,12 +927,12 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_3x3( size_t iw = ow * 2 - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2][2]; + GI_FLOAT32_FIXLEN_t dst_v[2][2]; load_bias_vec::impl( dst_v[0], init, bias + oh * OW * 4 + ow * 4); load_bias_vec::impl( dst_v[1], init, bias + (oh + 1) * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[2][5]; + GI_FLOAT32_FIXLEN_t src_v[2][5]; load_vec<5>(src_v[0], input); compute_vec<3>(dst_v[0][0], &src_v[0][0], &kernel[0]); compute_vec<3>(dst_v[0][1], &src_v[0][2], &kernel[0]); @@ -821,19 +951,20 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_3x3( compute_vec<3>(dst_v[1][0], &src_v[0][0], &kernel[6]); compute_vec<3>(dst_v[1][1], &src_v[0][2], &kernel[6]); - op({{dst_v[0][0], dst_v[0][1]}}, output); - op({{dst_v[1][0], dst_v[1][1]}}, output + OW * 4); + ParamElemFixLenVisitorV2None vis; + op(vis(dst_v[0][0], dst_v[0][1]), output); + op(vis(dst_v[1][0], dst_v[1][1]), output + OW * 4); } for (; ow < ow_end; ow++) { size_t iw = ow * 2 - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2]; + GI_FLOAT32_FIXLEN_t dst_v[2]; load_bias_vec::impl( &dst_v[0], init, bias + oh * OW * 4 + ow * 4); load_bias_vec::impl( &dst_v[1], init, bias + (oh + 1) * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[2][3]; + GI_FLOAT32_FIXLEN_t src_v[2][3]; load_vec<3>(src_v[0], input); compute_vec<3>(dst_v[0], &src_v[0][0], &kernel[0]); load_vec<3>(src_v[1], input + IW * 4); @@ -846,8 +977,8 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_3x3( load_vec<3>(src_v[0], input + 4 * IW * 4); compute_vec<3>(dst_v[1], &src_v[0][0], &kernel[6]); - op(dst_v[0], output); - op(dst_v[1], output + OW * 4); + op(GiFixLenType2GiFloat32Type(dst_v[0]), output); + op(GiFixLenType2GiFloat32Type(dst_v[1]), output + OW * 4); } } for (; oh < oh_end; oh++) { @@ -857,10 +988,10 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_3x3( size_t iw = ow * 2 - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2]; + GI_FLOAT32_FIXLEN_t dst_v[2]; load_bias_vec::impl( &dst_v[0], init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[3][5]; + GI_FLOAT32_FIXLEN_t src_v[3][5]; load_vec<5>(src_v[0], input); compute_vec<3>(dst_v[0], &src_v[0][0], &kernel[0]); compute_vec<3>(dst_v[1], &src_v[0][2], &kernel[0]); @@ -870,23 +1001,24 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_3x3( load_vec<5>(src_v[2], input + 2 * IW * 4); compute_vec<3>(dst_v[0], &src_v[2][0], &kernel[6]); compute_vec<3>(dst_v[1], &src_v[2][2], &kernel[6]); - op({{dst_v[0], dst_v[1]}}, output); + ParamElemFixLenVisitorV2None vis; + op(vis(dst_v[0], dst_v[1]), output); } for (; ow < ow_end; ow++) { size_t iw = ow * 2 - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v; + GI_FLOAT32_FIXLEN_t dst_v; load_bias_vec::impl( &dst_v, init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t src_v[3][3]; + GI_FLOAT32_FIXLEN_t src_v[3][3]; load_vec<3>(src_v[0], input); compute_vec<3>(dst_v, &src_v[0][0], &kernel[0]); load_vec<3>(src_v[1], input + IW * 4); compute_vec<3>(dst_v, &src_v[1][0], &kernel[3]); load_vec<3>(src_v[2], input + 2 * IW * 4); compute_vec<3>(dst_v, &src_v[2][0], &kernel[6]); - op(dst_v, output); + op(GiFixLenType2GiFloat32Type(dst_v), output); } } } @@ -909,7 +1041,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_5x5( if (PH || PW) { PaddingCompute::compute( src, bias, dst, 5, stride, IH, IW, OH, OW, PH, PW, - reinterpret_cast(filter), init); + reinterpret_cast(filter), init); } size_t oh = oh_start; for (; oh + 1 < oh_end; oh += 2) { @@ -919,13 +1051,13 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_5x5( size_t iw = ow * stride - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2][2]; + GI_FLOAT32_FIXLEN_t dst_v[2][2]; load_bias_vec::impl( dst_v[0], init, bias + oh * OW * 4 + ow * 4); load_bias_vec::impl( dst_v[1], init, bias + (oh + 1) * OW * 4 + ow * 4); - GI_FLOAT32_t kernel[3][5]; - GI_FLOAT32_t src_v[2][7]; + GI_FLOAT32_FIXLEN_t kernel[3][5]; + GI_FLOAT32_FIXLEN_t src_v[2][7]; #define COMPUTE_5X5_4(i, dst, src, kernel0, kernel1) \ load_vec<5>(kernel0, filter + i * 5 * 4); \ load_vec<7>(src, input + i * IW * 4); \ @@ -956,20 +1088,21 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_5x5( COMPUTE_5X5_2(6, dst_v[1], src_v[0], kernel[1]); #undef COMPUTE_5X5_4 #undef COMPUTE_5X5_2 - op({{dst_v[0][0], dst_v[0][1]}}, output); - op({{dst_v[1][0], dst_v[1][1]}}, output + OW * 4); + ParamElemFixLenVisitorV2None vis; + op(vis(dst_v[0][0], dst_v[0][1]), output); + op(vis(dst_v[1][0], dst_v[1][1]), output + OW * 4); } for (; ow < ow_end; ow++) { size_t iw = ow * stride - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v[2]; + GI_FLOAT32_FIXLEN_t dst_v[2]; load_bias_vec::impl( &dst_v[0], init, bias + oh * OW * 4 + ow * 4); load_bias_vec::impl( &dst_v[1], init, bias + (oh + 1) * OW * 4 + ow * 4); - GI_FLOAT32_t kernel[3][5]; - GI_FLOAT32_t src_v[2][5]; + GI_FLOAT32_FIXLEN_t kernel[3][5]; + GI_FLOAT32_FIXLEN_t src_v[2][5]; #define COMPUTE_5X5_2(i, dst, src, kernel0, kernel1) \ load_vec<5>(kernel0, filter + i * 5 * 4); \ load_vec<5>(src, input + i * IW * 4); \ @@ -997,8 +1130,8 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_5x5( COMPUTE_5X5_1(6, dst_v[1], src_v[0], kernel[1]); #undef COMPUTE_5X5_2 #undef COMPUTE_5X5_1 - op(dst_v[0], output); - op(dst_v[1], output + OW * 4); + op(GiFixLenType2GiFloat32Type(dst_v[0]), output); + op(GiFixLenType2GiFloat32Type(dst_v[1]), output + OW * 4); } } for (; oh < oh_end; oh++) { @@ -1008,11 +1141,11 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_5x5( size_t iw = ow * stride - PW; const float* input = src + ih * IW * 4 + iw * 4; float* output = dst + oh * OW * 4 + ow * 4; - GI_FLOAT32_t dst_v; + GI_FLOAT32_FIXLEN_t dst_v; load_bias_vec::impl( &dst_v, init, bias + oh * OW * 4 + ow * 4); - GI_FLOAT32_t kernel[2][5]; - GI_FLOAT32_t src_v[2][5]; + GI_FLOAT32_FIXLEN_t kernel[2][5]; + GI_FLOAT32_FIXLEN_t src_v[2][5]; #define COMPUTE_5X5_1(i, dst, src, kernel) \ load_vec<5>(kernel, filter + i * 5 * 4); \ load_vec<6>(src, input + i * IW * 4); \ @@ -1028,7 +1161,7 @@ void channel_wise_nchw44_float::do_conv_kern_stride2_5x5( // line 4 COMPUTE_5X5_1(4, dst_v, src_v[0], kernel[0]); #undef COMPUTE_5X5_1 - op(dst_v, output); + op(GiFixLenType2GiFloat32Type(dst_v), output); } } } diff --git a/dnn/src/fallback/conv_bias/gi/fp32/direct.cpp b/dnn/src/fallback/conv_bias/gi/fp32/direct.cpp index bf2756445..d62fc4694 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/direct.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/direct.cpp @@ -93,7 +93,12 @@ struct do_pixel_proxy<1, height, width> { (void)IH; (void)OH; const int ih = oh, iw = ow; - GI_FLOAT32_t out0{0}, out1{0}, out2{0}, out3{0}, kr0, inp; + GI_FLOAT32_t zero = GiZeroFloat32(); + GI_FLOAT32_t out0, out1, out2, out3, kr0, inp; + out0 = zero; + out1 = zero; + out2 = zero; + out3 = zero; src += ih * IW + iw; dst += oh * OW + ow; LOAD_OUT; @@ -134,7 +139,12 @@ struct do_pixel_proxy<2, height, width> { (void)IH; (void)OH; const int ih = oh, iw = ow; - GI_FLOAT32_t out0{0}, out1{0}, out2{0}, out3{0}, kr0, kr1, inp; + GI_FLOAT32_t zero = GiZeroFloat32(); + GI_FLOAT32_t out0, out1, out2, out3, kr0, kr1, inp; + out0 = zero; + out1 = zero; + out2 = zero; + out3 = zero; src += ih * IW + iw; dst += oh * OW + ow; LOAD_OUT; @@ -187,7 +197,12 @@ struct do_pixel_proxy<3, height, width> { (void)IH; (void)OH; const int ih = oh, iw = ow; - GI_FLOAT32_t out0{0}, out1{0}, out2{0}, out3{0}, kr0, kr1, kr2, inp; + GI_FLOAT32_t zero = GiZeroFloat32(); + GI_FLOAT32_t out0, out1, out2, out3, kr0, kr1, kr2, inp; + out0 = zero; + out1 = zero; + out2 = zero; + out3 = zero; src += ih * IW + iw; dst += oh * OW + ow; LOAD_OUT; @@ -252,7 +267,12 @@ struct do_pixel_proxy<4, height, width> { (void)IH; (void)OH; const int ih = oh, iw = ow; - GI_FLOAT32_t out0{0}, out1{0}, out2{0}, out3{0}, kr0, kr1, kr2, kr3, inp; + GI_FLOAT32_t zero = GiZeroFloat32(); + GI_FLOAT32_t out0, out1, out2, out3, kr0, kr1, kr2, kr3, inp; + out0 = zero; + out1 = zero; + out2 = zero; + out3 = zero; src += ih * IW + iw; dst += oh * OW + ow; LOAD_OUT; @@ -329,7 +349,12 @@ struct do_pixel_proxy<5, height, width> { (void)IH; (void)OH; const int ih = oh, iw = ow; - GI_FLOAT32_t out0{0}, out1{0}, out2{0}, out3{0}, kr0, kr1, kr2, kr3, kr4, inp; + GI_FLOAT32_t zero = GiZeroFloat32(); + GI_FLOAT32_t out0, out1, out2, out3, kr0, kr1, kr2, kr3, kr4, inp; + out0 = zero; + out1 = zero; + out2 = zero; + out3 = zero; src += ih * IW + iw; dst += oh * OW + ow; LOAD_OUT; @@ -418,8 +443,12 @@ struct do_pixel_proxy<6, height, width> { (void)IH; (void)OH; const int ih = oh, iw = ow; - GI_FLOAT32_t out0{0}, out1{0}, out2{0}, out3{0}, kr0, kr1, kr2, kr3, kr4, kr5, - inp; + GI_FLOAT32_t zero = GiZeroFloat32(); + GI_FLOAT32_t out0, out1, out2, out3, kr0, kr1, kr2, kr3, kr4, kr5, inp; + out0 = zero; + out1 = zero; + out2 = zero; + out3 = zero; src += ih * IW + iw; dst += oh * OW + ow; LOAD_OUT; @@ -520,8 +549,12 @@ struct do_pixel_proxy<7, height, width> { (void)IH; (void)OH; const int ih = oh, iw = ow; - GI_FLOAT32_t out0{0}, out1{0}, out2{0}, out3{0}, kr0, kr1, kr2, kr3, kr4, kr5, - kr6, inp; + GI_FLOAT32_t zero = GiZeroFloat32(); + GI_FLOAT32_t out0, out1, out2, out3, kr0, kr1, kr2, kr3, kr4, kr5, kr6, inp; + out0 = zero; + out1 = zero; + out2 = zero; + out3 = zero; src += ih * IW + iw; dst += oh * OW + ow; LOAD_OUT; diff --git a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern.cpp b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern.cpp index ef97e1e43..a911ea8bd 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern.cpp @@ -38,23 +38,23 @@ static inline void odd_even_split_iw8_even( const int src_offset = src_idx * ic_step; const int even_offset = iw_idx / 2 * ic_step; const int odd_offset = (odd_start + iw_idx / 2) * ic_step; - GI_FLOAT32_t temp[8]; - temp[0] = GiLoadFloat32(sptr + src_offset + 0 * ic_step); - temp[1] = GiLoadFloat32(sptr + src_offset + 1 * ic_step); - temp[2] = GiLoadFloat32(sptr + src_offset + 2 * ic_step); - temp[3] = GiLoadFloat32(sptr + src_offset + 3 * ic_step); - temp[4] = GiLoadFloat32(sptr + src_offset + 4 * ic_step); - temp[5] = GiLoadFloat32(sptr + src_offset + 5 * ic_step); - temp[6] = GiLoadFloat32(sptr + src_offset + 6 * ic_step); - temp[7] = GiLoadFloat32(sptr + src_offset + 7 * ic_step); - GiStoreFloat32(sptr_base + even_offset + 0 * ic_step, temp[0]); - GiStoreFloat32(sptr_base + even_offset + 1 * ic_step, temp[2]); - GiStoreFloat32(sptr_base + even_offset + 2 * ic_step, temp[4]); - GiStoreFloat32(sptr_base + even_offset + 3 * ic_step, temp[6]); - GiStoreFloat32(sptr_base + odd_offset + 0 * ic_step, temp[1]); - GiStoreFloat32(sptr_base + odd_offset + 1 * ic_step, temp[3]); - GiStoreFloat32(sptr_base + odd_offset + 2 * ic_step, temp[5]); - GiStoreFloat32(sptr_base + odd_offset + 3 * ic_step, temp[7]); + GI_FLOAT32_t a0, a1, a2, a3, a4, a5, a6, a7; + a0 = GiLoadFloat32(sptr + src_offset + 0 * ic_step); + a1 = GiLoadFloat32(sptr + src_offset + 1 * ic_step); + a2 = GiLoadFloat32(sptr + src_offset + 2 * ic_step); + a3 = GiLoadFloat32(sptr + src_offset + 3 * ic_step); + a4 = GiLoadFloat32(sptr + src_offset + 4 * ic_step); + a5 = GiLoadFloat32(sptr + src_offset + 5 * ic_step); + a6 = GiLoadFloat32(sptr + src_offset + 6 * ic_step); + a7 = GiLoadFloat32(sptr + src_offset + 7 * ic_step); + GiStoreFloat32(sptr_base + even_offset + 0 * ic_step, a0); + GiStoreFloat32(sptr_base + even_offset + 1 * ic_step, a2); + GiStoreFloat32(sptr_base + even_offset + 2 * ic_step, a4); + GiStoreFloat32(sptr_base + even_offset + 3 * ic_step, a6); + GiStoreFloat32(sptr_base + odd_offset + 0 * ic_step, a1); + GiStoreFloat32(sptr_base + odd_offset + 1 * ic_step, a3); + GiStoreFloat32(sptr_base + odd_offset + 2 * ic_step, a5); + GiStoreFloat32(sptr_base + odd_offset + 3 * ic_step, a7); } static inline void odd_even_split_iw8_odd( @@ -64,23 +64,23 @@ static inline void odd_even_split_iw8_odd( const int src_offset = src_idx * ic_step; const int even_offset = (iw_idx + 1) / 2 * ic_step; const int odd_offset = (odd_start + iw_idx / 2) * ic_step; - GI_FLOAT32_t temp[8]; - temp[0] = GiLoadFloat32(sptr + src_offset + 0 * ic_step); - temp[1] = GiLoadFloat32(sptr + src_offset + 1 * ic_step); - temp[2] = GiLoadFloat32(sptr + src_offset + 2 * ic_step); - temp[3] = GiLoadFloat32(sptr + src_offset + 3 * ic_step); - temp[4] = GiLoadFloat32(sptr + src_offset + 4 * ic_step); - temp[5] = GiLoadFloat32(sptr + src_offset + 5 * ic_step); - temp[6] = GiLoadFloat32(sptr + src_offset + 6 * ic_step); - temp[7] = GiLoadFloat32(sptr + src_offset + 7 * ic_step); - GiStoreFloat32(sptr_base + odd_offset + 0 * ic_step, temp[0]); - GiStoreFloat32(sptr_base + odd_offset + 1 * ic_step, temp[2]); - GiStoreFloat32(sptr_base + odd_offset + 2 * ic_step, temp[4]); - GiStoreFloat32(sptr_base + odd_offset + 3 * ic_step, temp[6]); - GiStoreFloat32(sptr_base + even_offset + 0 * ic_step, temp[1]); - GiStoreFloat32(sptr_base + even_offset + 1 * ic_step, temp[3]); - GiStoreFloat32(sptr_base + even_offset + 2 * ic_step, temp[5]); - GiStoreFloat32(sptr_base + even_offset + 3 * ic_step, temp[7]); + GI_FLOAT32_t a0, a1, a2, a3, a4, a5, a6, a7; + a0 = GiLoadFloat32(sptr + src_offset + 0 * ic_step); + a1 = GiLoadFloat32(sptr + src_offset + 1 * ic_step); + a2 = GiLoadFloat32(sptr + src_offset + 2 * ic_step); + a3 = GiLoadFloat32(sptr + src_offset + 3 * ic_step); + a4 = GiLoadFloat32(sptr + src_offset + 4 * ic_step); + a5 = GiLoadFloat32(sptr + src_offset + 5 * ic_step); + a6 = GiLoadFloat32(sptr + src_offset + 6 * ic_step); + a7 = GiLoadFloat32(sptr + src_offset + 7 * ic_step); + GiStoreFloat32(sptr_base + odd_offset + 0 * ic_step, a0); + GiStoreFloat32(sptr_base + odd_offset + 1 * ic_step, a2); + GiStoreFloat32(sptr_base + odd_offset + 2 * ic_step, a4); + GiStoreFloat32(sptr_base + odd_offset + 3 * ic_step, a6); + GiStoreFloat32(sptr_base + even_offset + 0 * ic_step, a1); + GiStoreFloat32(sptr_base + even_offset + 1 * ic_step, a3); + GiStoreFloat32(sptr_base + even_offset + 2 * ic_step, a5); + GiStoreFloat32(sptr_base + even_offset + 3 * ic_step, a7); } } // namespace diff --git a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h index 240cf1b46..ba4f14a59 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h +++ b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h @@ -25,14 +25,20 @@ struct ShiftCalHelper { }; #define cb2(step, lane, ow_block) \ - c[0][step] = GiSimdFmaLane( \ - c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane); \ - c[1][step] = GiSimdFmaLane( \ - c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane); - -#define cb(step, lane, ow_block) \ - c[0][step] = GiSimdFmaLane( \ - c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane); + c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(c[0][step]), \ + GiFixLenType2GiFloat32Type(weight[0][lane]), \ + GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \ + c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(c[1][step]), \ + GiFixLenType2GiFloat32Type(weight[1][lane]), \ + GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); + +#define cb(step, lane, ow_block) \ + c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(c[0][step]), \ + GiFixLenType2GiFloat32Type(weight[0][lane]), \ + GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); #define SHIFT_CAL_HELPER(ow_block, remain_w) \ template < \ @@ -133,19 +139,20 @@ struct KerGiXXs1Nchw44FP32 { const int ld_src_ic = ih * iw; const int ld_src_iw = iw * oc_step; constexpr int c_dim = OCHelper::val; - GI_FLOAT32_t c[c_dim][ow_block]; + GI_FLOAT32_FIXLEN_t c[c_dim][ow_block]; init_ocx_ow8(c, bias_ptr, ld_bias); for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { - GI_FLOAT32_t src[ow_block]; - GI_FLOAT32_t weight[c_dim][ic_step]; + GI_FLOAT32_FIXLEN_t src[ow_block]; + GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; load_helper(src, src_ptr, 0); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr + (ow_block)*ic_step); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block)*ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -175,23 +182,25 @@ struct KerGiXXs1Nchw44FP32 { const int ld_src_ic = ih * iw; const int ld_src_iw = iw * oc_step; constexpr int c_dim = OCHelper::val; - GI_FLOAT32_t c[c_dim][ow_block]; + GI_FLOAT32_FIXLEN_t c[c_dim][ow_block]; init_ocx_ow8(c, bias_ptr, ld_bias); for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { - GI_FLOAT32_t src[ow_block]; - GI_FLOAT32_t weight[c_dim][ic_step]; + GI_FLOAT32_FIXLEN_t src[ow_block]; + GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; load_helper(src, src_ptr, 0); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr + (ow_block)*ic_step); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block)*ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[1] = GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step); + src[1] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -220,35 +229,39 @@ struct KerGiXXs1Nchw44FP32 { const int ld_src_ic = ih * iw; const int ld_src_iw = iw * oc_step; constexpr int c_dim = OCHelper::val; - GI_FLOAT32_t c[c_dim][ow_block]; + GI_FLOAT32_FIXLEN_t c[c_dim][ow_block]; init_ocx_ow8(c, bias_ptr, ld_bias); for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { - GI_FLOAT32_t src[ow_block]; - GI_FLOAT32_t weight[c_dim][ic_step]; + GI_FLOAT32_FIXLEN_t src[ow_block]; + GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; load_helper(src, src_ptr, 0); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr + (ow_block)*ic_step); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block)*ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[1] = GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step); + src[1] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[2] = GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step); + src[2] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[3] = GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step); + src[3] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -278,45 +291,51 @@ struct KerGiXXs1Nchw44FP32 { const int ld_src_ic = ih * iw; const int ld_src_iw = iw * oc_step; constexpr int c_dim = OCHelper::val; - GI_FLOAT32_t c[c_dim][ow_block]; + GI_FLOAT32_FIXLEN_t c[c_dim][ow_block]; init_ocx_ow8(c, bias_ptr, ld_bias); for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { - GI_FLOAT32_t src[ow_block]; - GI_FLOAT32_t weight[c_dim][ic_step]; + GI_FLOAT32_FIXLEN_t src[ow_block]; + GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step]; load_helper(src, src_ptr, 0); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr + (ow_block)*ic_step); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block)*ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[1] = GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step); + src[1] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[2] = GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step); + src[2] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[3] = GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step); + src[3] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[4] = GiLoadFloat32(src_ptr + (ow_block + 4) * ic_step); + src[4] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 4) * ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<5, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[5] = GiLoadFloat32(src_ptr + (ow_block + 5) * ic_step); + src[5] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 5) * ic_step)); load_helper( weight, weight_ptr, ld_weight_oc); cal_helper<6, 0, c_dim, ow_block, remain_w>(c, src, weight); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h index f2d5fae21..e33e2538b 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h +++ b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h @@ -25,14 +25,20 @@ struct ShiftCalHelper { }; #define cb2(step, lane, ow_block) \ - c[0][step] = GiSimdFmaLane( \ - c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane); \ - c[1][step] = GiSimdFmaLane( \ - c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane); - -#define cb(step, lane, ow_block) \ - c[0][step] = GiSimdFmaLane( \ - c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane); + c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(c[0][step]), \ + GiFixLenType2GiFloat32Type(weight[0][lane]), \ + GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \ + c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(c[1][step]), \ + GiFixLenType2GiFloat32Type(weight[1][lane]), \ + GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); + +#define cb(step, lane, ow_block) \ + c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(c[0][step]), \ + GiFixLenType2GiFloat32Type(weight[0][lane]), \ + GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); #define SHIFT_CAL_HELPER(ow_block, remain_w) \ template < \ @@ -133,15 +139,15 @@ struct KerGiXXs2Nchw44FP32 { const int ld_src_ic = ih * iw; const int ld_src_iw = iw * oc_step; constexpr int c_dim = OCHelper::val; - GI_FLOAT32_t c[c_dim][ow_block]; + GI_FLOAT32_FIXLEN_t c[c_dim][ow_block]; init_ocx_ow8(c, bias_ptr, ld_bias); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; - GI_FLOAT32_t src[ow_block]; - GI_FLOAT32_t weight[c_dim][4]; + GI_FLOAT32_FIXLEN_t src[ow_block]; + GI_FLOAT32_FIXLEN_t weight[c_dim][4]; /////////row 0///////////// load_helper(src, src_ptr, 0); load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( @@ -191,21 +197,22 @@ struct KerGiXXs2Nchw44FP32 { const int ld_src_ic = ih * iw; const int ld_src_iw = iw * oc_step; constexpr int c_dim = OCHelper::val; - GI_FLOAT32_t c[c_dim][ow_block]; + GI_FLOAT32_FIXLEN_t c[c_dim][ow_block]; init_ocx_ow8(c, bias_ptr, ld_bias); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic; const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; - GI_FLOAT32_t src[ow_block]; - GI_FLOAT32_t weight[c_dim][4]; + GI_FLOAT32_FIXLEN_t src[ow_block]; + GI_FLOAT32_FIXLEN_t weight[c_dim][4]; /////////row 0///////////// load_helper(src, src_ptr, 0); load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr + ow_block * simd_len); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + ow_block * simd_len)); load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -222,7 +229,8 @@ struct KerGiXXs2Nchw44FP32 { load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr + ow_block * simd_len); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + ow_block * simd_len)); load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -239,7 +247,8 @@ struct KerGiXXs2Nchw44FP32 { load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr + ow_block * simd_len); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + ow_block * simd_len)); load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); @@ -275,7 +284,7 @@ struct KerGiXXs2Nchw44FP32 { const int ld_src_ic = ih * iw; const int ld_src_iw = iw * oc_step; constexpr int c_dim = OCHelper::val; - GI_FLOAT32_t c[c_dim][ow_block]; + GI_FLOAT32_FIXLEN_t c[c_dim][ow_block]; init_ocx_ow8(c, bias_ptr, ld_bias); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { @@ -283,18 +292,20 @@ struct KerGiXXs2Nchw44FP32 { const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { - GI_FLOAT32_t src[ow_block]; - GI_FLOAT32_t weight[c_dim][4]; + GI_FLOAT32_FIXLEN_t src[ow_block]; + GI_FLOAT32_FIXLEN_t weight[c_dim][4]; // even element load_helper(src, src_ptr, 0); load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr + ow_block * simd_len); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + ow_block * simd_len)); load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[1] = GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len); + src[1] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len)); load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -303,7 +314,8 @@ struct KerGiXXs2Nchw44FP32 { load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr_odd + ow_block * simd_len); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr_odd + ow_block * simd_len)); load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -340,7 +352,7 @@ struct KerGiXXs2Nchw44FP32 { const int ld_src_ic = ih * iw; const int ld_src_iw = iw * oc_step; constexpr int c_dim = OCHelper::val; - GI_FLOAT32_t c[c_dim][ow_block]; + GI_FLOAT32_FIXLEN_t c[c_dim][ow_block]; init_ocx_ow8(c, bias_ptr, ld_bias); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { @@ -348,22 +360,25 @@ struct KerGiXXs2Nchw44FP32 { const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic; for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) { - GI_FLOAT32_t src[ow_block]; - GI_FLOAT32_t weight[c_dim][4]; + GI_FLOAT32_FIXLEN_t src[ow_block]; + GI_FLOAT32_FIXLEN_t weight[c_dim][4]; // even element load_helper(src, src_ptr, 0); load_helper<4, 0, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr + ow_block * simd_len); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + ow_block * simd_len)); load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[1] = GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len); + src[1] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len)); load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[2] = GiLoadFloat32(src_ptr + (ow_block + 2) * simd_len); + src[2] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr + (ow_block + 2) * simd_len)); load_helper<4, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight); @@ -372,11 +387,13 @@ struct KerGiXXs2Nchw44FP32 { load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[0] = GiLoadFloat32(src_ptr_odd + ow_block * simd_len); + src[0] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr_odd + ow_block * simd_len)); load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight); - src[1] = GiLoadFloat32(src_ptr_odd + (ow_block + 1) * simd_len); + src[1] = GiFloat32Type2FixLenType( + GiLoadFloat32(src_ptr_odd + (ow_block + 1) * simd_len)); load_helper<4, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>( weight, weight_ptr, ld_weight_oc); cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h index a4c4abace..2a434158e 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h +++ b/dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h @@ -37,18 +37,24 @@ struct ShiftCalHelper { static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {} }; -#define cb(step) \ - c[0][step] = GiSimdFmaLane( \ - c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \ - (step * stride + src_idx) % 4); \ - c[1][step] = GiSimdFmaLane( \ - c[1][step], weight[1][weight_idx], src[(step * stride + src_idx) / 4], \ - (step * stride + src_idx) % 4); - -#define cb2(step) \ - c[0][step] = GiSimdFmaLane( \ - c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \ - (step * stride + src_idx) % 4); +#define cb(step) \ + c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(c[0][step]), \ + GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ + GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ + (step * stride + src_idx) % 4)); \ + c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(c[1][step]), \ + GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \ + GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ + (step * stride + src_idx) % 4)); + +#define cb2(step) \ + c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \ + GiFixLenType2GiFloat32Type(c[0][step]), \ + GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \ + GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \ + (step * stride + src_idx) % 4)); #define SHIFT_CAL_HELPER(ow_remain) \ template < \ @@ -141,12 +147,12 @@ struct KerGiXXs2NchwNchw44FP32::val; - GI_FLOAT32_t c[c_dim][8]; + GI_FLOAT32_FIXLEN_t c[c_dim][8]; init_ocx_ow8(c, bias_ptr, oc_step); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { - GI_FLOAT32_t src[src_reg_size]; - GI_FLOAT32_t weight[c_dim][filter_size]; + GI_FLOAT32_FIXLEN_t src[src_reg_size]; + GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size]; #define KERNEL_CB(step) \ load_helper(src, src_ptr + step * iw, 0); \ @@ -190,12 +196,12 @@ struct KerGiXXs2NchwNchw44FP32::val; - GI_FLOAT32_t c[c_dim][8]; + GI_FLOAT32_FIXLEN_t c[c_dim][8]; init_ocx_ow8(c, bias_ptr, oc_step); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { - GI_FLOAT32_t src[src_reg_size]; - GI_FLOAT32_t weight[c_dim][filter_size]; + GI_FLOAT32_FIXLEN_t src[src_reg_size]; + GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size]; #define KERNEL_CB(step) \ load_helper(src, src_ptr + step * iw, 0); \ @@ -236,12 +242,12 @@ struct KerGiXXs2NchwNchw44FP32::val; - GI_FLOAT32_t c[c_dim][8]; + GI_FLOAT32_FIXLEN_t c[c_dim][8]; init_ocx_ow8(c, bias_ptr, oc_step); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { - GI_FLOAT32_t src[src_reg_size]; - GI_FLOAT32_t weight[c_dim][filter_size]; + GI_FLOAT32_FIXLEN_t src[src_reg_size]; + GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size]; // row 0 load_helper(src, src_ptr, 0); load_helper( @@ -295,7 +301,7 @@ struct KerGiXXs2NchwNchw44FP32 { const int ld_src_ic_skip_bytes = iw * (ih - filter_size) * sizeof(float) + iw_skip_bytes; constexpr int c_dim = OCHelper::val; - GI_FLOAT32_t c[1][8]; + GI_FLOAT32_FIXLEN_t c[1][8]; init_ocx_ow8(c, bias_ptr, oc_step); const int img_stride = ih * iw; constexpr int filter_stride = filter_size * filter_size * oc_step; @@ -467,7 +473,7 @@ struct KerGiXXs2NchwNchw44FP32::val; - GI_FLOAT32_t c[1][8]; + GI_FLOAT32_FIXLEN_t c[1][8]; init_ocx_ow8(c, bias_ptr, oc_step); /** * c q8-q15 @@ -627,12 +633,12 @@ struct KerGiXXs2NchwNchw44FP32::val; - GI_FLOAT32_t c[c_dim][8]; + GI_FLOAT32_FIXLEN_t c[c_dim][8]; init_ocx_ow8(c, bias_ptr, oc_step); for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) { - GI_FLOAT32_t src[src_reg_size]; - GI_FLOAT32_t weight[c_dim][filter_size]; + GI_FLOAT32_FIXLEN_t src[src_reg_size]; + GI_FLOAT32_FIXLEN_t weight[c_dim][filter_size]; // row 0 load_helper(src, src_ptr, 0); load_helper( diff --git a/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp b/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp index bbac1bf08..d45f9610b 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp @@ -38,16 +38,16 @@ void conv_stride2::do_conv_2x2_stride2( GI_FLOAT32_V2_t _r0 = GiLd2qFloat32(r0); - GI_FLOAT32_t _r00 = _r0.val[0]; // 0 2 4 6 - GI_FLOAT32_t _r01 = _r0.val[1]; // 1 3 5 7 + GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6 + GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7 _outp = GiSimdFmaLane(_outp, _r00, _k0123, 0); _outp = GiSimdFmaLane(_outp, _r01, _k0123, 1); GI_FLOAT32_V2_t _r1 = GiLd2qFloat32(r1); - GI_FLOAT32_t _r10 = _r1.val[0]; - GI_FLOAT32_t _r11 = _r1.val[1]; + GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0); + GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1); _outp = GiSimdFmaLane(_outp, _r10, _k0123, 2); _outp = GiSimdFmaLane(_outp, _r11, _k0123, 3); @@ -97,9 +97,10 @@ void conv_stride2::do_conv_3x3_stride2( GI_FLOAT32_V2_t _r0 = GiLd2qFloat32(r0); GI_FLOAT32_V2_t _r0n = GiLd2qFloat32(r0 + 8); - GI_FLOAT32_t _r00 = _r0.val[0]; // 0 2 4 6 - GI_FLOAT32_t _r01 = _r0.val[1]; // 1 3 5 7 - GI_FLOAT32_t _r02 = GiExtqFloat32(_r00, _r0n.val[0], 1); // 2 4 6 8 + GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6 + GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7 + GI_FLOAT32_t _r02 = GiExtqFloat32( + _r00, GiGetSubVectorFloat32V2(_r0n, 0), 1); // 2 4 6 8 _outp = GiSimdFmaLane(_outp, _r00, _k0123, 0); _outp = GiSimdFmaLane(_outp, _r01, _k0123, 1); @@ -108,9 +109,10 @@ void conv_stride2::do_conv_3x3_stride2( GI_FLOAT32_V2_t _r1 = GiLd2qFloat32(r1); GI_FLOAT32_V2_t _r1n = GiLd2qFloat32(r1 + 8); - GI_FLOAT32_t _r10 = _r1.val[0]; - GI_FLOAT32_t _r11 = _r1.val[1]; - GI_FLOAT32_t _r12 = GiExtqFloat32(_r10, _r1n.val[0], 1); + GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0); + GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1); + GI_FLOAT32_t _r12 = + GiExtqFloat32(_r10, GiGetSubVectorFloat32V2(_r1n, 0), 1); _outp = GiSimdFmaLane(_outp, _r10, _k3456, 0); _outp = GiSimdFmaLane(_outp, _r11, _k3456, 1); @@ -119,9 +121,10 @@ void conv_stride2::do_conv_3x3_stride2( GI_FLOAT32_V2_t _r2 = GiLd2qFloat32(r2); GI_FLOAT32_V2_t _r2n = GiLd2qFloat32(r2 + 8); - GI_FLOAT32_t _r20 = _r2.val[0]; - GI_FLOAT32_t _r21 = _r2.val[1]; - GI_FLOAT32_t _r22 = GiExtqFloat32(_r20, _r2n.val[0], 1); + GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r2, 0); + GI_FLOAT32_t _r21 = GiGetSubVectorFloat32V2(_r2, 1); + GI_FLOAT32_t _r22 = + GiExtqFloat32(_r20, GiGetSubVectorFloat32V2(_r2n, 0), 1); _outp = GiSimdFmaLane(_outp, _r20, _k6789, 0); _outp = GiSimdFmaLane(_outp, _r21, _k6789, 1); @@ -175,50 +178,54 @@ void conv_stride2::do_conv_5x5_stride2( GI_FLOAT32_V2_t _r00_02461357 = GiLd2qFloat32(r0); GI_FLOAT32_V2_t _r00nx2 = GiLd2qFloat32(r0 + 8); - GI_FLOAT32_t _r0_8101214 = _r00nx2.val[0]; // 8 10 12 14 - GI_FLOAT32_t _r0_9111315 = _r00nx2.val[1]; // 9 11 13 15 - GI_FLOAT32_t _r00 = _r00_02461357.val[0]; // 0 2 4 6 - GI_FLOAT32_t _r01 = _r00_02461357.val[1]; // 1 3 5 7 + GI_FLOAT32_t _r0_8101214 = + GiGetSubVectorFloat32V2(_r00nx2, 0); // 8 10 12 14 + GI_FLOAT32_t _r0_9111315 = + GiGetSubVectorFloat32V2(_r00nx2, 1); // 9 11 13 15 + GI_FLOAT32_t _r00 = + GiGetSubVectorFloat32V2(_r00_02461357, 0); // 0 2 4 6 + GI_FLOAT32_t _r01 = + GiGetSubVectorFloat32V2(_r00_02461357, 1); // 1 3 5 7 GI_FLOAT32_t _r02 = GiExtqFloat32(_r00, _r0_8101214, 1); // 2 4 6 8 GI_FLOAT32_t _r03 = GiExtqFloat32(_r01, _r0_9111315, 1); // 3 5 7 9 GI_FLOAT32_t _r04 = GiExtqFloat32(_r00, _r0_8101214, 2); // 4 6 8 10 GI_FLOAT32_V2_t _r10_02461357 = GiLd2qFloat32(r1); GI_FLOAT32_V2_t _r10nx2 = GiLd2qFloat32(r1 + 8); - GI_FLOAT32_t _r1_8101214 = _r10nx2.val[0]; - GI_FLOAT32_t _r1_9111315 = _r10nx2.val[1]; - GI_FLOAT32_t _r10 = _r10_02461357.val[0]; - GI_FLOAT32_t _r11 = _r10_02461357.val[1]; + GI_FLOAT32_t _r1_8101214 = GiGetSubVectorFloat32V2(_r10nx2, 0); + GI_FLOAT32_t _r1_9111315 = GiGetSubVectorFloat32V2(_r10nx2, 1); + GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r10_02461357, 0); + GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r10_02461357, 1); GI_FLOAT32_t _r12 = GiExtqFloat32(_r10, _r1_8101214, 1); GI_FLOAT32_t _r13 = GiExtqFloat32(_r11, _r1_9111315, 1); GI_FLOAT32_t _r14 = GiExtqFloat32(_r10, _r1_8101214, 2); GI_FLOAT32_V2_t _r20_02461357 = GiLd2qFloat32(r2); GI_FLOAT32_V2_t _r20nx2 = GiLd2qFloat32(r2 + 8); - GI_FLOAT32_t _r2_8101214 = _r20nx2.val[0]; - GI_FLOAT32_t _r2_9111315 = _r20nx2.val[1]; - GI_FLOAT32_t _r20 = _r20_02461357.val[0]; - GI_FLOAT32_t _r21 = _r20_02461357.val[1]; + GI_FLOAT32_t _r2_8101214 = GiGetSubVectorFloat32V2(_r20nx2, 0); + GI_FLOAT32_t _r2_9111315 = GiGetSubVectorFloat32V2(_r20nx2, 1); + GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r20_02461357, 0); + GI_FLOAT32_t _r21 = GiGetSubVectorFloat32V2(_r20_02461357, 1); GI_FLOAT32_t _r22 = GiExtqFloat32(_r20, _r2_8101214, 1); GI_FLOAT32_t _r23 = GiExtqFloat32(_r21, _r2_9111315, 1); GI_FLOAT32_t _r24 = GiExtqFloat32(_r20, _r2_8101214, 2); GI_FLOAT32_V2_t _r30_02461357 = GiLd2qFloat32(r3); GI_FLOAT32_V2_t _r30nx2 = GiLd2qFloat32(r3 + 8); - GI_FLOAT32_t _r3_8101214 = _r30nx2.val[0]; - GI_FLOAT32_t _r3_9111315 = _r30nx2.val[1]; - GI_FLOAT32_t _r30 = _r30_02461357.val[0]; - GI_FLOAT32_t _r31 = _r30_02461357.val[1]; + GI_FLOAT32_t _r3_8101214 = GiGetSubVectorFloat32V2(_r30nx2, 0); + GI_FLOAT32_t _r3_9111315 = GiGetSubVectorFloat32V2(_r30nx2, 1); + GI_FLOAT32_t _r30 = GiGetSubVectorFloat32V2(_r30_02461357, 0); + GI_FLOAT32_t _r31 = GiGetSubVectorFloat32V2(_r30_02461357, 1); GI_FLOAT32_t _r32 = GiExtqFloat32(_r30, _r3_8101214, 1); GI_FLOAT32_t _r33 = GiExtqFloat32(_r31, _r3_9111315, 1); GI_FLOAT32_t _r34 = GiExtqFloat32(_r30, _r3_8101214, 2); GI_FLOAT32_V2_t _r40_02461357 = GiLd2qFloat32(r4); GI_FLOAT32_V2_t _r40nx2 = GiLd2qFloat32(r4 + 8); - GI_FLOAT32_t _r4_8101214 = _r40nx2.val[0]; - GI_FLOAT32_t _r4_9111315 = _r40nx2.val[1]; - GI_FLOAT32_t _r40 = _r40_02461357.val[0]; - GI_FLOAT32_t _r41 = _r40_02461357.val[1]; + GI_FLOAT32_t _r4_8101214 = GiGetSubVectorFloat32V2(_r40nx2, 0); + GI_FLOAT32_t _r4_9111315 = GiGetSubVectorFloat32V2(_r40nx2, 1); + GI_FLOAT32_t _r40 = GiGetSubVectorFloat32V2(_r40_02461357, 0); + GI_FLOAT32_t _r41 = GiGetSubVectorFloat32V2(_r40_02461357, 1); GI_FLOAT32_t _r42 = GiExtqFloat32(_r40, _r4_8101214, 1); GI_FLOAT32_t _r43 = GiExtqFloat32(_r41, _r4_9111315, 1); GI_FLOAT32_t _r44 = GiExtqFloat32(_r40, _r4_8101214, 2); @@ -310,10 +317,14 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_V2_t _r00_02461357 = GiLd2qFloat32(r0); GI_FLOAT32_V2_t _r00nx2 = GiLd2qFloat32(r0 + 8); - GI_FLOAT32_t _r0_8101214 = _r00nx2.val[0]; // 8 10 12 14 - GI_FLOAT32_t _r0_9111315 = _r00nx2.val[1]; // 9 11 13 15 - GI_FLOAT32_t _r00 = _r00_02461357.val[0]; // 0 2 4 6 - GI_FLOAT32_t _r01 = _r00_02461357.val[1]; // 1 3 5 7 + GI_FLOAT32_t _r0_8101214 = + GiGetSubVectorFloat32V2(_r00nx2, 0); // 8 10 12 14 + GI_FLOAT32_t _r0_9111315 = + GiGetSubVectorFloat32V2(_r00nx2, 1); // 9 11 13 15 + GI_FLOAT32_t _r00 = + GiGetSubVectorFloat32V2(_r00_02461357, 0); // 0 2 4 6 + GI_FLOAT32_t _r01 = + GiGetSubVectorFloat32V2(_r00_02461357, 1); // 1 3 5 7 GI_FLOAT32_t _r02 = GiExtqFloat32(_r00, _r0_8101214, 1); // 2 4 6 8 GI_FLOAT32_t _r03 = GiExtqFloat32(_r01, _r0_9111315, 1); // 3 5 7 9 GI_FLOAT32_t _r04 = GiExtqFloat32(_r00, _r0_8101214, 2); // 4 6 8 10 @@ -333,10 +344,10 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_V2_t _r10_02461357 = GiLd2qFloat32(r1); GI_FLOAT32_V2_t _r10nx2 = GiLd2qFloat32(r1 + 8); - GI_FLOAT32_t _r1_8101214 = _r10nx2.val[0]; - GI_FLOAT32_t _r1_9111315 = _r10nx2.val[1]; - GI_FLOAT32_t _r10 = _r10_02461357.val[0]; - GI_FLOAT32_t _r11 = _r10_02461357.val[1]; + GI_FLOAT32_t _r1_8101214 = GiGetSubVectorFloat32V2(_r10nx2, 0); + GI_FLOAT32_t _r1_9111315 = GiGetSubVectorFloat32V2(_r10nx2, 1); + GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r10_02461357, 0); + GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r10_02461357, 1); GI_FLOAT32_t _r12 = GiExtqFloat32(_r10, _r1_8101214, 1); GI_FLOAT32_t _r13 = GiExtqFloat32(_r11, _r1_9111315, 1); GI_FLOAT32_t _r14 = GiExtqFloat32(_r10, _r1_8101214, 2); @@ -356,10 +367,10 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_V2_t _r20_02461357 = GiLd2qFloat32(r2); GI_FLOAT32_V2_t _r20nx2 = GiLd2qFloat32(r2 + 8); - GI_FLOAT32_t _r2_8101214 = _r20nx2.val[0]; - GI_FLOAT32_t _r2_9111315 = _r20nx2.val[1]; - GI_FLOAT32_t _r20 = _r20_02461357.val[0]; - GI_FLOAT32_t _r21 = _r20_02461357.val[1]; + GI_FLOAT32_t _r2_8101214 = GiGetSubVectorFloat32V2(_r20nx2, 0); + GI_FLOAT32_t _r2_9111315 = GiGetSubVectorFloat32V2(_r20nx2, 1); + GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r20_02461357, 0); + GI_FLOAT32_t _r21 = GiGetSubVectorFloat32V2(_r20_02461357, 1); GI_FLOAT32_t _r22 = GiExtqFloat32(_r20, _r2_8101214, 1); GI_FLOAT32_t _r23 = GiExtqFloat32(_r21, _r2_9111315, 1); GI_FLOAT32_t _r24 = GiExtqFloat32(_r20, _r2_8101214, 2); @@ -379,10 +390,10 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_V2_t _r30_02461357 = GiLd2qFloat32(r3); GI_FLOAT32_V2_t _r30nx2 = GiLd2qFloat32(r3 + 8); - GI_FLOAT32_t _r3_8101214 = _r30nx2.val[0]; - GI_FLOAT32_t _r3_9111315 = _r30nx2.val[1]; - GI_FLOAT32_t _r30 = _r30_02461357.val[0]; - GI_FLOAT32_t _r31 = _r30_02461357.val[1]; + GI_FLOAT32_t _r3_8101214 = GiGetSubVectorFloat32V2(_r30nx2, 0); + GI_FLOAT32_t _r3_9111315 = GiGetSubVectorFloat32V2(_r30nx2, 1); + GI_FLOAT32_t _r30 = GiGetSubVectorFloat32V2(_r30_02461357, 0); + GI_FLOAT32_t _r31 = GiGetSubVectorFloat32V2(_r30_02461357, 1); GI_FLOAT32_t _r32 = GiExtqFloat32(_r30, _r3_8101214, 1); GI_FLOAT32_t _r33 = GiExtqFloat32(_r31, _r3_9111315, 1); GI_FLOAT32_t _r34 = GiExtqFloat32(_r30, _r3_8101214, 2); @@ -402,10 +413,10 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_V2_t _r40_02461357 = GiLd2qFloat32(r4); GI_FLOAT32_V2_t _r40nx2 = GiLd2qFloat32(r4 + 8); - GI_FLOAT32_t _r4_8101214 = _r40nx2.val[0]; - GI_FLOAT32_t _r4_9111315 = _r40nx2.val[1]; - GI_FLOAT32_t _r40 = _r40_02461357.val[0]; - GI_FLOAT32_t _r41 = _r40_02461357.val[1]; + GI_FLOAT32_t _r4_8101214 = GiGetSubVectorFloat32V2(_r40nx2, 0); + GI_FLOAT32_t _r4_9111315 = GiGetSubVectorFloat32V2(_r40nx2, 1); + GI_FLOAT32_t _r40 = GiGetSubVectorFloat32V2(_r40_02461357, 0); + GI_FLOAT32_t _r41 = GiGetSubVectorFloat32V2(_r40_02461357, 1); GI_FLOAT32_t _r42 = GiExtqFloat32(_r40, _r4_8101214, 1); GI_FLOAT32_t _r43 = GiExtqFloat32(_r41, _r4_9111315, 1); GI_FLOAT32_t _r44 = GiExtqFloat32(_r40, _r4_8101214, 2); @@ -425,10 +436,10 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_V2_t _r50_02461357 = GiLd2qFloat32(r5); GI_FLOAT32_V2_t _r50nx2 = GiLd2qFloat32(r5 + 8); - GI_FLOAT32_t _r5_8101214 = _r50nx2.val[0]; - GI_FLOAT32_t _r5_9111315 = _r50nx2.val[1]; - GI_FLOAT32_t _r50 = _r50_02461357.val[0]; - GI_FLOAT32_t _r51 = _r50_02461357.val[1]; + GI_FLOAT32_t _r5_8101214 = GiGetSubVectorFloat32V2(_r50nx2, 0); + GI_FLOAT32_t _r5_9111315 = GiGetSubVectorFloat32V2(_r50nx2, 1); + GI_FLOAT32_t _r50 = GiGetSubVectorFloat32V2(_r50_02461357, 0); + GI_FLOAT32_t _r51 = GiGetSubVectorFloat32V2(_r50_02461357, 1); GI_FLOAT32_t _r52 = GiExtqFloat32(_r50, _r5_8101214, 1); GI_FLOAT32_t _r53 = GiExtqFloat32(_r51, _r5_9111315, 1); GI_FLOAT32_t _r54 = GiExtqFloat32(_r50, _r5_8101214, 2); @@ -448,10 +459,10 @@ void conv_stride2::do_conv_7x7_stride2( GI_FLOAT32_V2_t _r60_02461357 = GiLd2qFloat32(r6); GI_FLOAT32_V2_t _r60nx2 = GiLd2qFloat32(r6 + 8); - GI_FLOAT32_t _r6_8101214 = _r60nx2.val[0]; - GI_FLOAT32_t _r6_9111315 = _r60nx2.val[1]; - GI_FLOAT32_t _r60 = _r60_02461357.val[0]; - GI_FLOAT32_t _r61 = _r60_02461357.val[1]; + GI_FLOAT32_t _r6_8101214 = GiGetSubVectorFloat32V2(_r60nx2, 0); + GI_FLOAT32_t _r6_9111315 = GiGetSubVectorFloat32V2(_r60nx2, 1); + GI_FLOAT32_t _r60 = GiGetSubVectorFloat32V2(_r60_02461357, 0); + GI_FLOAT32_t _r61 = GiGetSubVectorFloat32V2(_r60_02461357, 1); GI_FLOAT32_t _r62 = GiExtqFloat32(_r60, _r6_8101214, 1); GI_FLOAT32_t _r63 = GiExtqFloat32(_r61, _r6_9111315, 1); GI_FLOAT32_t _r64 = GiExtqFloat32(_r60, _r6_8101214, 2); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/filter_transform.h b/dnn/src/fallback/conv_bias/gi/fp32/filter_transform.h index cae0245d7..de1d15ce6 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/filter_transform.h +++ b/dnn/src/fallback/conv_bias/gi/fp32/filter_transform.h @@ -54,7 +54,8 @@ struct FilterTransform6X3 { Vector g2 = Vector::load(fptr + 6 - 1); GI_FLOAT32_t zeros = GiZeroFloat32(); - g2.value = GiExtqFloat32(g2.value, zeros, 1); + g2.value = GiFloat32Type2FixLenType( + GiExtqFloat32(GiFixLenType2GiFloat32Type(g2.value), zeros, 1)); #define cb(i) Vector wd##i; UNROLL_CALL_NOWRAPPER(8, cb); @@ -115,7 +116,8 @@ struct FilterTransform6X3 { mid_buf1[7] = GET_VECTOR_ELEM(wd, i, 2); \ mid_buf1 += 8; \ } while (0); -#define GET_VECTOR_ELEM(s, i, idx) GiExtractLane##idx##Float32(CONCAT(s, i).value) +#define GET_VECTOR_ELEM(s, i, idx) \ + GiExtractLane##idx##Float32(GiFixLenType2GiFloat32Type(CONCAT(s, i).value)) float* mid_buf1 = transform_mid_buf; UNROLL_CALL_NOWRAPPER(8, cb); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/helper.h b/dnn/src/fallback/conv_bias/gi/fp32/helper.h index 4a93fb673..2b38f8ef7 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/helper.h +++ b/dnn/src/fallback/conv_bias/gi/fp32/helper.h @@ -6,18 +6,22 @@ namespace megdnn { namespace fallback { inline void transpose_4x4(const float* src, float* dst, int lda, int ldb) { GI_FLOAT32_V2_t a0, a1; - a0.val[0] = GiLoadFloat32(src + 0 * lda); - a0.val[1] = GiLoadFloat32(src + 1 * lda); - a1.val[0] = GiLoadFloat32(src + 2 * lda); - a1.val[1] = GiLoadFloat32(src + 3 * lda); - GI_FLOAT32_V2_t b0 = GiZipqFloat32(a0.val[0], a1.val[0]); - GI_FLOAT32_V2_t b1 = GiZipqFloat32(a0.val[1], a1.val[1]); - GI_FLOAT32_V2_t c0 = GiZipqFloat32(b0.val[0], b1.val[0]); - GI_FLOAT32_V2_t c1 = GiZipqFloat32(b0.val[1], b1.val[1]); - GiStoreFloat32(dst + 0 * ldb, c0.val[0]); - GiStoreFloat32(dst + 1 * ldb, c0.val[1]); - GiStoreFloat32(dst + 2 * ldb, c1.val[0]); - GiStoreFloat32(dst + 3 * ldb, c1.val[1]); + GiSetSubVectorFloat32V2(a0, 0, GiLoadFloat32(src + 0 * lda)); + GiSetSubVectorFloat32V2(a0, 1, GiLoadFloat32(src + 1 * lda)); + GiSetSubVectorFloat32V2(a1, 0, GiLoadFloat32(src + 2 * lda)); + GiSetSubVectorFloat32V2(a1, 1, GiLoadFloat32(src + 3 * lda)); + GI_FLOAT32_V2_t b0 = GiZipqFloat32( + GiGetSubVectorFloat32V2(a0, 0), GiGetSubVectorFloat32V2(a1, 0)); + GI_FLOAT32_V2_t b1 = GiZipqFloat32( + GiGetSubVectorFloat32V2(a0, 1), GiGetSubVectorFloat32V2(a1, 1)); + GI_FLOAT32_V2_t c0 = GiZipqFloat32( + GiGetSubVectorFloat32V2(b0, 0), GiGetSubVectorFloat32V2(b1, 0)); + GI_FLOAT32_V2_t c1 = GiZipqFloat32( + GiGetSubVectorFloat32V2(b0, 1), GiGetSubVectorFloat32V2(b1, 1)); + GiStoreFloat32(dst + 0 * ldb, GiGetSubVectorFloat32V2(c0, 0)); + GiStoreFloat32(dst + 1 * ldb, GiGetSubVectorFloat32V2(c0, 1)); + GiStoreFloat32(dst + 2 * ldb, GiGetSubVectorFloat32V2(c1, 0)); + GiStoreFloat32(dst + 3 * ldb, GiGetSubVectorFloat32V2(c1, 1)); } } // namespace fallback } // namespace megdnn @@ -159,27 +163,43 @@ inline void transpose_4x4(const float* src, float* dst, int lda, int ldb) { GiReinterpretqFloat32ToS64(b3.val[1]))); #else -#define TRANSPOSE_8x4(a, ret) \ - auto b0 = GiZipqFloat32(CONCAT(a, 0).value, CONCAT(a, 1).value); \ - auto b1 = GiZipqFloat32(CONCAT(a, 2).value, CONCAT(a, 3).value); \ - auto b2 = GiZipqFloat32(CONCAT(a, 4).value, CONCAT(a, 5).value); \ - auto b3 = GiZipqFloat32(CONCAT(a, 6).value, CONCAT(a, 7).value); \ - CONCAT(ret, 0).value.val[0] = \ - GiCombineFloat32(GiGetLowFloat32(b0.val[0]), GiGetLowFloat32(b1.val[0])); \ - CONCAT(ret, 1).value.val[0] = GiCombineFloat32( \ - GiGetHighFloat32(b0.val[0]), GiGetHighFloat32(b1.val[0])); \ - CONCAT(ret, 2).value.val[0] = \ - GiCombineFloat32(GiGetLowFloat32(b0.val[1]), GiGetLowFloat32(b1.val[1])); \ - CONCAT(ret, 3).value.val[0] = GiCombineFloat32( \ - GiGetHighFloat32(b0.val[1]), GiGetHighFloat32(b1.val[1])); \ - CONCAT(ret, 0).value.val[1] = \ - GiCombineFloat32(GiGetLowFloat32(b2.val[0]), GiGetLowFloat32(b3.val[0])); \ - CONCAT(ret, 1).value.val[1] = GiCombineFloat32( \ - GiGetHighFloat32(b2.val[0]), GiGetHighFloat32(b3.val[0])); \ - CONCAT(ret, 2).value.val[1] = \ - GiCombineFloat32(GiGetLowFloat32(b2.val[1]), GiGetLowFloat32(b3.val[1])); \ - CONCAT(ret, 3).value.val[1] = GiCombineFloat32( \ - GiGetHighFloat32(b2.val[1]), GiGetHighFloat32(b3.val[1])); +#define TRANSPOSE_8x4(a, ret) \ + auto b0 = GiZipqFloat32( \ + GiFixLenType2GiFloat32Type(CONCAT(a, 0).value), \ + GiFixLenType2GiFloat32Type(CONCAT(a, 1).value)); \ + auto b1 = GiZipqFloat32( \ + GiFixLenType2GiFloat32Type(CONCAT(a, 2).value), \ + GiFixLenType2GiFloat32Type(CONCAT(a, 3).value)); \ + auto b2 = GiZipqFloat32( \ + GiFixLenType2GiFloat32Type(CONCAT(a, 4).value), \ + GiFixLenType2GiFloat32Type(CONCAT(a, 5).value)); \ + auto b3 = GiZipqFloat32( \ + GiFixLenType2GiFloat32Type(CONCAT(a, 6).value), \ + GiFixLenType2GiFloat32Type(CONCAT(a, 7).value)); \ + CONCAT(ret, 0).value.val[0] = GiFloat32Type2FixLenType(GiCombineFloat32( \ + GiGetLowFloat32(GiGetSubVectorFloat32V2(b0, 0)), \ + GiGetLowFloat32(GiGetSubVectorFloat32V2(b1, 0)))); \ + CONCAT(ret, 1).value.val[0] = GiFloat32Type2FixLenType(GiCombineFloat32( \ + GiGetHighFloat32(GiGetSubVectorFloat32V2(b0, 0)), \ + GiGetHighFloat32(GiGetSubVectorFloat32V2(b1, 0)))); \ + CONCAT(ret, 2).value.val[0] = GiFloat32Type2FixLenType(GiCombineFloat32( \ + GiGetLowFloat32(GiGetSubVectorFloat32V2(b0, 1)), \ + GiGetLowFloat32(GiGetSubVectorFloat32V2(b1, 1)))); \ + CONCAT(ret, 3).value.val[0] = GiFloat32Type2FixLenType(GiCombineFloat32( \ + GiGetHighFloat32(GiGetSubVectorFloat32V2(b0, 1)), \ + GiGetHighFloat32(GiGetSubVectorFloat32V2(b1, 1)))); \ + CONCAT(ret, 0).value.val[1] = GiFloat32Type2FixLenType(GiCombineFloat32( \ + GiGetLowFloat32(GiGetSubVectorFloat32V2(b2, 0)), \ + GiGetLowFloat32(GiGetSubVectorFloat32V2(b3, 0)))); \ + CONCAT(ret, 1).value.val[1] = GiFloat32Type2FixLenType(GiCombineFloat32( \ + GiGetHighFloat32(GiGetSubVectorFloat32V2(b2, 0)), \ + GiGetHighFloat32(GiGetSubVectorFloat32V2(b3, 0)))); \ + CONCAT(ret, 2).value.val[1] = GiFloat32Type2FixLenType(GiCombineFloat32( \ + GiGetLowFloat32(GiGetSubVectorFloat32V2(b2, 1)), \ + GiGetLowFloat32(GiGetSubVectorFloat32V2(b3, 1)))); \ + CONCAT(ret, 3).value.val[1] = GiFloat32Type2FixLenType(GiCombineFloat32( \ + GiGetHighFloat32(GiGetSubVectorFloat32V2(b2, 1)), \ + GiGetHighFloat32(GiGetSubVectorFloat32V2(b3, 1)))); #endif // vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/conv_bias/gi/fp32/strategy_2x3_4x4.cpp b/dnn/src/fallback/conv_bias/gi/fp32/strategy_2x3_4x4.cpp index f63692a98..5951f6f46 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/strategy_2x3_4x4.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/strategy_2x3_4x4.cpp @@ -155,10 +155,10 @@ struct OutputTransform2X3 { v11 += vbias; } if (bmode != BiasMode::BIAS) { - v00 = op(v00.value); - v01 = op(v01.value); - v10 = op(v10.value); - v11 = op(v11.value); + v00 = op(GiFixLenType2GiFloat32Type(v00.value)); + v01 = op(GiFixLenType2GiFloat32Type(v01.value)); + v10 = op(GiFixLenType2GiFloat32Type(v10.value)); + v11 = op(GiFixLenType2GiFloat32Type(v11.value)); } v00.save(transform_mid_buf + (0 * 2 + 0) * 4); @@ -194,10 +194,28 @@ void winograd_gi_2x3_4x4_f::filter( size_t OC, size_t IC, size_t oc_start, size_t oc_end) { constexpr int alpha = 2 + 3 - 1; //! G * g * GT - GI_FLOAT32_t g0{1.f, 0, 0, 0}, g1{0.5, 0.5, 0.5, 0}, g2{0.5, -0.5, 0.5, 0}, - g3{0, 0, 1, 0}; - GI_FLOAT32_t gt0{1, 0.5, 0.5, 0}, gt1{0, 0.5, -0.5, 0}, gt2{0, 0.5, 0.5, 1}, - gt3{0, 0, 0, 0}; + float tmp[4]; + auto init_g = [&](float a0, float a1, float a2, float a3) { + tmp[0] = a0; + tmp[1] = a1; + tmp[2] = a2; + tmp[3] = a3; + }; + init_g(1.f, 0, 0, 0); + GI_FLOAT32_t g0 = GiLoadFloat32(tmp); + init_g(0.5, 0.5, 0.5, 0); + GI_FLOAT32_t g1 = GiLoadFloat32(tmp); + init_g(0.5, -0.5, 0.5, 0); + GI_FLOAT32_t g2 = GiLoadFloat32(tmp); + init_g(0, 0, 1, 0); + GI_FLOAT32_t g3 = GiLoadFloat32(tmp); + init_g(1, 0.5, 0.5, 0); + GI_FLOAT32_t gt0 = GiLoadFloat32(tmp); + init_g(0, 0.5, -0.5, 0); + GI_FLOAT32_t gt1 = GiLoadFloat32(tmp); + init_g(0, 0.5, 0.5, 1); + GI_FLOAT32_t gt2 = GiLoadFloat32(tmp); + GI_FLOAT32_t gt3 = GiZeroFloat32(); size_t OCB = OC / 4; size_t ICB = IC / 4; @@ -217,15 +235,15 @@ void winograd_gi_2x3_4x4_f::filter( GI_FLOAT32_t vf1 = GiLoadFloat32(filter_ptr + 4); GI_FLOAT32_t vf2 = GiBroadcastFloat32(filter_ptr[8]); - GI_FLOAT32_t v3(GiBroadcastFloat32(0)); + GI_FLOAT32_t v3 = GiBroadcastFloat32(0); auto vtmp = GiExtqFloat32(vf1, vf2, 2); vtmp = GiSetqLaneFloat32(0, vtmp, 3); - GI_FLOAT32_t v2(vtmp); + GI_FLOAT32_t v2 = vtmp; vtmp = GiExtqFloat32(vf0, vf1, 3); vtmp = GiSetqLaneFloat32(0, vtmp, 3); - GI_FLOAT32_t v1(vtmp); + GI_FLOAT32_t v1 = vtmp; vtmp = GiSetqLaneFloat32(0, vf0, 3); - GI_FLOAT32_t v0(vtmp); + GI_FLOAT32_t v0 = vtmp; GI_FLOAT32_t vsum0 = GiBroadcastFloat32(0), vsum1 = GiBroadcastFloat32(0), vsum2 = GiBroadcastFloat32(0), vsum3 = GiBroadcastFloat32(0); diff --git a/dnn/src/fallback/conv_bias/gi/fp32/strategy_4x5.cpp b/dnn/src/fallback/conv_bias/gi/fp32/strategy_4x5.cpp index ad2b30b51..4eabc4c42 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/strategy_4x5.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/strategy_4x5.cpp @@ -115,10 +115,19 @@ struct FilterTransform4X5 { FILTER_TRANSFORM(g, Gg) GI_FLOAT32_V2_t vgr; - GI_FLOAT32_t vgr0 = {Ggr0, Ggr1, Ggr2, Ggr3}; - GI_FLOAT32_t vgr1 = {Ggr4, Ggr5, Ggr6, Ggr7}; - vgr.val[0] = vgr0; //{Ggr0, Ggr1, Ggr2, Ggr3}; - vgr.val[1] = vgr1; //{Ggr4, Ggr5, Ggr6, Ggr7}; + float tmp[4]; + tmp[0] = Ggr0; + tmp[1] = Ggr1; + tmp[2] = Ggr2; + tmp[3] = Ggr3; + GI_FLOAT32_t vgr0 = GiLoadFloat32(tmp); + tmp[0] = Ggr4; + tmp[1] = Ggr5; + tmp[2] = Ggr6; + tmp[3] = Ggr7; + GI_FLOAT32_t vgr1 = GiLoadFloat32(tmp); + GiSetSubVectorFloat32V2(vgr, 0, vgr0); //{Ggr0, Ggr1, Ggr2, Ggr3}; + GiSetSubVectorFloat32V2(vgr, 1, vgr1); //{Ggr4, Ggr5, Ggr6, Ggr7}; Vector Ggt4(vgr); TRANSPOSE_8x4(Gg, Ggt); FILTER_TRANSFORM_FINAL(Ggt, result); @@ -155,10 +164,12 @@ struct InputTransform4X5 { wd##7 = (d##7 - d##1) + (d##3 - d##5) * 5.25f; \ } while (0) -#define GET_VECTOR_HIGH_ELEM(s, i, idx) \ - GiExtractLane##idx##Float32(CONCAT(s, i).value.val[1]) -#define GET_VECTOR_LOW_ELEM(s, i, idx) \ - GiExtractLane##idx##Float32(CONCAT(s, i).value.val[0]) +#define GET_VECTOR_HIGH_ELEM(s, i, idx) \ + GiExtractLane##idx##Float32(GiGetSubVectorFloat32V2( \ + GiFixLenType2GiFloat32V2Type(CONCAT(s, i).value), 1)) +#define GET_VECTOR_LOW_ELEM(s, i, idx) \ + GiExtractLane##idx##Float32(GiGetSubVectorFloat32V2( \ + GiFixLenType2GiFloat32V2Type(CONCAT(s, i).value), 0)) template static void transform( diff --git a/dnn/src/fallback/conv_bias/gi/fp32/strategy_5x4.cpp b/dnn/src/fallback/conv_bias/gi/fp32/strategy_5x4.cpp index 7def8ecf4..1b3611fa7 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/strategy_5x4.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/strategy_5x4.cpp @@ -104,7 +104,8 @@ struct FilterTransform5X4 { mid_buf1[7] = GET_VECTOR_ELEM(wd, i, 3); \ mid_buf1 += 8; \ } while (0); -#define GET_VECTOR_ELEM(s, i, idx) GiExtractLane##idx##Float32(CONCAT(s, i).value) +#define GET_VECTOR_ELEM(s, i, idx) \ + GiExtractLane##idx##Float32(GiFixLenType2GiFloat32Type(CONCAT(s, i).value)) float* mid_buf1 = transform_mid_buf; UNROLL_CALL_NOWRAPPER(8, cb); @@ -142,9 +143,9 @@ struct InputTransform5X4 { } while (0) #define GET_VECTOR_HIGH_ELEM(s, i, idx) \ - GiExtractLane##idx##Float32(CONCAT(s, i).value.val[1]) + GiExtractLane##idx##Float32(GiFixLenType2GiFloat32Type(CONCAT(s, i).value.val[1])) #define GET_VECTOR_LOW_ELEM(s, i, idx) \ - GiExtractLane##idx##Float32(CONCAT(s, i).value.val[0]) + GiExtractLane##idx##Float32(GiFixLenType2GiFloat32Type(CONCAT(s, i).value.val[0])) template static void transform( diff --git a/dnn/src/fallback/conv_bias/gi/fp32/strategy_6x3.cpp b/dnn/src/fallback/conv_bias/gi/fp32/strategy_6x3.cpp index 2356a6a14..16af210db 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/strategy_6x3.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/strategy_6x3.cpp @@ -46,9 +46,9 @@ namespace { } while (0); #define GET_VECTOR_HIGH_ELEM(s, i, idx) \ - GiExtractLane##idx##Float32(CONCAT(s, i).value.val[1]) + GiExtractLane##idx##Float32(GiFixLenType2GiFloat32Type(CONCAT(s, i).value.val[1])) #define GET_VECTOR_LOW_ELEM(s, i, idx) \ - GiExtractLane##idx##Float32(CONCAT(s, i).value.val[0]) + GiExtractLane##idx##Float32(GiFixLenType2GiFloat32Type(CONCAT(s, i).value.val[0])) struct InputTransform6X3 { template static void transform( diff --git a/dnn/src/fallback/conv_bias/gi/fp32/strategy_6x3_4x4.cpp b/dnn/src/fallback/conv_bias/gi/fp32/strategy_6x3_4x4.cpp index 0ddd8c646..46a6c3128 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/strategy_6x3_4x4.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/strategy_6x3_4x4.cpp @@ -215,7 +215,7 @@ struct OutputTransform6X3 { #undef cb } if (bmode != BiasMode::BIAS) { -#define cb(m, n) v##m##n = op(CONCAT(v##m, n).value); +#define cb(m, n) v##m##n = op(GiFixLenType2GiFloat32Type(CONCAT(v##m, n).value)); UNROLL_CALL_RAW_D2(6, 6, cb); #undef cb } diff --git a/dnn/src/fallback/conv_bias/gi/fp32/strategy_f23_mk4_nchw44.cpp b/dnn/src/fallback/conv_bias/gi/fp32/strategy_f23_mk4_nchw44.cpp index 485a89a0d..c0413335e 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/strategy_f23_mk4_nchw44.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/strategy_f23_mk4_nchw44.cpp @@ -153,7 +153,7 @@ struct OutputTransformF23_NCHW44 { #undef cb } if (bmode != BiasMode::BIAS) { -#define cb(m, n) v##m##n = op(CONCAT(v##m, n).value); +#define cb(m, n) v##m##n = op(GiFixLenType2GiFloat32Type(CONCAT(v##m, n).value)); UNROLL_CALL_RAW_D2(2, 2, cb); #undef cb } @@ -165,7 +165,7 @@ struct OutputTransformF23_NCHW44 { if (bmode == BiasMode::BIAS) { \ v##oho##owo += Vector::load( \ bias + oc * OH * OW + oh * OW * pack_size + ow * pack_size); \ - v##oho##owo = op(v##oho##owo.value); \ + v##oho##owo = op(GiFixLenType2GiFloat32Type(v##oho##owo.value)); \ } \ v##oho##owo.save( \ output + oc * OH * OW + oh * OW * pack_size + ow * pack_size); \ diff --git a/dnn/src/fallback/conv_bias/gi/fp32/strategy_f63_mk4_nchw44.cpp b/dnn/src/fallback/conv_bias/gi/fp32/strategy_f63_mk4_nchw44.cpp index 1bcd096d0..53ca6d17a 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/strategy_f63_mk4_nchw44.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/strategy_f63_mk4_nchw44.cpp @@ -102,17 +102,17 @@ struct InputTransformF63_NCHW44 { auto t##i##4 = d6; \ auto t##i##5 = d6; \ auto t##i##6 = d6; \ - t##i##0 = t##i##0 - d6; \ - t##i##1 = t##i##1 + d1; \ - t##i##2 = t##i##2 - d1; \ + t##i##0 = GiSubtractFloat32(t##i##0, d6); \ + t##i##1 = GiAddFloat32(t##i##1, d1); \ + t##i##2 = GiSubtractFloat32(t##i##2, d1); \ t##i##3 = GiSimdFmaLane(t##i##3, d1, v0, 2); \ t##i##4 = GiFmsqLaneQFloat32(t##i##4, d1, v0, 2); \ t##i##5 = GiSimdFmaLane(t##i##5, d1, v1, 2); \ t##i##6 = GiFmsqLaneQFloat32(t##i##6, d1, v1, 2); \ - t##i##7 = t##i##7 - d1; \ + t##i##7 = GiSubtractFloat32(t##i##7, d1); \ t##i##0 = GiFmsqLaneQFloat32(t##i##0, d2, v0, 0); \ - t##i##1 = t##i##1 + d2; \ - t##i##2 = t##i##2 + d2; \ + t##i##1 = GiAddFloat32(t##i##1, d2); \ + t##i##2 = GiAddFloat32(t##i##2, d2); \ t##i##3 = GiSimdFmaLane(t##i##3, d2, v0, 3); \ t##i##4 = GiSimdFmaLane(t##i##4, d2, v0, 3); \ t##i##5 = GiSimdFmaLane(t##i##5, d2, v1, 3); \ @@ -131,8 +131,8 @@ struct InputTransformF63_NCHW44 { t##i##4 = GiFmsqLaneQFloat32(t##i##4, d4, v1, 1); \ t##i##5 = GiFmsqLaneQFloat32(t##i##5, d4, v2, 0); \ t##i##6 = GiFmsqLaneQFloat32(t##i##6, d4, v2, 0); \ - t##i##1 = t##i##1 + d5; \ - t##i##2 = t##i##2 - d5; \ + t##i##1 = GiAddFloat32(t##i##1, d5); \ + t##i##2 = GiSubtractFloat32(t##i##2, d5); \ t##i##3 = GiSimdFmaLane(t##i##3, d5, v1, 2); \ t##i##4 = GiFmsqLaneQFloat32(t##i##4, d5, v1, 2); \ t##i##5 = GiSimdFmaLane(t##i##5, d5, v0, 2); \ @@ -150,17 +150,17 @@ struct InputTransformF63_NCHW44 { d5 = t6##i; \ d6 = t6##i; \ d7 = t7##i; \ - d0 = d0 - t6##i; \ - d1 = d1 + t1##i; \ - d2 = d2 - t1##i; \ + d0 = GiSubtractFloat32(d0, t6##i); \ + d1 = GiAddFloat32(d1, t1##i); \ + d2 = GiSubtractFloat32(d2, t1##i); \ d3 = GiSimdFmaLane(d3, t1##i, v0, 2); \ d4 = GiFmsqLaneQFloat32(d4, t1##i, v0, 2); \ d5 = GiSimdFmaLane(d5, t1##i, v1, 2); \ d6 = GiFmsqLaneQFloat32(d6, t1##i, v1, 2); \ - d7 = d7 - t1##i; \ + d7 = GiSubtractFloat32(d7, t1##i); \ d0 = GiFmsqLaneQFloat32(d0, t2##i, v0, 0); \ - d1 = d1 + t2##i; \ - d2 = d2 + t2##i; \ + d1 = GiAddFloat32(d1, t2##i); \ + d2 = GiAddFloat32(d2, t2##i); \ d3 = GiSimdFmaLane(d3, t2##i, v0, 3); \ d4 = GiSimdFmaLane(d4, t2##i, v0, 3); \ d5 = GiSimdFmaLane(d5, t2##i, v1, 3); \ @@ -179,8 +179,8 @@ struct InputTransformF63_NCHW44 { d4 = GiFmsqLaneQFloat32(d4, t4##i, v1, 1); \ d5 = GiFmsqLaneQFloat32(d5, t4##i, v2, 0); \ d6 = GiFmsqLaneQFloat32(d6, t4##i, v2, 0); \ - d1 = d1 + t5##i; \ - d2 = d2 - t5##i; \ + d1 = GiAddFloat32(d1, t5##i); \ + d2 = GiSubtractFloat32(d2, t5##i); \ d3 = GiSimdFmaLane(d3, t5##i, v1, 2); \ d4 = GiFmsqLaneQFloat32(d4, t5##i, v1, 2); \ d5 = GiSimdFmaLane(d5, t5##i, v0, 2); \ @@ -311,7 +311,7 @@ struct OutputTransformF63_NCHW44 { #undef cb } if (bmode != BiasMode::BIAS) { -#define cb(m, n) v##m##n = op(CONCAT(v##m, n).value); +#define cb(m, n) v##m##n = op(GiFixLenType2GiFloat32Type(CONCAT(v##m, n).value)); UNROLL_CALL_RAW_D2(6, 6, cb); #undef cb } @@ -323,7 +323,7 @@ struct OutputTransformF63_NCHW44 { if (bmode == BiasMode::BIAS) { \ v##oho##owo += Vector::load( \ bias + oc * OH * OW + oh * OW * pack_size + ow * pack_size); \ - v##oho##owo = op(v##oho##owo.value); \ + v##oho##owo = op(GiFixLenType2GiFloat32Type(v##oho##owo.value)); \ } \ v##oho##owo.save( \ output + oc * OH * OW + oh * OW * pack_size + ow * pack_size); \ diff --git a/dnn/src/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp b/dnn/src/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp index 52ea1f460..41e29cb29 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp @@ -121,14 +121,14 @@ struct InputTransformF73_NCHW44 { auto t##i##6 = d7; \ auto t##i##7 = d7; \ t##i##8 = GiFmsqLaneQFloat32(t##i##8, d7, v0, 0); \ - t##i##0 = t##i##0 - d1; \ + t##i##0 = GiSubtractFloat32(t##i##0, d1); \ t##i##1 = GiFmsqLaneQFloat32(t##i##1, d1, v0, 0); \ t##i##2 = GiSimdFmaLane(t##i##2, d1, v0, 0); \ t##i##3 = GiFmsqLaneQFloat32(t##i##3, d1, v0, 1); \ t##i##4 = GiSimdFmaLane(t##i##4, d1, v0, 1); \ t##i##5 = GiFmsqLaneQFloat32(t##i##5, d1, v0, 2); \ t##i##6 = GiSimdFmaLane(t##i##6, d1, v0, 2); \ - t##i##7 = t##i##7 - d1; \ + t##i##7 = GiSubtractFloat32(t##i##7, d1); \ t##i##8 = GiSimdFmaLane(t##i##8, d1, v0, 0); \ t##i##0 = GiFmsqLaneQFloat32(t##i##0, d2, v0, 3); \ t##i##1 = GiFmsqLaneQFloat32(t##i##1, d2, v1, 0); \ @@ -137,7 +137,7 @@ struct InputTransformF73_NCHW44 { t##i##4 = GiFmsqLaneQFloat32(t##i##4, d2, v1, 3); \ t##i##5 = GiFmsqLaneQFloat32(t##i##5, d2, v2, 0); \ t##i##6 = GiFmsqLaneQFloat32(t##i##6, d2, v2, 1); \ - t##i##8 = t##i##8 - d2; \ + t##i##8 = GiSubtractFloat32(t##i##8, d2); \ t##i##0 = GiSimdFmaLane(t##i##0, d3, v2, 2); \ t##i##1 = GiSimdFmaLane(t##i##1, d3, v2, 3); \ t##i##2 = GiFmsqLaneQFloat32(t##i##2, d3, v3, 0); \ @@ -169,7 +169,7 @@ struct InputTransformF73_NCHW44 { t##i##2 = GiFmsqLaneQFloat32(t##i##2, d6, v1, 1); \ t##i##3 = GiSimdFmaLane(t##i##3, d6, v1, 0); \ t##i##4 = GiFmsqLaneQFloat32(t##i##4, d6, v3, 1); \ - t##i##5 = t##i##5 - d6; \ + t##i##5 = GiSubtractFloat32(t##i##5, d6); \ t##i##6 = GiFmsqLaneQFloat32(t##i##6, d6, v6, 2); \ t##i##8 = GiFmsqLaneQFloat32(t##i##8, d6, v2, 2); \ t##i##0 = GiSimdFmaLane(t##i##0, d0, v0, 0); @@ -188,14 +188,14 @@ struct InputTransformF73_NCHW44 { d6 = t7##i; \ d7 = t7##i; \ d8 = GiFmsqLaneQFloat32(d8, t7##i, v0, 0); \ - d0 = d0 - t1##i; \ + d0 = GiSubtractFloat32(d0, t1##i); \ d1 = GiFmsqLaneQFloat32(d1, t1##i, v0, 0); \ d2 = GiSimdFmaLane(d2, t1##i, v0, 0); \ d3 = GiFmsqLaneQFloat32(d3, t1##i, v0, 1); \ d4 = GiSimdFmaLane(d4, t1##i, v0, 1); \ d5 = GiFmsqLaneQFloat32(d5, t1##i, v0, 2); \ d6 = GiSimdFmaLane(d6, t1##i, v0, 2); \ - d7 = d7 - t1##i; \ + d7 = GiSubtractFloat32(d7, t1##i); \ d8 = GiSimdFmaLane(d8, t1##i, v0, 0); \ d0 = GiFmsqLaneQFloat32(d0, t2##i, v0, 3); \ d1 = GiFmsqLaneQFloat32(d1, t2##i, v1, 0); \ @@ -204,7 +204,7 @@ struct InputTransformF73_NCHW44 { d4 = GiFmsqLaneQFloat32(d4, t2##i, v1, 3); \ d5 = GiFmsqLaneQFloat32(d5, t2##i, v2, 0); \ d6 = GiFmsqLaneQFloat32(d6, t2##i, v2, 1); \ - d8 = d8 - t2##i; \ + d8 = GiSubtractFloat32(d8, t2##i); \ d0 = GiSimdFmaLane(d0, t3##i, v2, 2); \ d1 = GiSimdFmaLane(d1, t3##i, v2, 3); \ d2 = GiFmsqLaneQFloat32(d2, t3##i, v3, 0); \ @@ -236,7 +236,7 @@ struct InputTransformF73_NCHW44 { d2 = GiFmsqLaneQFloat32(d2, t6##i, v1, 1); \ d3 = GiSimdFmaLane(d3, t6##i, v1, 0); \ d4 = GiFmsqLaneQFloat32(d4, t6##i, v3, 1); \ - d5 = d5 - t6##i; \ + d5 = GiSubtractFloat32(d5, t6##i); \ d6 = GiFmsqLaneQFloat32(d6, t6##i, v6, 2); \ d8 = GiFmsqLaneQFloat32(d8, t6##i, v2, 2); \ d0 = GiSimdFmaLane(d0, t0##i, v0, 0); \ @@ -377,7 +377,7 @@ struct OutputTransformF73_NCHW44 { #undef cb } if (bmode != BiasMode::BIAS) { -#define cb(m, n) v##m##n = op(CONCAT(v##m, n).value); +#define cb(m, n) v##m##n = op(GiFixLenType2GiFloat32Type(CONCAT(v##m, n).value)); UNROLL_CALL_RAW_D2(7, 7, cb); #undef cb } @@ -389,7 +389,7 @@ struct OutputTransformF73_NCHW44 { if (bmode == BiasMode::BIAS) { \ v##oho##owo += Vector::load( \ bias + oc * OH * OW + oh * OW * pack_size + ow * pack_size); \ - v##oho##owo = op(v##oho##owo.value); \ + v##oho##owo = op(GiFixLenType2GiFloat32Type(v##oho##owo.value)); \ } \ v##oho##owo.save( \ output + oc * OH * OW + oh * OW * pack_size + ow * pack_size); \ diff --git a/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h b/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h index dae8bfaf7..f34a221c9 100644 --- a/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h +++ b/dnn/src/fallback/conv_bias/gi/intrinsic_helper.h @@ -26,6 +26,35 @@ struct Vld1qF32S { #endif #endif +template +struct ParamElemFixLenVisitor; + +template +struct ParamElemFixLenVisitorV2; + +#define cb(_ctype, _simd_type, _simd_fixlen_type, _fun_suffix, _simd_type_v2) \ + template <> \ + struct ParamElemFixLenVisitor<_ctype> { \ + _simd_type operator()(const _simd_fixlen_type& s0) const { \ + return GiFixLenType2Gi##_fun_suffix##Type(s0); \ + } \ + }; \ + template <> \ + struct ParamElemFixLenVisitorV2<_ctype> { \ + _simd_type_v2 operator()( \ + const _simd_fixlen_type& s0, const _simd_fixlen_type& s1) const { \ + _simd_type_v2 ret; \ + GiSetSubVector##_fun_suffix##V2( \ + ret, 0, GiFixLenType2Gi##_fun_suffix##Type(s0)); \ + GiSetSubVector##_fun_suffix##V2( \ + ret, 1, GiFixLenType2Gi##_fun_suffix##Type(s1)); \ + return ret; \ + } \ + }; + +cb(dt_float32, GI_FLOAT32_t, GI_FLOAT32_FIXLEN_t, Float32, GI_FLOAT32_V2_t); +#undef cb + template < int weight_number, int base_offset, int ptr_step, int oc_block, typename Func, typename T, typename T2, typename... XT> @@ -33,8 +62,9 @@ struct LoadHelper { static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args); }; -#define WEIGHT_CB(step) \ - src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...); +#define WEIGHT_CB(step) \ + src[step] = GiFloat32Type2FixLenType( \ + Func::impl(ptr + base_offset + step * ptr_step, args...)); #define LOAD_HELPER(step) \ template < \ @@ -67,7 +97,9 @@ LOAD_HELPER(16); #undef WEIGHT_CB ///////////////////////////c_dim = 1///////////////////////// -#define WEIGHT_CB(step) src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); +#define WEIGHT_CB(step) \ + src[0][step] = \ + GiFloat32Type2FixLenType(Func::impl(ptr + base_offset + step * ptr_step)); #define LOAD_HELPER(step) \ template \ @@ -91,9 +123,11 @@ LOAD_HELPER(9); #undef WEIGHT_CB /////////////////////////c_dim = 2/////////////////////////////// -#define WEIGHT_CB(step) \ - src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \ - src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset); +#define WEIGHT_CB(step) \ + src[0][step] = \ + GiFloat32Type2FixLenType(Func::impl(ptr + base_offset + step * ptr_step)); \ + src[1][step] = GiFloat32Type2FixLenType( \ + Func::impl(ptr + base_offset + step * ptr_step + oc_offset)); #define LOAD_HELPER(step) \ template \ @@ -132,172 +166,196 @@ struct StoreOcxOw8Remain { template struct StoreOcxOw8Remain<2, 0, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); - op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); - op({{c[0][6], c[0][7]}}, reinterpret_cast(dst_ptr + 24)); - - op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); - op({{c[1][2], c[1][3]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 8)); - op({{c[1][4], c[1][5]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 16)); - op({{c[1][6], c[1][7]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 24)); + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); + op(vis(c[0][4], c[0][5]), reinterpret_cast(dst_ptr + 16)); + op(vis(c[0][6], c[0][7]), reinterpret_cast(dst_ptr + 24)); + + op(vis(c[1][0], c[1][1]), reinterpret_cast(dst_ptr + ld_dst_oc)); + op(vis(c[1][2], c[1][3]), reinterpret_cast(dst_ptr + ld_dst_oc + 8)); + op(vis(c[1][4], c[1][5]), reinterpret_cast(dst_ptr + ld_dst_oc + 16)); + op(vis(c[1][6], c[1][7]), reinterpret_cast(dst_ptr + ld_dst_oc + 24)); } }; template struct StoreOcxOw8Remain<2, 8, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); - op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); - op({{c[0][6], c[0][7]}}, reinterpret_cast(dst_ptr + 24)); - - op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); - op({{c[1][2], c[1][3]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 8)); - op({{c[1][4], c[1][5]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 16)); - op({{c[1][6], c[1][7]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 24)); + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); + op(vis(c[0][4], c[0][5]), reinterpret_cast(dst_ptr + 16)); + op(vis(c[0][6], c[0][7]), reinterpret_cast(dst_ptr + 24)); + + op(vis(c[1][0], c[1][1]), reinterpret_cast(dst_ptr + ld_dst_oc)); + op(vis(c[1][2], c[1][3]), reinterpret_cast(dst_ptr + ld_dst_oc + 8)); + op(vis(c[1][4], c[1][5]), reinterpret_cast(dst_ptr + ld_dst_oc + 16)); + op(vis(c[1][6], c[1][7]), reinterpret_cast(dst_ptr + ld_dst_oc + 24)); } }; template struct StoreOcxOw8Remain<2, 7, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); - op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); - op(c[0][6], reinterpret_cast(dst_ptr + 24)); - - op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); - op({{c[1][2], c[1][3]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 8)); - op({{c[1][4], c[1][5]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 16)); - op(c[1][6], reinterpret_cast(dst_ptr + ld_dst_oc + 24)); + ParamElemFixLenVisitor vis0; + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); + op(vis(c[0][4], c[0][5]), reinterpret_cast(dst_ptr + 16)); + op(vis0(c[0][6]), reinterpret_cast(dst_ptr + 24)); + + op(vis(c[1][0], c[1][1]), reinterpret_cast(dst_ptr + ld_dst_oc)); + op(vis(c[1][2], c[1][3]), reinterpret_cast(dst_ptr + ld_dst_oc + 8)); + op(vis(c[1][4], c[1][5]), reinterpret_cast(dst_ptr + ld_dst_oc + 16)); + op(vis0(c[1][6]), reinterpret_cast(dst_ptr + ld_dst_oc + 24)); } }; template struct StoreOcxOw8Remain<2, 6, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); - op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); - - op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); - op({{c[1][2], c[1][3]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 8)); - op({{c[1][4], c[1][5]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 16)); + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); + op(vis(c[0][4], c[0][5]), reinterpret_cast(dst_ptr + 16)); + + op(vis(c[1][0], c[1][1]), reinterpret_cast(dst_ptr + ld_dst_oc)); + op(vis(c[1][2], c[1][3]), reinterpret_cast(dst_ptr + ld_dst_oc + 8)); + op(vis(c[1][4], c[1][5]), reinterpret_cast(dst_ptr + ld_dst_oc + 16)); } }; template struct StoreOcxOw8Remain<2, 5, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); - op(c[0][4], reinterpret_cast(dst_ptr + 16)); - - op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); - op({{c[1][2], c[1][3]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 8)); - op(c[1][4], reinterpret_cast(dst_ptr + ld_dst_oc + 16)); + ParamElemFixLenVisitor vis0; + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); + op(vis0(c[0][4]), reinterpret_cast(dst_ptr + 16)); + + op(vis(c[1][0], c[1][1]), reinterpret_cast(dst_ptr + ld_dst_oc)); + op(vis(c[1][2], c[1][3]), reinterpret_cast(dst_ptr + ld_dst_oc + 8)); + op(vis0(c[1][4]), reinterpret_cast(dst_ptr + ld_dst_oc + 16)); } }; template struct StoreOcxOw8Remain<2, 4, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); - op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); - op({{c[1][2], c[1][3]}}, reinterpret_cast(dst_ptr + ld_dst_oc + 8)); + op(vis(c[1][0], c[1][1]), reinterpret_cast(dst_ptr + ld_dst_oc)); + op(vis(c[1][2], c[1][3]), reinterpret_cast(dst_ptr + ld_dst_oc + 8)); } }; template struct StoreOcxOw8Remain<2, 3, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op(c[0][2], reinterpret_cast(dst_ptr + 8)); + ParamElemFixLenVisitor vis0; + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis0(c[0][2]), reinterpret_cast(dst_ptr + 8)); - op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); - op(c[1][2], reinterpret_cast(dst_ptr + ld_dst_oc + 8)); + op(vis(c[1][0], c[1][1]), reinterpret_cast(dst_ptr + ld_dst_oc)); + op(vis0(c[1][2]), reinterpret_cast(dst_ptr + ld_dst_oc + 8)); } }; template struct StoreOcxOw8Remain<2, 2, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[1][0], c[1][1]}}, reinterpret_cast(dst_ptr + ld_dst_oc)); + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[1][0], c[1][1]), reinterpret_cast(dst_ptr + ld_dst_oc)); } }; template struct StoreOcxOw8Remain<2, 1, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int ld_dst_oc) { - op(c[0][0], reinterpret_cast(dst_ptr)); - op(c[1][0], reinterpret_cast(dst_ptr + ld_dst_oc)); + ParamElemFixLenVisitor vis0; + op(vis0(c[0][0]), reinterpret_cast(dst_ptr)); + op(vis0(c[1][0]), reinterpret_cast(dst_ptr + ld_dst_oc)); } }; template struct StoreOcxOw8Remain<1, 0, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); - op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); - op({{c[0][6], c[0][7]}}, reinterpret_cast(dst_ptr + 24)); + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); + op(vis(c[0][4], c[0][5]), reinterpret_cast(dst_ptr + 16)); + op(vis(c[0][6], c[0][7]), reinterpret_cast(dst_ptr + 24)); } }; template struct StoreOcxOw8Remain<1, 8, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); - op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); - op({{c[0][6], c[0][7]}}, reinterpret_cast(dst_ptr + 24)); + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); + op(vis(c[0][4], c[0][5]), reinterpret_cast(dst_ptr + 16)); + op(vis(c[0][6], c[0][7]), reinterpret_cast(dst_ptr + 24)); } }; template struct StoreOcxOw8Remain<1, 7, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); - op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); - op(c[0][6], reinterpret_cast(dst_ptr + 24)); + ParamElemFixLenVisitor vis0; + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); + op(vis(c[0][4], c[0][5]), reinterpret_cast(dst_ptr + 16)); + op(vis0(c[0][6]), reinterpret_cast(dst_ptr + 24)); } }; template struct StoreOcxOw8Remain<1, 6, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); - op({{c[0][4], c[0][5]}}, reinterpret_cast(dst_ptr + 16)); + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); + op(vis(c[0][4], c[0][5]), reinterpret_cast(dst_ptr + 16)); } }; template struct StoreOcxOw8Remain<1, 5, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); - op(c[0][4], reinterpret_cast(dst_ptr + 16)); + ParamElemFixLenVisitor vis0; + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); + op(vis0(c[0][4]), reinterpret_cast(dst_ptr + 16)); } }; template struct StoreOcxOw8Remain<1, 4, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op({{c[0][2], c[0][3]}}, reinterpret_cast(dst_ptr + 8)); + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis(c[0][2], c[0][3]), reinterpret_cast(dst_ptr + 8)); } }; template struct StoreOcxOw8Remain<1, 3, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); - op(c[0][2], reinterpret_cast(dst_ptr + 8)); + ParamElemFixLenVisitor vis0; + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); + op(vis0(c[0][2]), reinterpret_cast(dst_ptr + 8)); } }; template struct StoreOcxOw8Remain<1, 2, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int) { - op({{c[0][0], c[0][1]}}, reinterpret_cast(dst_ptr)); + ParamElemFixLenVisitorV2 vis; + op(vis(c[0][0], c[0][1]), reinterpret_cast(dst_ptr)); } }; template struct StoreOcxOw8Remain<1, 1, Op, T, T2, T3> { static GI_FORCEINLINE void impl(T& c, const Op& op, T2 dst_ptr, int) { - op(c[0][0], reinterpret_cast(dst_ptr)); + ParamElemFixLenVisitor vis0; + op(vis0(c[0][0]), reinterpret_cast(dst_ptr)); } }; @@ -331,21 +389,25 @@ struct InitOcxOw8 { static GI_FORCEINLINE void impl(T&, const T2*, int) {} }; -#define BAIS_INIT_NO_BIAS_C2(step) \ - c[0][step] = GiBroadcastFloat32(static_cast(0)); \ - c[1][step] = GiBroadcastFloat32(static_cast(0)); -#define BAIS_INIT_NO_BIAS_C1(step) c[0][step] = GiBroadcastFloat32(static_cast(0)); - -#define BAIS_INIT_BROADCAST_C2(step) \ - c[0][step] = GiLoadFloat32(bias_ptr); \ - c[1][step] = GiLoadFloat32(bias_ptr + oc_step); -#define BAIS_INIT_BROADCAST_C1(step) c[0][step] = GiLoadFloat32(bias_ptr); - -#define BAIS_INIT_BIAS_C2(step) \ - c[0][step] = GiLoadFloat32(bias_ptr + step * simd_len); \ - c[1][step] = GiLoadFloat32(bias_ptr + oc_step + step * simd_len); - -#define BAIS_INIT_BIAS_C1(step) c[0][step] = GiLoadFloat32(bias_ptr + step * simd_len); +#define BAIS_INIT_NO_BIAS_C2(step) \ + c[0][step] = GiFloat32Type2FixLenType(GiBroadcastFloat32(static_cast(0))); \ + c[1][step] = GiFloat32Type2FixLenType(GiBroadcastFloat32(static_cast(0))); +#define BAIS_INIT_NO_BIAS_C1(step) \ + c[0][step] = GiFloat32Type2FixLenType(GiBroadcastFloat32(static_cast(0))); + +#define BAIS_INIT_BROADCAST_C2(step) \ + c[0][step] = GiFloat32Type2FixLenType(GiLoadFloat32(bias_ptr)); \ + c[1][step] = GiFloat32Type2FixLenType(GiLoadFloat32(bias_ptr + oc_step)); +#define BAIS_INIT_BROADCAST_C1(step) \ + c[0][step] = GiFloat32Type2FixLenType(GiLoadFloat32(bias_ptr)); + +#define BAIS_INIT_BIAS_C2(step) \ + c[0][step] = GiFloat32Type2FixLenType(GiLoadFloat32(bias_ptr + step * simd_len)); \ + c[1][step] = GiFloat32Type2FixLenType( \ + GiLoadFloat32(bias_ptr + oc_step + step * simd_len)); + +#define BAIS_INIT_BIAS_C1(step) \ + c[0][step] = GiFloat32Type2FixLenType(GiLoadFloat32(bias_ptr + step * simd_len)); #define INSTANCE_InitOcxOw8(ow_remain, cdim) \ template \ diff --git a/dnn/src/fallback/conv_bias/gi/utils.h b/dnn/src/fallback/conv_bias/gi/utils.h index 693c07903..5e8ec5fe7 100644 --- a/dnn/src/fallback/conv_bias/gi/utils.h +++ b/dnn/src/fallback/conv_bias/gi/utils.h @@ -12,49 +12,64 @@ struct Vector; template <> struct Vector { - GI_FLOAT32_t value; + GI_FLOAT32_FIXLEN_t value; Vector() {} - Vector(const float v) { value = GiBroadcastFloat32(v); } + Vector(const float v) { value = GiFloat32Type2FixLenType(GiBroadcastFloat32(v)); } Vector(const Vector& lr) { value = lr.value; } Vector(const Vector&& lr) { value = std::move(lr.value); } - Vector(const GI_FLOAT32_t& v) { value = v; } + Vector(const GI_FLOAT32_t& v) { value = GiFloat32Type2FixLenType(v); } static Vector load(const float* addr) { Vector v; - v.value = GiLoadFloat32(addr); + v.value = GiFloat32Type2FixLenType(GiLoadFloat32(addr)); return v; } - static void save(float* addr, const Vector& v) { GiStoreFloat32(addr, v.value); } + static void save(float* addr, const Vector& v) { + GiStoreFloat32(addr, GiFixLenType2GiFloat32Type(v.value)); + } void save(float* addr) { save(addr, *this); } Vector operator+(const Vector& lr) { Vector dst; - dst.value = GiAddFloat32(value, lr.value); + dst.value = GiFloat32Type2FixLenType(GiAddFloat32( + GiFixLenType2GiFloat32Type(value), + GiFixLenType2GiFloat32Type(lr.value))); return dst; } Vector& operator+=(const Vector& lr) { - value = GiAddFloat32(value, lr.value); + value = GiFloat32Type2FixLenType(GiAddFloat32( + GiFixLenType2GiFloat32Type(value), + GiFixLenType2GiFloat32Type(lr.value))); return *this; } Vector operator-(const Vector& lr) { Vector dst; - dst.value = GiSubtractFloat32(value, lr.value); + dst.value = GiFloat32Type2FixLenType(GiSubtractFloat32( + GiFixLenType2GiFloat32Type(value), + GiFixLenType2GiFloat32Type(lr.value))); return dst; } Vector& operator-=(const Vector& lr) { - value = GiSubtractFloat32(value, lr.value); + value = GiFloat32Type2FixLenType(GiSubtractFloat32( + GiFixLenType2GiFloat32Type(value), + GiFixLenType2GiFloat32Type(lr.value))); return *this; } Vector operator*(float lr) { Vector dst; - dst.value = GiMultiplyScalerFloat32(value, lr); + dst.value = GiFloat32Type2FixLenType( + GiMultiplyScalerFloat32(GiFixLenType2GiFloat32Type(value), lr)); return dst; } Vector operator*(const Vector& lr) { Vector dst; - dst.value = GiMultiplyFloat32(value, lr.value); + dst.value = GiFloat32Type2FixLenType(GiMultiplyFloat32( + GiFixLenType2GiFloat32Type(value), + GiFixLenType2GiFloat32Type(lr.value))); return dst; } Vector& operator*=(const Vector& lr) { - value = GiMultiplyFloat32(value, lr.value); + value = GiFloat32Type2FixLenType(GiMultiplyFloat32( + GiFixLenType2GiFloat32Type(value), + GiFixLenType2GiFloat32Type(lr.value))); return *this; } Vector& operator=(const Vector& lr) { @@ -74,72 +89,108 @@ struct Vector { template <> struct Vector { - GI_FLOAT32_V2_t value; + GI_FLOAT32_FIXLEN_V2_t value; Vector() {} Vector(const float v) { - value.val[0] = GiBroadcastFloat32(v); - value.val[1] = GiBroadcastFloat32(v); + value.val[0] = GiFloat32Type2FixLenType(GiBroadcastFloat32(v)); + value.val[1] = GiFloat32Type2FixLenType(GiBroadcastFloat32(v)); } Vector(const Vector& lr) { value = lr.value; } Vector(const Vector&& lr) { value = std::move(lr.value); } - Vector(const GI_FLOAT32_V2_t& v) { value = v; } + Vector(const GI_FLOAT32_V2_t& v) { value = GiFloat32Type2FixLenV2Type(v); } static Vector load(const float* addr) { Vector v; - v.value = GiLoadFloat32V2(addr); + v.value = GiFloat32Type2FixLenV2Type(GiLoadFloat32V2(addr)); return v; } - static void save(float* addr, const Vector& v) { GiStoreFloat32V2(addr, v.value); } + static void save(float* addr, const Vector& v) { + GiStoreFloat32V2(addr, GiFixLenType2GiFloat32V2Type(v.value)); + } void save(float* addr) { save(addr, *this); } Vector operator+(const Vector& lr) { Vector dst; - dst.value.val[0] = GiAddFloat32(value.val[0], lr.value.val[0]); - dst.value.val[1] = GiAddFloat32(value.val[1], lr.value.val[1]); + dst.value.val[0] = GiFloat32Type2FixLenType(GiAddFloat32( + GiFixLenType2GiFloat32Type(value.val[0]), + GiFixLenType2GiFloat32Type(lr.value.val[0]))); + dst.value.val[1] = GiFloat32Type2FixLenType(GiAddFloat32( + GiFixLenType2GiFloat32Type(value.val[1]), + GiFixLenType2GiFloat32Type(lr.value.val[1]))); return dst; } Vector& operator+=(const Vector& lr) { - value.val[0] = GiAddFloat32(value.val[0], lr.value.val[0]); - value.val[1] = GiAddFloat32(value.val[1], lr.value.val[1]); + value.val[0] = GiFloat32Type2FixLenType(GiAddFloat32( + GiFixLenType2GiFloat32Type(value.val[0]), + GiFixLenType2GiFloat32Type(lr.value.val[0]))); + value.val[1] = GiFloat32Type2FixLenType(GiAddFloat32( + GiFixLenType2GiFloat32Type(value.val[1]), + GiFixLenType2GiFloat32Type(lr.value.val[1]))); return *this; } Vector& add(const Vector& lr) { - value.val[0] = GiAddFloat32(value.val[0], lr.value.val[0]); - value.val[1] = GiAddFloat32(value.val[1], lr.value.val[1]); + value.val[0] = GiFloat32Type2FixLenType(GiAddFloat32( + GiFixLenType2GiFloat32Type(value.val[0]), + GiFixLenType2GiFloat32Type(lr.value.val[0]))); + value.val[1] = GiFloat32Type2FixLenType(GiAddFloat32( + GiFixLenType2GiFloat32Type(value.val[1]), + GiFixLenType2GiFloat32Type(lr.value.val[1]))); return *this; } Vector operator-(const Vector& lr) { Vector dst; - dst.value.val[0] = GiSubtractFloat32(value.val[0], lr.value.val[0]); - dst.value.val[1] = GiSubtractFloat32(value.val[1], lr.value.val[1]); + dst.value.val[0] = GiFloat32Type2FixLenType(GiSubtractFloat32( + GiFixLenType2GiFloat32Type(value.val[0]), + GiFixLenType2GiFloat32Type(lr.value.val[0]))); + dst.value.val[1] = GiFloat32Type2FixLenType(GiSubtractFloat32( + GiFixLenType2GiFloat32Type(value.val[1]), + GiFixLenType2GiFloat32Type(lr.value.val[1]))); return dst; } Vector& operator-=(const Vector& lr) { - value.val[0] = GiSubtractFloat32(value.val[0], lr.value.val[0]); - value.val[1] = GiSubtractFloat32(value.val[1], lr.value.val[1]); + value.val[0] = GiFloat32Type2FixLenType(GiSubtractFloat32( + GiFixLenType2GiFloat32Type(value.val[0]), + GiFixLenType2GiFloat32Type(lr.value.val[0]))); + value.val[1] = GiFloat32Type2FixLenType(GiSubtractFloat32( + GiFixLenType2GiFloat32Type(value.val[1]), + GiFixLenType2GiFloat32Type(lr.value.val[1]))); return *this; } Vector operator*(float lr) { Vector dst; - dst.value.val[0] = GiMultiplyScalerFloat32(value.val[0], lr); - dst.value.val[1] = GiMultiplyScalerFloat32(value.val[1], lr); + dst.value.val[0] = GiFloat32Type2FixLenType( + GiMultiplyScalerFloat32(GiFixLenType2GiFloat32Type(value.val[0]), lr)); + dst.value.val[1] = GiFloat32Type2FixLenType( + GiMultiplyScalerFloat32(GiFixLenType2GiFloat32Type(value.val[1]), lr)); return dst; } //! val + lr * n Vector& mla(const Vector& lr, float n) { - value.val[0] = GiMultiplyAddScalarFloat32(value.val[0], lr.value.val[0], n); - value.val[1] = GiMultiplyAddScalarFloat32(value.val[1], lr.value.val[1], n); + value.val[0] = GiFloat32Type2FixLenType(GiMultiplyAddScalarFloat32( + GiFixLenType2GiFloat32Type(value.val[0]), + GiFixLenType2GiFloat32Type(lr.value.val[0]), n)); + value.val[1] = GiFloat32Type2FixLenType(GiMultiplyAddScalarFloat32( + GiFixLenType2GiFloat32Type(value.val[1]), + GiFixLenType2GiFloat32Type(lr.value.val[1]), n)); return *this; } Vector operator*(const Vector& lr) { Vector dst; - dst.value.val[0] = GiMultiplyFloat32(value.val[0], lr.value.val[0]); - dst.value.val[1] = GiMultiplyFloat32(value.val[1], lr.value.val[1]); + dst.value.val[0] = GiFloat32Type2FixLenType(GiMultiplyFloat32( + GiFixLenType2GiFloat32Type(value.val[0]), + GiFixLenType2GiFloat32Type(lr.value.val[0]))); + dst.value.val[1] = GiFloat32Type2FixLenType(GiMultiplyFloat32( + GiFixLenType2GiFloat32Type(value.val[1]), + GiFixLenType2GiFloat32Type(lr.value.val[1]))); return dst; } Vector& operator*=(const Vector& lr) { - value.val[0] = GiMultiplyFloat32(value.val[0], lr.value.val[0]); - value.val[1] = GiMultiplyFloat32(value.val[1], lr.value.val[1]); + value.val[0] = GiFloat32Type2FixLenType(GiMultiplyFloat32( + GiFixLenType2GiFloat32Type(value.val[0]), + GiFixLenType2GiFloat32Type(lr.value.val[0]))); + value.val[1] = GiFloat32Type2FixLenType(GiMultiplyFloat32( + GiFixLenType2GiFloat32Type(value.val[1]), + GiFixLenType2GiFloat32Type(lr.value.val[1]))); return *this; } Vector& operator=(const Vector& lr) { diff --git a/dnn/src/fallback/general_intrinsic/gi_common.h b/dnn/src/fallback/general_intrinsic/gi_common.h index 88ef64026..20660524a 100644 --- a/dnn/src/fallback/general_intrinsic/gi_common.h +++ b/dnn/src/fallback/general_intrinsic/gi_common.h @@ -515,26 +515,26 @@ typedef GI_INT8_t GI_INT8_FIXLEN_t; typedef GI_INT16_t GI_INT16_FIXLEN_t; typedef GI_INT32_t GI_INT32_FIXLEN_t; typedef GI_UINT32_t GI_UINT32_FIXLEN_t; -#define GiFloat32Type2FixLenType(s) (s) -#define GiFixLenType2GiFloat32Type(s) (s) +#define GiFloat32Type2FixLenType(s) s +#define GiFixLenType2GiFloat32Type(s) s -#define GiFloat32Type2FixLenV2Type(s) (s) -#define GiFixLenType2GiFloat32V2Type(s) (s) +#define GiFloat32Type2FixLenV2Type(s) s +#define GiFixLenType2GiFloat32V2Type(s) s -#define GiUint8Type2FixLenType(s) (s) -#define GiFixLenType2GiUint8Type(s) (s) +#define GiUint8Type2FixLenType(s) s +#define GiFixLenType2GiUint8Type(s) s -#define GiInt8Type2FixLenType(s) (s) -#define GiFixLenType2GiInt8Type(s) (s) +#define GiInt8Type2FixLenType(s) s +#define GiFixLenType2GiInt8Type(s) s -#define GiInt16Type2FixLenType(s) (s) -#define GiFixLenType2GiInt16Type(s) (s) +#define GiInt16Type2FixLenType(s) s +#define GiFixLenType2GiInt16Type(s) s -#define GiInt32Type2FixLenType(s) (s) -#define GiFixLenType2GiInt32Type(s) (s) +#define GiInt32Type2FixLenType(s) s +#define GiFixLenType2GiInt32Type(s) s -#define GiUint32Type2FixLenType(s) (s) -#define GiFixLenType2GiUint32Type(s) (s) +#define GiUint32Type2FixLenType(s) s +#define GiFixLenType2GiUint32Type(s) s //! get subvector #define GiGetSubVectorFloat32V2(s, index) s.val[index] -- GitLab