From 8546c15d45795d44a2b777e04f5146954fcd9c0a Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 14 Jun 2022 18:14:48 +0800 Subject: [PATCH] feat(gi): make elemwise apply gi class type GitOrigin-RevId: 6ff1a8a55ce5e01a93b4c619833dcd70ebe2f735 --- .../arm_common/elemwise_helper/elemwise_op.h | 60 ++- .../fallback/elemwise/gi_impl/gi_mathfun.cpp | 2 +- dnn/src/fallback/elemwise_helper/kimpl/abs.h | 51 ++- dnn/src/fallback/elemwise_helper/kimpl/add.h | 63 +++- dnn/src/fallback/elemwise_helper/kimpl/exp.h | 38 +- .../elemwise_helper/kimpl/fast_tanh.h | 14 +- .../elemwise_helper/kimpl/fuse_add_h_swish.h | 45 ++- .../elemwise_helper/kimpl/fuse_add_relu.h | 60 ++- .../elemwise_helper/kimpl/fuse_add_sigmoid.h | 18 +- .../elemwise_helper/kimpl/fuse_add_tanh.h | 18 +- .../elemwise_helper/kimpl/fuse_mul_add3.h | 52 +-- .../fallback/elemwise_helper/kimpl/hswish.h | 89 +++-- dnn/src/fallback/elemwise_helper/kimpl/max.h | 39 +- dnn/src/fallback/elemwise_helper/kimpl/min.h | 39 +- dnn/src/fallback/elemwise_helper/kimpl/mul.h | 39 +- dnn/src/fallback/elemwise_helper/kimpl/none.h | 39 +- .../fallback/elemwise_helper/kimpl/op_base.h | 348 ++++++++++------- dnn/src/fallback/elemwise_helper/kimpl/relu.h | 106 ++++-- .../fallback/elemwise_helper/kimpl/sigmoid.h | 48 +-- dnn/src/fallback/elemwise_helper/kimpl/sub.h | 39 +- dnn/src/fallback/elemwise_helper/kimpl/tanh.h | 100 ++--- .../fallback/elemwise_helper/kimpl/true_div.h | 18 +- .../fallback/elemwise_helper/kimpl/typecvt.h | 23 +- dnn/src/fallback/elemwise_helper/op_common.h | 313 +++++++++------- dnn/src/fallback/gi_intrinsic_helper.h | 5 +- dnn/src/fallback/quantized_converter.h | 8 +- dnn/src/fallback/reduce/reducer.h | 351 ++++++++++-------- dnn/src/fallback/type_cvt/typecvt_helper.h | 110 +++--- 28 files changed, 1297 insertions(+), 838 deletions(-) diff --git a/dnn/src/arm_common/elemwise_helper/elemwise_op.h b/dnn/src/arm_common/elemwise_helper/elemwise_op.h index 1eaed779..96b25e76 100644 --- a/dnn/src/arm_common/elemwise_helper/elemwise_op.h +++ b/dnn/src/arm_common/elemwise_helper/elemwise_op.h @@ -12,7 +12,7 @@ using BcastType = megdnn::elemwise::BcastType; ///////////////////////////////// ParamElemVistor /////////////////////////// -#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix) \ +#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, _neon_type_v2) \ template <> \ struct ParamElemVisitor<_ctype> { \ _neon_type operator()(const _ctype* src) const { \ @@ -24,29 +24,61 @@ using BcastType = megdnn::elemwise::BcastType; _neon_type operator()(const _ctype* src) const { \ return vdupq_n_##_fun_suffix(*reinterpret_cast(src)); \ } \ + }; \ + template <> \ + struct ParamElemVisitorV2<_ctype> { \ + _neon_type_v2 operator()(const _ctype* src, const _ctype* src_1) const { \ + _neon_type_v2 ret; \ + ret.val[0] = \ + vld1q_##_fun_suffix(reinterpret_cast(src)); \ + ret.val[1] = \ + vld1q_##_fun_suffix(reinterpret_cast(src_1)); \ + return ret; \ + } \ + }; \ + template <> \ + struct ParamElemVisitorDupV2<_ctype> { \ + _neon_type_v2 operator()(const _ctype* src) const { \ + _neon_type_v2 ret; \ + ret.val[0] = vdupq_n_##_fun_suffix( \ + *reinterpret_cast(src)); \ + ret.val[1] = ret.val[0]; \ + return ret; \ + } \ } -cb(dt_quint8, uint8_t, uint8x16_t, u8); +cb(dt_quint8, uint8_t, uint8x16_t, u8, uint8x16x2_t); #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -cb(__fp16, __fp16, float16x8_t, f16); +cb(__fp16, __fp16, float16x8_t, f16, float16x8x2_t); #endif -cb(dt_int16, int16_t, int16x8_t, s16); +cb(dt_int16, int16_t, int16x8_t, s16, int16x8x2_t); #undef cb template struct ParamElemVisitorBcast101x4; -#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, rel_suffix) \ - template <> \ - struct ParamElemVisitorBcast101x4<_ctype> { \ - _neon_type operator()(const _ctype* src) const { \ - return vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \ - reinterpret_cast(src))); \ - } \ +#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, rel_suffix, _neon_type_v2) \ + template <> \ + struct ParamElemVisitorBcast101x4<_ctype> { \ + _neon_type operator()(const _ctype* src) const { \ + return vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \ + reinterpret_cast(src))); \ + } \ + }; \ + template <> \ + struct ParamElemVisitorBcast101x4V2<_ctype> { \ + _neon_type_v2 operator()(const _ctype* src) const { \ + _neon_type_v2 ret; \ + ret.val[0] = \ + vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \ + reinterpret_cast(src))); \ + ret.val[1] = ret.val[0]; \ + return ret; \ + } \ } -cb(dt_quint8, uint32_t, uint8x16_t, u8, u32); -cb(dt_int16, int64_t, int16x8_t, s16, s64); +cb(dt_quint8, uint32_t, uint8x16_t, u8, u32, uint8x16x2_t); +cb(dt_int16, int64_t, int16x8_t, s16, s64, int16x8x2_t); #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -cb(__fp16, uint64_t, float16x8_t, f16, u64); +cb(__fp16, uint64_t, float16x8_t, f16, u64, float16x8x2_t); #endif #undef cb diff --git a/dnn/src/fallback/elemwise/gi_impl/gi_mathfun.cpp b/dnn/src/fallback/elemwise/gi_impl/gi_mathfun.cpp index 6c30af27..65038765 100644 --- a/dnn/src/fallback/elemwise/gi_impl/gi_mathfun.cpp +++ b/dnn/src/fallback/elemwise/gi_impl/gi_mathfun.cpp @@ -283,7 +283,7 @@ v4sf GiCosPsFloat32(v4sf x) { v4sf GiTanPsFloat32(v4sf x) { v4sf ysin, ycos; GiSinCosPsFloat32(x, &ysin, &ycos); - return ysin / ycos; + return GiDivFloat32(ysin, ycos); } #undef c_exp_hi diff --git a/dnn/src/fallback/elemwise_helper/kimpl/abs.h b/dnn/src/fallback/elemwise_helper/kimpl/abs.h index 5fe0263c..018afe9a 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/abs.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/abs.h @@ -20,22 +20,28 @@ struct AbsOpBase : UnaryOpBase { template struct AbsOp; -#define OP(_ctype, _gi_type, _func_suffix, _simd_width) \ - template <> \ - struct AbsOp<_ctype> : AbsOpBase<_ctype> { \ - using AbsOpBase::AbsOpBase; \ - using AbsOpBase::operator(); \ - constexpr static size_t SIMD_WIDTH = _simd_width; \ - void operator()(const _gi_type& src, _ctype* dst) const { \ - auto vitem = operator()(src); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ - } \ - _gi_type operator()(const _gi_type& src) const { \ - auto vitem0 = GiAbs##_func_suffix(src.val[0]); \ - auto vitem1 = GiAbs##_func_suffix(src.val[1]); \ - return {{vitem0, vitem1}}; \ - } \ +#define OP(_ctype, _gi_type, _func_suffix, _simd_width) \ + template <> \ + struct AbsOp<_ctype> : AbsOpBase<_ctype> { \ + using AbsOpBase::AbsOpBase; \ + using AbsOpBase::operator(); \ + constexpr static size_t SIMD_WIDTH = _simd_width; \ + void operator()(const _gi_type& src, _ctype* dst) const { \ + auto vitem = operator()(src); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ + } \ + _gi_type operator()(const _gi_type& src) const { \ + auto vitem0 = \ + GiAbs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 0)); \ + auto vitem1 = \ + GiAbs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 1)); \ + _gi_type ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ + GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ + return ret; \ + } \ }; OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(dt_float32)) OP(dt_int32, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(dt_int32)) @@ -64,11 +70,18 @@ struct AbsOp : AbsOpBase { OPERATOR_UNARY_QINT8_FALLBACK; } GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { - auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale); - auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale); + auto vitem0 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), + GiFixLenType2GiFloat32Type(this->vscale)); + auto vitem1 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), + GiFixLenType2GiFloat32Type(this->vscale)); vitem0 = GiAbsFloat32(vitem0); vitem1 = GiAbsFloat32(vitem1); - return QConverter::convert({{vitem0, vitem1}}); + GI_FLOAT32_V2_t tmp; + GiSetSubVectorFloat32V2(tmp, 0, vitem0); + GiSetSubVectorFloat32V2(tmp, 1, vitem1); + return QConverter::convert(tmp); } }; diff --git a/dnn/src/fallback/elemwise_helper/kimpl/add.h b/dnn/src/fallback/elemwise_helper/kimpl/add.h index b0acd10c..4ffe0191 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/add.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/add.h @@ -33,13 +33,21 @@ struct AddOp; void operator()( \ const _gi_type2& src0, const _gi_type2& src1, dst_ctype* dst) const { \ auto vitem = operator()(src0, src1); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _gi_type2 operator()(const _gi_type2& src0, const _gi_type2& src1) const { \ - auto vitem0 = GiAdd##_func_suffix(src0.val[0], src1.val[0]); \ - auto vitem1 = GiAdd##_func_suffix(src0.val[1], src1.val[1]); \ - return {{vitem0, vitem1}}; \ + auto vitem0 = GiAdd##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src0, 0), \ + GiGetSubVector##_func_suffix##V2(src1, 0)); \ + auto vitem1 = GiAdd##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src0, 1), \ + GiGetSubVector##_func_suffix##V2(src1, 1)); \ + _gi_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ + GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ + return ret; \ } \ void operator()( \ const _gi_type& src0, const _gi_type& src1, dst_ctype* dst) const { \ @@ -82,13 +90,24 @@ struct AddOp : AddOpBase { GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { auto vitem0 = GiAddFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale1))); auto vitem1 = GiAddFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); - - return QConverter::convert({{vitem0, vitem1}}); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale1))); + GI_FLOAT32_V2_t ret; + GiSetSubVectorFloat32V2(ret, 0, vitem0); + GiSetSubVectorFloat32V2(ret, 1, vitem1); + + return QConverter::convert(ret); } }; @@ -119,12 +138,24 @@ struct AddOp : AddOpBase { GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { auto vitem0 = GiAddFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale1))); auto vitem1 = GiAddFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); - return QConverter::convert({{vitem0, vitem1}}); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale1))); + GI_FLOAT32_V2_t ret; + GiSetSubVectorFloat32V2(ret, 0, vitem0); + GiSetSubVectorFloat32V2(ret, 1, vitem1); + + return QConverter::convert(ret); } }; diff --git a/dnn/src/fallback/elemwise_helper/kimpl/exp.h b/dnn/src/fallback/elemwise_helper/kimpl/exp.h index 4a6291dc..cb71dbeb 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/exp.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/exp.h @@ -23,22 +23,28 @@ struct ExpOpBase : UnaryOpBase { template struct ExpOp; -#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \ - template <> \ - struct ExpOp<_ctype> : ExpOpBase<_ctype> { \ - using ExpOpBase::ExpOpBase; \ - using ExpOpBase::operator(); \ - constexpr static size_t SIMD_WIDTH = _simd_width; \ - void operator()(const _simd_type& src, _ctype* dst) const { \ - auto vitem = operator()(src); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ - } \ - _simd_type operator()(const _simd_type& src) const { \ - auto vitem0 = GiExpPs##_func_suffix(src.val[0]); \ - auto vitem1 = GiExpPs##_func_suffix(src.val[1]); \ - return {{vitem0, vitem1}}; \ - } \ +#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \ + template <> \ + struct ExpOp<_ctype> : ExpOpBase<_ctype> { \ + using ExpOpBase::ExpOpBase; \ + using ExpOpBase::operator(); \ + constexpr static size_t SIMD_WIDTH = _simd_width; \ + void operator()(const _simd_type& src, _ctype* dst) const { \ + auto vitem = operator()(src); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ + } \ + _simd_type operator()(const _simd_type& src) const { \ + auto vitem0 = \ + GiExpPs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 0)); \ + auto vitem1 = \ + GiExpPs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 1)); \ + _simd_type ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ + GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ + return ret; \ + } \ }; OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) #undef OP diff --git a/dnn/src/fallback/elemwise_helper/kimpl/fast_tanh.h b/dnn/src/fallback/elemwise_helper/kimpl/fast_tanh.h index 9fb0f37e..8a4bc998 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/fast_tanh.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/fast_tanh.h @@ -32,14 +32,15 @@ struct FastTanhOp; constexpr static size_t SIMD_WIDTH = _simd_width; \ void operator()(const _simd_type& src, _ctype* dst) const { \ auto vitem = operator()(src); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _simd_type operator()(const _simd_type& src) const { \ auto val_27 = GiBroadcast##_func_suffix(27.f); \ auto val_9 = GiBroadcast##_func_suffix(9.f); \ - auto valx = src.val[0]; \ - auto valx1 = src.val[1]; \ + auto valx = GiGetSubVector##_func_suffix##V2(src, 0); \ + auto valx1 = GiGetSubVector##_func_suffix##V2(src, 1); \ auto valxp2 = GiMultiply##_fix_func_suffix(valx, valx); \ auto valx1p2 = GiMultiply##_fix_func_suffix(valx1, valx1); \ auto denominator = GiAdd##_fix_func_suffix(valxp2, val_27); \ @@ -58,7 +59,10 @@ struct FastTanhOp; r_denominator1); \ valx = GiMultiply##_fix_func_suffix(valx, r_denominator); \ valx1 = GiMultiply##_fix_func_suffix(valx1, r_denominator1); \ - return {{valx, valx1}}; \ + _simd_type ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, valx); \ + GiSetSubVector##_func_suffix##V2(ret, 1, valx1); \ + return ret; \ } \ }; OP(dt_float32, GI_FLOAT32_V2_t, Float32, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) diff --git a/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_h_swish.h b/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_h_swish.h index 987de10b..2b39db3a 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_h_swish.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_h_swish.h @@ -36,19 +36,23 @@ struct FuseAddHSwishOp; const _simd_type2& src0, const _simd_type2& src1, \ dst_ctype* dst) const { \ auto vitem = operator()(src0, src1); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _simd_type2 operator()( \ const _simd_type2& src0, const _simd_type2& src1) const { \ - auto val1 = src0.val[0]; \ - auto val2 = src0.val[1]; \ - auto val3 = src1.val[0]; \ - auto val4 = src1.val[1]; \ + auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \ + auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \ + auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \ + auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \ val1 = GiAdd##_func_suffix(val1, val3); \ val2 = GiAdd##_func_suffix(val2, val4); \ H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \ - return {{val1, val2}}; \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ + GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ + return ret; \ } \ void operator()( \ const _simd_type& src0, const _simd_type& src1, \ @@ -98,15 +102,28 @@ struct FuseAddHSwishOp : FuseAddHSwishOpBasevscale_src0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale_src1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale_src0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale_src1))); vitem1 = GiAddFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale_src1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale_src0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale_src1))); H_SWISH_KERN_FALLBACK(Float32, vitem0, vitem1); - vitem0 = GiMultiplyFloat32(vitem0, this->vscale_dst); - vitem1 = GiMultiplyFloat32(vitem1, this->vscale_dst); - return QConverter::convert({{vitem0, vitem1}}); + vitem0 = + GiMultiplyFloat32(vitem0, GiFixLenType2GiFloat32Type(this->vscale_dst)); + vitem1 = + GiMultiplyFloat32(vitem1, GiFixLenType2GiFloat32Type(this->vscale_dst)); + GI_FLOAT32_V2_t ret; + GiSetSubVectorFloat32V2(ret, 0, vitem0); + GiSetSubVectorFloat32V2(ret, 1, vitem1); + return QConverter::convert(ret); } }; diff --git a/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_relu.h b/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_relu.h index 87ebb540..da0f5fb4 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_relu.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_relu.h @@ -35,17 +35,21 @@ struct FuseAddReluOp; const _simd_type2& src0, const _simd_type2& src1, \ dst_ctype* dst) const { \ auto vitem = operator()(src0, src1); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _simd_type2 operator()( \ const _simd_type2& src0, const _simd_type2& src1) const { \ - auto val1 = src0.val[0]; \ - auto val2 = src0.val[1]; \ - auto val3 = src1.val[0]; \ - auto val4 = src1.val[1]; \ + auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \ + auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \ + auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \ + auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \ FUSE_ADD_RELU_SIMD_PACK2_FALLBACK(val1, val2, val3, val4, _func_suffix); \ - return {{val1, val2}}; \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ + GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ + return ret; \ } \ void operator()( \ const _simd_type& src0, const _simd_type& src1, \ @@ -105,15 +109,26 @@ struct FuseAddReluOp : FuseAddReluOpBase GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { auto vitem0 = GiAddFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale1))); auto vitem1 = GiAddFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale1))); vitem0 = GiMaximumFloat32(vitem0, this->vzero()); vitem1 = GiMaximumFloat32(vitem1, this->vzero()); - return QConverter::convert({{vitem0, vitem1}}); + GI_FLOAT32_V2_t ret; + GiSetSubVectorFloat32V2(ret, 0, vitem0); + GiSetSubVectorFloat32V2(ret, 1, vitem1); + return QConverter::convert(ret); } }; @@ -144,15 +159,26 @@ struct FuseAddReluOp : FuseAddReluOpBasevscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale1))); auto vitem1 = GiAddFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale1))); vitem0 = GiMaximumFloat32(vitem0, this->vzero()); vitem1 = GiMaximumFloat32(vitem1, this->vzero()); - return QConverter::convert({{vitem0, vitem1}}); + GI_FLOAT32_V2_t ret; + GiSetSubVectorFloat32V2(ret, 0, vitem0); + GiSetSubVectorFloat32V2(ret, 1, vitem1); + return QConverter::convert(ret); } }; diff --git a/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_sigmoid.h b/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_sigmoid.h index 27924bdf..293fa97b 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_sigmoid.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_sigmoid.h @@ -36,19 +36,23 @@ struct FuseAddSigmoidOp; const _simd_type& src0, const _simd_type& src1, \ dst_ctype* dst) const { \ auto vitem = operator()(src0, src1); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _simd_type operator()(const _simd_type& src0, const _simd_type& src1) const { \ - auto val1 = src0.val[0]; \ - auto val2 = src0.val[1]; \ - auto val3 = src1.val[0]; \ - auto val4 = src1.val[1]; \ + auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \ + auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \ + auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \ + auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \ val1 = GiAdd##_func_suffix(val1, val3); \ val2 = GiAdd##_func_suffix(val2, val4); \ val1 = GiSigmoidPs##_func_suffix(val1); \ val2 = GiSigmoidPs##_func_suffix(val2); \ - return {{val1, val2}}; \ + _simd_type ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ + GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ + return ret; \ } \ }; OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) diff --git a/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_tanh.h b/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_tanh.h index 3efe2b09..1083cbac 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_tanh.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/fuse_add_tanh.h @@ -35,14 +35,15 @@ struct FuseAddTanhOp; const _simd_type& src0, const _simd_type& src1, \ dst_ctype* dst) const { \ auto vitem = operator()(src0, src1); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _simd_type operator()(const _simd_type& src0, const _simd_type& src1) const { \ - auto val1 = src0.val[0]; \ - auto val2 = src0.val[1]; \ - auto val3 = src1.val[0]; \ - auto val4 = src1.val[1]; \ + auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \ + auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \ + auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \ + auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \ val1 = GiAdd##_func_suffix(val1, val3); \ val2 = GiAdd##_func_suffix(val2, val4); \ auto exp1 = GiExpPs##_func_suffix(val1); \ @@ -65,7 +66,10 @@ struct FuseAddTanhOp; GiRecpeS##_func_suffix(exp2, rexp2), rexp2); \ val1 = GiMultiply##_func_suffix(val1, rexp1); \ val2 = GiMultiply##_func_suffix(val2, rexp2); \ - return {{val1, val2}}; \ + _simd_type ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ + GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ + return ret; \ } \ }; OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) diff --git a/dnn/src/fallback/elemwise_helper/kimpl/fuse_mul_add3.h b/dnn/src/fallback/elemwise_helper/kimpl/fuse_mul_add3.h index 348d10ab..0c38b75d 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/fuse_mul_add3.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/fuse_mul_add3.h @@ -26,28 +26,36 @@ struct FuseMulAdd3OpBase : TernaryOpBase { template struct FuseMulAdd3Op; -#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \ - template <> \ - struct FuseMulAdd3Op<_ctype> : FuseMulAdd3OpBase<_ctype> { \ - using FuseMulAdd3OpBase::FuseMulAdd3OpBase; \ - using FuseMulAdd3OpBase::operator(); \ - constexpr static size_t SIMD_WIDTH = _simd_width; \ - void operator()( \ - const _simd_type& src0, const _simd_type& src1, \ - const _simd_type& src2, dst_ctype* dst) const { \ - auto vitem = operator()(src0, src1, src2); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ - } \ - _simd_type operator()( \ - const _simd_type& src0, const _simd_type& src1, \ - const _simd_type& src2) const { \ - auto vitem0 = GiMultiplyAdd##_func_suffix( \ - src2.val[0], src0.val[0], src1.val[0]); \ - auto vitem1 = GiMultiplyAdd##_func_suffix( \ - src2.val[1], src0.val[1], src1.val[1]); \ - return {{vitem0, vitem1}}; \ - } \ +#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \ + template <> \ + struct FuseMulAdd3Op<_ctype> : FuseMulAdd3OpBase<_ctype> { \ + using FuseMulAdd3OpBase::FuseMulAdd3OpBase; \ + using FuseMulAdd3OpBase::operator(); \ + constexpr static size_t SIMD_WIDTH = _simd_width; \ + void operator()( \ + const _simd_type& src0, const _simd_type& src1, \ + const _simd_type& src2, dst_ctype* dst) const { \ + auto vitem = operator()(src0, src1, src2); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ + } \ + _simd_type operator()( \ + const _simd_type& src0, const _simd_type& src1, \ + const _simd_type& src2) const { \ + auto vitem0 = GiMultiplyAdd##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src2, 0), \ + GiGetSubVector##_func_suffix##V2(src0, 0), \ + GiGetSubVector##_func_suffix##V2(src1, 0)); \ + auto vitem1 = GiMultiplyAdd##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src2, 1), \ + GiGetSubVector##_func_suffix##V2(src0, 1), \ + GiGetSubVector##_func_suffix##V2(src1, 1)); \ + _simd_type ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ + GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ + return ret; \ + } \ }; OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) OP(dt_int32, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t)) diff --git a/dnn/src/fallback/elemwise_helper/kimpl/hswish.h b/dnn/src/fallback/elemwise_helper/kimpl/hswish.h index c6e8663f..0d67b14c 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/hswish.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/hswish.h @@ -26,39 +26,43 @@ struct HSwishOpBase : UnaryOpBase { template struct HSwishOp; -#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ - template <> \ - struct HSwishOp<_ctype> : HSwishOpBase<_ctype> { \ - using HSwishOpBase::HSwishOpBase; \ - using HSwishOpBase::operator(); \ - constexpr static size_t SIMD_WIDTH = _simd_width; \ - void operator()(const _simd_type2& src, _ctype* dst) const { \ - auto vitem = operator()(src); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ - } \ - void operator()(const _simd_type& src, _ctype* dst) const { \ - auto vitem = operator()(src); \ - GiStore##_func_suffix(dst, vitem); \ - } \ - _simd_type2 operator()(const _simd_type2& src) const { \ - auto val1 = src.val[0]; \ - auto val2 = src.val[1]; \ - H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \ - return {{val1, val2}}; \ - } \ - _simd_type operator()(const _simd_type& src) const { \ - auto val_zero = GiBroadcast##_func_suffix(0.f); \ - auto val_six = GiBroadcast##_func_suffix(6.f); \ - auto val_three = GiBroadcast##_func_suffix(3.f); \ - auto val_rec_six = GiBroadcast##_func_suffix(1.f / 6.f); \ - auto clip1 = GiMaximum##_func_suffix( \ - GiMinimum##_func_suffix( \ - GiAdd##_func_suffix(src, val_three), val_six), \ - val_zero); \ - return GiMultiply##_func_suffix( \ - GiMultiply##_func_suffix(src, clip1), val_rec_six); \ - } \ +#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ + template <> \ + struct HSwishOp<_ctype> : HSwishOpBase<_ctype> { \ + using HSwishOpBase::HSwishOpBase; \ + using HSwishOpBase::operator(); \ + constexpr static size_t SIMD_WIDTH = _simd_width; \ + void operator()(const _simd_type2& src, _ctype* dst) const { \ + auto vitem = operator()(src); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ + } \ + void operator()(const _simd_type& src, _ctype* dst) const { \ + auto vitem = operator()(src); \ + GiStore##_func_suffix(dst, vitem); \ + } \ + _simd_type2 operator()(const _simd_type2& src) const { \ + auto val1 = GiGetSubVector##_func_suffix##V2(src, 0); \ + auto val2 = GiGetSubVector##_func_suffix##V2(src, 1); \ + H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ + GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ + return ret; \ + } \ + _simd_type operator()(const _simd_type& src) const { \ + auto val_zero = GiBroadcast##_func_suffix(0.f); \ + auto val_six = GiBroadcast##_func_suffix(6.f); \ + auto val_three = GiBroadcast##_func_suffix(3.f); \ + auto val_rec_six = GiBroadcast##_func_suffix(1.f / 6.f); \ + auto clip1 = GiMaximum##_func_suffix( \ + GiMinimum##_func_suffix( \ + GiAdd##_func_suffix(src, val_three), val_six), \ + val_zero); \ + return GiMultiply##_func_suffix( \ + GiMultiply##_func_suffix(src, clip1), val_rec_six); \ + } \ }; OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) @@ -90,14 +94,23 @@ struct HSwishOp : HSwishOpBase { } GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { - auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale_src); - auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale_src); + auto vitem0 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), + GiFixLenType2GiFloat32Type(this->vscale_src)); + auto vitem1 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), + GiFixLenType2GiFloat32Type(this->vscale_src)); H_SWISH_KERN_FALLBACK(Float32, vitem0, vitem1); - vitem0 = GiMultiplyFloat32(vitem0, this->vscale_dst); - vitem1 = GiMultiplyFloat32(vitem1, this->vscale_dst); + vitem0 = + GiMultiplyFloat32(vitem0, GiFixLenType2GiFloat32Type(this->vscale_dst)); + vitem1 = + GiMultiplyFloat32(vitem1, GiFixLenType2GiFloat32Type(this->vscale_dst)); + GI_FLOAT32_V2_t tmp; + GiSetSubVectorFloat32V2(tmp, 0, vitem0); + GiSetSubVectorFloat32V2(tmp, 1, vitem1); - return QConverter::convert({{vitem0, vitem1}}); + return QConverter::convert(tmp); } }; diff --git a/dnn/src/fallback/elemwise_helper/kimpl/max.h b/dnn/src/fallback/elemwise_helper/kimpl/max.h index 025c08b6..2776dce3 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/max.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/max.h @@ -32,14 +32,22 @@ struct MaxOp; const _simd_type2& src0, const _simd_type2& src1, \ dst_ctype* dst) const { \ auto vitem = operator()(src0, src1); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _simd_type2 operator()( \ const _simd_type2& src0, const _simd_type2& src1) const { \ - auto vitem0 = GiMaximum##_func_suffix(src0.val[0], src1.val[0]); \ - auto vitem1 = GiMaximum##_func_suffix(src0.val[1], src1.val[1]); \ - return {{vitem0, vitem1}}; \ + auto vitem0 = GiMaximum##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src0, 0), \ + GiGetSubVector##_func_suffix##V2(src1, 0)); \ + auto vitem1 = GiMaximum##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src0, 1), \ + GiGetSubVector##_func_suffix##V2(src1, 1)); \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ + GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ + return ret; \ } \ void operator()( \ const _simd_type& src0, const _simd_type& src1, \ @@ -87,12 +95,23 @@ struct MaxOp : MaxOpBase { GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { auto vitem0 = GiMaximumFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale1))); auto vitem1 = GiMaximumFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); - return QConverter::convert({{vitem0, vitem1}}); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale1))); + GI_FLOAT32_V2_t tmp; + GiSetSubVectorFloat32V2(tmp, 0, vitem0); + GiSetSubVectorFloat32V2(tmp, 1, vitem1); + return QConverter::convert(tmp); } }; diff --git a/dnn/src/fallback/elemwise_helper/kimpl/min.h b/dnn/src/fallback/elemwise_helper/kimpl/min.h index edac0104..c9e0e33e 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/min.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/min.h @@ -33,14 +33,22 @@ struct MinOp; const _simd_type2& src0, const _simd_type2& src1, \ dst_ctype* dst) const { \ auto vitem = operator()(src0, src1); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _simd_type2 operator()( \ const _simd_type2& src0, const _simd_type2& src1) const { \ - auto vitem0 = GiMinimum##_func_suffix(src0.val[0], src1.val[0]); \ - auto vitem1 = GiMinimum##_func_suffix(src0.val[1], src1.val[1]); \ - return {{vitem0, vitem1}}; \ + auto vitem0 = GiMinimum##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src0, 0), \ + GiGetSubVector##_func_suffix##V2(src1, 0)); \ + auto vitem1 = GiMinimum##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src0, 1), \ + GiGetSubVector##_func_suffix##V2(src1, 1)); \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ + GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ + return ret; \ } \ void operator()( \ const _simd_type& src0, const _simd_type& src1, \ @@ -84,12 +92,23 @@ struct MinOp : MinOpBase { GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { auto vitem0 = GiMinimumFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale1))); auto vitem1 = GiMinimumFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); - return QConverter::convert({{vitem0, vitem1}}); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale1))); + GI_FLOAT32_V2_t tmp; + GiSetSubVectorFloat32V2(tmp, 0, vitem0); + GiSetSubVectorFloat32V2(tmp, 1, vitem1); + return QConverter::convert(tmp); } }; diff --git a/dnn/src/fallback/elemwise_helper/kimpl/mul.h b/dnn/src/fallback/elemwise_helper/kimpl/mul.h index dc58f5ac..2dd84b99 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/mul.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/mul.h @@ -33,14 +33,22 @@ struct MulOp; const _simd_type2& src0, const _simd_type2& src1, \ dst_ctype* dst) const { \ auto vitem = operator()(src0, src1); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _simd_type2 operator()( \ const _simd_type2& src0, const _simd_type2& src1) const { \ - auto vitem0 = GiMultiply##_func_suffix(src0.val[0], src1.val[0]); \ - auto vitem1 = GiMultiply##_func_suffix(src0.val[1], src1.val[1]); \ - return {{vitem0, vitem1}}; \ + auto vitem0 = GiMultiply##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src0, 0), \ + GiGetSubVector##_func_suffix##V2(src1, 0)); \ + auto vitem1 = GiMultiply##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src0, 1), \ + GiGetSubVector##_func_suffix##V2(src1, 1)); \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ + GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ + return ret; \ } \ void operator()( \ const _simd_type& src0, const _simd_type& src1, \ @@ -83,13 +91,24 @@ struct MulOp : MulOpBase { GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { auto vitem0 = GiMultiplyFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale_src0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale1))); auto vitem1 = GiMultiplyFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale_src0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale1))); - return QConverter::convert({{vitem0, vitem1}}); + GI_FLOAT32_V2_t tmp; + GiSetSubVectorFloat32V2(tmp, 0, vitem0); + GiSetSubVectorFloat32V2(tmp, 1, vitem1); + return QConverter::convert(tmp); } }; diff --git a/dnn/src/fallback/elemwise_helper/kimpl/none.h b/dnn/src/fallback/elemwise_helper/kimpl/none.h index 9c20e510..b02aba10 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/none.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/none.h @@ -16,23 +16,24 @@ struct NoneOpBase : UnaryOpBase { template struct NoneOp; -#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ - template <> \ - struct NoneOp<_ctype> : NoneOpBase<_ctype> { \ - NoneOp(){}; \ - NoneOp(float, float){}; \ - using NoneOpBase::NoneOpBase; \ - using NoneOpBase::operator(); \ - constexpr static size_t SIMD_WIDTH = _simd_width; \ - _simd_type2 operator()(const _simd_type2& src) const { return src; } \ - void operator()(const _simd_type2& src, _ctype* dst) const { \ - GiStore##_func_suffix(dst, src.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, src.val[1]); \ - } \ - void operator()(const _simd_type& src, _ctype* dst) const { \ - GiStore##_func_suffix(dst, src); \ - } \ - _simd_type operator()(const _simd_type& src) const { return src; } \ +#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ + template <> \ + struct NoneOp<_ctype> : NoneOpBase<_ctype> { \ + NoneOp(){}; \ + NoneOp(float, float){}; \ + using NoneOpBase::NoneOpBase; \ + using NoneOpBase::operator(); \ + constexpr static size_t SIMD_WIDTH = _simd_width; \ + _simd_type2 operator()(const _simd_type2& src) const { return src; } \ + void operator()(const _simd_type2& src, _ctype* dst) const { \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(src, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(src, 1)); \ + } \ + void operator()(const _simd_type& src, _ctype* dst) const { \ + GiStore##_func_suffix(dst, src); \ + } \ + _simd_type operator()(const _simd_type& src) const { return src; } \ }; OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) @@ -61,8 +62,8 @@ struct NoneOp : NoneOpBase { constexpr static size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t); void operator()(const GI_INT32_V2_t& vsrc, dt_qint8* dst) const { - GiStoreInt32(dst, vsrc.val[0]); - GiStoreInt32(dst + 16, vsrc.val[1]); + GiStoreInt32(dst, GiGetSubVectorInt32V2(vsrc, 0)); + GiStoreInt32(dst + 16, GiGetSubVectorInt32V2(vsrc, 1)); } void operator()(const GI_INT32_t& src, dt_qint8* dst) const { GiStoreInt32(dst, src); diff --git a/dnn/src/fallback/elemwise_helper/kimpl/op_base.h b/dnn/src/fallback/elemwise_helper/kimpl/op_base.h index e5b89972..8b7b21d3 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/op_base.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/op_base.h @@ -31,24 +31,24 @@ struct UnaryOpBase : OpBase { UnaryOpBase(DType /*src_dtype*/, DType /*dst_dtype*/) {} }; -#define OPERATOR_UNARY_QINT8_FALLBACK \ - GI_INT16_t vsrct0 = GiMoveLowLongInt8(vsrc.val[0]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst), operator()( \ - {{GiMoveLowLongInt16(vsrct0), \ - GiMoveHighLongInt16(vsrct0)}})); \ - GI_INT16_t vsrct1 = GiMoveHighLongInt8(vsrc.val[0]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst + 8), \ - operator()({{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}})); \ - GI_INT16_t vsrct2 = GiMoveLowLongInt8(vsrc.val[1]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst + 16), \ - operator()({{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \ - GI_INT16_t vsrct3 = GiMoveHighLongInt8(vsrc.val[1]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst + 24), \ - operator()({{GiMoveLowLongInt16(vsrct3), GiMoveHighLongInt16(vsrct3)}})) +#define OPERATOR_UNARY_QINT8_FALLBACK \ + GI_INT16_t vsrct0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc, 0)); \ + GI_INT32_V2_t tmp; \ + GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct0)); \ + GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct0)); \ + GiStoreLowInt8(reinterpret_cast(dst), operator()(tmp)); \ + GI_INT16_t vsrct1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc, 0)); \ + GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct1)); \ + GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct1)); \ + GiStoreLowInt8(reinterpret_cast(dst + 8), operator()(tmp)); \ + GI_INT16_t vsrct2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc, 1)); \ + GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct2)); \ + GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct2)); \ + GiStoreLowInt8(reinterpret_cast(dst + 16), operator()(tmp)); \ + GI_INT16_t vsrct3 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc, 1)); \ + GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct3)); \ + GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct3)); \ + GiStoreLowInt8(reinterpret_cast(dst + 24), operator()(tmp)) //! scale_src = src.scale; scale_dst = 1.f / dst.scale (div -> mul) //! scale = src.scale / dst.scale @@ -56,17 +56,17 @@ template <> struct UnaryOpBase : OpBase { using OpBase::OpBase; float scale_src, scale_dst; - GI_FLOAT32_t vscale_src, vscale_dst; + GI_FLOAT32_FIXLEN_t vscale_src, vscale_dst; float scale; - GI_FLOAT32_t vscale; + GI_FLOAT32_FIXLEN_t vscale; void init(float src_scale, float dst_scale) { scale_src = src_scale; - vscale_src = GiBroadcastFloat32(scale_src); + vscale_src = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src)); scale_dst = 1.f / dst_scale; - vscale_dst = GiBroadcastFloat32(scale_dst); + vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst)); scale = src_scale / dst_scale; - vscale = GiBroadcastFloat32(scale); + vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); } UnaryOpBase(DType src_dtype, DType dst_dtype) { @@ -83,17 +83,17 @@ struct UnaryOpBase : OpBase { using src_ctype = dt_qint32; using dst_ctype = dt_qint8; float scale; - GI_FLOAT32_t vscale; + GI_FLOAT32_FIXLEN_t vscale; float scale_src, scale_dst; - GI_FLOAT32_t vscale_src, vscale_dst; + GI_FLOAT32_FIXLEN_t vscale_src, vscale_dst; void init(float src_scale, float dst_scale) { scale_src = src_scale; - vscale_src = GiBroadcastFloat32(src_scale); + vscale_src = GiFloat32Type2FixLenType(GiBroadcastFloat32(src_scale)); scale_dst = 1 / dst_scale; - vscale_dst = GiBroadcastFloat32(scale_dst); + vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst)); scale = src_scale / dst_scale; - vscale = GiBroadcastFloat32(scale); + vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); } UnaryOpBase(DType src_dtype, DType dst_dtype) { @@ -115,35 +115,36 @@ struct BinaryOpBase : OpBase { /* ================= binary op for quantized types ================== */ -#define OPERATOR_BINARY_QINT8_FALLBACK \ - GI_INT16_t vsrct0_0 = GiMoveLowLongInt8(vsrc0.val[0]); \ - GI_INT16_t vsrct1_0 = GiMoveLowLongInt8(vsrc1.val[0]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst), \ - operator()( \ - {{GiMoveLowLongInt16(vsrct0_0), GiMoveHighLongInt16(vsrct0_0)}}, \ - {{GiMoveLowLongInt16(vsrct1_0), GiMoveHighLongInt16(vsrct1_0)}})); \ - GI_INT16_t vsrct0_1 = GiMoveHighLongInt8(vsrc0.val[0]); \ - GI_INT16_t vsrct1_1 = GiMoveHighLongInt8(vsrc1.val[0]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst + 8), \ - operator()( \ - {{GiMoveLowLongInt16(vsrct0_1), GiMoveHighLongInt16(vsrct0_1)}}, \ - {{GiMoveLowLongInt16(vsrct1_1), GiMoveHighLongInt16(vsrct1_1)}})); \ - GI_INT16_t vsrct0_2 = GiMoveLowLongInt8(vsrc0.val[1]); \ - GI_INT16_t vsrct1_2 = GiMoveLowLongInt8(vsrc1.val[1]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst + 16), \ - operator()( \ - {{GiMoveLowLongInt16(vsrct0_2), GiMoveHighLongInt16(vsrct0_2)}}, \ - {{GiMoveLowLongInt16(vsrct1_2), GiMoveHighLongInt16(vsrct1_2)}})); \ - GI_INT16_t vsrct0_3 = GiMoveHighLongInt8(vsrc0.val[1]); \ - GI_INT16_t vsrct1_3 = GiMoveHighLongInt8(vsrc1.val[1]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst + 24), \ - operator()( \ - {{GiMoveLowLongInt16(vsrct0_3), GiMoveHighLongInt16(vsrct0_3)}}, \ - {{GiMoveLowLongInt16(vsrct1_3), GiMoveHighLongInt16(vsrct1_3)}})) +#define OPERATOR_BINARY_QINT8_FALLBACK \ + GI_INT16_t vsrct0_0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \ + GI_INT16_t vsrct1_0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \ + GI_INT32_V2_t tmp0, tmp1; \ + GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_0)); \ + GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_0)); \ + GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_0)); \ + GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_0)); \ + GiStoreLowInt8(reinterpret_cast(dst), operator()(tmp0, tmp1)); \ + GI_INT16_t vsrct0_1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \ + GI_INT16_t vsrct1_1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \ + GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_1)); \ + GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_1)); \ + GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_1)); \ + GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_1)); \ + GiStoreLowInt8(reinterpret_cast(dst + 8), operator()(tmp0, tmp1)); \ + GI_INT16_t vsrct0_2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \ + GI_INT16_t vsrct1_2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \ + GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_2)); \ + GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_2)); \ + GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_2)); \ + GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_2)); \ + GiStoreLowInt8(reinterpret_cast(dst + 16), operator()(tmp0, tmp1)); \ + GI_INT16_t vsrct0_3 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \ + GI_INT16_t vsrct1_3 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \ + GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_3)); \ + GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_3)); \ + GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_3)); \ + GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_3)); \ + GiStoreLowInt8(reinterpret_cast(dst + 24), operator()(tmp0, tmp1)); //! scale_src0 = src0.scale; scale_src1 = src1.scale; scale_dst = 1.f / //! dst.scale scale0 = src0.scale / dst.scale; scale1 = src1.scale / dst.scale @@ -153,21 +154,21 @@ struct BinaryOpBase : OpBase { using src_ctype = dt_qint8; using dst_ctype = dt_qint8; float scale_src0, scale_src1, scale_dst; - GI_FLOAT32_t vscale_src0, vscale_src1, vscale_dst; + GI_FLOAT32_FIXLEN_t vscale_src0, vscale_src1, vscale_dst; float scale0, scale1; - GI_FLOAT32_t vscale0, vscale1; + GI_FLOAT32_FIXLEN_t vscale0, vscale1; void init(float src0_scale, float src1_scale, float dst_scale) { scale_src0 = src0_scale; - vscale_src0 = GiBroadcastFloat32(scale_src0); + vscale_src0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src0)); scale_src1 = src1_scale; - vscale_src1 = GiBroadcastFloat32(scale_src1); + vscale_src1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src1)); scale_dst = 1.f / dst_scale; - vscale_dst = GiBroadcastFloat32(scale_dst); + vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst)); scale0 = src0_scale / dst_scale; - vscale0 = GiBroadcastFloat32(scale0); + vscale0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale0)); scale1 = src1_scale / dst_scale; - vscale1 = GiBroadcastFloat32(scale1); + vscale1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale1)); } BinaryOpBase(DType src0_dtype, DType src1_dtype, DType dst_dtype) { @@ -188,21 +189,21 @@ struct BinaryOpBase : OpBase { using src_ctype = dt_qint32; using dst_ctype = dt_qint8; float scale0, scale1; - GI_FLOAT32_t vscale0, vscale1; + GI_FLOAT32_FIXLEN_t vscale0, vscale1; float scale_src0, scale_src1, scale_dst; - GI_FLOAT32_t vscale_src0, vscale_src1, vscale_dst; + GI_FLOAT32_FIXLEN_t vscale_src0, vscale_src1, vscale_dst; void init(float src0_scale, float src1_scale, float dst_scale) { scale_src0 = src0_scale; - vscale_src0 = GiBroadcastFloat32(src0_scale); + vscale_src0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(src0_scale)); scale_src1 = src1_scale; - vscale_src1 = GiBroadcastFloat32(src1_scale); + vscale_src1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(src1_scale)); scale_dst = 1 / dst_scale; - vscale_dst = GiBroadcastFloat32(scale_dst); + vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst)); scale0 = src0_scale / dst_scale; - vscale0 = GiBroadcastFloat32(scale0); + vscale0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale0)); scale1 = src1_scale / dst_scale; - vscale1 = GiBroadcastFloat32(scale1); + vscale1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale1)); } BinaryOpBase(DType src0_dtype, DType src1_dtype, DType dst_dtype) { @@ -227,43 +228,48 @@ struct TernaryOpBase : OpBase { DType /*dst_dtype*/) {} }; -#define OPERATOR_TERNARY_QINT8_FALLBACK \ - GI_INT16_t vsrct0 = GiMoveLowLongInt8(vsrc0.val[0]); \ - GI_INT16_t vsrct1 = GiMoveLowLongInt8(vsrc1.val[0]); \ - GI_INT16_t vsrct2 = GiMoveLowLongInt8(vsrc2.val[0]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst), \ - operator()( \ - {{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \ - {{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \ - {{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \ - vsrct0 = GiMoveHighLongInt8(vsrc0.val[0]); \ - vsrct1 = GiMoveHighLongInt8(vsrc1.val[0]); \ - vsrct2 = GiMoveHighLongInt8(vsrc2.val[0]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst + 8), \ - operator()( \ - {{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \ - {{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \ - {{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \ - vsrct0 = GiMoveLowLongInt8(vsrc0.val[1]); \ - vsrct1 = GiMoveLowLongInt8(vsrc1.val[1]); \ - vsrct2 = GiMoveLowLongInt8(vsrc2.val[1]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst + 16), \ - operator()( \ - {{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \ - {{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \ - {{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \ - vsrct0 = GiMoveHighLongInt8(vsrc0.val[1]); \ - vsrct1 = GiMoveHighLongInt8(vsrc1.val[1]); \ - vsrct2 = GiMoveHighLongInt8(vsrc2.val[1]); \ - GiStoreLowInt8( \ - reinterpret_cast(dst + 24), \ - operator()( \ - {{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \ - {{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \ - {{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})) +#define OPERATOR_TERNARY_QINT8_FALLBACK \ + GI_INT16_t vsrct0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \ + GI_INT16_t vsrct1 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \ + GI_INT16_t vsrct2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc2, 0)); \ + GI_INT32_V2_t tmp0, tmp1, tmp2; \ + GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \ + GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \ + GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \ + GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \ + GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \ + GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \ + GiStoreLowInt8(reinterpret_cast(dst), operator()(tmp0, tmp1, tmp2)); \ + vsrct0 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \ + vsrct1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \ + vsrct2 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc2, 0)); \ + GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \ + GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \ + GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \ + GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \ + GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \ + GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \ + GiStoreLowInt8(reinterpret_cast(dst + 8), operator()(tmp0, tmp1, tmp2)); \ + vsrct0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \ + vsrct1 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \ + vsrct2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc2, 1)); \ + GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \ + GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \ + GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \ + GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \ + GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \ + GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \ + GiStoreLowInt8(reinterpret_cast(dst + 16), operator()(tmp0, tmp1, tmp2)); \ + vsrct0 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \ + vsrct1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \ + vsrct2 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc2, 1)); \ + GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \ + GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \ + GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \ + GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \ + GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \ + GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \ + GiStoreLowInt8(reinterpret_cast(dst + 24), operator()(tmp0, tmp1, tmp2)); /*========================= ternaty op for quanzited ====================*/ template <> @@ -272,24 +278,24 @@ struct TernaryOpBase : OpBase { using src_ctype = dt_qint8; using dst_ctype = dt_qint8; float scale_src0, scale_src1, scale_src2, scale_dst; - GI_FLOAT32_t vscale_src0, vscale_src1, vscale_src2, vscale_dst; + GI_FLOAT32_FIXLEN_t vscale_src0, vscale_src1, vscale_src2, vscale_dst; float scale0, scale1, scale2; - GI_FLOAT32_t vscale0, vscale1, vscale2; + GI_FLOAT32_FIXLEN_t vscale0, vscale1, vscale2; void init(float src0_scale, float src1_scale, float src2_scale, float dst_scale) { scale_src0 = src0_scale; scale_src1 = src1_scale; scale_src2 = src2_scale; scale_dst = 1.f / dst_scale; - vscale_src0 = GiBroadcastFloat32(scale_src0); - vscale_src1 = GiBroadcastFloat32(scale_src1); - vscale_src2 = GiBroadcastFloat32(scale_src2); - vscale_dst = GiBroadcastFloat32(scale_dst); + vscale_src0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src0)); + vscale_src1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src1)); + vscale_src2 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src2)); + vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst)); scale0 = src0_scale / dst_scale; scale1 = src1_scale / dst_scale; scale2 = src2_scale / dst_scale; - vscale0 = GiBroadcastFloat32(scale0); - vscale1 = GiBroadcastFloat32(scale1); - vscale2 = GiBroadcastFloat32(scale2); + vscale0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale0)); + vscale1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale1)); + vscale2 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale2)); } TernaryOpBase( DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype) { @@ -307,7 +313,7 @@ struct TernaryOpBase : OpBase { ////////////////////////// fixup ////////////////////////// struct FixupBase { - GI_INT32_t vmultiplier, vshift; + GI_INT32_FIXLEN_t vmultiplier, vshift; FixupBase(float scale) { //! ignore Fixup if scale >= 0.5, using typecvt instead of shift & //! multiplier, as it may introduce errors. @@ -317,9 +323,9 @@ struct FixupBase { int shift = static_cast(::ceilf(::log2f(0.5 / scale))); scale *= ::powf(2, shift); //! Using double can get full precision here, but it can be ignored. - vmultiplier = GiBroadcastInt32( - std::round(static_cast(scale) * ((2LL) << 30))); - vshift = GiBroadcastInt32(-shift); + vmultiplier = GiInt32Type2FixLenType(GiBroadcastInt32( + std::round(static_cast(scale) * ((2LL) << 30)))); + vshift = GiInt32Type2FixLenType(GiBroadcastInt32(-shift)); } }; @@ -349,11 +355,25 @@ struct UnaryQuantizationOp : UnaryOpBasevscale_src); - auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale_src); - auto val = this->op({{vitem0, vitem1}}); - val.val[0] = GiMultiplyFloat32(val.val[0], this->vscale_dst); - val.val[1] = GiMultiplyFloat32(val.val[1], this->vscale_dst); + auto vitem0 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), + GiFixLenType2GiFloat32Type(this->vscale_src)); + auto vitem1 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), + GiFixLenType2GiFloat32Type(this->vscale_src)); + GI_FLOAT32_V2_t tmp; + GiSetSubVectorFloat32V2(tmp, 0, vitem0); + GiSetSubVectorFloat32V2(tmp, 1, vitem1); + + auto val = this->op(tmp); + GI_FLOAT32_t a = GiMultiplyFloat32( + GiGetSubVectorFloat32V2(val, 0), + GiFixLenType2GiFloat32Type(this->vscale_dst)); + GI_FLOAT32_t b = GiMultiplyFloat32( + GiGetSubVectorFloat32V2(val, 1), + GiFixLenType2GiFloat32Type(this->vscale_dst)); + GiSetSubVectorFloat32V2(val, 0, a); + GiSetSubVectorFloat32V2(val, 1, b); return QConverter::convert(val); } }; @@ -385,13 +405,32 @@ struct BinaryQuantizationOp : BinaryOpBasevscale_src0); - auto val1 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0); - auto val2 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale_src1); - auto val3 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale_src1); - auto val = op({{val0, val1}}, {{val2, val3}}); - val.val[0] = GiMultiplyFloat32(val.val[0], this->vscale_dst); - val.val[1] = GiMultiplyFloat32(val.val[1], this->vscale_dst); + auto val0 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale_src0)); + auto val1 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale_src0)); + auto val2 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale_src1)); + auto val3 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale_src1)); + GI_FLOAT32_V2_t tmp0, tmp1; + GiSetSubVectorFloat32V2(tmp0, 0, val0); + GiSetSubVectorFloat32V2(tmp0, 1, val1); + GiSetSubVectorFloat32V2(tmp1, 0, val2); + GiSetSubVectorFloat32V2(tmp1, 1, val3); + auto val = op(tmp0, tmp1); + GI_FLOAT32_t a = GiMultiplyFloat32( + GiGetSubVectorFloat32V2(val, 0), + GiFixLenType2GiFloat32Type(this->vscale_dst)); + GI_FLOAT32_t b = GiMultiplyFloat32( + GiGetSubVectorFloat32V2(val, 1), + GiFixLenType2GiFloat32Type(this->vscale_dst)); + GiSetSubVectorFloat32V2(val, 0, a); + GiSetSubVectorFloat32V2(val, 1, b); return QConverter::convert(val); } }; @@ -431,15 +470,40 @@ struct TernaryQuantizationOp GI_INT8_t operator()( const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1, const GI_INT32_V2_t& vsrc2) const { - auto val0 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0); - auto val1 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0); - auto val2 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale_src1); - auto val3 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale_src1); - auto val4 = GiMultiplyFloat32(GiCastToFloat32(vsrc2.val[0]), this->vscale_src2); - auto val5 = GiMultiplyFloat32(GiCastToFloat32(vsrc2.val[1]), this->vscale_src2); - auto val = op({{val0, val1}}, {{val2, val3}}, {{val4, val5}}); - val.val[0] = GiMultiplyFloat32(val.val[0], this->vscale_dst); - val.val[1] = GiMultiplyFloat32(val.val[1], this->vscale_dst); + auto val0 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale_src0)); + auto val1 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale_src0)); + auto val2 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale_src1)); + auto val3 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale_src1)); + auto val4 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc2, 0)), + GiFixLenType2GiFloat32Type(this->vscale_src2)); + auto val5 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc2, 1)), + GiFixLenType2GiFloat32Type(this->vscale_src2)); + GI_FLOAT32_V2_t tmp0, tmp1, tmp2; + GiSetSubVectorFloat32V2(tmp0, 0, val0); + GiSetSubVectorFloat32V2(tmp0, 1, val1); + GiSetSubVectorFloat32V2(tmp1, 0, val2); + GiSetSubVectorFloat32V2(tmp1, 1, val3); + GiSetSubVectorFloat32V2(tmp2, 0, val4); + GiSetSubVectorFloat32V2(tmp2, 1, val5); + auto val = op(tmp0, tmp1, tmp2); + GI_FLOAT32_t a = GiMultiplyFloat32( + GiGetSubVectorFloat32V2(val, 0), + GiFixLenType2GiFloat32Type(this->vscale_dst)); + GI_FLOAT32_t b = GiMultiplyFloat32( + GiGetSubVectorFloat32V2(val, 1), + GiFixLenType2GiFloat32Type(this->vscale_dst)); + GiSetSubVectorFloat32V2(val, 0, a); + GiSetSubVectorFloat32V2(val, 1, b); return QConverter::convert(val); } }; diff --git a/dnn/src/fallback/elemwise_helper/kimpl/relu.h b/dnn/src/fallback/elemwise_helper/kimpl/relu.h index ffddb422..4e419abc 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/relu.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/relu.h @@ -20,37 +20,43 @@ struct ReluOpBase : UnaryOpBase { template struct ReluOp; -#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width, zero) \ - template <> \ - struct ReluOp<_ctype> : ReluOpBase<_ctype> { \ - using ReluOpBase::ReluOpBase; \ - using ReluOpBase::operator(); \ - constexpr static size_t SIMD_WIDTH = _simd_width; \ - void operator()(const _simd_type2& src, _ctype* dst) const { \ - auto vitem = operator()(src); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ - } \ - _simd_type2 operator()(const _simd_type2& src) const { \ - auto vitem0 = GiMaximum##_func_suffix(src.val[0], zero); \ - auto vitem1 = GiMaximum##_func_suffix(src.val[1], zero); \ - return {{vitem0, vitem1}}; \ - } \ - void operator()(const _simd_type& src, _ctype* dst) const { \ - auto vitem = operator()(src); \ - GiStore##_func_suffix(dst, vitem); \ - } \ - _simd_type operator()(const _simd_type& src) const { \ - return GiMaximum##_func_suffix(src, zero); \ - } \ +#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width, zero_num) \ + template <> \ + struct ReluOp<_ctype> : ReluOpBase<_ctype> { \ + using ReluOpBase::ReluOpBase; \ + using ReluOpBase::operator(); \ + constexpr static size_t SIMD_WIDTH = _simd_width; \ + void operator()(const _simd_type2& src, _ctype* dst) const { \ + auto vitem = operator()(src); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ + } \ + _simd_type2 operator()(const _simd_type2& src) const { \ + _simd_type zero = GiBroadcast##_func_suffix(zero_num); \ + auto vitem0 = GiMaximum##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src, 0), zero); \ + auto vitem1 = GiMaximum##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src, 1), zero); \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ + GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ + return ret; \ + } \ + void operator()(const _simd_type& src, _ctype* dst) const { \ + auto vitem = operator()(src); \ + GiStore##_func_suffix(dst, vitem); \ + } \ + _simd_type operator()(const _simd_type& src) const { \ + _simd_type zero = GiBroadcast##_func_suffix(zero_num); \ + return GiMaximum##_func_suffix(src, zero); \ + } \ }; OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float), - vfzero) -OP(dt_int32, GI_INT32_t, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t), - vzero) -OP(dt_int8, GI_INT8_t, GI_INT8_V2_t, Int8, GI_SIMD_LEN_BYTE / sizeof(int8_t), - vzero_int8) + 0.0f) +OP(dt_int32, GI_INT32_t, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t), 0) +OP(dt_int8, GI_INT8_t, GI_INT8_V2_t, Int8, GI_SIMD_LEN_BYTE / sizeof(int8_t), 0) #undef OP template <> @@ -76,11 +82,19 @@ struct ReluOp : ReluOpBase { OPERATOR_UNARY_QINT8_FALLBACK; } GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { - auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale); - auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale); + GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); + auto vitem0 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), + GiFixLenType2GiFloat32Type(this->vscale)); + auto vitem1 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), + GiFixLenType2GiFloat32Type(this->vscale)); vitem0 = GiMaximumFloat32(vitem0, vfzero); vitem1 = GiMaximumFloat32(vitem1, vfzero); - return QConverter::convert({{vitem0, vitem1}}); + GI_FLOAT32_V2_t tmp; + GiSetSubVectorFloat32V2(tmp, 0, vitem0); + GiSetSubVectorFloat32V2(tmp, 1, vitem1); + return QConverter::convert(tmp); } }; @@ -104,6 +118,8 @@ template <> struct ReluOp : ReluOpBase, FixupBase { using ReluOpBase::operator(); constexpr static size_t SIMD_WIDTH = 4; + GI_INT32_t vzero = GiBroadcastInt32(0); + GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); ReluOp(DType src_dtype, DType dst_dtype) : ReluOpBase(src_dtype, dst_dtype), FixupBase(scale) {} @@ -115,8 +131,8 @@ struct ReluOp : ReluOpBase, FixupBase vst1_s8(reinterpret_cast(dst), vget_low_s8(operator()(vsrc))); } int8x16_t operator()(const int32x4x2_t& vsrc) const { - int32x4_t vitem0 = vqrdmulhq_s32(vsrc.val[0], vmultiplier); - int32x4_t vitem1 = vqrdmulhq_s32(vsrc.val[1], vmultiplier); + int32x4_t vitem0 = vqrdmulhq_s32(GiGetSubVectorInt32V2(vsrc, 0), vmultiplier); + int32x4_t vitem1 = vqrdmulhq_s32(GiGetSubVectorInt32V2(vsrc, 1), vmultiplier); vitem0 = vmaxq_s32(vitem0, vzero); vitem1 = vmaxq_s32(vitem1, vzero); auto tmp = vqmovn_s16(vcombine_s16( @@ -158,24 +174,36 @@ struct ReluOp : ReluOpBase { } void operator()(const GI_INT32_t& src, dt_qint8* dst) const { GiStoreLane0Int32( - reinterpret_cast(dst), (GI_INT32_t)(operator()(src))); + reinterpret_cast(dst), + GiReinterpretInt8AsInt32(operator()(src))); } GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { - auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale); - auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale); + auto vitem0 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), + GiFixLenType2GiFloat32Type(this->vscale)); + auto vitem1 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), + GiFixLenType2GiFloat32Type(this->vscale)); + GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); vitem0 = GiMaximumFloat32(vitem0, vfzero); vitem1 = GiMaximumFloat32(vitem1, vfzero); - return QConverter::convert({{vitem0, vitem1}}); + GI_FLOAT32_V2_t tmp; + GiSetSubVectorFloat32V2(tmp, 0, vitem0); + GiSetSubVectorFloat32V2(tmp, 1, vitem1); + return QConverter::convert(tmp); } GI_INT8_t operator()(const GI_INT32_t& src) const { - auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(src), this->vscale); + auto vitem0 = GiMultiplyFloat32( + GiCastToFloat32(src), GiFixLenType2GiFloat32Type(this->vscale)); + GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); vitem0 = GiMaximumFloat32(vitem0, vfzero); return QConverter::convert(vitem0); } GI_INT8_t operator()(const GI_FLOAT32_t& src) const { - auto vitem0 = GiMultiplyFloat32(src, this->vscale); + auto vitem0 = GiMultiplyFloat32(src, GiFixLenType2GiFloat32Type(this->vscale)); + GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); vitem0 = GiMaximumFloat32(vitem0, vfzero); return QConverter::convert(vitem0); } diff --git a/dnn/src/fallback/elemwise_helper/kimpl/sigmoid.h b/dnn/src/fallback/elemwise_helper/kimpl/sigmoid.h index 81930dae..1ca2a630 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/sigmoid.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/sigmoid.h @@ -25,27 +25,33 @@ struct SigmoidOpBase : UnaryOpBase { template struct SigmoidOp; -#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ - template <> \ - struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \ - using SigmoidOpBase::SigmoidOpBase; \ - using SigmoidOpBase::operator(); \ - constexpr static size_t SIMD_WIDTH = _simd_width; \ - void operator()(const _simd_type2& src, _ctype* dst) const { \ - auto vitem = operator()(src); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ - } \ - void operator()(const _simd_type& src, _ctype* dst) const { \ - auto vitem = operator()(src); \ - GiStore##_func_suffix(dst, vitem); \ - } \ - _simd_type2 operator()(const _simd_type2& src) const { \ - return {{operator()(src.val[0]), operator()(src.val[1])}}; \ - } \ - _simd_type operator()(const _simd_type& src) const { \ - return GiSigmoidPs##_func_suffix(src); \ - } \ +#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ + template <> \ + struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \ + using SigmoidOpBase::SigmoidOpBase; \ + using SigmoidOpBase::operator(); \ + constexpr static size_t SIMD_WIDTH = _simd_width; \ + void operator()(const _simd_type2& src, _ctype* dst) const { \ + auto vitem = operator()(src); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ + } \ + void operator()(const _simd_type& src, _ctype* dst) const { \ + auto vitem = operator()(src); \ + GiStore##_func_suffix(dst, vitem); \ + } \ + _simd_type2 operator()(const _simd_type2& src) const { \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2( \ + ret, 0, operator()(GiGetSubVector##_func_suffix##V2(src, 0))); \ + GiSetSubVector##_func_suffix##V2( \ + ret, 1, operator()(GiGetSubVector##_func_suffix##V2(src, 1))); \ + return ret; \ + } \ + _simd_type operator()(const _simd_type& src) const { \ + return GiSigmoidPs##_func_suffix(src); \ + } \ }; OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) #undef OP diff --git a/dnn/src/fallback/elemwise_helper/kimpl/sub.h b/dnn/src/fallback/elemwise_helper/kimpl/sub.h index c898225b..384b4998 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/sub.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/sub.h @@ -33,14 +33,22 @@ struct SubOp; const _simd_type2& src0, const _simd_type2& src1, \ dst_ctype* dst) const { \ auto vitem = operator()(src0, src1); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _simd_type2 operator()( \ const _simd_type2& src0, const _simd_type2& src1) const { \ - auto vitem0 = GiSubtract##_func_suffix(src0.val[0], src1.val[0]); \ - auto vitem1 = GiSubtract##_func_suffix(src0.val[1], src1.val[1]); \ - return {{vitem0, vitem1}}; \ + auto vitem0 = GiSubtract##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src0, 0), \ + GiGetSubVector##_func_suffix##V2(src1, 0)); \ + auto vitem1 = GiSubtract##_func_suffix( \ + GiGetSubVector##_func_suffix##V2(src0, 1), \ + GiGetSubVector##_func_suffix##V2(src1, 1)); \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ + GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ + return ret; \ } \ void operator()( \ const _simd_type& src0, const _simd_type& src1, \ @@ -82,12 +90,23 @@ struct SubOp : SubOpBase { } GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { auto vitem0 = GiSubtractFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), + GiFixLenType2GiFloat32Type(this->vscale1))); auto vitem1 = GiSubtractFloat32( - GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), - GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); - return QConverter::convert({{vitem0, vitem1}}); + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), + GiFixLenType2GiFloat32Type(this->vscale0)), + GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), + GiFixLenType2GiFloat32Type(this->vscale1))); + GI_FLOAT32_V2_t tmp; + GiSetSubVectorFloat32V2(tmp, 0, vitem0); + GiSetSubVectorFloat32V2(tmp, 1, vitem1); + return QConverter::convert(tmp); } }; diff --git a/dnn/src/fallback/elemwise_helper/kimpl/tanh.h b/dnn/src/fallback/elemwise_helper/kimpl/tanh.h index bfbb7091..5da978c2 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/tanh.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/tanh.h @@ -23,54 +23,58 @@ struct TanhOpBase : UnaryOpBase { template struct TanhOp; -#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ - template <> \ - struct TanhOp<_ctype> : TanhOpBase<_ctype> { \ - using TanhOpBase::TanhOpBase; \ - using TanhOpBase::operator(); \ - constexpr static size_t SIMD_WIDTH = _simd_width; \ - void operator()(const _simd_type2& src, _ctype* dst) const { \ - auto vitem = operator()(src); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ - } \ - _simd_type2 operator()(const _simd_type2& src) const { \ - auto one_val = GiBroadcast##_func_suffix(1.f); \ - auto two_val = GiBroadcast##_func_suffix(2.f); \ - auto val1 = src.val[0]; \ - auto val2 = src.val[1]; \ - val1 = GiMultiply##_func_suffix(two_val, val1); \ - val2 = GiMultiply##_func_suffix(two_val, val2); \ - val1 = GiExpPs##_func_suffix(val1); \ - val2 = GiExpPs##_func_suffix(val2); \ - val1 = GiAdd##_func_suffix(one_val, val1); \ - val2 = GiAdd##_func_suffix(one_val, val2); \ - auto rval1 = GiRecpe##_func_suffix(val1); \ - auto rval2 = GiRecpe##_func_suffix(val2); \ - rval1 = GiMultiply##_func_suffix( \ - GiRecpeS##_func_suffix(val1, rval1), rval1); \ - rval2 = GiMultiply##_func_suffix( \ - GiRecpeS##_func_suffix(val2, rval2), rval2); \ - val1 = GiMultiply##_func_suffix(two_val, rval1); \ - val2 = GiMultiply##_func_suffix(two_val, rval2); \ - val1 = GiSubtract##_func_suffix(one_val, val1); \ - val2 = GiSubtract##_func_suffix(one_val, val2); \ - return {{val1, val2}}; \ - } \ - _simd_type operator()(const _simd_type& src) const { \ - auto one_val = GiBroadcast##_func_suffix(1.f); \ - auto two_val = GiBroadcast##_func_suffix(2.f); \ - auto val1 = src; \ - val1 = GiMultiply##_func_suffix(two_val, val1); \ - val1 = GiExpPs##_func_suffix(val1); \ - val1 = GiAdd##_func_suffix(one_val, val1); \ - auto rval1 = GiRecpe##_func_suffix(val1); \ - rval1 = GiMultiply##_func_suffix( \ - GiRecpeS##_func_suffix(val1, rval1), rval1); \ - val1 = GiMultiply##_func_suffix(two_val, rval1); \ - val1 = GiSubtract##_func_suffix(one_val, val1); \ - return val1; \ - } \ +#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ + template <> \ + struct TanhOp<_ctype> : TanhOpBase<_ctype> { \ + using TanhOpBase::TanhOpBase; \ + using TanhOpBase::operator(); \ + constexpr static size_t SIMD_WIDTH = _simd_width; \ + void operator()(const _simd_type2& src, _ctype* dst) const { \ + auto vitem = operator()(src); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ + } \ + _simd_type2 operator()(const _simd_type2& src) const { \ + auto one_val = GiBroadcast##_func_suffix(1.f); \ + auto two_val = GiBroadcast##_func_suffix(2.f); \ + auto val1 = GiGetSubVector##_func_suffix##V2(src, 0); \ + auto val2 = GiGetSubVector##_func_suffix##V2(src, 1); \ + val1 = GiMultiply##_func_suffix(two_val, val1); \ + val2 = GiMultiply##_func_suffix(two_val, val2); \ + val1 = GiExpPs##_func_suffix(val1); \ + val2 = GiExpPs##_func_suffix(val2); \ + val1 = GiAdd##_func_suffix(one_val, val1); \ + val2 = GiAdd##_func_suffix(one_val, val2); \ + auto rval1 = GiRecpe##_func_suffix(val1); \ + auto rval2 = GiRecpe##_func_suffix(val2); \ + rval1 = GiMultiply##_func_suffix( \ + GiRecpeS##_func_suffix(val1, rval1), rval1); \ + rval2 = GiMultiply##_func_suffix( \ + GiRecpeS##_func_suffix(val2, rval2), rval2); \ + val1 = GiMultiply##_func_suffix(two_val, rval1); \ + val2 = GiMultiply##_func_suffix(two_val, rval2); \ + val1 = GiSubtract##_func_suffix(one_val, val1); \ + val2 = GiSubtract##_func_suffix(one_val, val2); \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ + GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ + return ret; \ + } \ + _simd_type operator()(const _simd_type& src) const { \ + auto one_val = GiBroadcast##_func_suffix(1.f); \ + auto two_val = GiBroadcast##_func_suffix(2.f); \ + auto val1 = src; \ + val1 = GiMultiply##_func_suffix(two_val, val1); \ + val1 = GiExpPs##_func_suffix(val1); \ + val1 = GiAdd##_func_suffix(one_val, val1); \ + auto rval1 = GiRecpe##_func_suffix(val1); \ + rval1 = GiMultiply##_func_suffix( \ + GiRecpeS##_func_suffix(val1, rval1), rval1); \ + val1 = GiMultiply##_func_suffix(two_val, rval1); \ + val1 = GiSubtract##_func_suffix(one_val, val1); \ + return val1; \ + } \ }; OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) #undef OP diff --git a/dnn/src/fallback/elemwise_helper/kimpl/true_div.h b/dnn/src/fallback/elemwise_helper/kimpl/true_div.h index 71817845..85793143 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/true_div.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/true_div.h @@ -36,18 +36,22 @@ struct TrueDivOp; const _simd_type2& src0, const _simd_type2& src1, \ dst_ctype* dst) const { \ auto vitem = operator()(src0, src1); \ - GiStore##_func_suffix(dst, vitem.val[0]); \ - GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ + GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ + GiStore##_func_suffix( \ + dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ } \ _simd_type2 operator()( \ const _simd_type2& src0, const _simd_type2& src1) const { \ - auto val1 = src0.val[0]; \ - auto val2 = src0.val[1]; \ - auto val3 = src1.val[0]; \ - auto val4 = src1.val[1]; \ + auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \ + auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \ + auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \ + auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \ val1 = GiDivide##_func_suffix(val1, val3); \ val2 = GiDivide##_func_suffix(val2, val4); \ - return {{val1, val2}}; \ + _simd_type2 ret; \ + GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ + GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ + return ret; \ } \ void operator()( \ const _simd_type& src0, const _simd_type& src1, \ diff --git a/dnn/src/fallback/elemwise_helper/kimpl/typecvt.h b/dnn/src/fallback/elemwise_helper/kimpl/typecvt.h index 32b9edf8..1d1b32d8 100644 --- a/dnn/src/fallback/elemwise_helper/kimpl/typecvt.h +++ b/dnn/src/fallback/elemwise_helper/kimpl/typecvt.h @@ -21,7 +21,8 @@ struct TypeCvtOp : UnaryOpBase { } void operator()(const GI_INT32_t& vsrc, dt_qint8* dst) const { GiStoreLane0Int32( - reinterpret_cast(dst), (GI_INT32_t)(operator()(vsrc))); + reinterpret_cast(dst), + GiReinterpretInt8AsInt32(operator()(vsrc))); } void operator()(const src_ctype& src, dst_ctype* dst) const { *dst = operator()(src); @@ -32,17 +33,25 @@ struct TypeCvtOp : UnaryOpBase { } GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { - auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale); - auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale); - - return QConverter::convert({{vitem0, vitem1}}); + auto vitem0 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), + GiFixLenType2GiFloat32Type(this->vscale)); + auto vitem1 = GiMultiplyFloat32( + GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), + GiFixLenType2GiFloat32Type(this->vscale)); + + GI_FLOAT32_V2_t tmp; + GiSetSubVectorFloat32V2(tmp, 0, vitem0); + GiSetSubVectorFloat32V2(tmp, 1, vitem1); + return QConverter::convert(tmp); } GI_INT8_t operator()(const GI_INT32_t& src) const { - auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(src), this->vscale); + auto vitem0 = GiMultiplyFloat32( + GiCastToFloat32(src), GiFixLenType2GiFloat32Type(this->vscale)); return QConverter::convert(vitem0); } GI_INT8_t operator()(const GI_FLOAT32_t& src) const { - auto vitem0 = GiMultiplyFloat32(src, this->vscale); + auto vitem0 = GiMultiplyFloat32(src, GiFixLenType2GiFloat32Type(this->vscale)); return QConverter::convert(vitem0); } }; diff --git a/dnn/src/fallback/elemwise_helper/op_common.h b/dnn/src/fallback/elemwise_helper/op_common.h index bda1b385..f0ca73eb 100644 --- a/dnn/src/fallback/elemwise_helper/op_common.h +++ b/dnn/src/fallback/elemwise_helper/op_common.h @@ -96,6 +96,82 @@ cb(dt_float32, float, GI_FLOAT32_t, Float32); cb(dt_int32, int32_t, GI_INT32_t, Int32); #undef cb +///////////////////////////////// ParamElemVistor v2/////////////////////////// +template +struct ParamElemVisitorV2; + +//! visitor single elemwise, and dup to vector +template +struct ParamElemVisitorDupV2; + +template +struct ParamElemVisitorBcast101x4V2; + +#define cb(_ctype, _inner_ctype, _simd_type, _fun_suffix, _simd_type_v2) \ + template <> \ + struct ParamElemVisitorV2<_ctype> { \ + _simd_type_v2 operator()(const _ctype* src, const _ctype* src_1) const { \ + _simd_type_v2 ret; \ + GiSetSubVector##_fun_suffix##V2(ret, 0, GiLoad##_fun_suffix(src)); \ + GiSetSubVector##_fun_suffix##V2(ret, 1, GiLoad##_fun_suffix(src_1)); \ + return ret; \ + } \ + }; \ + template <> \ + struct ParamElemVisitorDupV2<_ctype> { \ + _simd_type_v2 operator()(const _ctype* src) const { \ + _simd_type_v2 ret; \ + _simd_type tmp = GiBroadcast##_fun_suffix( \ + *reinterpret_cast(src)); \ + GiSetSubVector##_fun_suffix##V2(ret, 0, tmp); \ + GiSetSubVector##_fun_suffix##V2(ret, 1, tmp); \ + return ret; \ + } \ + } +cb(dt_qint32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t); +cb(dt_qint8, int8_t, GI_INT8_t, Int8, GI_INT8_V2_t); + +cb(dt_float32, float, GI_FLOAT32_t, Float32, GI_FLOAT32_V2_t); +cb(dt_int32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t); +cb(dt_int8, int8_t, GI_INT8_t, Int8, GI_INT8_V2_t); +#undef cb + +template +struct ParamElemVisitorBcast101x4V2; +#define cb(_ctype, _inner_ctype, _simd_type, _fun_suffix, rel_suffix, _simd_type_v2) \ + template <> \ + struct ParamElemVisitorBcast101x4V2<_ctype> { \ + _simd_type_v2 operator()(const _ctype* src) const { \ + _simd_type_v2 ret; \ + _simd_type tmp = \ + GiReinter##rel_suffix##To##_fun_suffix(GiBroadcast##rel_suffix( \ + *reinterpret_cast(src))); \ + GiSetSubVector##_fun_suffix##V2(ret, 0, tmp); \ + GiSetSubVector##_fun_suffix##V2(ret, 1, tmp); \ + return ret; \ + } \ + } + +cb(dt_qint8, int32_t, GI_INT8_t, Int8, Int32, GI_INT8_V2_t); +cb(dt_int8, int32_t, GI_INT8_t, Int8, Int32, GI_INT8_V2_t); +#undef cb +#define cb(_ctype, _inner_ctype, _simd_type, _fun_suffix, _simd_type_v2) \ + template <> \ + struct ParamElemVisitorBcast101x4V2<_ctype> { \ + _simd_type_v2 operator()(const _ctype* src) const { \ + _simd_type_v2 ret; \ + _simd_type tmp = GiLoad##_fun_suffix(src); \ + GiSetSubVector##_fun_suffix##V2(ret, 0, tmp); \ + GiSetSubVector##_fun_suffix##V2(ret, 1, tmp); \ + return ret; \ + } \ + } + +cb(dt_qint32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t); +cb(dt_float32, float, GI_FLOAT32_t, Float32, GI_FLOAT32_V2_t); +cb(dt_int32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t); +#undef cb + ///////////////////////////////// OpCaller ///////////////////////////// template struct OpCallerUnary; @@ -106,10 +182,10 @@ struct OpCallerUnary { const typename Op::src_ctype* src, typename Op::dst_ctype* dst, DType src_dtype, DType dst_dtype, size_t nr_elems) { Op op(src_dtype, dst_dtype); - ParamElemVisitor vis; + ParamElemVisitorV2 vis; size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { - op({{vis(src), vis(src + Op::SIMD_WIDTH)}}, dst); + op(vis(src, src + Op::SIMD_WIDTH), dst); src += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; } @@ -364,12 +440,12 @@ struct OpCallerBinary { typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, DType dst_dtype, size_t nr_elems) { Op op(src0_dtype, src1_dtype, dst_dtype); - ParamElemVisitor vis0; - ParamElemVisitor vis1; + ParamElemVisitorV2 vis0; + ParamElemVisitorV2 vis1; size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, - {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1(src1, src1 + Op::SIMD_WIDTH), + dst); src0 += Op::SIMD_WIDTH * 2; src1 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; @@ -394,17 +470,16 @@ struct OpCallerBinary { typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { Op op(src0_dtype, src1_dtype, dst_dtype); - ParamElemVisitor vis0; - ParamElemVisitorDup vis1; + ParamElemVisitorV2 vis0; + ParamElemVisitorDupV2 vis1; for (size_t b = 0; b < batch; b++) { const typename Op::src_ctype* src1_ptr = src1; for (size_t c = 0; c < channel; c++) { size_t i = 0; - auto src1_simd = vis1(src1_ptr); + auto src1_simd_v2 = vis1(src1_ptr); for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; i += Op::SIMD_WIDTH * 2) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, - {{src1_simd, src1_simd}}, dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), src1_simd_v2, dst); src0 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; } @@ -430,7 +505,7 @@ struct OpCallerBinary { typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { Op op(src0_dtype, src1_dtype, dst_dtype); - ParamElemVisitor vis; + ParamElemVisitorV2 vis; for (size_t b = 0; b < batch; b++) { const typename Op::src_ctype* src1_ptr_base = src1 + b * channel_stride; for (size_t c = 0; c < channel; c++) { @@ -438,11 +513,9 @@ struct OpCallerBinary { auto src1_ptr = src1_ptr_base; for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; i += Op::SIMD_WIDTH * 2) { - auto src0_simd0 = vis(src0); - auto src0_simd1 = vis(src0 + Op::SIMD_WIDTH); - auto src1_simd0 = vis(src1_ptr); - auto src1_simd1 = vis(src1_ptr + Op::SIMD_WIDTH); - op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst); + auto src0_simd01 = vis(src0, src0 + Op::SIMD_WIDTH); + auto src1_simd01 = vis(src1_ptr, src1_ptr + Op::SIMD_WIDTH); + op(src0_simd01, src1_simd01, dst); src0 += Op::SIMD_WIDTH * 2; src1_ptr += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; @@ -469,19 +542,17 @@ struct OpCallerBinary { typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { Op op(src0_dtype, src1_dtype, dst_dtype); - ParamElemVisitor vis; + ParamElemVisitorV2 vis; for (size_t b = 0; b < batch; b++) { for (size_t c = 0; c < channel; c++) { size_t rest = channel_stride; const typename Op::src_ctype* src1_ptr = src1; while (rest >= Op::SIMD_WIDTH * 2) { - auto src0_simd0 = vis(src0); - auto src0_simd1 = vis(src0 + Op::SIMD_WIDTH); - auto src1_simd0 = vis(src1_ptr); - auto src1_simd1 = vis(src1_ptr + Op::SIMD_WIDTH); + auto src0_simd01 = vis(src0, src0 + Op::SIMD_WIDTH); + auto src1_simd01 = vis(src1_ptr, src1_ptr + Op::SIMD_WIDTH); src0 += Op::SIMD_WIDTH * 2; src1_ptr += Op::SIMD_WIDTH * 2; - op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst); + op(src0_simd01, src1_simd01, dst); dst += Op::SIMD_WIDTH * 2; rest -= Op::SIMD_WIDTH * 2; } @@ -508,19 +579,17 @@ struct OpCallerBinary { typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { Op op(src0_dtype, src1_dtype, dst_dtype); - ParamElemVisitor vis; + ParamElemVisitorV2 vis; for (size_t b = 0; b < batch; b++) { for (size_t c = 0; c < channel; c++) { size_t rest = channel_stride; const typename Op::src_ctype* src0_ptr = src0; while (rest >= Op::SIMD_WIDTH * 2) { - auto src0_simd0 = vis(src0_ptr); - auto src0_simd1 = vis(src0_ptr + Op::SIMD_WIDTH); - auto src1_simd0 = vis(src1); - auto src1_simd1 = vis(src1 + Op::SIMD_WIDTH); + auto src0_simd01 = vis(src0_ptr, src0_ptr + Op::SIMD_WIDTH); + auto src1_simd01 = vis(src1, src1 + Op::SIMD_WIDTH); src0_ptr += Op::SIMD_WIDTH * 2; src1 += Op::SIMD_WIDTH * 2; - op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst); + op(src0_simd01, src1_simd01, dst); dst += Op::SIMD_WIDTH * 2; rest -= Op::SIMD_WIDTH * 2; } @@ -599,13 +668,12 @@ struct OpCallerBinaryBcast101xDVec { auto src0_ptr = src0; for (size_t cb = 0; cb < nr_channel_blocks; cb++) { auto src0_block_ptr = src0_ptr + cb * channel_block_dim; - auto channel_block_vec = vis0(src0_block_ptr); + auto channel_block_vec_v2 = vis0(src0_block_ptr); size_t img_index = 0; auto src1_offset = Op::SIMD_WIDTH / channel_block_dim; for (; img_index + 2 * src1_offset <= channel_stride; img_index += 2 * src1_offset) { - op({{channel_block_vec, channel_block_vec}}, - {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, dst); + op(channel_block_vec_v2, vis1(src1, src1 + Op::SIMD_WIDTH), dst); src1 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; } @@ -629,8 +697,8 @@ struct OpCallerBinaryBcast101xXVec { const src_ctype* src0, const src_ctype* src1, typename Op::dst_ctype* dst, const Op& op, size_t batch, size_t nr_channel_blocks, size_t channel_stride) { - ParamElemVisitorBcast101x4 vis0; - ParamElemVisitor vis1; + ParamElemVisitorBcast101x4V2 vis0; + ParamElemVisitorV2 vis1; OpCallerBinaryBcast101xDVec::run( src0, src1, dst, op, vis0, vis1, batch, nr_channel_blocks, channel_stride); @@ -717,13 +785,12 @@ struct OpCallerBinaryVecBcast101xD { auto src1_ptr = src1; for (size_t cb = 0; cb < nr_channel_blocks; cb++) { auto src1_block_ptr = src1_ptr + cb * channel_block_dim; - auto channel_block_vec = vis1(src1_block_ptr); + auto channel_block_vec_v2 = vis1(src1_block_ptr); size_t img_index = 0; auto src0_offset = Op::SIMD_WIDTH / channel_block_dim; for (; img_index + 2 * src0_offset <= channel_stride; img_index += 2 * src0_offset) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, - {{channel_block_vec, channel_block_vec}}, dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), channel_block_vec_v2, dst); src0 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; } @@ -747,8 +814,8 @@ struct OpCallerBinaryVecBcast101xX { const src_ctype* src0, const src_ctype* src1, typename Op::dst_ctype* dst, const Op& op, size_t batch, size_t nr_channel_blocks, size_t channel_stride) { - ParamElemVisitor vis0; - ParamElemVisitorBcast101x4 vis1; + ParamElemVisitorV2 vis0; + ParamElemVisitorBcast101x4V2 vis1; OpCallerBinaryVecBcast101xD::run( src0, src1, dst, op, vis0, vis1, batch, nr_channel_blocks, channel_stride); @@ -783,13 +850,12 @@ struct OpCallerBinary { typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, DType dst_dtype, size_t nr_elems) { Op op(src0_dtype, src1_dtype, dst_dtype); - ParamElemVisitor vis0; - ParamElemVisitorDup vis1; - auto vis1_simd = vis1(&src1); + ParamElemVisitorV2 vis0; + ParamElemVisitorDupV2 vis1; + auto vis1_simd_v2 = vis1(&src1); size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, {{vis1_simd, vis1_simd}}, - dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1_simd_v2, dst); src0 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; } @@ -813,13 +879,12 @@ struct OpCallerBinary { typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, DType dst_dtype, size_t nr_elems) { Op op(src0_dtype, src1_dtype, dst_dtype); - ParamElemVisitorDup vis0; - ParamElemVisitor vis1; - auto vis0_simd = vis0(&src0); + ParamElemVisitorDupV2 vis0; + ParamElemVisitorV2 vis1; + auto vis0_simd_v2 = vis0(&src0); size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { - op({{vis0_simd, vis0_simd}}, {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, - dst); + op(vis0_simd_v2, vis1(src1, src1 + Op::SIMD_WIDTH), dst); src1 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; } @@ -842,17 +907,16 @@ struct OpCallerBinary { typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { Op op(src0_dtype, src1_dtype, dst_dtype); - ParamElemVisitorDup vis0; - ParamElemVisitor vis1; + ParamElemVisitorDupV2 vis0; + ParamElemVisitorV2 vis1; for (size_t b = 0; b < batch; b++) { auto src0_ptr = src0; for (size_t c = 0; c < channel; c++) { - auto vis0_simd = vis0(src0_ptr); + auto vis0_simd_v2 = vis0(src0_ptr); size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; i += Op::SIMD_WIDTH * 2) { - op({{vis0_simd, vis0_simd}}, - {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, dst); + op(vis0_simd_v2, vis1(src1, src1 + Op::SIMD_WIDTH), dst); src1 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; } @@ -878,7 +942,7 @@ struct OpCallerBinary { typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { Op op(src0_dtype, src1_dtype, dst_dtype); - ParamElemVisitor vis; + ParamElemVisitorV2 vis; for (size_t b = 0; b < batch; b++) { auto src0_ptr_base = src0 + b * channel_stride; for (size_t c = 0; c < channel; c++) { @@ -886,11 +950,9 @@ struct OpCallerBinary { size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; i += Op::SIMD_WIDTH * 2) { - auto src0_simd0 = vis(src0_ptr); - auto src0_simd1 = vis(src0_ptr + Op::SIMD_WIDTH); - auto src1_simd0 = vis(src1); - auto src1_simd1 = vis(src1 + Op::SIMD_WIDTH); - op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst); + auto src0_simd01 = vis(src0_ptr, src0_ptr + Op::SIMD_WIDTH); + auto src1_simd01 = vis(src1, src1 + Op::SIMD_WIDTH); + op(src0_simd01, src1_simd01, dst); src0_ptr += Op::SIMD_WIDTH * 2; src1 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; @@ -921,14 +983,13 @@ struct OpCallerTernary { DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype, size_t nr_elems) { Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); - ParamElemVisitor vis0; - ParamElemVisitor vis1; - ParamElemVisitor vis2; + ParamElemVisitorV2 vis0; + ParamElemVisitorV2 vis1; + ParamElemVisitorV2 vis2; size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, - {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, - {{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1(src1, src1 + Op::SIMD_WIDTH), + vis2(src2, src2 + Op::SIMD_WIDTH), dst); src0 += Op::SIMD_WIDTH * 2; src1 += Op::SIMD_WIDTH * 2; src2 += Op::SIMD_WIDTH * 2; @@ -957,15 +1018,14 @@ struct OpCallerTernary { DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype, size_t nr_elems) { Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); - ParamElemVisitor vis0; - ParamElemVisitor vis1; - ParamElemVisitorDup vis2; - auto vis2_simd = vis2(&src2); + ParamElemVisitorV2 vis0; + ParamElemVisitorV2 vis1; + ParamElemVisitorDupV2 vis2; + auto vis2_simd_v2 = vis2(&src2); size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, - {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, {{vis2_simd, vis2_simd}}, - dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1(src1, src1 + Op::SIMD_WIDTH), + vis2_simd_v2, dst); src0 += Op::SIMD_WIDTH * 2; src1 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; @@ -993,22 +1053,21 @@ struct OpCallerTernary { size_t batch_size, size_t channel_size, size_t channel_stride, size_t batch_offset) { Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); - ParamElemVisitor vis1; - ParamElemVisitorDup vis0; - ParamElemVisitorDup vis2; + ParamElemVisitorV2 vis1; + ParamElemVisitorDupV2 vis0; + ParamElemVisitorDupV2 vis2; for (size_t batch = 0; batch < batch_size; batch++) { auto src0_ptr = src0; auto src2_ptr = src2; auto b_offset = batch_offset; for (size_t channel = 0; channel < channel_size; channel++) { size_t i = 0; - auto src0_simd = vis0(src0_ptr); - auto src2_simd = vis2(src2_ptr); + auto src0_simd_v2 = vis0(src0_ptr); + auto src2_simd_v2 = vis2(src2_ptr); for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; i += Op::SIMD_WIDTH * 2) { - op({{src0_simd, src0_simd}}, - {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, - {{src2_simd, src2_simd}}, dst); + op(src0_simd_v2, vis1(src1, src1 + Op::SIMD_WIDTH), src2_simd_v2, + dst); src1 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; b_offset -= Op::SIMD_WIDTH * 2; @@ -1042,7 +1101,7 @@ struct OpCallerTernary { DType src2_dtype, DType dst_dtype, size_t batch_size, size_t channel_size, size_t channel_stride, size_t batch_offset) { Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); - ParamElemVisitor vis; + ParamElemVisitorV2 vis; for (size_t batch = 0; batch < batch_size; batch++) { auto b_offset = batch_offset; for (size_t channel = 0; channel < channel_size; channel++) { @@ -1051,14 +1110,10 @@ struct OpCallerTernary { size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; i += Op::SIMD_WIDTH * 2) { - auto src0_simd0 = vis(src0_ptr); - auto src0_simd1 = vis(src0_ptr + Op::SIMD_WIDTH); - auto src1_simd0 = vis(src1); - auto src1_simd1 = vis(src1 + Op::SIMD_WIDTH); - auto src2_simd0 = vis(src2_ptr); - auto src2_simd1 = vis(src2_ptr + Op::SIMD_WIDTH); - op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, - {{src2_simd0, src2_simd1}}, dst); + auto src0_simd01 = vis(src0_ptr, src0_ptr + Op::SIMD_WIDTH); + auto src1_simd01 = vis(src1, src1 + Op::SIMD_WIDTH); + auto src2_simd01 = vis(src2_ptr, src2_ptr + Op::SIMD_WIDTH); + op(src0_simd01, src1_simd01, src2_simd01, dst); src0_ptr += Op::SIMD_WIDTH * 2; src1 += Op::SIMD_WIDTH * 2; src2_ptr += Op::SIMD_WIDTH * 2; @@ -1125,15 +1180,14 @@ struct OpCallerTernaryBcast101xDVecBcast101xD { for (size_t cb = 0; cb < nr_channel_blocks; cb++) { auto src0_block_ptr = src0_ptr + cb * channel_block_dim; auto src2_block_ptr = src2_ptr + cb * channel_block_dim; - auto channel_block_vec0 = vis0(src0_block_ptr); - auto channel_block_vec2 = vis2(src2_block_ptr); + auto channel_block_vec0_v2 = vis0(src0_block_ptr); + auto channel_block_vec2_v2 = vis2(src2_block_ptr); size_t img_index = 0; auto src1_offset = Op::SIMD_WIDTH / channel_block_dim; for (; img_index + 2 * src1_offset <= channel_stride; img_index += 2 * src1_offset) { - op({{channel_block_vec0, channel_block_vec0}}, - {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, - {{channel_block_vec2, channel_block_vec2}}, dst); + op(channel_block_vec0_v2, vis1(src1, src1 + Op::SIMD_WIDTH), + channel_block_vec2_v2, dst); src1 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; } @@ -1159,9 +1213,9 @@ struct OpCallerTernaryBcast101xXVecBcast101xX { const src_ctype* src0, const src_ctype* src1, const src_ctype* src2, typename Op::dst_ctype* dst, const Op& op, size_t batch, size_t nr_channel_blocks, size_t channel_stride) { - ParamElemVisitorBcast101x4 vis0; - ParamElemVisitor vis1; - ParamElemVisitorBcast101x4 vis2; + ParamElemVisitorBcast101x4V2 vis0; + ParamElemVisitorV2 vis1; + ParamElemVisitorBcast101x4V2 vis2; OpCallerTernaryBcast101xDVecBcast101xD::run( src0, src1, src2, dst, op, vis0, vis1, vis2, batch, nr_channel_blocks, channel_stride); @@ -1201,19 +1255,18 @@ struct OpCallerTernary { DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype, size_t batch_size, size_t channel_size, size_t channel_stride) { Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); - ParamElemVisitor vis0; - ParamElemVisitorDup vis1; - ParamElemVisitor vis2; + ParamElemVisitorV2 vis0; + ParamElemVisitorDupV2 vis1; + ParamElemVisitorV2 vis2; for (size_t batch = 0; batch < batch_size; batch++) { auto src1_ptr = src1; for (size_t channel = 0; channel < channel_size; channel++) { size_t i = 0; - auto src1_simd = vis1(src1_ptr); + auto src1_simd_v2 = vis1(src1_ptr); for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; i += Op::SIMD_WIDTH * 2) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, - {{src1_simd, src1_simd}}, - {{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), src1_simd_v2, + vis2(src2, src2 + Op::SIMD_WIDTH), dst); src0 += Op::SIMD_WIDTH * 2; src2 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; @@ -1244,18 +1297,18 @@ struct OpCallerTernary { DType src1_dtype, DType src2_dtype, DType dst_dtype, size_t batch_size, size_t channel_size, size_t channel_stride) { Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); - ParamElemVisitor vis0; - ParamElemVisitor vis1; - ParamElemVisitor vis2; + ParamElemVisitorV2 vis0; + ParamElemVisitorV2 vis1; + ParamElemVisitorV2 vis2; for (size_t batch = 0; batch < batch_size; batch++) { for (size_t channel = 0; channel < channel_size; channel++) { auto src1_ptr = src1; size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; i += Op::SIMD_WIDTH * 2) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, - {{vis1(src1_ptr), vis1(src1_ptr + Op::SIMD_WIDTH)}}, - {{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), + vis1(src1_ptr, src1_ptr + Op::SIMD_WIDTH), + vis2(src2, src2 + Op::SIMD_WIDTH), dst); src0 += Op::SIMD_WIDTH * 2; src1_ptr += Op::SIMD_WIDTH * 2; src2 += Op::SIMD_WIDTH * 2; @@ -1316,14 +1369,13 @@ struct OpCallerTernaryVecBcast101xDVec { auto src1_ptr = src1; for (size_t cb = 0; cb < nr_channel_blocks; cb++) { auto src1_block_ptr = src1_ptr + cb * channel_block_dim; - auto channel_block_vec = vis1(src1_block_ptr); + auto channel_block_vec_v2 = vis1(src1_block_ptr); size_t img_index = 0; auto offset = Op::SIMD_WIDTH / channel_block_dim; for (; img_index + 2 * offset <= channel_stride; img_index += 2 * offset) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, - {{channel_block_vec, channel_block_vec}}, - {{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), channel_block_vec_v2, + vis2(src2, src2 + Op::SIMD_WIDTH), dst); src0 += Op::SIMD_WIDTH * 2; src2 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; @@ -1349,9 +1401,9 @@ struct OpCallerTernaryVecBcast101xXVec { const src_ctype* src0, const src_ctype* src1, const src_ctype* src2, typename Op::dst_ctype* dst, const Op& op, size_t batch, size_t nr_channel_blocks, size_t channel_stride) { - ParamElemVisitor vis0; - ParamElemVisitorBcast101x4 vis1; - ParamElemVisitor vis2; + ParamElemVisitorV2 vis0; + ParamElemVisitorBcast101x4V2 vis1; + ParamElemVisitorV2 vis2; OpCallerTernaryVecBcast101xDVec::run( src0, src1, src2, dst, op, vis0, vis1, vis2, batch, nr_channel_blocks, channel_stride); @@ -1392,14 +1444,14 @@ struct OpCallerTernary { DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype, size_t nr_elems) { Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); - ParamElemVisitor vis0; - ParamElemVisitorDup vis1; - ParamElemVisitor vis2; - auto vis1_simd = vis1(&src1); + ParamElemVisitorV2 vis0; + ParamElemVisitorDupV2 vis1; + ParamElemVisitorV2 vis2; + auto vis1_simd_v2 = vis1(&src1); size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, {{vis1_simd, vis1_simd}}, - {{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1_simd_v2, + vis2(src2, src2 + Op::SIMD_WIDTH), dst); src0 += Op::SIMD_WIDTH * 2; src2 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; @@ -1426,15 +1478,14 @@ struct OpCallerTernary { DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype, size_t nr_elems) { Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); - ParamElemVisitor vis0; - ParamElemVisitorDup vis1; - ParamElemVisitorDup vis2; - auto vis1_simd = vis1(&src1); - auto vis2_simd = vis2(&src2); + ParamElemVisitorV2 vis0; + ParamElemVisitorDupV2 vis1; + ParamElemVisitorDupV2 vis2; + auto vis1_simd_v2 = vis1(&src1); + auto vis2_simd_v2 = vis2(&src2); size_t i = 0; for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { - op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, {{vis1_simd, vis1_simd}}, - {{vis2_simd, vis2_simd}}, dst); + op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1_simd_v2, vis2_simd_v2, dst); src0 += Op::SIMD_WIDTH * 2; dst += Op::SIMD_WIDTH * 2; } diff --git a/dnn/src/fallback/gi_intrinsic_helper.h b/dnn/src/fallback/gi_intrinsic_helper.h index c060d48c..793ec4a8 100644 --- a/dnn/src/fallback/gi_intrinsic_helper.h +++ b/dnn/src/fallback/gi_intrinsic_helper.h @@ -11,8 +11,9 @@ struct LoadHelper { static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args); }; -#define WEIGHT_CB(step) \ - src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...); +#define WEIGHT_CB(step) \ + src[step] = GiFloat32Type2FixLenType( \ + Func::impl(ptr + base_offset + step * ptr_step, args...)); #define LOAD_HELPER(step) \ template < \ diff --git a/dnn/src/fallback/quantized_converter.h b/dnn/src/fallback/quantized_converter.h index 8bbe37a2..54b8cede 100644 --- a/dnn/src/fallback/quantized_converter.h +++ b/dnn/src/fallback/quantized_converter.h @@ -38,7 +38,13 @@ template <> inline GI_FLOAT32_V2_t QConverter::convert(const GI_INT16_t& vsrc) { GI_INT32_t vhi = GiMoveHighLongInt16(vsrc); GI_INT32_t vlo = GiMoveLowLongInt16(vsrc); - return {{GiCastToFloat32(vlo), GiCastToFloat32(vhi)}}; + GI_FLOAT32_t fhi = GiCastToFloat32(vhi); + GI_FLOAT32_t flo = GiCastToFloat32(vlo); + GI_FLOAT32_V2_t ret; + GiSetSubVectorFloat32V2(ret, 0, flo); + GiSetSubVectorFloat32V2(ret, 1, fhi); + + return ret; } template <> diff --git a/dnn/src/fallback/reduce/reducer.h b/dnn/src/fallback/reduce/reducer.h index e98ddec8..604cd3d8 100644 --- a/dnn/src/fallback/reduce/reducer.h +++ b/dnn/src/fallback/reduce/reducer.h @@ -36,14 +36,14 @@ struct MeanReducer { using ctype = int8_t; static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); - GI_INT32_t res[4]; + GI_INT32_FIXLEN_t res[4]; int32_t remain; int32_t cnt; float coef; - GI_FLOAT32_t vcoef; + GI_FLOAT32_FIXLEN_t vcoef; MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { memset(res, 0, sizeof(res)); - vcoef = GiBroadcastFloat32(coef); + vcoef = GiFloat32Type2FixLenType(GiBroadcastFloat32(coef)); } MeanReducer() = default; void feed(const int8_t* val) { @@ -56,19 +56,27 @@ struct MeanReducer { const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high); const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high); - res[0] = GiAddInt32(res[0], vval_low_low); - res[1] = GiAddInt32(res[1], vval_low_high); - res[2] = GiAddInt32(res[2], vval_high_low); - res[3] = GiAddInt32(res[3], vval_high_high); + res[0] = GiInt32Type2FixLenType( + GiAddInt32(GiFixLenType2GiInt32Type(res[0]), vval_low_low)); + res[1] = GiInt32Type2FixLenType( + GiAddInt32(GiFixLenType2GiInt32Type(res[1]), vval_low_high)); + res[2] = GiInt32Type2FixLenType( + GiAddInt32(GiFixLenType2GiInt32Type(res[2]), vval_high_low)); + res[3] = GiInt32Type2FixLenType( + GiAddInt32(GiFixLenType2GiInt32Type(res[3]), vval_high_high)); } void feed_remain(const int8_t* val) { remain += *val; } void post(int8_t* dst) { for (int i = 0; i < 4; i += 2) { - GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); - GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); - GiStoreLowInt8( - dst, (QConverter::convert( - {{vitem0, vitem1}}))); + auto tmp = GiFixLenType2GiFloat32Type(vcoef); + GI_FLOAT32_t vitem0 = GiMultiplyFloat32( + GiCastToFloat32(GiFixLenType2GiInt32Type(res[i])), tmp); + GI_FLOAT32_t vitem1 = GiMultiplyFloat32( + GiCastToFloat32(GiFixLenType2GiInt32Type(res[i + 1])), tmp); + GI_FLOAT32_V2_t ret; + GiSetSubVectorFloat32V2(ret, 0, vitem0); + GiSetSubVectorFloat32V2(ret, 1, vitem1); + GiStoreLowInt8(dst, (QConverter::convert(ret))); dst += 8; } } @@ -83,17 +91,20 @@ struct MeanReducer { using ctype = float; static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); - GI_FLOAT32_t res; + GI_FLOAT32_FIXLEN_t res; float result; float coef; MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { - res = GiBroadcastFloat32(0.0f); + res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f)); } MeanReducer() = default; - void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); } + void feed(const float* val) { + res = GiFloat32Type2FixLenType( + GiAddFloat32(GiLoadFloat32(val), GiFixLenType2GiFloat32Type(res))); + } void feed_remain(const float* val) { result += *val; } void post(float* dst) { - result += GiReduceAddFloat32(res); + result += GiReduceAddFloat32(GiFixLenType2GiFloat32Type(res)); *dst = result * coef; } }; @@ -103,18 +114,22 @@ struct MeanReducer { using ctype = float; static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); - GI_FLOAT32_t res; + GI_FLOAT32_FIXLEN_t res; float remain; float coef; MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { - res = GiBroadcastFloat32(0.0f); + res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f)); } MeanReducer() = default; - void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); } + void feed(const float* val) { + res = GiFloat32Type2FixLenType( + GiAddFloat32(GiLoadFloat32(val), GiFixLenType2GiFloat32Type(res))); + } void feed_remain(const float* val) { remain += *val; } void post(float* dst) { - res = GiMultiplyScalerFloat32(res, coef); - GiStoreFloat32(dst, res); + res = GiFloat32Type2FixLenType( + GiMultiplyScalerFloat32(GiFixLenType2GiFloat32Type(res), coef)); + GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); } void post_remain(float* dst) { *dst = remain * coef; } }; @@ -125,23 +140,29 @@ struct maxReducer; template struct minReducer; -#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ - template <> \ - struct _mode##Reducer { \ - using ctype = float; \ - static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ - GI_FLOAT32_t res; \ - _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ - _mode##Reducer() = default; \ - void feed(const float* val) { \ - auto vval = GiLoadFloat32(val); \ - res = Gi##_Mode##NanFloat32(res, vval); \ - } \ - void feed_remain(const float* val) { \ - auto vval = GiBroadcastFloat32(*val); \ - res = Gi##_Mode##NanFloat32(vval, res); \ - } \ - void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); } \ +#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = float; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ + GI_FLOAT32_FIXLEN_t res; \ + _mode##Reducer(DType, size_t) { \ + res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ + } \ + _mode##Reducer() = default; \ + void feed(const float* val) { \ + auto vval = GiLoadFloat32(val); \ + res = GiFloat32Type2FixLenType( \ + Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), vval)); \ + } \ + void feed_remain(const float* val) { \ + auto vval = GiBroadcastFloat32(*val); \ + res = GiFloat32Type2FixLenType( \ + Gi##_Mode##NanFloat32(vval, GiFixLenType2GiFloat32Type(res))); \ + } \ + void post(float* dst) { \ + *dst = GiReduce##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res)); \ + } \ } REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits::lowest()); @@ -151,28 +172,31 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits::max()); #define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); #define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); -#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ - template <> \ - struct _mode##Reducer { \ - using ctype = float; \ - static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ - GI_FLOAT32_t res; \ - float remain; \ - _mode##Reducer(DType, size_t) { \ - res = GiBroadcastFloat32(_init); \ - remain = _init; \ - } \ - _mode##Reducer() = default; \ - void feed(const float* val) { \ - GI_FLOAT32_t vval = GiLoadFloat32(val); \ - res = Gi##_Mode##NanFloat32(res, vval); \ - } \ - void feed_remain(const float* val) { \ - using namespace std; \ - remain = _Mode##_NAN(*val, remain); \ - } \ - void post(float* dst) { GiStoreFloat32(dst, res); } \ - void post_remain(float* dst) { *dst = remain; } \ +#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = float; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ + GI_FLOAT32_FIXLEN_t res; \ + float remain; \ + _mode##Reducer(DType, size_t) { \ + res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ + remain = _init; \ + } \ + _mode##Reducer() = default; \ + void feed(const float* val) { \ + GI_FLOAT32_t vval = GiLoadFloat32(val); \ + res = GiFloat32Type2FixLenType( \ + Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), vval)); \ + } \ + void feed_remain(const float* val) { \ + using namespace std; \ + remain = _Mode##_NAN(*val, remain); \ + } \ + void post(float* dst) { \ + GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \ + } \ + void post_remain(float* dst) { *dst = remain; } \ } REDUCER_MAX_MIN_C(max, Max, std::numeric_limits::lowest()); @@ -181,51 +205,58 @@ REDUCER_MAX_MIN_C(min, Min, std::numeric_limits::max()); #undef Max_NAN #undef Min_NAN -#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ - template <> \ - struct _mode##Reducer { \ - using ctype = int8_t; \ - static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ - GI_INT8_t res; \ - _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ - _mode##Reducer() = default; \ - void feed(const int8_t* val) { \ - GI_INT8_t vval = GiLoadInt8(val); \ - res = Gi##_Mode##imumInt8(vval, res); \ - } \ - void feed_remain(const int8_t* val) { \ - GI_INT8_t vval = GiBroadcastInt8(*val); \ - res = Gi##_Mode##imumInt8(res, vval); \ - } \ - void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ +#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = int8_t; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ + GI_INT8_FIXLEN_t res; \ + _mode##Reducer(DType, size_t) { \ + res = GiInt8Type2FixLenType(GiBroadcastInt8(_init)); \ + } \ + _mode##Reducer() = default; \ + void feed(const int8_t* val) { \ + GI_INT8_t vval = GiLoadInt8(val); \ + res = GiInt8Type2FixLenType( \ + Gi##_Mode##imumInt8(vval, GiFixLenType2GiInt8Type(res))); \ + } \ + void feed_remain(const int8_t* val) { \ + GI_INT8_t vval = GiBroadcastInt8(*val); \ + res = GiInt8Type2FixLenType( \ + Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), vval)); \ + } \ + void post(int8_t* dst) { \ + *dst = GiReduce##_Mode##Int8(GiFixLenType2GiInt8Type(res)); \ + } \ } REDUCER_MAX_MIN_C1(max, Max, -128); REDUCER_MAX_MIN_C1(min, Min, 127); #undef REDUCER_MAX_MIN_C1 -#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ - template <> \ - struct _mode##Reducer { \ - using ctype = int8_t; \ - static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ - GI_INT8_t res; \ - int8_t remain; \ - _mode##Reducer(DType, size_t) { \ - res = GiBroadcastInt8(_init); \ - remain = _init; \ - } \ - _mode##Reducer() = default; \ - void feed(const int8_t* val) { \ - GI_INT8_t vval = GiLoadInt8(val); \ - res = Gi##_Mode##imumInt8(res, vval); \ - } \ - void feed_remain(const int8_t* val) { \ - using namespace std; \ - remain = _mode(*val, remain); \ - } \ - void post(int8_t* dst) { GiStoreInt8(dst, res); } \ - void post_remain(int8_t* dst) { *dst = remain; } \ +#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = int8_t; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ + GI_INT8_FIXLEN_t res; \ + int8_t remain; \ + _mode##Reducer(DType, size_t) { \ + res = GiInt8Type2FixLenType(GiBroadcastInt8(_init)); \ + remain = _init; \ + } \ + _mode##Reducer() = default; \ + void feed(const int8_t* val) { \ + GI_INT8_t vval = GiLoadInt8(val); \ + res = GiInt8Type2FixLenType( \ + Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), vval)); \ + } \ + void feed_remain(const int8_t* val) { \ + using namespace std; \ + remain = _mode(*val, remain); \ + } \ + void post(int8_t* dst) { GiStoreInt8(dst, GiFixLenType2GiInt8Type(res)); } \ + void post_remain(int8_t* dst) { *dst = remain; } \ } REDUCER_MAX_MIN_C(max, Max, -128); @@ -238,61 +269,67 @@ struct SumReducer; template struct ProductReducer; -#define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init) \ - template <> \ - struct _mode##Reducer { \ - using ctype = float; \ - static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ - GI_FLOAT32_t res; \ - float remain; \ - _mode##Reducer(DType, size_t) { \ - res = GiBroadcastFloat32(_init); \ - remain = _init; \ - } \ - _mode##Reducer() = default; \ - void feed(const float* val) { \ - GI_FLOAT32_t vval = GiLoadFloat32(val); \ - res = Gi##_Mode##Float32(vval, res); \ - } \ - void feed_remain(const float* val) { \ - using namespace std; \ - auto op = _op(); \ - remain = op(remain, *val); \ - } \ - void post(float* dst) { \ - using namespace std; \ - auto op = _op(); \ - *dst = op(remain, GiReduce##_Mode##Float32(res)); \ - } \ +#define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = float; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ + GI_FLOAT32_FIXLEN_t res; \ + float remain; \ + _mode##Reducer(DType, size_t) { \ + res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ + remain = _init; \ + } \ + _mode##Reducer() = default; \ + void feed(const float* val) { \ + GI_FLOAT32_t vval = GiLoadFloat32(val); \ + res = GiFloat32Type2FixLenType( \ + Gi##_Mode##Float32(vval, GiFixLenType2GiFloat32Type(res))); \ + } \ + void feed_remain(const float* val) { \ + using namespace std; \ + auto op = _op(); \ + remain = op(remain, *val); \ + } \ + void post(float* dst) { \ + using namespace std; \ + auto op = _op(); \ + *dst = \ + op(remain, \ + GiReduce##_Mode##Float32(GiFixLenType2GiFloat32Type(res))); \ + } \ } REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f); REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); #undef REDUCER_SUM_PRODUCT_C1 -#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \ - template <> \ - struct _mode##Reducer { \ - using ctype = float; \ - static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ - GI_FLOAT32_t res; \ - float remain; \ - _mode##Reducer(DType, size_t) { \ - res = GiBroadcastFloat32(_init); \ - remain = _init; \ - } \ - _mode##Reducer() = default; \ - void feed(const float* val) { \ - GI_FLOAT32_t vval = GiLoadFloat32(val); \ - res = Gi##_Mode##Float32(vval, res); \ - } \ - void feed_remain(const float* val) { \ - using namespace std; \ - auto op = _op(); \ - remain = op(remain, (*val)); \ - } \ - void post(float* dst) { GiStoreFloat32(dst, res); } \ - void post_remain(float* dst) { *dst = remain; } \ +#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \ + template <> \ + struct _mode##Reducer { \ + using ctype = float; \ + static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ + GI_FLOAT32_FIXLEN_t res; \ + float remain; \ + _mode##Reducer(DType, size_t) { \ + res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ + remain = _init; \ + } \ + _mode##Reducer() = default; \ + void feed(const float* val) { \ + GI_FLOAT32_t vval = GiLoadFloat32(val); \ + res = GiFloat32Type2FixLenType( \ + Gi##_Mode##Float32(vval, GiFixLenType2GiFloat32Type(res))); \ + } \ + void feed_remain(const float* val) { \ + using namespace std; \ + auto op = _op(); \ + remain = op(remain, (*val)); \ + } \ + void post(float* dst) { \ + GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \ + } \ + void post_remain(float* dst) { *dst = remain; } \ } REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f); @@ -308,23 +345,24 @@ struct SumSqrReducer { using ctype = float; static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); - GI_FLOAT32_t res; + GI_FLOAT32_FIXLEN_t res; float result; SumSqrReducer(DType, size_t cnt) : result(0.0f) { MEGDNN_MARK_USED_VAR(cnt); - res = GiBroadcastFloat32(0.0f); + res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f)); } SumSqrReducer() = default; void feed(const float* val) { GI_FLOAT32_t vval = GiLoadFloat32(val); - res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); + res = GiFloat32Type2FixLenType(GiAddFloat32( + GiMultiplyFloat32(vval, vval), GiFixLenType2GiFloat32Type(res))); } void feed_remain(const float* val) { float vval = *val; result += vval * vval; } void post(float* dst) { - result += GiReduceAddFloat32(res); + result += GiReduceAddFloat32(GiFixLenType2GiFloat32Type(res)); *dst = result; } }; @@ -333,19 +371,20 @@ struct SumSqrReducer { using ctype = float; static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); - GI_FLOAT32_t res; + GI_FLOAT32_FIXLEN_t res; float remain; SumSqrReducer(DType, size_t cnt) : remain(0.0f) { MEGDNN_MARK_USED_VAR(cnt); - res = GiBroadcastFloat32(0.0f); + res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f)); } SumSqrReducer() = default; void feed(const float* val) { GI_FLOAT32_t vval = GiLoadFloat32(val); - res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); + res = GiFloat32Type2FixLenType(GiAddFloat32( + GiMultiplyFloat32(vval, vval), GiFixLenType2GiFloat32Type(res))); } void feed_remain(const float* val) { remain += (*val) * (*val); } - void post(float* dst) { GiStoreFloat32(dst, res); } + void post(float* dst) { GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); } void post_remain(float* dst) { *dst = remain; } }; /**************************************do reduce*************************/ diff --git a/dnn/src/fallback/type_cvt/typecvt_helper.h b/dnn/src/fallback/type_cvt/typecvt_helper.h index a9d13c29..1cd69ba9 100644 --- a/dnn/src/fallback/type_cvt/typecvt_helper.h +++ b/dnn/src/fallback/type_cvt/typecvt_helper.h @@ -18,22 +18,26 @@ struct QuantizedTypeCvter { static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t) * 2; static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(int32_t); float scale; - GI_FLOAT32_t vscale; + GI_FLOAT32_FIXLEN_t vscale; QuantizedTypeCvter(DType src_dtype, DType dst_dtype) { float src_scale = src_dtype.param().scale; float dst_scale = dst_dtype.param().scale; scale = src_scale / dst_scale; - vscale = GiBroadcastFloat32(scale); + vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); } void cvt(const int32_t* src, int8_t* dst) { - GI_FLOAT32_t vitem0 = - GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), vscale); - GI_FLOAT32_t vitem1 = GiMultiplyFloat32( - GiCastToFloat32(GiLoadInt32(src + SIMD_STEP)), vscale); - - auto vres = QConverter::convert({{vitem0, vitem1}}); + GI_FLOAT32_t t; + t = GiFixLenType2GiFloat32Type(vscale); + GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), t); + GI_FLOAT32_t vitem1 = + GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src + SIMD_STEP)), t); + + GI_FLOAT32_V2_t v2; + GiSetSubVectorFloat32V2(v2, 0, vitem0); + GiSetSubVectorFloat32V2(v2, 1, vitem1); + auto vres = QConverter::convert(v2); GiStoreLowInt8(dst, vres); } @@ -48,27 +52,29 @@ struct QuantizedTypeCvter { using dst_type = int32_t; static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); float scale; - GI_FLOAT32_t vscale; + GI_FLOAT32_FIXLEN_t vscale; QuantizedTypeCvter(DType src_dtype, DType dst_dtype) { float src_scale = src_dtype.param().scale; float dst_scale = dst_dtype.param().scale; scale = src_scale / dst_scale; - vscale = GiBroadcastFloat32(scale); + vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); } void cvt(const int8_t* src, int32_t* dst) { + GI_FLOAT32_t t; + t = GiFixLenType2GiFloat32Type(vscale); GI_INT8_t data = GiLoadInt8(src); GI_INT16_t vitem0 = GiMoveLowLongInt8(data); GI_INT16_t vitem1 = GiMoveHighLongInt8(data); auto vret0 = QConverter::round( - GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale)); - auto vret1 = QConverter::round(GiMultiplyFloat32( - GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale)); + GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t)); + auto vret1 = QConverter::round( + GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t)); auto vret2 = QConverter::round( - GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale)); - auto vret3 = QConverter::round(GiMultiplyFloat32( - GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale)); + GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t)); + auto vret3 = QConverter::round( + GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t)); constexpr size_t step = GI_SIMD_LEN_BYTE / sizeof(int32_t); GiStoreInt32(dst, vret0); @@ -90,21 +96,26 @@ struct QuantizedTypeCvter { static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float) * 2; static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(float); float scale; - GI_FLOAT32_t vscale; + GI_FLOAT32_FIXLEN_t vscale; QuantizedTypeCvter(DType src_dtype, DType dst_dtype) { MEGDNN_MARK_USED_VAR(src_dtype); float src_scale = 1; float dst_scale = dst_dtype.param().scale; scale = src_scale / dst_scale; - vscale = GiBroadcastFloat32(scale); + vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); } void cvt(const float* src, int8_t* dst) { - GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiLoadFloat32(src), vscale); - GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiLoadFloat32(src + SIMD_STEP), vscale); - - auto vres = QConverter::convert({{vitem0, vitem1}}); + GI_FLOAT32_t t; + t = GiFixLenType2GiFloat32Type(vscale); + GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiLoadFloat32(src), t); + GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiLoadFloat32(src + SIMD_STEP), t); + + GI_FLOAT32_V2_t v2; + GiSetSubVectorFloat32V2(v2, 0, vitem0); + GiSetSubVectorFloat32V2(v2, 1, vitem1); + auto vres = QConverter::convert(v2); GiStoreLowInt8(dst, vres); } @@ -119,18 +130,19 @@ struct QuantizedTypeCvter { using dst_type = int32_t; static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t); float scale; - GI_FLOAT32_t vscale; + GI_FLOAT32_FIXLEN_t vscale; QuantizedTypeCvter(DType src_dtype, DType dst_dtype) { float src_scale = src_dtype.param().scale; float dst_scale = dst_dtype.param().scale; scale = src_scale / dst_scale; - vscale = GiBroadcastFloat32(scale); + vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); } void cvt(const int32_t* src, int32_t* dst) { - GI_FLOAT32_t vitem = - GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), vscale); + GI_FLOAT32_t t; + t = GiFixLenType2GiFloat32Type(vscale); + GI_FLOAT32_t vitem = GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), t); auto vres = QConverter::round(vitem); GiStoreInt32(dst, vres); @@ -148,30 +160,32 @@ struct QuantizedTypeCvter { using dst_type = int8_t; static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); float scale; - GI_FLOAT32_t vscale; + GI_FLOAT32_FIXLEN_t vscale; QuantizedTypeCvter(DType src_dtype, DType dst_dtype) { float src_scale = src_dtype.param().scale; float dst_scale = dst_dtype.param().scale; scale = src_scale / dst_scale; - vscale = GiBroadcastFloat32(scale); + vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); } void cvt(const int8_t* src, int8_t* dst) { + GI_FLOAT32_t t; + t = GiFixLenType2GiFloat32Type(vscale); GI_INT8_t data = GiLoadInt8(src); GI_INT16_t vitem0 = GiMoveLowLongInt8(data); GI_INT16_t vitem1 = GiMoveHighLongInt8(data); - auto vret0 = - GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale); - auto vret1 = - GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale); - auto vret2 = - GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale); - auto vret3 = - GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale); - - auto vres = QConverter::convert( - {{vret0, vret1, vret2, vret3}}); + auto vret0 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t); + auto vret1 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t); + auto vret2 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t); + auto vret3 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t); + + GI_FLOAT32_V4_t v4; + GiSetSubVectorFloat32V4(v4, 0, vret0); + GiSetSubVectorFloat32V4(v4, 1, vret1); + GiSetSubVectorFloat32V4(v4, 2, vret2); + GiSetSubVectorFloat32V4(v4, 3, vret3); + auto vres = QConverter::convert(v4); GiStoreInt8(dst, vres); } @@ -245,26 +259,24 @@ struct Quan2FloatTypeCvter { static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(float); float _scale = 0.0f; - GI_FLOAT32_t vscale; + GI_FLOAT32_FIXLEN_t vscale; Quan2FloatTypeCvter(DType src_dtype, DType dst_dtype) { _scale = src_dtype.param().scale; - vscale = GiBroadcastFloat32(_scale); + vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(_scale)); MEGDNN_MARK_USED_VAR(dst_dtype); } void cvt(const int8_t* src, float* dst) { + GI_FLOAT32_t t; + t = GiFixLenType2GiFloat32Type(vscale); GI_INT8_t data = GiLoadInt8(src); GI_INT16_t vitem0 = GiMoveLowLongInt8(data); GI_INT16_t vitem1 = GiMoveHighLongInt8(data); - auto vret0 = - GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale); - auto vret1 = - GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale); - auto vret2 = - GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale); - auto vret3 = - GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale); + auto vret0 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t); + auto vret1 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t); + auto vret2 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t); + auto vret3 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t); GiStoreFloat32(dst, vret0); GiStoreFloat32(dst + SIMD_STEP, vret1); -- GitLab