提交 8546c15d 编写于 作者: M Megvii Engine Team

feat(gi): make elemwise apply gi class type

GitOrigin-RevId: 6ff1a8a55ce5e01a93b4c619833dcd70ebe2f735
上级 74fb63db
......@@ -12,7 +12,7 @@ using BcastType = megdnn::elemwise::BcastType;
///////////////////////////////// ParamElemVistor ///////////////////////////
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix) \
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, _neon_type_v2) \
template <> \
struct ParamElemVisitor<_ctype> { \
_neon_type operator()(const _ctype* src) const { \
......@@ -24,29 +24,61 @@ using BcastType = megdnn::elemwise::BcastType;
_neon_type operator()(const _ctype* src) const { \
return vdupq_n_##_fun_suffix(*reinterpret_cast<const _inner_ctype*>(src)); \
} \
}; \
template <> \
struct ParamElemVisitorV2<_ctype> { \
_neon_type_v2 operator()(const _ctype* src, const _ctype* src_1) const { \
_neon_type_v2 ret; \
ret.val[0] = \
vld1q_##_fun_suffix(reinterpret_cast<const _inner_ctype*>(src)); \
ret.val[1] = \
vld1q_##_fun_suffix(reinterpret_cast<const _inner_ctype*>(src_1)); \
return ret; \
} \
}; \
template <> \
struct ParamElemVisitorDupV2<_ctype> { \
_neon_type_v2 operator()(const _ctype* src) const { \
_neon_type_v2 ret; \
ret.val[0] = vdupq_n_##_fun_suffix( \
*reinterpret_cast<const _inner_ctype*>(src)); \
ret.val[1] = ret.val[0]; \
return ret; \
} \
}
cb(dt_quint8, uint8_t, uint8x16_t, u8);
cb(dt_quint8, uint8_t, uint8x16_t, u8, uint8x16x2_t);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
cb(__fp16, __fp16, float16x8_t, f16);
cb(__fp16, __fp16, float16x8_t, f16, float16x8x2_t);
#endif
cb(dt_int16, int16_t, int16x8_t, s16);
cb(dt_int16, int16_t, int16x8_t, s16, int16x8x2_t);
#undef cb
template <typename ctype>
struct ParamElemVisitorBcast101x4;
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, rel_suffix) \
template <> \
struct ParamElemVisitorBcast101x4<_ctype> { \
_neon_type operator()(const _ctype* src) const { \
return vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \
reinterpret_cast<const _inner_ctype*>(src))); \
} \
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, rel_suffix, _neon_type_v2) \
template <> \
struct ParamElemVisitorBcast101x4<_ctype> { \
_neon_type operator()(const _ctype* src) const { \
return vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \
reinterpret_cast<const _inner_ctype*>(src))); \
} \
}; \
template <> \
struct ParamElemVisitorBcast101x4V2<_ctype> { \
_neon_type_v2 operator()(const _ctype* src) const { \
_neon_type_v2 ret; \
ret.val[0] = \
vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \
reinterpret_cast<const _inner_ctype*>(src))); \
ret.val[1] = ret.val[0]; \
return ret; \
} \
}
cb(dt_quint8, uint32_t, uint8x16_t, u8, u32);
cb(dt_int16, int64_t, int16x8_t, s16, s64);
cb(dt_quint8, uint32_t, uint8x16_t, u8, u32, uint8x16x2_t);
cb(dt_int16, int64_t, int16x8_t, s16, s64, int16x8x2_t);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
cb(__fp16, uint64_t, float16x8_t, f16, u64);
cb(__fp16, uint64_t, float16x8_t, f16, u64, float16x8x2_t);
#endif
#undef cb
......
......@@ -283,7 +283,7 @@ v4sf GiCosPsFloat32(v4sf x) {
v4sf GiTanPsFloat32(v4sf x) {
v4sf ysin, ycos;
GiSinCosPsFloat32(x, &ysin, &ycos);
return ysin / ycos;
return GiDivFloat32(ysin, ycos);
}
#undef c_exp_hi
......
......@@ -20,22 +20,28 @@ struct AbsOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_ctype = src_ctype>
struct AbsOp;
#define OP(_ctype, _gi_type, _func_suffix, _simd_width) \
template <> \
struct AbsOp<_ctype> : AbsOpBase<_ctype> { \
using AbsOpBase::AbsOpBase; \
using AbsOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _gi_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_gi_type operator()(const _gi_type& src) const { \
auto vitem0 = GiAbs##_func_suffix(src.val[0]); \
auto vitem1 = GiAbs##_func_suffix(src.val[1]); \
return {{vitem0, vitem1}}; \
} \
#define OP(_ctype, _gi_type, _func_suffix, _simd_width) \
template <> \
struct AbsOp<_ctype> : AbsOpBase<_ctype> { \
using AbsOpBase::AbsOpBase; \
using AbsOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _gi_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_gi_type operator()(const _gi_type& src) const { \
auto vitem0 = \
GiAbs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 0)); \
auto vitem1 = \
GiAbs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 1)); \
_gi_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(dt_float32))
OP(dt_int32, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(dt_int32))
......@@ -64,11 +70,18 @@ struct AbsOp<dt_qint8, dt_qint8> : AbsOpBase<dt_qint8, dt_qint8> {
OPERATOR_UNARY_QINT8_FALLBACK;
}
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale));
vitem0 = GiAbsFloat32(vitem0);
vitem1 = GiAbsFloat32(vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};
......
......@@ -33,13 +33,21 @@ struct AddOp;
void operator()( \
const _gi_type2& src0, const _gi_type2& src1, dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_gi_type2 operator()(const _gi_type2& src0, const _gi_type2& src1) const { \
auto vitem0 = GiAdd##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiAdd##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_gi_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _gi_type& src0, const _gi_type& src1, dst_ctype* dst) const { \
......@@ -82,13 +90,24 @@ struct AddOp<dt_qint8, dt_qint8> : AddOpBase<dt_qint8, dt_qint8> {
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret);
}
};
......@@ -119,12 +138,24 @@ struct AddOp<dt_qint32, dt_qint8> : AddOpBase<dt_qint32, dt_qint8> {
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret);
}
};
......
......@@ -23,22 +23,28 @@ struct ExpOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_ctype = src_ctype>
struct ExpOp;
#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct ExpOp<_ctype> : ExpOpBase<_ctype> { \
using ExpOpBase::ExpOpBase; \
using ExpOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type operator()(const _simd_type& src) const { \
auto vitem0 = GiExpPs##_func_suffix(src.val[0]); \
auto vitem1 = GiExpPs##_func_suffix(src.val[1]); \
return {{vitem0, vitem1}}; \
} \
#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct ExpOp<_ctype> : ExpOpBase<_ctype> { \
using ExpOpBase::ExpOpBase; \
using ExpOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src) const { \
auto vitem0 = \
GiExpPs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 0)); \
auto vitem1 = \
GiExpPs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 1)); \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
#undef OP
......
......@@ -32,14 +32,15 @@ struct FastTanhOp;
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src) const { \
auto val_27 = GiBroadcast##_func_suffix(27.f); \
auto val_9 = GiBroadcast##_func_suffix(9.f); \
auto valx = src.val[0]; \
auto valx1 = src.val[1]; \
auto valx = GiGetSubVector##_func_suffix##V2(src, 0); \
auto valx1 = GiGetSubVector##_func_suffix##V2(src, 1); \
auto valxp2 = GiMultiply##_fix_func_suffix(valx, valx); \
auto valx1p2 = GiMultiply##_fix_func_suffix(valx1, valx1); \
auto denominator = GiAdd##_fix_func_suffix(valxp2, val_27); \
......@@ -58,7 +59,10 @@ struct FastTanhOp;
r_denominator1); \
valx = GiMultiply##_fix_func_suffix(valx, r_denominator); \
valx1 = GiMultiply##_fix_func_suffix(valx1, r_denominator1); \
return {{valx, valx1}}; \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, valx); \
GiSetSubVector##_func_suffix##V2(ret, 1, valx1); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
......
......@@ -36,19 +36,23 @@ struct FuseAddHSwishOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \
auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \
auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \
val1 = GiAdd##_func_suffix(val1, val3); \
val2 = GiAdd##_func_suffix(val2, val4); \
H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \
return {{val1, val2}}; \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
......@@ -98,15 +102,28 @@ struct FuseAddHSwishOp<dt_qint32, dt_qint8> : FuseAddHSwishOpBase<dt_qint32, dt_
GI_FLOAT32_t vitem0, vitem1;
vitem0 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale_src1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src1)));
vitem1 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale_src1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src1)));
H_SWISH_KERN_FALLBACK(Float32, vitem0, vitem1);
vitem0 = GiMultiplyFloat32(vitem0, this->vscale_dst);
vitem1 = GiMultiplyFloat32(vitem1, this->vscale_dst);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
vitem0 =
GiMultiplyFloat32(vitem0, GiFixLenType2GiFloat32Type(this->vscale_dst));
vitem1 =
GiMultiplyFloat32(vitem1, GiFixLenType2GiFloat32Type(this->vscale_dst));
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret);
}
};
......
......@@ -35,17 +35,21 @@ struct FuseAddReluOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \
auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \
auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \
FUSE_ADD_RELU_SIMD_PACK2_FALLBACK(val1, val2, val3, val4, _func_suffix); \
return {{val1, val2}}; \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
......@@ -105,15 +109,26 @@ struct FuseAddReluOp<dt_qint8, dt_qint8> : FuseAddReluOpBase<dt_qint8, dt_qint8>
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
vitem0 = GiMaximumFloat32(vitem0, this->vzero());
vitem1 = GiMaximumFloat32(vitem1, this->vzero());
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret);
}
};
......@@ -144,15 +159,26 @@ struct FuseAddReluOp<dt_qint32, dt_qint8> : FuseAddReluOpBase<dt_qint32, dt_qint
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiAddFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
vitem0 = GiMaximumFloat32(vitem0, this->vzero());
vitem1 = GiMaximumFloat32(vitem1, this->vzero());
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vitem0);
GiSetSubVectorFloat32V2(ret, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret);
}
};
......
......@@ -36,19 +36,23 @@ struct FuseAddSigmoidOp;
const _simd_type& src0, const _simd_type& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src0, const _simd_type& src1) const { \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \
auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \
auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \
val1 = GiAdd##_func_suffix(val1, val3); \
val2 = GiAdd##_func_suffix(val2, val4); \
val1 = GiSigmoidPs##_func_suffix(val1); \
val2 = GiSigmoidPs##_func_suffix(val2); \
return {{val1, val2}}; \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
......
......@@ -35,14 +35,15 @@ struct FuseAddTanhOp;
const _simd_type& src0, const _simd_type& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src0, const _simd_type& src1) const { \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \
auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \
auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \
val1 = GiAdd##_func_suffix(val1, val3); \
val2 = GiAdd##_func_suffix(val2, val4); \
auto exp1 = GiExpPs##_func_suffix(val1); \
......@@ -65,7 +66,10 @@ struct FuseAddTanhOp;
GiRecpeS##_func_suffix(exp2, rexp2), rexp2); \
val1 = GiMultiply##_func_suffix(val1, rexp1); \
val2 = GiMultiply##_func_suffix(val2, rexp2); \
return {{val1, val2}}; \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
......
......@@ -26,28 +26,36 @@ struct FuseMulAdd3OpBase : TernaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_ctype = src_ctype>
struct FuseMulAdd3Op;
#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct FuseMulAdd3Op<_ctype> : FuseMulAdd3OpBase<_ctype> { \
using FuseMulAdd3OpBase::FuseMulAdd3OpBase; \
using FuseMulAdd3OpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2, dst_ctype* dst) const { \
auto vitem = operator()(src0, src1, src2); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2) const { \
auto vitem0 = GiMultiplyAdd##_func_suffix( \
src2.val[0], src0.val[0], src1.val[0]); \
auto vitem1 = GiMultiplyAdd##_func_suffix( \
src2.val[1], src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
} \
#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct FuseMulAdd3Op<_ctype> : FuseMulAdd3OpBase<_ctype> { \
using FuseMulAdd3OpBase::FuseMulAdd3OpBase; \
using FuseMulAdd3OpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2, dst_ctype* dst) const { \
auto vitem = operator()(src0, src1, src2); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2) const { \
auto vitem0 = GiMultiplyAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src2, 0), \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMultiplyAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src2, 1), \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
};
OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
OP(dt_int32, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t))
......
......@@ -26,39 +26,43 @@ struct HSwishOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_ctype = src_ctype>
struct HSwishOp;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct HSwishOp<_ctype> : HSwishOpBase<_ctype> { \
using HSwishOpBase::HSwishOpBase; \
using HSwishOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto val1 = src.val[0]; \
auto val2 = src.val[1]; \
H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \
return {{val1, val2}}; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto val_zero = GiBroadcast##_func_suffix(0.f); \
auto val_six = GiBroadcast##_func_suffix(6.f); \
auto val_three = GiBroadcast##_func_suffix(3.f); \
auto val_rec_six = GiBroadcast##_func_suffix(1.f / 6.f); \
auto clip1 = GiMaximum##_func_suffix( \
GiMinimum##_func_suffix( \
GiAdd##_func_suffix(src, val_three), val_six), \
val_zero); \
return GiMultiply##_func_suffix( \
GiMultiply##_func_suffix(src, clip1), val_rec_six); \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct HSwishOp<_ctype> : HSwishOpBase<_ctype> { \
using HSwishOpBase::HSwishOpBase; \
using HSwishOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto val1 = GiGetSubVector##_func_suffix##V2(src, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src, 1); \
H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto val_zero = GiBroadcast##_func_suffix(0.f); \
auto val_six = GiBroadcast##_func_suffix(6.f); \
auto val_three = GiBroadcast##_func_suffix(3.f); \
auto val_rec_six = GiBroadcast##_func_suffix(1.f / 6.f); \
auto clip1 = GiMaximum##_func_suffix( \
GiMinimum##_func_suffix( \
GiAdd##_func_suffix(src, val_three), val_six), \
val_zero); \
return GiMultiply##_func_suffix( \
GiMultiply##_func_suffix(src, clip1), val_rec_six); \
} \
};
OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
......@@ -90,14 +94,23 @@ struct HSwishOp<dt_qint32, dt_qint8> : HSwishOpBase<dt_qint32, dt_qint8> {
}
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale_src);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale_src);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src));
H_SWISH_KERN_FALLBACK(Float32, vitem0, vitem1);
vitem0 = GiMultiplyFloat32(vitem0, this->vscale_dst);
vitem1 = GiMultiplyFloat32(vitem1, this->vscale_dst);
vitem0 =
GiMultiplyFloat32(vitem0, GiFixLenType2GiFloat32Type(this->vscale_dst));
vitem1 =
GiMultiplyFloat32(vitem1, GiFixLenType2GiFloat32Type(this->vscale_dst));
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};
......
......@@ -32,14 +32,22 @@ struct MaxOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiMaximum##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiMaximum##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
......@@ -87,12 +95,23 @@ struct MaxOp<dt_qint8, dt_qint8> : MaxOpBase<dt_qint8, dt_qint8> {
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiMaximumFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiMaximumFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};
......
......@@ -33,14 +33,22 @@ struct MinOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiMinimum##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiMinimum##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiMinimum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMinimum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
......@@ -84,12 +92,23 @@ struct MinOp<dt_qint8, dt_qint8> : MinOpBase<dt_qint8, dt_qint8> {
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiMinimumFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiMinimumFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};
......
......@@ -33,14 +33,22 @@ struct MulOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiMultiply##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiMultiply##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiMultiply##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMultiply##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
......@@ -83,13 +91,24 @@ struct MulOp<dt_qint8, dt_qint8> : MulOpBase<dt_qint8, dt_qint8> {
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiMultiplyFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale_src0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiMultiplyFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale_src0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};
......
......@@ -16,23 +16,24 @@ struct NoneOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_type = src_ctype>
struct NoneOp;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct NoneOp<_ctype> : NoneOpBase<_ctype> { \
NoneOp(){}; \
NoneOp(float, float){}; \
using NoneOpBase::NoneOpBase; \
using NoneOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
_simd_type2 operator()(const _simd_type2& src) const { return src; } \
void operator()(const _simd_type2& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, src.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, src.val[1]); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, src); \
} \
_simd_type operator()(const _simd_type& src) const { return src; } \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct NoneOp<_ctype> : NoneOpBase<_ctype> { \
NoneOp(){}; \
NoneOp(float, float){}; \
using NoneOpBase::NoneOpBase; \
using NoneOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
_simd_type2 operator()(const _simd_type2& src) const { return src; } \
void operator()(const _simd_type2& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(src, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(src, 1)); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, src); \
} \
_simd_type operator()(const _simd_type& src) const { return src; } \
};
OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
......@@ -61,8 +62,8 @@ struct NoneOp<dt_qint32, dt_qint8> : NoneOpBase<dt_qint32, dt_qint8> {
constexpr static size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t);
void operator()(const GI_INT32_V2_t& vsrc, dt_qint8* dst) const {
GiStoreInt32(dst, vsrc.val[0]);
GiStoreInt32(dst + 16, vsrc.val[1]);
GiStoreInt32(dst, GiGetSubVectorInt32V2(vsrc, 0));
GiStoreInt32(dst + 16, GiGetSubVectorInt32V2(vsrc, 1));
}
void operator()(const GI_INT32_t& src, dt_qint8* dst) const {
GiStoreInt32(dst, src);
......
......@@ -20,37 +20,43 @@ struct ReluOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_type = src_ctype>
struct ReluOp;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width, zero) \
template <> \
struct ReluOp<_ctype> : ReluOpBase<_ctype> { \
using ReluOpBase::ReluOpBase; \
using ReluOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto vitem0 = GiMaximum##_func_suffix(src.val[0], zero); \
auto vitem1 = GiMaximum##_func_suffix(src.val[1], zero); \
return {{vitem0, vitem1}}; \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type operator()(const _simd_type& src) const { \
return GiMaximum##_func_suffix(src, zero); \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width, zero_num) \
template <> \
struct ReluOp<_ctype> : ReluOpBase<_ctype> { \
using ReluOpBase::ReluOpBase; \
using ReluOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
_simd_type zero = GiBroadcast##_func_suffix(zero_num); \
auto vitem0 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src, 0), zero); \
auto vitem1 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src, 1), zero); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type operator()(const _simd_type& src) const { \
_simd_type zero = GiBroadcast##_func_suffix(zero_num); \
return GiMaximum##_func_suffix(src, zero); \
} \
};
OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float),
vfzero)
OP(dt_int32, GI_INT32_t, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t),
vzero)
OP(dt_int8, GI_INT8_t, GI_INT8_V2_t, Int8, GI_SIMD_LEN_BYTE / sizeof(int8_t),
vzero_int8)
0.0f)
OP(dt_int32, GI_INT32_t, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t), 0)
OP(dt_int8, GI_INT8_t, GI_INT8_V2_t, Int8, GI_SIMD_LEN_BYTE / sizeof(int8_t), 0)
#undef OP
template <>
......@@ -76,11 +82,19 @@ struct ReluOp<dt_qint8, dt_qint8> : ReluOpBase<dt_qint8, dt_qint8> {
OPERATOR_UNARY_QINT8_FALLBACK;
}
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale);
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale));
vitem0 = GiMaximumFloat32(vitem0, vfzero);
vitem1 = GiMaximumFloat32(vitem1, vfzero);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};
......@@ -104,6 +118,8 @@ template <>
struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8>, FixupBase {
using ReluOpBase::operator();
constexpr static size_t SIMD_WIDTH = 4;
GI_INT32_t vzero = GiBroadcastInt32(0);
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f);
ReluOp(DType src_dtype, DType dst_dtype)
: ReluOpBase(src_dtype, dst_dtype), FixupBase(scale) {}
......@@ -115,8 +131,8 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8>, FixupBase
vst1_s8(reinterpret_cast<int8_t*>(dst), vget_low_s8(operator()(vsrc)));
}
int8x16_t operator()(const int32x4x2_t& vsrc) const {
int32x4_t vitem0 = vqrdmulhq_s32(vsrc.val[0], vmultiplier);
int32x4_t vitem1 = vqrdmulhq_s32(vsrc.val[1], vmultiplier);
int32x4_t vitem0 = vqrdmulhq_s32(GiGetSubVectorInt32V2(vsrc, 0), vmultiplier);
int32x4_t vitem1 = vqrdmulhq_s32(GiGetSubVectorInt32V2(vsrc, 1), vmultiplier);
vitem0 = vmaxq_s32(vitem0, vzero);
vitem1 = vmaxq_s32(vitem1, vzero);
auto tmp = vqmovn_s16(vcombine_s16(
......@@ -158,24 +174,36 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8> {
}
void operator()(const GI_INT32_t& src, dt_qint8* dst) const {
GiStoreLane0Int32(
reinterpret_cast<int32_t*>(dst), (GI_INT32_t)(operator()(src)));
reinterpret_cast<int32_t*>(dst),
GiReinterpretInt8AsInt32(operator()(src)));
}
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale));
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f);
vitem0 = GiMaximumFloat32(vitem0, vfzero);
vitem1 = GiMaximumFloat32(vitem1, vfzero);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
GI_INT8_t operator()(const GI_INT32_t& src) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(src), this->vscale);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(src), GiFixLenType2GiFloat32Type(this->vscale));
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f);
vitem0 = GiMaximumFloat32(vitem0, vfzero);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0);
}
GI_INT8_t operator()(const GI_FLOAT32_t& src) const {
auto vitem0 = GiMultiplyFloat32(src, this->vscale);
auto vitem0 = GiMultiplyFloat32(src, GiFixLenType2GiFloat32Type(this->vscale));
GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f);
vitem0 = GiMaximumFloat32(vitem0, vfzero);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0);
}
......
......@@ -25,27 +25,33 @@ struct SigmoidOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_ctype = src_ctype>
struct SigmoidOp;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \
using SigmoidOpBase::SigmoidOpBase; \
using SigmoidOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
return {{operator()(src.val[0]), operator()(src.val[1])}}; \
} \
_simd_type operator()(const _simd_type& src) const { \
return GiSigmoidPs##_func_suffix(src); \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \
using SigmoidOpBase::SigmoidOpBase; \
using SigmoidOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2( \
ret, 0, operator()(GiGetSubVector##_func_suffix##V2(src, 0))); \
GiSetSubVector##_func_suffix##V2( \
ret, 1, operator()(GiGetSubVector##_func_suffix##V2(src, 1))); \
return ret; \
} \
_simd_type operator()(const _simd_type& src) const { \
return GiSigmoidPs##_func_suffix(src); \
} \
};
OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
#undef OP
......
......@@ -33,14 +33,22 @@ struct SubOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiSubtract##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiSubtract##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiSubtract##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiSubtract##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
......@@ -82,12 +90,23 @@ struct SubOp<dt_qint8, dt_qint8> : SubOpBase<dt_qint8, dt_qint8> {
}
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const {
auto vitem0 = GiSubtractFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1));
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)),
GiFixLenType2GiFloat32Type(this->vscale1)));
auto vitem1 = GiSubtractFloat32(
GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0),
GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)),
GiFixLenType2GiFloat32Type(this->vscale0)),
GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)),
GiFixLenType2GiFloat32Type(this->vscale1)));
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
};
......
......@@ -23,54 +23,58 @@ struct TanhOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template <typename src_ctype, typename dst_type = src_ctype>
struct TanhOp;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct TanhOp<_ctype> : TanhOpBase<_ctype> { \
using TanhOpBase::TanhOpBase; \
using TanhOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = src.val[0]; \
auto val2 = src.val[1]; \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val2 = GiMultiply##_func_suffix(two_val, val2); \
val1 = GiExpPs##_func_suffix(val1); \
val2 = GiExpPs##_func_suffix(val2); \
val1 = GiAdd##_func_suffix(one_val, val1); \
val2 = GiAdd##_func_suffix(one_val, val2); \
auto rval1 = GiRecpe##_func_suffix(val1); \
auto rval2 = GiRecpe##_func_suffix(val2); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
rval2 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val2, rval2), rval2); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val2 = GiMultiply##_func_suffix(two_val, rval2); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
val2 = GiSubtract##_func_suffix(one_val, val2); \
return {{val1, val2}}; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = src; \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val1 = GiExpPs##_func_suffix(val1); \
val1 = GiAdd##_func_suffix(one_val, val1); \
auto rval1 = GiRecpe##_func_suffix(val1); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
return val1; \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct TanhOp<_ctype> : TanhOpBase<_ctype> { \
using TanhOpBase::TanhOpBase; \
using TanhOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = GiGetSubVector##_func_suffix##V2(src, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src, 1); \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val2 = GiMultiply##_func_suffix(two_val, val2); \
val1 = GiExpPs##_func_suffix(val1); \
val2 = GiExpPs##_func_suffix(val2); \
val1 = GiAdd##_func_suffix(one_val, val1); \
val2 = GiAdd##_func_suffix(one_val, val2); \
auto rval1 = GiRecpe##_func_suffix(val1); \
auto rval2 = GiRecpe##_func_suffix(val2); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
rval2 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val2, rval2), rval2); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val2 = GiMultiply##_func_suffix(two_val, rval2); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
val2 = GiSubtract##_func_suffix(one_val, val2); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = src; \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val1 = GiExpPs##_func_suffix(val1); \
val1 = GiAdd##_func_suffix(one_val, val1); \
auto rval1 = GiRecpe##_func_suffix(val1); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
return val1; \
} \
};
OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float))
#undef OP
......
......@@ -36,18 +36,22 @@ struct TrueDivOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \
auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \
auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \
val1 = GiDivide##_func_suffix(val1, val3); \
val2 = GiDivide##_func_suffix(val2, val4); \
return {{val1, val2}}; \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
......
......@@ -21,7 +21,8 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8> {
}
void operator()(const GI_INT32_t& vsrc, dt_qint8* dst) const {
GiStoreLane0Int32(
reinterpret_cast<int32_t*>(dst), (GI_INT32_t)(operator()(vsrc)));
reinterpret_cast<int32_t*>(dst),
GiReinterpretInt8AsInt32(operator()(vsrc)));
}
void operator()(const src_ctype& src, dst_ctype* dst) const {
*dst = operator()(src);
......@@ -32,17 +33,25 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8> {
}
GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale);
auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)),
GiFixLenType2GiFloat32Type(this->vscale));
auto vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)),
GiFixLenType2GiFloat32Type(this->vscale));
GI_FLOAT32_V2_t tmp;
GiSetSubVectorFloat32V2(tmp, 0, vitem0);
GiSetSubVectorFloat32V2(tmp, 1, vitem1);
return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp);
}
GI_INT8_t operator()(const GI_INT32_t& src) const {
auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(src), this->vscale);
auto vitem0 = GiMultiplyFloat32(
GiCastToFloat32(src), GiFixLenType2GiFloat32Type(this->vscale));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0);
}
GI_INT8_t operator()(const GI_FLOAT32_t& src) const {
auto vitem0 = GiMultiplyFloat32(src, this->vscale);
auto vitem0 = GiMultiplyFloat32(src, GiFixLenType2GiFloat32Type(this->vscale));
return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0);
}
};
......
......@@ -11,8 +11,9 @@ struct LoadHelper {
static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args);
};
#define WEIGHT_CB(step) \
src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...);
#define WEIGHT_CB(step) \
src[step] = GiFloat32Type2FixLenType( \
Func::impl(ptr + base_offset + step * ptr_step, args...));
#define LOAD_HELPER(step) \
template < \
......
......@@ -38,7 +38,13 @@ template <>
inline GI_FLOAT32_V2_t QConverter::convert(const GI_INT16_t& vsrc) {
GI_INT32_t vhi = GiMoveHighLongInt16(vsrc);
GI_INT32_t vlo = GiMoveLowLongInt16(vsrc);
return {{GiCastToFloat32(vlo), GiCastToFloat32(vhi)}};
GI_FLOAT32_t fhi = GiCastToFloat32(vhi);
GI_FLOAT32_t flo = GiCastToFloat32(vlo);
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, flo);
GiSetSubVectorFloat32V2(ret, 1, fhi);
return ret;
}
template <>
......
此差异已折叠。
......@@ -18,22 +18,26 @@ struct QuantizedTypeCvter<int32_t, int8_t> {
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t) * 2;
static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(int32_t);
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;
QuantizedTypeCvter(DType src_dtype, DType dst_dtype) {
float src_scale = src_dtype.param<dtype::QuantizedS32>().scale;
float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale;
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}
void cvt(const int32_t* src, int8_t* dst) {
GI_FLOAT32_t vitem0 =
GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), vscale);
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(
GiCastToFloat32(GiLoadInt32(src + SIMD_STEP)), vscale);
auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), t);
GI_FLOAT32_t vitem1 =
GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src + SIMD_STEP)), t);
GI_FLOAT32_V2_t v2;
GiSetSubVectorFloat32V2(v2, 0, vitem0);
GiSetSubVectorFloat32V2(v2, 1, vitem1);
auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(v2);
GiStoreLowInt8(dst, vres);
}
......@@ -48,27 +52,29 @@ struct QuantizedTypeCvter<int8_t, int32_t> {
using dst_type = int32_t;
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;
QuantizedTypeCvter(DType src_dtype, DType dst_dtype) {
float src_scale = src_dtype.param<dtype::QuantizedS8>().scale;
float dst_scale = dst_dtype.param<dtype::QuantizedS32>().scale;
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}
void cvt(const int8_t* src, int32_t* dst) {
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_INT8_t data = GiLoadInt8(src);
GI_INT16_t vitem0 = GiMoveLowLongInt8(data);
GI_INT16_t vitem1 = GiMoveHighLongInt8(data);
auto vret0 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale));
auto vret1 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(GiMultiplyFloat32(
GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale));
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t));
auto vret1 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t));
auto vret2 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale));
auto vret3 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(GiMultiplyFloat32(
GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale));
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t));
auto vret3 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t));
constexpr size_t step = GI_SIMD_LEN_BYTE / sizeof(int32_t);
GiStoreInt32(dst, vret0);
......@@ -90,21 +96,26 @@ struct QuantizedTypeCvter<float, int8_t> {
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float) * 2;
static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(float);
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;
QuantizedTypeCvter(DType src_dtype, DType dst_dtype) {
MEGDNN_MARK_USED_VAR(src_dtype);
float src_scale = 1;
float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale;
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}
void cvt(const float* src, int8_t* dst) {
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiLoadFloat32(src), vscale);
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiLoadFloat32(src + SIMD_STEP), vscale);
auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}});
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiLoadFloat32(src), t);
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiLoadFloat32(src + SIMD_STEP), t);
GI_FLOAT32_V2_t v2;
GiSetSubVectorFloat32V2(v2, 0, vitem0);
GiSetSubVectorFloat32V2(v2, 1, vitem1);
auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(v2);
GiStoreLowInt8(dst, vres);
}
......@@ -119,18 +130,19 @@ struct QuantizedTypeCvter<int32_t, int32_t> {
using dst_type = int32_t;
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t);
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;
QuantizedTypeCvter(DType src_dtype, DType dst_dtype) {
float src_scale = src_dtype.param<dtype::QuantizedS32>().scale;
float dst_scale = dst_dtype.param<dtype::QuantizedS32>().scale;
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}
void cvt(const int32_t* src, int32_t* dst) {
GI_FLOAT32_t vitem =
GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), vscale);
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_FLOAT32_t vitem = GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), t);
auto vres = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(vitem);
GiStoreInt32(dst, vres);
......@@ -148,30 +160,32 @@ struct QuantizedTypeCvter<int8_t, int8_t> {
using dst_type = int8_t;
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
float scale;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;
QuantizedTypeCvter(DType src_dtype, DType dst_dtype) {
float src_scale = src_dtype.param<dtype::QuantizedS8>().scale;
float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale;
scale = src_scale / dst_scale;
vscale = GiBroadcastFloat32(scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale));
}
void cvt(const int8_t* src, int8_t* dst) {
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_INT8_t data = GiLoadInt8(src);
GI_INT16_t vitem0 = GiMoveLowLongInt8(data);
GI_INT16_t vitem1 = GiMoveHighLongInt8(data);
auto vret0 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale);
auto vret1 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale);
auto vret2 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale);
auto vret3 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale);
auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V4_t>(
{{vret0, vret1, vret2, vret3}});
auto vret0 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t);
auto vret1 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t);
auto vret2 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t);
auto vret3 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t);
GI_FLOAT32_V4_t v4;
GiSetSubVectorFloat32V4(v4, 0, vret0);
GiSetSubVectorFloat32V4(v4, 1, vret1);
GiSetSubVectorFloat32V4(v4, 2, vret2);
GiSetSubVectorFloat32V4(v4, 3, vret3);
auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V4_t>(v4);
GiStoreInt8(dst, vres);
}
......@@ -245,26 +259,24 @@ struct Quan2FloatTypeCvter<int8_t, float> {
static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(float);
float _scale = 0.0f;
GI_FLOAT32_t vscale;
GI_FLOAT32_FIXLEN_t vscale;
Quan2FloatTypeCvter(DType src_dtype, DType dst_dtype) {
_scale = src_dtype.param<dtype::QuantizedS8>().scale;
vscale = GiBroadcastFloat32(_scale);
vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(_scale));
MEGDNN_MARK_USED_VAR(dst_dtype);
}
void cvt(const int8_t* src, float* dst) {
GI_FLOAT32_t t;
t = GiFixLenType2GiFloat32Type(vscale);
GI_INT8_t data = GiLoadInt8(src);
GI_INT16_t vitem0 = GiMoveLowLongInt8(data);
GI_INT16_t vitem1 = GiMoveHighLongInt8(data);
auto vret0 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale);
auto vret1 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale);
auto vret2 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale);
auto vret3 =
GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale);
auto vret0 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t);
auto vret1 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t);
auto vret2 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t);
auto vret3 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t);
GiStoreFloat32(dst, vret0);
GiStoreFloat32(dst + SIMD_STEP, vret1);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册