提交 58ba080d 编写于 作者: M Megvii Engine Team

feat(x86/rvv): make gi conv algo adapt to vv and vf model

GitOrigin-RevId: f29593be4df167f63029893bd9cf0fb667861622
上级 bd50e457
......@@ -24,21 +24,27 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
};
#define cb2(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[1][step]), \
GiFixLenType2GiFloat32Type(weight[1][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane));
#define cb(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane));
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#define MLA(a, b, c, d) \
GiMultiplyAddScalarFloat32( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d))
#else
#define MLA(a, b, c, d) \
GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \
GiFixLenType2GiFloat32Type(c), d)
#endif
#define cb2(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); \
c[1][step] = GiFloat32Type2FixLenType( \
MLA(c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane));
#define cb(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane));
#define SHIFT_CAL_HELPER(ow_block, remain_w) \
template < \
......@@ -81,6 +87,7 @@ SHIFT_CAL_HELPER(4, 4);
#undef SHIFT_CAL_HELPER
#undef cb
#undef cb2
#undef MLA
template <
int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, typename T,
......@@ -145,14 +152,23 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) {
GI_FLOAT32_FIXLEN_t src[ow_block];
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step];
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[ow_block];
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
GI_FLOAT32_FIXLEN_t src[ow_block];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr + (ow_block)*ic_step;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block)*ic_step));
#endif
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
......@@ -188,19 +204,32 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) {
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[ow_block];
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
GI_FLOAT32_FIXLEN_t src[ow_block];
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step];
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr + (ow_block)*ic_step;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block)*ic_step));
#endif
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[1] = src_ptr + (ow_block + 1) * ic_step;
#else
src[1] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step));
#endif
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
......@@ -235,33 +264,54 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) {
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[ow_block];
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
GI_FLOAT32_FIXLEN_t src[ow_block];
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step];
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr + (ow_block)*ic_step;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block)*ic_step));
#endif
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[1] = src_ptr + (ow_block + 1) * ic_step;
#else
src[1] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step));
#endif
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[2] = src_ptr + (ow_block + 2) * ic_step;
#else
src[2] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step));
#endif
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[3] = src_ptr + (ow_block + 3) * ic_step;
#else
src[3] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step));
#endif
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight);
......@@ -297,45 +347,74 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) {
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[ow_block];
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
GI_FLOAT32_FIXLEN_t src[ow_block];
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
GI_FLOAT32_FIXLEN_t weight[c_dim][ic_step];
load_helper<ic_step, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr + (ow_block)*ic_step;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block)*ic_step));
#endif
load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[1] = src_ptr + (ow_block + 1) * ic_step;
#else
src[1] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 1) * ic_step));
#endif
load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[2] = src_ptr + (ow_block + 2) * ic_step;
#else
src[2] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 2) * ic_step));
#endif
load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[3] = src_ptr + (ow_block + 3) * ic_step;
#else
src[3] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 3) * ic_step));
#endif
load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[4] = src_ptr + (ow_block + 4) * ic_step;
#else
src[4] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 4) * ic_step));
#endif
load_helper<ic_step, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<5, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[5] = src_ptr + (ow_block + 5) * ic_step;
#else
src[5] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 5) * ic_step));
#endif
load_helper<ic_step, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<6, 0, c_dim, ow_block, remain_w>(c, src, weight);
......
......@@ -24,21 +24,28 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> {
static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
};
#define cb2(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[1][step]), \
GiFixLenType2GiFloat32Type(weight[1][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane));
#define cb(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane));
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#define MLA(a, b, c, d) \
GiMultiplyAddScalarFloat32( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d))
#else
#define MLA(a, b, c, d) \
GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \
GiFixLenType2GiFloat32Type(c), d)
#endif
#define cb2(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); \
c[1][step] = GiFloat32Type2FixLenType( \
MLA(c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane));
#define cb(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane));
#define SHIFT_CAL_HELPER(ow_block, remain_w) \
template < \
......@@ -81,6 +88,7 @@ SHIFT_CAL_HELPER(4, 4);
#undef SHIFT_CAL_HELPER
#undef cb
#undef cb2
#undef MLA
template <
int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w, typename T,
......@@ -146,15 +154,24 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic;
GI_FLOAT32_FIXLEN_t src[ow_block];
GI_FLOAT32_FIXLEN_t weight[c_dim][4];
/////////row 0/////////////
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[ow_block];
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
GI_FLOAT32_FIXLEN_t src[ow_block];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
/////////row 0/////////////
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0);
#else
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0);
#endif
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
......@@ -162,12 +179,20 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
src_ptr_odd += ld_src_iw;
weight_ptr += ld_weight_fh;
/////////row 1/////////////
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0);
#else
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0);
#endif
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
......@@ -203,21 +228,34 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic;
GI_FLOAT32_FIXLEN_t src[ow_block];
GI_FLOAT32_FIXLEN_t weight[c_dim][4];
/////////row 0/////////////
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[ow_block];
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
GI_FLOAT32_FIXLEN_t src[ow_block];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
/////////row 0/////////////
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr + ow_block * simd_len;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + ow_block * simd_len));
#endif
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0);
#else
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0);
#endif
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
......@@ -225,17 +263,29 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
src_ptr_odd += ld_src_iw;
weight_ptr += ld_weight_fh;
/////////row 1/////////////
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr + ow_block * simd_len;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + ow_block * simd_len));
#endif
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0);
#else
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0);
#endif
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
......@@ -243,18 +293,30 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
src_ptr_odd += ld_src_iw;
weight_ptr += ld_weight_fh;
//////////row 2/////////////
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr + ow_block * simd_len;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + ow_block * simd_len));
#endif
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0);
#else
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0);
#endif
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
......@@ -292,30 +354,51 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic;
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) {
GI_FLOAT32_FIXLEN_t src[ow_block];
GI_FLOAT32_FIXLEN_t weight[c_dim][4];
// even element
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[ow_block];
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
GI_FLOAT32_FIXLEN_t src[ow_block];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
// even element
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr + ow_block * simd_len;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + ow_block * simd_len));
#endif
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[1] = src_ptr + (ow_block + 1) * simd_len;
#else
src[1] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len));
#endif
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
// odd element
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0);
#else
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0);
#endif
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr_odd + ow_block * simd_len;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr_odd + ow_block * simd_len));
#endif
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
......@@ -360,40 +443,69 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic;
for (int fh_idx = 0; fh_idx < filter_size; ++fh_idx) {
GI_FLOAT32_FIXLEN_t src[ow_block];
GI_FLOAT32_FIXLEN_t weight[c_dim][4];
// even element
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const float* src[ow_block];
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr, 0);
#else
GI_FLOAT32_FIXLEN_t src[ow_block];
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr, 0);
#endif
// even element
load_helper<4, 0, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr + ow_block * simd_len;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + ow_block * simd_len));
#endif
load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[1] = src_ptr + (ow_block + 1) * simd_len;
#else
src[1] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 1) * simd_len));
#endif
load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[2] = src_ptr + (ow_block + 2) * simd_len;
#else
src[2] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr + (ow_block + 2) * simd_len));
#endif
load_helper<4, 6 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);
// odd element
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper<ow_block, 0, simd_len, 0>(src, src_ptr_odd, 0);
#else
load_helper<ow_block, 0, simd_len, 0, Vld1qF32S>(src, src_ptr_odd, 0);
#endif
load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[0] = src_ptr_odd + ow_block * simd_len;
#else
src[0] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr_odd + ow_block * simd_len));
#endif
load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src[1] = src_ptr_odd + (ow_block + 1) * simd_len;
#else
src[1] = GiFloat32Type2FixLenType(
GiLoadFloat32(src_ptr_odd + (ow_block + 1) * simd_len));
#endif
load_helper<4, 5 * ld_weight, oc_step, c_dim, Vld1qF32S>(
weight, weight_ptr, ld_weight_oc);
cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
......
......@@ -40,44 +40,29 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> {
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#define MLA GiMultiplyAddScalarFloat32
#define cb(step) \
c[0][step] = GiFloat32Type2FixLenType(MLA( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); \
c[1][step] = GiFloat32Type2FixLenType(MLA( \
GiFixLenType2GiFloat32Type(c[1][step]), \
GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4)));
#define cb2(step) \
c[0][step] = GiFloat32Type2FixLenType(MLA( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4)));
#define MLA(a, b, c, d) \
GiMultiplyAddScalarFloat32( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d))
#else
#define cb(step) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \
(step * stride + src_idx) % 4)); \
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[1][step]), \
GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \
(step * stride + src_idx) % 4));
#define cb2(step) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \
(step * stride + src_idx) % 4));
#undef MLA
#define MLA(a, b, c, d) \
GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \
GiFixLenType2GiFloat32Type(c), d)
#endif
#define cb(step) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \
(step * stride + src_idx) % 4)); \
c[1][step] = GiFloat32Type2FixLenType( \
MLA(c[1][step], weight[1][weight_idx], src[(step * stride + src_idx) / 4], \
(step * stride + src_idx) % 4));
#define cb2(step) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \
(step * stride + src_idx) % 4));
#define SHIFT_CAL_HELPER(ow_remain) \
template < \
int src_idx, int weight_idx, int stride, typename T, typename T2, \
......@@ -108,6 +93,7 @@ SHIFT_CAL_HELPER(8)
#undef SHIFT_CAL_HELPER
#undef cb
#undef cb2
#undef MLA
template <
int src_idx, int weight_idx, int c_dim, int stride, int remain_w, typename T,
......
......@@ -15,6 +15,30 @@ using namespace conv_stride2;
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
#if defined(GI_RVV_INTRINSICS)
#define PREFER_VF
#endif
#if defined(PREFER_VF)
#define MLA(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d))
namespace {
GI_FORCEINLINE void ext_float32_ptr(
const float* a, const float* b, const int n, float* ret) {
int t_count = GI_SIMD_LEN_BYTE / sizeof(float);
int a_count = t_count - n;
for (int i = 0; i < a_count; i++) {
ret[i] = a[i + n];
}
for (int i = 0; i < n; i++) {
ret[i + a_count] = b[i];
}
}
}; // namespace
#else
#define MLA(a, b, c, d) GiSimdFmaLane(a, b, c, d)
#endif
void conv_stride2::do_conv_2x2_stride2(
const float* src, const float* filter, float* dst, size_t IH, size_t IW,
size_t OH, size_t OW, size_t IC) {
......@@ -29,7 +53,11 @@ void conv_stride2::do_conv_2x2_stride2(
const float* k0 = filter;
#if defined(PREFER_VF)
const float* _k0123 = k0;
#else
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0);
#endif
rep(h, OH) {
int nn = OW >> 2;
......@@ -41,16 +69,16 @@ void conv_stride2::do_conv_2x2_stride2(
GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6
GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7
_outp = GiSimdFmaLane(_outp, _r00, _k0123, 0);
_outp = GiSimdFmaLane(_outp, _r01, _k0123, 1);
_outp = MLA(_outp, _r00, _k0123, 0);
_outp = MLA(_outp, _r01, _k0123, 1);
GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1);
GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0);
GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1);
_outp = GiSimdFmaLane(_outp, _r10, _k0123, 2);
_outp = GiSimdFmaLane(_outp, _r11, _k0123, 3);
_outp = MLA(_outp, _r10, _k0123, 2);
_outp = MLA(_outp, _r11, _k0123, 3);
GiStoreFloat32(outptr, _outp);
......@@ -84,10 +112,18 @@ void conv_stride2::do_conv_3x3_stride2(
const float* k1 = filter + 3;
const float* k2 = filter + 5;
#if defined(PREFER_VF)
const float* _k0123 = k0;
const float* _k3456 = k1;
const float* _k5678 = k2;
float _k6789[GI_SIMD_LEN_BYTE / sizeof(float)];
ext_float32_ptr(_k5678, _k5678, 1, _k6789);
#else
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0);
GI_FLOAT32_t _k3456 = GiLoadFloat32(k1);
GI_FLOAT32_t _k5678 = GiLoadFloat32(k2);
GI_FLOAT32_t _k6789 = GiExtqFloat32(_k5678, _k5678, 1);
#endif
rep(h, OH) {
int nn = OW >> 2;
......@@ -102,9 +138,9 @@ void conv_stride2::do_conv_3x3_stride2(
GI_FLOAT32_t _r02 = GiExtqFloat32(
_r00, GiGetSubVectorFloat32V2(_r0n, 0), 1); // 2 4 6 8
_outp = GiSimdFmaLane(_outp, _r00, _k0123, 0);
_outp = GiSimdFmaLane(_outp, _r01, _k0123, 1);
_outp = GiSimdFmaLane(_outp, _r02, _k0123, 2);
_outp = MLA(_outp, _r00, _k0123, 0);
_outp = MLA(_outp, _r01, _k0123, 1);
_outp = MLA(_outp, _r02, _k0123, 2);
GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1);
GI_FLOAT32_V2_t _r1n = GiLoadUzipFloat32V2(r1 + 8);
......@@ -114,9 +150,9 @@ void conv_stride2::do_conv_3x3_stride2(
GI_FLOAT32_t _r12 =
GiExtqFloat32(_r10, GiGetSubVectorFloat32V2(_r1n, 0), 1);
_outp = GiSimdFmaLane(_outp, _r10, _k3456, 0);
_outp = GiSimdFmaLane(_outp, _r11, _k3456, 1);
_outp = GiSimdFmaLane(_outp, _r12, _k3456, 2);
_outp = MLA(_outp, _r10, _k3456, 0);
_outp = MLA(_outp, _r11, _k3456, 1);
_outp = MLA(_outp, _r12, _k3456, 2);
GI_FLOAT32_V2_t _r2 = GiLoadUzipFloat32V2(r2);
GI_FLOAT32_V2_t _r2n = GiLoadUzipFloat32V2(r2 + 8);
......@@ -126,9 +162,9 @@ void conv_stride2::do_conv_3x3_stride2(
GI_FLOAT32_t _r22 =
GiExtqFloat32(_r20, GiGetSubVectorFloat32V2(_r2n, 0), 1);
_outp = GiSimdFmaLane(_outp, _r20, _k6789, 0);
_outp = GiSimdFmaLane(_outp, _r21, _k6789, 1);
_outp = GiSimdFmaLane(_outp, _r22, _k6789, 2);
_outp = MLA(_outp, _r20, _k6789, 0);
_outp = MLA(_outp, _r21, _k6789, 1);
_outp = MLA(_outp, _r22, _k6789, 2);
GiStoreFloat32(outptr, _outp);
......@@ -162,6 +198,15 @@ void conv_stride2::do_conv_5x5_stride2(
const float* r3 = src_ptr + IW * 3;
const float* r4 = src_ptr + IW * 4;
#if defined(PREFER_VF)
const float* _k0123 = filter;
const float* _k4567 = filter + 4;
const float* _k891011 = filter + 8;
const float* _k12131415 = filter + 12;
const float* _k16171819 = filter + 16;
const float* _k20212223 = filter + 20;
const float* _k24242424 = filter + 24;
#else
GI_FLOAT32_t _k0123 = GiLoadFloat32(filter);
GI_FLOAT32_t _k4567 = GiLoadFloat32(filter + 4);
GI_FLOAT32_t _k891011 = GiLoadFloat32(filter + 8);
......@@ -169,6 +214,7 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t _k16171819 = GiLoadFloat32(filter + 16);
GI_FLOAT32_t _k20212223 = GiLoadFloat32(filter + 20);
GI_FLOAT32_t _k24242424 = GiBroadcastFloat32(filter[24]);
#endif
for (size_t i = 0; i < OH; i++) {
int nn = OW >> 2;
......@@ -230,35 +276,35 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t _r43 = GiExtqFloat32(_r41, _r4_9111315, 1);
GI_FLOAT32_t _r44 = GiExtqFloat32(_r40, _r4_8101214, 2);
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0);
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1);
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2);
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3);
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0);
_sum = GiSimdFmaLane(_sum, _r10, _k4567, 1);
_sum = GiSimdFmaLane(_sum, _r11, _k4567, 2);
_sum = GiSimdFmaLane(_sum, _r12, _k4567, 3);
_sum = GiSimdFmaLane(_sum, _r13, _k891011, 0);
_sum = GiSimdFmaLane(_sum, _r14, _k891011, 1);
_sum = GiSimdFmaLane(_sum, _r20, _k891011, 2);
_sum = GiSimdFmaLane(_sum, _r21, _k891011, 3);
_sum = GiSimdFmaLane(_sum, _r22, _k12131415, 0);
_sum = GiSimdFmaLane(_sum, _r23, _k12131415, 1);
_sum = GiSimdFmaLane(_sum, _r24, _k12131415, 2);
_sum = GiSimdFmaLane(_sum, _r30, _k12131415, 3);
_sum = GiSimdFmaLane(_sum, _r31, _k16171819, 0);
_sum = GiSimdFmaLane(_sum, _r32, _k16171819, 1);
_sum = GiSimdFmaLane(_sum, _r33, _k16171819, 2);
_sum = GiSimdFmaLane(_sum, _r34, _k16171819, 3);
_sum = GiSimdFmaLane(_sum, _r40, _k20212223, 0);
_sum = GiSimdFmaLane(_sum, _r41, _k20212223, 1);
_sum = GiSimdFmaLane(_sum, _r42, _k20212223, 2);
_sum = GiSimdFmaLane(_sum, _r43, _k20212223, 3);
_sum = GiSimdFmaLane(_sum, _r44, _k24242424, 0);
_sum = MLA(_sum, _r00, _k0123, 0);
_sum = MLA(_sum, _r01, _k0123, 1);
_sum = MLA(_sum, _r02, _k0123, 2);
_sum = MLA(_sum, _r03, _k0123, 3);
_sum = MLA(_sum, _r04, _k4567, 0);
_sum = MLA(_sum, _r10, _k4567, 1);
_sum = MLA(_sum, _r11, _k4567, 2);
_sum = MLA(_sum, _r12, _k4567, 3);
_sum = MLA(_sum, _r13, _k891011, 0);
_sum = MLA(_sum, _r14, _k891011, 1);
_sum = MLA(_sum, _r20, _k891011, 2);
_sum = MLA(_sum, _r21, _k891011, 3);
_sum = MLA(_sum, _r22, _k12131415, 0);
_sum = MLA(_sum, _r23, _k12131415, 1);
_sum = MLA(_sum, _r24, _k12131415, 2);
_sum = MLA(_sum, _r30, _k12131415, 3);
_sum = MLA(_sum, _r31, _k16171819, 0);
_sum = MLA(_sum, _r32, _k16171819, 1);
_sum = MLA(_sum, _r33, _k16171819, 2);
_sum = MLA(_sum, _r34, _k16171819, 3);
_sum = MLA(_sum, _r40, _k20212223, 0);
_sum = MLA(_sum, _r41, _k20212223, 1);
_sum = MLA(_sum, _r42, _k20212223, 2);
_sum = MLA(_sum, _r43, _k20212223, 3);
_sum = MLA(_sum, _r44, _k24242424, 0);
GiStoreFloat32(outptr, _sum);
......@@ -312,8 +358,13 @@ void conv_stride2::do_conv_7x7_stride2(
rep(i, nn) {
GI_FLOAT32_t _sum = GiLoadFloat32(outptr);
#if defined(PREFER_VF)
const float* _k0123 = k0;
const float* _k4567 = k0 + 4;
#else
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0);
GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4);
#endif
GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0);
GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8);
......@@ -331,16 +382,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _r05 = GiExtqFloat32(_r01, _r0_9111315, 2); // 5 7 9 11
GI_FLOAT32_t _r06 = GiExtqFloat32(_r00, _r0_8101214, 3); // 6 8 10 12
_sum = GiSimdFmaLane(_sum, _r00, _k0123, 0);
_sum = GiSimdFmaLane(_sum, _r01, _k0123, 1);
_sum = GiSimdFmaLane(_sum, _r02, _k0123, 2);
_sum = GiSimdFmaLane(_sum, _r03, _k0123, 3);
_sum = GiSimdFmaLane(_sum, _r04, _k4567, 0);
_sum = GiSimdFmaLane(_sum, _r05, _k4567, 1);
_sum = GiSimdFmaLane(_sum, _r06, _k4567, 2);
_sum = MLA(_sum, _r00, _k0123, 0);
_sum = MLA(_sum, _r01, _k0123, 1);
_sum = MLA(_sum, _r02, _k0123, 2);
_sum = MLA(_sum, _r03, _k0123, 3);
_sum = MLA(_sum, _r04, _k4567, 0);
_sum = MLA(_sum, _r05, _k4567, 1);
_sum = MLA(_sum, _r06, _k4567, 2);
#if defined(PREFER_VF)
const float* _k78910 = k1;
const float* _k11121314 = k1 + 4;
#else
GI_FLOAT32_t _k78910 = GiLoadFloat32(k1);
GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4);
#endif
GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1);
GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8);
......@@ -354,16 +410,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _r15 = GiExtqFloat32(_r11, _r1_9111315, 2);
GI_FLOAT32_t _r16 = GiExtqFloat32(_r10, _r1_8101214, 3);
_sum = GiSimdFmaLane(_sum, _r10, _k78910, 0);
_sum = GiSimdFmaLane(_sum, _r11, _k78910, 1);
_sum = GiSimdFmaLane(_sum, _r12, _k78910, 2);
_sum = GiSimdFmaLane(_sum, _r13, _k78910, 3);
_sum = GiSimdFmaLane(_sum, _r14, _k11121314, 0);
_sum = GiSimdFmaLane(_sum, _r15, _k11121314, 1);
_sum = GiSimdFmaLane(_sum, _r16, _k11121314, 2);
_sum = MLA(_sum, _r10, _k78910, 0);
_sum = MLA(_sum, _r11, _k78910, 1);
_sum = MLA(_sum, _r12, _k78910, 2);
_sum = MLA(_sum, _r13, _k78910, 3);
_sum = MLA(_sum, _r14, _k11121314, 0);
_sum = MLA(_sum, _r15, _k11121314, 1);
_sum = MLA(_sum, _r16, _k11121314, 2);
#if defined(PREFER_VF)
const float* _k14151617 = k2;
const float* _k18192021 = k2 + 4;
#else
GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2);
GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4);
#endif
GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2);
GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8);
......@@ -377,16 +438,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _r25 = GiExtqFloat32(_r21, _r2_9111315, 2);
GI_FLOAT32_t _r26 = GiExtqFloat32(_r20, _r2_8101214, 3);
_sum = GiSimdFmaLane(_sum, _r20, _k14151617, 0);
_sum = GiSimdFmaLane(_sum, _r21, _k14151617, 1);
_sum = GiSimdFmaLane(_sum, _r22, _k14151617, 2);
_sum = GiSimdFmaLane(_sum, _r23, _k14151617, 3);
_sum = GiSimdFmaLane(_sum, _r24, _k18192021, 0);
_sum = GiSimdFmaLane(_sum, _r25, _k18192021, 1);
_sum = GiSimdFmaLane(_sum, _r26, _k18192021, 2);
_sum = MLA(_sum, _r20, _k14151617, 0);
_sum = MLA(_sum, _r21, _k14151617, 1);
_sum = MLA(_sum, _r22, _k14151617, 2);
_sum = MLA(_sum, _r23, _k14151617, 3);
_sum = MLA(_sum, _r24, _k18192021, 0);
_sum = MLA(_sum, _r25, _k18192021, 1);
_sum = MLA(_sum, _r26, _k18192021, 2);
#if defined(PREFER_VF)
const float* _k21222324 = k3;
const float* _k25262728 = k3 + 4;
#else
GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3);
GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4);
#endif
GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3);
GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8);
......@@ -400,16 +466,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _r35 = GiExtqFloat32(_r31, _r3_9111315, 2);
GI_FLOAT32_t _r36 = GiExtqFloat32(_r30, _r3_8101214, 3);
_sum = GiSimdFmaLane(_sum, _r30, _k21222324, 0);
_sum = GiSimdFmaLane(_sum, _r31, _k21222324, 1);
_sum = GiSimdFmaLane(_sum, _r32, _k21222324, 2);
_sum = GiSimdFmaLane(_sum, _r33, _k21222324, 3);
_sum = GiSimdFmaLane(_sum, _r34, _k25262728, 0);
_sum = GiSimdFmaLane(_sum, _r35, _k25262728, 1);
_sum = GiSimdFmaLane(_sum, _r36, _k25262728, 2);
_sum = MLA(_sum, _r30, _k21222324, 0);
_sum = MLA(_sum, _r31, _k21222324, 1);
_sum = MLA(_sum, _r32, _k21222324, 2);
_sum = MLA(_sum, _r33, _k21222324, 3);
_sum = MLA(_sum, _r34, _k25262728, 0);
_sum = MLA(_sum, _r35, _k25262728, 1);
_sum = MLA(_sum, _r36, _k25262728, 2);
#if defined(PREFER_VF)
const float* _k28293031 = k4;
const float* _k32333435 = k4 + 4;
#else
GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4);
GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4);
#endif
GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4);
GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8);
......@@ -423,16 +494,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _r45 = GiExtqFloat32(_r41, _r4_9111315, 2);
GI_FLOAT32_t _r46 = GiExtqFloat32(_r40, _r4_8101214, 3);
_sum = GiSimdFmaLane(_sum, _r40, _k28293031, 0);
_sum = GiSimdFmaLane(_sum, _r41, _k28293031, 1);
_sum = GiSimdFmaLane(_sum, _r42, _k28293031, 2);
_sum = GiSimdFmaLane(_sum, _r43, _k28293031, 3);
_sum = GiSimdFmaLane(_sum, _r44, _k32333435, 0);
_sum = GiSimdFmaLane(_sum, _r45, _k32333435, 1);
_sum = GiSimdFmaLane(_sum, _r46, _k32333435, 2);
_sum = MLA(_sum, _r40, _k28293031, 0);
_sum = MLA(_sum, _r41, _k28293031, 1);
_sum = MLA(_sum, _r42, _k28293031, 2);
_sum = MLA(_sum, _r43, _k28293031, 3);
_sum = MLA(_sum, _r44, _k32333435, 0);
_sum = MLA(_sum, _r45, _k32333435, 1);
_sum = MLA(_sum, _r46, _k32333435, 2);
#if defined(PREFER_VF)
const float* _k35363738 = k5;
const float* _k39404142 = k5 + 4;
#else
GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5);
GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4);
#endif
GI_FLOAT32_V2_t _r50_02461357 = GiLoadUzipFloat32V2(r5);
GI_FLOAT32_V2_t _r50nx2 = GiLoadUzipFloat32V2(r5 + 8);
......@@ -446,16 +522,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _r55 = GiExtqFloat32(_r51, _r5_9111315, 2);
GI_FLOAT32_t _r56 = GiExtqFloat32(_r50, _r5_8101214, 3);
_sum = GiSimdFmaLane(_sum, _r50, _k35363738, 0);
_sum = GiSimdFmaLane(_sum, _r51, _k35363738, 1);
_sum = GiSimdFmaLane(_sum, _r52, _k35363738, 2);
_sum = GiSimdFmaLane(_sum, _r53, _k35363738, 3);
_sum = GiSimdFmaLane(_sum, _r54, _k39404142, 0);
_sum = GiSimdFmaLane(_sum, _r55, _k39404142, 1);
_sum = GiSimdFmaLane(_sum, _r56, _k39404142, 2);
_sum = MLA(_sum, _r50, _k35363738, 0);
_sum = MLA(_sum, _r51, _k35363738, 1);
_sum = MLA(_sum, _r52, _k35363738, 2);
_sum = MLA(_sum, _r53, _k35363738, 3);
_sum = MLA(_sum, _r54, _k39404142, 0);
_sum = MLA(_sum, _r55, _k39404142, 1);
_sum = MLA(_sum, _r56, _k39404142, 2);
#if defined(PREFER_VF)
const float* _k42434445 = k6;
const float* _k45464748 = k6 + 3;
#else
GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6);
GI_FLOAT32_t _k45464748 = GiLoadFloat32(k6 + 3);
#endif
GI_FLOAT32_V2_t _r60_02461357 = GiLoadUzipFloat32V2(r6);
GI_FLOAT32_V2_t _r60nx2 = GiLoadUzipFloat32V2(r6 + 8);
......@@ -469,13 +550,13 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _r65 = GiExtqFloat32(_r61, _r6_9111315, 2);
GI_FLOAT32_t _r66 = GiExtqFloat32(_r60, _r6_8101214, 3);
_sum = GiSimdFmaLane(_sum, _r60, _k42434445, 0);
_sum = GiSimdFmaLane(_sum, _r61, _k42434445, 1);
_sum = GiSimdFmaLane(_sum, _r62, _k42434445, 2);
_sum = GiSimdFmaLane(_sum, _r63, _k42434445, 3);
_sum = GiSimdFmaLane(_sum, _r64, _k45464748, 1);
_sum = GiSimdFmaLane(_sum, _r65, _k45464748, 2);
_sum = GiSimdFmaLane(_sum, _r66, _k45464748, 3);
_sum = MLA(_sum, _r60, _k42434445, 0);
_sum = MLA(_sum, _r61, _k42434445, 1);
_sum = MLA(_sum, _r62, _k42434445, 2);
_sum = MLA(_sum, _r63, _k42434445, 3);
_sum = MLA(_sum, _r64, _k45464748, 1);
_sum = MLA(_sum, _r65, _k45464748, 2);
_sum = MLA(_sum, _r66, _k45464748, 3);
GiStoreFloat32(outptr, _sum);
......
......@@ -75,6 +75,21 @@ struct InputTransformF73_NCHW44 {
size_t icb = ic / pack_size;
GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8;
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#define MADD(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d))
const float* v0 = input_parameters + 0;
const float* v1 = input_parameters + 4;
const float* v2 = input_parameters + 8;
const float* v3 = input_parameters + 12;
const float* v4 = input_parameters + 16;
const float* v5 = input_parameters + 20;
const float* v6 = input_parameters + 24;
#define MSUB(a, b, c, d) GiMultiplySubScalarFloat32(a, b, *(c + d))
#else
#define MADD(a, b, c, d) GiSimdFmaLane(a, b, c, d)
#define MSUB(a, b, c, d) GiFmsqLaneQFloat32(a, b, c, d)
GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters + 0);
GI_FLOAT32_t v1 = GiLoadFloat32(input_parameters + 4);
GI_FLOAT32_t v2 = GiLoadFloat32(input_parameters + 8);
......@@ -82,6 +97,7 @@ struct InputTransformF73_NCHW44 {
GI_FLOAT32_t v4 = GiLoadFloat32(input_parameters + 16);
GI_FLOAT32_t v5 = GiLoadFloat32(input_parameters + 20);
GI_FLOAT32_t v6 = GiLoadFloat32(input_parameters + 24);
#endif
//! B
//! 1.5 0 0 0 0 0 0 0 0
......@@ -120,59 +136,59 @@ struct InputTransformF73_NCHW44 {
auto t##i##5 = d7; \
auto t##i##6 = d7; \
auto t##i##7 = d7; \
t##i##8 = GiFmsqLaneQFloat32(t##i##8, d7, v0, 0); \
t##i##8 = MSUB(t##i##8, d7, v0, 0); \
t##i##0 = GiSubtractFloat32(t##i##0, d1); \
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d1, v0, 0); \
t##i##2 = GiSimdFmaLane(t##i##2, d1, v0, 0); \
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d1, v0, 1); \
t##i##4 = GiSimdFmaLane(t##i##4, d1, v0, 1); \
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d1, v0, 2); \
t##i##6 = GiSimdFmaLane(t##i##6, d1, v0, 2); \
t##i##1 = MSUB(t##i##1, d1, v0, 0); \
t##i##2 = MADD(t##i##2, d1, v0, 0); \
t##i##3 = MSUB(t##i##3, d1, v0, 1); \
t##i##4 = MADD(t##i##4, d1, v0, 1); \
t##i##5 = MSUB(t##i##5, d1, v0, 2); \
t##i##6 = MADD(t##i##6, d1, v0, 2); \
t##i##7 = GiSubtractFloat32(t##i##7, d1); \
t##i##8 = GiSimdFmaLane(t##i##8, d1, v0, 0); \
t##i##0 = GiFmsqLaneQFloat32(t##i##0, d2, v0, 3); \
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d2, v1, 0); \
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d2, v1, 1); \
t##i##3 = GiSimdFmaLane(t##i##3, d2, v1, 2); \
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d2, v1, 3); \
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d2, v2, 0); \
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d2, v2, 1); \
t##i##8 = MADD(t##i##8, d1, v0, 0); \
t##i##0 = MSUB(t##i##0, d2, v0, 3); \
t##i##1 = MSUB(t##i##1, d2, v1, 0); \
t##i##2 = MSUB(t##i##2, d2, v1, 1); \
t##i##3 = MADD(t##i##3, d2, v1, 2); \
t##i##4 = MSUB(t##i##4, d2, v1, 3); \
t##i##5 = MSUB(t##i##5, d2, v2, 0); \
t##i##6 = MSUB(t##i##6, d2, v2, 1); \
t##i##8 = GiSubtractFloat32(t##i##8, d2); \
t##i##0 = GiSimdFmaLane(t##i##0, d3, v2, 2); \
t##i##1 = GiSimdFmaLane(t##i##1, d3, v2, 3); \
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d3, v3, 0); \
t##i##3 = GiSimdFmaLane(t##i##3, d3, v2, 0); \
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d3, v3, 1); \
t##i##5 = GiSimdFmaLane(t##i##5, d3, v3, 2); \
t##i##6 = GiSimdFmaLane(t##i##6, d3, v3, 3); \
t##i##7 = GiSimdFmaLane(t##i##7, d3, v2, 2); \
t##i##8 = GiFmsqLaneQFloat32(t##i##8, d3, v0, 3); \
t##i##0 = GiSimdFmaLane(t##i##0, d4, v0, 3); \
t##i##1 = GiSimdFmaLane(t##i##1, d4, v4, 0); \
t##i##2 = GiSimdFmaLane(t##i##2, d4, v4, 1); \
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d4, v4, 2); \
t##i##4 = GiSimdFmaLane(t##i##4, d4, v4, 3); \
t##i##5 = GiSimdFmaLane(t##i##5, d4, v5, 0); \
t##i##6 = GiSimdFmaLane(t##i##6, d4, v5, 1); \
t##i##8 = GiSimdFmaLane(t##i##8, d4, v2, 2); \
t##i##0 = GiFmsqLaneQFloat32(t##i##0, d5, v2, 2); \
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d5, v5, 2); \
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d5, v5, 3); \
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d5, v6, 0); \
t##i##4 = GiSimdFmaLane(t##i##4, d5, v6, 1); \
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d5, v5, 2); \
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d5, v6, 0); \
t##i##7 = GiFmsqLaneQFloat32(t##i##7, d5, v2, 2); \
t##i##8 = GiSimdFmaLane(t##i##8, d5, v0, 3); \
t##i##0 = GiFmsqLaneQFloat32(t##i##0, d6, v0, 0); \
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d6, v1, 0); \
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d6, v1, 1); \
t##i##3 = GiSimdFmaLane(t##i##3, d6, v1, 0); \
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d6, v3, 1); \
t##i##0 = MADD(t##i##0, d3, v2, 2); \
t##i##1 = MADD(t##i##1, d3, v2, 3); \
t##i##2 = MSUB(t##i##2, d3, v3, 0); \
t##i##3 = MADD(t##i##3, d3, v2, 0); \
t##i##4 = MSUB(t##i##4, d3, v3, 1); \
t##i##5 = MADD(t##i##5, d3, v3, 2); \
t##i##6 = MADD(t##i##6, d3, v3, 3); \
t##i##7 = MADD(t##i##7, d3, v2, 2); \
t##i##8 = MSUB(t##i##8, d3, v0, 3); \
t##i##0 = MADD(t##i##0, d4, v0, 3); \
t##i##1 = MADD(t##i##1, d4, v4, 0); \
t##i##2 = MADD(t##i##2, d4, v4, 1); \
t##i##3 = MSUB(t##i##3, d4, v4, 2); \
t##i##4 = MADD(t##i##4, d4, v4, 3); \
t##i##5 = MADD(t##i##5, d4, v5, 0); \
t##i##6 = MADD(t##i##6, d4, v5, 1); \
t##i##8 = MADD(t##i##8, d4, v2, 2); \
t##i##0 = MSUB(t##i##0, d5, v2, 2); \
t##i##1 = MSUB(t##i##1, d5, v5, 2); \
t##i##2 = MSUB(t##i##2, d5, v5, 3); \
t##i##3 = MSUB(t##i##3, d5, v6, 0); \
t##i##4 = MADD(t##i##4, d5, v6, 1); \
t##i##5 = MSUB(t##i##5, d5, v5, 2); \
t##i##6 = MSUB(t##i##6, d5, v6, 0); \
t##i##7 = MSUB(t##i##7, d5, v2, 2); \
t##i##8 = MADD(t##i##8, d5, v0, 3); \
t##i##0 = MSUB(t##i##0, d6, v0, 0); \
t##i##1 = MSUB(t##i##1, d6, v1, 0); \
t##i##2 = MSUB(t##i##2, d6, v1, 1); \
t##i##3 = MADD(t##i##3, d6, v1, 0); \
t##i##4 = MSUB(t##i##4, d6, v3, 1); \
t##i##5 = GiSubtractFloat32(t##i##5, d6); \
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d6, v6, 2); \
t##i##8 = GiFmsqLaneQFloat32(t##i##8, d6, v2, 2); \
t##i##0 = GiSimdFmaLane(t##i##0, d0, v0, 0);
t##i##6 = MSUB(t##i##6, d6, v6, 2); \
t##i##8 = MSUB(t##i##8, d6, v2, 2); \
t##i##0 = MADD(t##i##0, d0, v0, 0);
UNROLL_CALL_RAW(9, cb);
#undef cb
......@@ -187,59 +203,59 @@ struct InputTransformF73_NCHW44 {
d5 = t7##i; \
d6 = t7##i; \
d7 = t7##i; \
d8 = GiFmsqLaneQFloat32(d8, t7##i, v0, 0); \
d8 = MSUB(d8, t7##i, v0, 0); \
d0 = GiSubtractFloat32(d0, t1##i); \
d1 = GiFmsqLaneQFloat32(d1, t1##i, v0, 0); \
d2 = GiSimdFmaLane(d2, t1##i, v0, 0); \
d3 = GiFmsqLaneQFloat32(d3, t1##i, v0, 1); \
d4 = GiSimdFmaLane(d4, t1##i, v0, 1); \
d5 = GiFmsqLaneQFloat32(d5, t1##i, v0, 2); \
d6 = GiSimdFmaLane(d6, t1##i, v0, 2); \
d1 = MSUB(d1, t1##i, v0, 0); \
d2 = MADD(d2, t1##i, v0, 0); \
d3 = MSUB(d3, t1##i, v0, 1); \
d4 = MADD(d4, t1##i, v0, 1); \
d5 = MSUB(d5, t1##i, v0, 2); \
d6 = MADD(d6, t1##i, v0, 2); \
d7 = GiSubtractFloat32(d7, t1##i); \
d8 = GiSimdFmaLane(d8, t1##i, v0, 0); \
d0 = GiFmsqLaneQFloat32(d0, t2##i, v0, 3); \
d1 = GiFmsqLaneQFloat32(d1, t2##i, v1, 0); \
d2 = GiFmsqLaneQFloat32(d2, t2##i, v1, 1); \
d3 = GiSimdFmaLane(d3, t2##i, v1, 2); \
d4 = GiFmsqLaneQFloat32(d4, t2##i, v1, 3); \
d5 = GiFmsqLaneQFloat32(d5, t2##i, v2, 0); \
d6 = GiFmsqLaneQFloat32(d6, t2##i, v2, 1); \
d8 = MADD(d8, t1##i, v0, 0); \
d0 = MSUB(d0, t2##i, v0, 3); \
d1 = MSUB(d1, t2##i, v1, 0); \
d2 = MSUB(d2, t2##i, v1, 1); \
d3 = MADD(d3, t2##i, v1, 2); \
d4 = MSUB(d4, t2##i, v1, 3); \
d5 = MSUB(d5, t2##i, v2, 0); \
d6 = MSUB(d6, t2##i, v2, 1); \
d8 = GiSubtractFloat32(d8, t2##i); \
d0 = GiSimdFmaLane(d0, t3##i, v2, 2); \
d1 = GiSimdFmaLane(d1, t3##i, v2, 3); \
d2 = GiFmsqLaneQFloat32(d2, t3##i, v3, 0); \
d3 = GiSimdFmaLane(d3, t3##i, v2, 0); \
d4 = GiFmsqLaneQFloat32(d4, t3##i, v3, 1); \
d5 = GiSimdFmaLane(d5, t3##i, v3, 2); \
d6 = GiSimdFmaLane(d6, t3##i, v3, 3); \
d7 = GiSimdFmaLane(d7, t3##i, v2, 2); \
d8 = GiFmsqLaneQFloat32(d8, t3##i, v0, 3); \
d0 = GiSimdFmaLane(d0, t4##i, v0, 3); \
d1 = GiSimdFmaLane(d1, t4##i, v4, 0); \
d2 = GiSimdFmaLane(d2, t4##i, v4, 1); \
d3 = GiFmsqLaneQFloat32(d3, t4##i, v4, 2); \
d4 = GiSimdFmaLane(d4, t4##i, v4, 3); \
d5 = GiSimdFmaLane(d5, t4##i, v5, 0); \
d6 = GiSimdFmaLane(d6, t4##i, v5, 1); \
d8 = GiSimdFmaLane(d8, t4##i, v2, 2); \
d0 = GiFmsqLaneQFloat32(d0, t5##i, v2, 2); \
d1 = GiFmsqLaneQFloat32(d1, t5##i, v5, 2); \
d2 = GiFmsqLaneQFloat32(d2, t5##i, v5, 3); \
d3 = GiFmsqLaneQFloat32(d3, t5##i, v6, 0); \
d4 = GiSimdFmaLane(d4, t5##i, v6, 1); \
d5 = GiFmsqLaneQFloat32(d5, t5##i, v5, 2); \
d6 = GiFmsqLaneQFloat32(d6, t5##i, v6, 0); \
d7 = GiFmsqLaneQFloat32(d7, t5##i, v2, 2); \
d8 = GiSimdFmaLane(d8, t5##i, v0, 3); \
d0 = GiFmsqLaneQFloat32(d0, t6##i, v0, 0); \
d1 = GiFmsqLaneQFloat32(d1, t6##i, v1, 0); \
d2 = GiFmsqLaneQFloat32(d2, t6##i, v1, 1); \
d3 = GiSimdFmaLane(d3, t6##i, v1, 0); \
d4 = GiFmsqLaneQFloat32(d4, t6##i, v3, 1); \
d0 = MADD(d0, t3##i, v2, 2); \
d1 = MADD(d1, t3##i, v2, 3); \
d2 = MSUB(d2, t3##i, v3, 0); \
d3 = MADD(d3, t3##i, v2, 0); \
d4 = MSUB(d4, t3##i, v3, 1); \
d5 = MADD(d5, t3##i, v3, 2); \
d6 = MADD(d6, t3##i, v3, 3); \
d7 = MADD(d7, t3##i, v2, 2); \
d8 = MSUB(d8, t3##i, v0, 3); \
d0 = MADD(d0, t4##i, v0, 3); \
d1 = MADD(d1, t4##i, v4, 0); \
d2 = MADD(d2, t4##i, v4, 1); \
d3 = MSUB(d3, t4##i, v4, 2); \
d4 = MADD(d4, t4##i, v4, 3); \
d5 = MADD(d5, t4##i, v5, 0); \
d6 = MADD(d6, t4##i, v5, 1); \
d8 = MADD(d8, t4##i, v2, 2); \
d0 = MSUB(d0, t5##i, v2, 2); \
d1 = MSUB(d1, t5##i, v5, 2); \
d2 = MSUB(d2, t5##i, v5, 3); \
d3 = MSUB(d3, t5##i, v6, 0); \
d4 = MADD(d4, t5##i, v6, 1); \
d5 = MSUB(d5, t5##i, v5, 2); \
d6 = MSUB(d6, t5##i, v6, 0); \
d7 = MSUB(d7, t5##i, v2, 2); \
d8 = MADD(d8, t5##i, v0, 3); \
d0 = MSUB(d0, t6##i, v0, 0); \
d1 = MSUB(d1, t6##i, v1, 0); \
d2 = MSUB(d2, t6##i, v1, 1); \
d3 = MADD(d3, t6##i, v1, 0); \
d4 = MSUB(d4, t6##i, v3, 1); \
d5 = GiSubtractFloat32(d5, t6##i); \
d6 = GiFmsqLaneQFloat32(d6, t6##i, v6, 2); \
d8 = GiFmsqLaneQFloat32(d8, t6##i, v2, 2); \
d0 = GiSimdFmaLane(d0, t0##i, v0, 0); \
d6 = MSUB(d6, t6##i, v6, 2); \
d8 = MSUB(d8, t6##i, v2, 2); \
d0 = MADD(d0, t0##i, v0, 0); \
GiStoreFloat32( \
input_transform_buf + \
(0 * alpha + i) * ICB * nr_units_in_tile * pack_size + \
......@@ -288,6 +304,8 @@ struct InputTransformF73_NCHW44 {
UNROLL_CALL_RAW(9, cb);
#undef cb
#undef MADD
#undef MSUB
}
};
......
......@@ -224,9 +224,7 @@ GI_FLOAT32_t GiMlaqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t c) {
#endif
#elif defined(GI_SSE2_INTRINSICS)
// fma is coming soon, but right now:
__m128 res;
res = _mm_mul_ps(c, b);
return _mm_add_ps(a, res);
return _mm_add_ps(a, _mm_mul_ps(c, b));
#elif defined(GI_RVV_INTRINSICS)
return vfmadd_vv_f32m1(b, c, a, GI_SIMD_LEN_BYTE / sizeof(float));
#else
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册