提交 5c3b4e95 编写于 作者: M Megvii Engine Team

feat(x86/rvv): opt AlgoFP32WinogradF63_4x4_NCHW44

GitOrigin-RevId: 0cd0089982fdeea0d432da8c25a5e304ca65009a
上级 fa59a7b0
...@@ -73,9 +73,21 @@ struct InputTransformF63_NCHW44 { ...@@ -73,9 +73,21 @@ struct InputTransformF63_NCHW44 {
size_t icb = ic / pack_size; size_t icb = ic / pack_size;
GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7; GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7;
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#define MADD(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d))
#define MSUB(a, b, c, d) GiMultiplySubScalarFloat32(a, b, *(c + d))
const float* v0 = input_parameters + 0;
const float* v1 = input_parameters + 4;
const float* v2 = input_parameters + 8;
#else
#define MADD(a, b, c, d) GiSimdFmaLane(a, b, c, d)
#define MSUB(a, b, c, d) GiFmsqLaneQFloat32(a, b, c, d)
GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters + 0); GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters + 0);
GI_FLOAT32_t v1 = GiLoadFloat32(input_parameters + 4); GI_FLOAT32_t v1 = GiLoadFloat32(input_parameters + 4);
GI_FLOAT32_t v2 = GiLoadFloat32(input_parameters + 8); GI_FLOAT32_t v2 = GiLoadFloat32(input_parameters + 8);
#endif
//! B //! B
//! 1 0 0 0 0 0 0 0 //! 1 0 0 0 0 0 0 0
...@@ -105,39 +117,39 @@ struct InputTransformF63_NCHW44 { ...@@ -105,39 +117,39 @@ struct InputTransformF63_NCHW44 {
t##i##0 = GiSubtractFloat32(t##i##0, d6); \ t##i##0 = GiSubtractFloat32(t##i##0, d6); \
t##i##1 = GiAddFloat32(t##i##1, d1); \ t##i##1 = GiAddFloat32(t##i##1, d1); \
t##i##2 = GiSubtractFloat32(t##i##2, d1); \ t##i##2 = GiSubtractFloat32(t##i##2, d1); \
t##i##3 = GiSimdFmaLane(t##i##3, d1, v0, 2); \ t##i##3 = MADD(t##i##3, d1, v0, 2); \
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d1, v0, 2); \ t##i##4 = MSUB(t##i##4, d1, v0, 2); \
t##i##5 = GiSimdFmaLane(t##i##5, d1, v1, 2); \ t##i##5 = MADD(t##i##5, d1, v1, 2); \
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d1, v1, 2); \ t##i##6 = MSUB(t##i##6, d1, v1, 2); \
t##i##7 = GiSubtractFloat32(t##i##7, d1); \ t##i##7 = GiSubtractFloat32(t##i##7, d1); \
t##i##0 = GiFmsqLaneQFloat32(t##i##0, d2, v0, 0); \ t##i##0 = MSUB(t##i##0, d2, v0, 0); \
t##i##1 = GiAddFloat32(t##i##1, d2); \ t##i##1 = GiAddFloat32(t##i##1, d2); \
t##i##2 = GiAddFloat32(t##i##2, d2); \ t##i##2 = GiAddFloat32(t##i##2, d2); \
t##i##3 = GiSimdFmaLane(t##i##3, d2, v0, 3); \ t##i##3 = MADD(t##i##3, d2, v0, 3); \
t##i##4 = GiSimdFmaLane(t##i##4, d2, v0, 3); \ t##i##4 = MADD(t##i##4, d2, v0, 3); \
t##i##5 = GiSimdFmaLane(t##i##5, d2, v1, 3); \ t##i##5 = MADD(t##i##5, d2, v1, 3); \
t##i##6 = GiSimdFmaLane(t##i##6, d2, v1, 3); \ t##i##6 = MADD(t##i##6, d2, v1, 3); \
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d3, v0, 1); \ t##i##1 = MSUB(t##i##1, d3, v0, 1); \
t##i##2 = GiSimdFmaLane(t##i##2, d3, v0, 1); \ t##i##2 = MADD(t##i##2, d3, v0, 1); \
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d3, v1, 0); \ t##i##3 = MSUB(t##i##3, d3, v1, 0); \
t##i##4 = GiSimdFmaLane(t##i##4, d3, v1, 0); \ t##i##4 = MADD(t##i##4, d3, v1, 0); \
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d3, v1, 0); \ t##i##5 = MSUB(t##i##5, d3, v1, 0); \
t##i##6 = GiSimdFmaLane(t##i##6, d3, v1, 0); \ t##i##6 = MADD(t##i##6, d3, v1, 0); \
t##i##7 = GiSimdFmaLane(t##i##7, d3, v0, 0); \ t##i##7 = MADD(t##i##7, d3, v0, 0); \
t##i##0 = GiSimdFmaLane(t##i##0, d4, v0, 0); \ t##i##0 = MADD(t##i##0, d4, v0, 0); \
t##i##1 = GiFmsqLaneQFloat32(t##i##1, d4, v0, 1); \ t##i##1 = MSUB(t##i##1, d4, v0, 1); \
t##i##2 = GiFmsqLaneQFloat32(t##i##2, d4, v0, 1); \ t##i##2 = MSUB(t##i##2, d4, v0, 1); \
t##i##3 = GiFmsqLaneQFloat32(t##i##3, d4, v1, 1); \ t##i##3 = MSUB(t##i##3, d4, v1, 1); \
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d4, v1, 1); \ t##i##4 = MSUB(t##i##4, d4, v1, 1); \
t##i##5 = GiFmsqLaneQFloat32(t##i##5, d4, v2, 0); \ t##i##5 = MSUB(t##i##5, d4, v2, 0); \
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d4, v2, 0); \ t##i##6 = MSUB(t##i##6, d4, v2, 0); \
t##i##1 = GiAddFloat32(t##i##1, d5); \ t##i##1 = GiAddFloat32(t##i##1, d5); \
t##i##2 = GiSubtractFloat32(t##i##2, d5); \ t##i##2 = GiSubtractFloat32(t##i##2, d5); \
t##i##3 = GiSimdFmaLane(t##i##3, d5, v1, 2); \ t##i##3 = MADD(t##i##3, d5, v1, 2); \
t##i##4 = GiFmsqLaneQFloat32(t##i##4, d5, v1, 2); \ t##i##4 = MSUB(t##i##4, d5, v1, 2); \
t##i##5 = GiSimdFmaLane(t##i##5, d5, v0, 2); \ t##i##5 = MADD(t##i##5, d5, v0, 2); \
t##i##6 = GiFmsqLaneQFloat32(t##i##6, d5, v0, 2); \ t##i##6 = MSUB(t##i##6, d5, v0, 2); \
t##i##7 = GiFmsqLaneQFloat32(t##i##7, d5, v0, 0); t##i##7 = MSUB(t##i##7, d5, v0, 0);
UNROLL_CALL_RAW(8, cb); UNROLL_CALL_RAW(8, cb);
#undef cb #undef cb
...@@ -153,39 +165,39 @@ struct InputTransformF63_NCHW44 { ...@@ -153,39 +165,39 @@ struct InputTransformF63_NCHW44 {
d0 = GiSubtractFloat32(d0, t6##i); \ d0 = GiSubtractFloat32(d0, t6##i); \
d1 = GiAddFloat32(d1, t1##i); \ d1 = GiAddFloat32(d1, t1##i); \
d2 = GiSubtractFloat32(d2, t1##i); \ d2 = GiSubtractFloat32(d2, t1##i); \
d3 = GiSimdFmaLane(d3, t1##i, v0, 2); \ d3 = MADD(d3, t1##i, v0, 2); \
d4 = GiFmsqLaneQFloat32(d4, t1##i, v0, 2); \ d4 = MSUB(d4, t1##i, v0, 2); \
d5 = GiSimdFmaLane(d5, t1##i, v1, 2); \ d5 = MADD(d5, t1##i, v1, 2); \
d6 = GiFmsqLaneQFloat32(d6, t1##i, v1, 2); \ d6 = MSUB(d6, t1##i, v1, 2); \
d7 = GiSubtractFloat32(d7, t1##i); \ d7 = GiSubtractFloat32(d7, t1##i); \
d0 = GiFmsqLaneQFloat32(d0, t2##i, v0, 0); \ d0 = MSUB(d0, t2##i, v0, 0); \
d1 = GiAddFloat32(d1, t2##i); \ d1 = GiAddFloat32(d1, t2##i); \
d2 = GiAddFloat32(d2, t2##i); \ d2 = GiAddFloat32(d2, t2##i); \
d3 = GiSimdFmaLane(d3, t2##i, v0, 3); \ d3 = MADD(d3, t2##i, v0, 3); \
d4 = GiSimdFmaLane(d4, t2##i, v0, 3); \ d4 = MADD(d4, t2##i, v0, 3); \
d5 = GiSimdFmaLane(d5, t2##i, v1, 3); \ d5 = MADD(d5, t2##i, v1, 3); \
d6 = GiSimdFmaLane(d6, t2##i, v1, 3); \ d6 = MADD(d6, t2##i, v1, 3); \
d1 = GiFmsqLaneQFloat32(d1, t3##i, v0, 1); \ d1 = MSUB(d1, t3##i, v0, 1); \
d2 = GiSimdFmaLane(d2, t3##i, v0, 1); \ d2 = MADD(d2, t3##i, v0, 1); \
d3 = GiFmsqLaneQFloat32(d3, t3##i, v1, 0); \ d3 = MSUB(d3, t3##i, v1, 0); \
d4 = GiSimdFmaLane(d4, t3##i, v1, 0); \ d4 = MADD(d4, t3##i, v1, 0); \
d5 = GiFmsqLaneQFloat32(d5, t3##i, v1, 0); \ d5 = MSUB(d5, t3##i, v1, 0); \
d6 = GiSimdFmaLane(d6, t3##i, v1, 0); \ d6 = MADD(d6, t3##i, v1, 0); \
d7 = GiSimdFmaLane(d7, t3##i, v0, 0); \ d7 = MADD(d7, t3##i, v0, 0); \
d0 = GiSimdFmaLane(d0, t4##i, v0, 0); \ d0 = MADD(d0, t4##i, v0, 0); \
d1 = GiFmsqLaneQFloat32(d1, t4##i, v0, 1); \ d1 = MSUB(d1, t4##i, v0, 1); \
d2 = GiFmsqLaneQFloat32(d2, t4##i, v0, 1); \ d2 = MSUB(d2, t4##i, v0, 1); \
d3 = GiFmsqLaneQFloat32(d3, t4##i, v1, 1); \ d3 = MSUB(d3, t4##i, v1, 1); \
d4 = GiFmsqLaneQFloat32(d4, t4##i, v1, 1); \ d4 = MSUB(d4, t4##i, v1, 1); \
d5 = GiFmsqLaneQFloat32(d5, t4##i, v2, 0); \ d5 = MSUB(d5, t4##i, v2, 0); \
d6 = GiFmsqLaneQFloat32(d6, t4##i, v2, 0); \ d6 = MSUB(d6, t4##i, v2, 0); \
d1 = GiAddFloat32(d1, t5##i); \ d1 = GiAddFloat32(d1, t5##i); \
d2 = GiSubtractFloat32(d2, t5##i); \ d2 = GiSubtractFloat32(d2, t5##i); \
d3 = GiSimdFmaLane(d3, t5##i, v1, 2); \ d3 = MADD(d3, t5##i, v1, 2); \
d4 = GiFmsqLaneQFloat32(d4, t5##i, v1, 2); \ d4 = MSUB(d4, t5##i, v1, 2); \
d5 = GiSimdFmaLane(d5, t5##i, v0, 2); \ d5 = MADD(d5, t5##i, v0, 2); \
d6 = GiFmsqLaneQFloat32(d6, t5##i, v0, 2); \ d6 = MSUB(d6, t5##i, v0, 2); \
d7 = GiFmsqLaneQFloat32(d7, t5##i, v0, 0); \ d7 = MSUB(d7, t5##i, v0, 0); \
GiStoreFloat32( \ GiStoreFloat32( \
input_transform_buf + \ input_transform_buf + \
(0 * alpha + i) * ICB * nr_units_in_tile * pack_size + \ (0 * alpha + i) * ICB * nr_units_in_tile * pack_size + \
...@@ -228,6 +240,8 @@ struct InputTransformF63_NCHW44 { ...@@ -228,6 +240,8 @@ struct InputTransformF63_NCHW44 {
d7); d7);
UNROLL_CALL_RAW(8, cb); UNROLL_CALL_RAW(8, cb);
#undef cb #undef cb
#undef MADD
#undef MSUB
} }
}; };
......
...@@ -889,6 +889,21 @@ GI_FLOAT32_t GiMultiplyAddScalarFloat32( ...@@ -889,6 +889,21 @@ GI_FLOAT32_t GiMultiplyAddScalarFloat32(
#endif #endif
} }
GI_FORCEINLINE
GI_FLOAT32_t GiMultiplySubScalarFloat32(
GI_FLOAT32_t VectorSub, GI_FLOAT32_t Vector, float Scalar) {
#if defined(GI_NEON_INTRINSICS)
return vmlsq_n_f32(VectorSub, Vector, Scalar);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_sub_ps(VectorSub, _mm_mul_ps(Vector, GiBroadcastFloat32(Scalar)));
#elif defined(GI_RVV_INTRINSICS)
return vfnmsub_vf_f32m1(
Vector, Scalar, VectorSub, GI_SIMD_LEN_BYTE / sizeof(float));
#else
return VectorSub - Vector * Scalar;
#endif
}
#if defined(GI_NEON_INTRINSICS) #if defined(GI_NEON_INTRINSICS)
#define GIMULTIPLYADDLANFLOAT32(i) \ #define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
......
...@@ -1808,6 +1808,29 @@ TEST_F(FALLBACK, GiMultiplyAddScalarFloat32) { ...@@ -1808,6 +1808,29 @@ TEST_F(FALLBACK, GiMultiplyAddScalarFloat32) {
assert_eq((float*)&ret, naive); assert_eq((float*)&ret, naive);
} }
TEST_F(FALLBACK, GiMultiplySubScalarFloat32) {
GI_FLOAT32_t src0, src1, ret;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f};
std::vector<float> s1{2312.1f, 345.244f, 3.59f, -12.8f};
s0.resize(SIMD_LEN);
s1.resize(SIMD_LEN);
init((float*)&src0, s0);
init((float*)&src1, s1);
float scalar = 3.1415;
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiMultiplySubScalarFloat32(src0, src1, scalar);
std::vector<float> naive;
for (size_t i = 0; i < SIMD_LEN; i++) {
naive.push_back(s0[i] - s1[i] * scalar);
}
assert_eq((float*)&ret, naive);
}
TEST_F(FALLBACK, GiMultiplyAddLanXXFloat32) { TEST_F(FALLBACK, GiMultiplyAddLanXXFloat32) {
GI_FLOAT32_t src0, src1, src2, ret; GI_FLOAT32_t src0, src1, src2, ret;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f}; std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册