diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h index bbde866a0d8e3611d70a5043b74ddaf61f147ff1..5355f9e3807e68a34cc273105b2d52817f406d1a 100644 --- a/dnn/src/fallback/general_intrinsic/gi_float.h +++ b/dnn/src/fallback/general_intrinsic/gi_float.h @@ -1750,6 +1750,42 @@ GI_FLOAT32_V3_t GiLoadUzipFloat32V3(const float* ptr) { #endif } +GI_FORCEINLINE +GI_FLOAT32_V4_t GiLoadUzipFloat32V4(const float* ptr) { +#if defined(GI_NEON_INTRINSICS) + return vld4q_f32(ptr); +#elif defined(GI_SSE2_INTRINSICS) + GI_FLOAT32_V4_t v; + __m128 tmp0, tmp1, tmp2, tmp3; + v.val[0] = GiLoadFloat32(ptr); + v.val[1] = GiLoadFloat32((ptr + 4)); + v.val[2] = GiLoadFloat32((ptr + 8)); + v.val[3] = GiLoadFloat32((ptr + 12)); + + tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]); + tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]); + tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]); + tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]); + v.val[0] = _mm_movelh_ps(tmp0, tmp2); + v.val[1] = _mm_movehl_ps(tmp2, tmp0); + v.val[2] = _mm_movelh_ps(tmp1, tmp3); + v.val[3] = _mm_movehl_ps(tmp3, tmp1); + return v; +#elif defined(GI_RVV_INTRINSICS) + return vlseg4e32_v_f32m1x4(ptr, GI_SIMD_LEN_BYTE / sizeof(float)); +#else + GI_FLOAT32_V4_t ret; + for (size_t i = 0; i < 4; i++) { + ret.val[i][0] = ptr[0 + i]; + ret.val[i][1] = ptr[4 + i]; + ret.val[i][2] = ptr[8 + i]; + ret.val[i][3] = ptr[12 + i]; + } + + return ret; +#endif +} + GI_FORCEINLINE void GiStoreZipFloat32V3(float* ptr, GI_FLOAT32_V3_t val) { #if defined(GI_NEON_INTRINSICS) diff --git a/dnn/test/fallback/gi.cpp b/dnn/test/fallback/gi.cpp index 78964724f85a2a0c58aa351ef368a3a0b04dc659..250989cf63d4962ac28f4e1a4786afb0b2a6e2d0 100644 --- a/dnn/test/fallback/gi.cpp +++ b/dnn/test/fallback/gi.cpp @@ -4208,7 +4208,26 @@ TEST_F(FALLBACK, GiLoadUzipFloat32V3) { naive.push_back(s0[9 + i]); } - assert_eq((float*)&ret, naive); + assert_eq((float*)&ret, naive, SIMD_LEN * 3); +} + +TEST_F(FALLBACK, GiLoadUzipFloat32V4) { + GI_FLOAT32_V4_t ret; + std::vector s0{1.1f, 2.2f, 3.5f, 4.9f, 2312.1f, 345.244f, 3.59f, -12.8f, + 2.2f, 6.0f, 90.0f, 89.3f, 2.1f, -3.5f, 4.9f, -2312.1f}; + s0.resize(SIMD_LEN * 4); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 4); + ret = GiLoadUzipFloat32V4(s0.data()); + std::vector naive; + for (size_t i = 0; i < 4; i++) { + naive.push_back(s0[0 + i]); + naive.push_back(s0[4 + i]); + naive.push_back(s0[8 + i]); + naive.push_back(s0[12 + i]); + } + + assert_eq((float*)&ret, naive, SIMD_LEN * 4); } TEST_F(FALLBACK, GiStoreZipFloat32V3) {