提交 8395a459 编写于 作者: M Megvii Engine Team

fix(dnn/fallback): fix naive shift multidefination error and optimize GiCvtFromInt32V4ToUint8

GitOrigin-RevId: 6660c35214c2f08a496d07f267845136abb696d7
上级 cc218550
......@@ -2055,6 +2055,7 @@ void GiStoreZipInt8V3(void* Buffer, GI_INT8_t a, GI_INT8_t b, GI_INT8_t c) {
#define GiShiftRightInt32(Vector, n) \
vsra_vx_i32m1(Vector, n, GI_SIMD_LEN_BYTE / sizeof(int32_t))
#else
GI_FORCEINLINE
GI_INT32_t ShiftRightNaive(GI_INT32_t src, const size_t shift) {
GI_INT32_t ret;
for (size_t idx = 0; idx < GI_SIMD_LEN_BYTE / sizeof(int32_t); ++idx) {
......@@ -2074,6 +2075,7 @@ GI_INT32_t ShiftRightNaive(GI_INT32_t src, const size_t shift) {
#define GiShiftLeftInt32(Vector, n) \
vsll_vx_i32m1(Vector, n, GI_SIMD_LEN_BYTE / sizeof(int32_t))
#else
GI_FORCEINLINE
GI_INT32_t ShiftLeftNaive(GI_INT32_t src, const size_t shift) {
GI_INT32_t ret;
for (size_t idx = 0; idx < GI_SIMD_LEN_BYTE / sizeof(int32_t); ++idx) {
......@@ -2306,19 +2308,9 @@ GI_UINT8_t GiCvtFromInt32V4ToUint8(
GI_INT32_t Vector0, GI_INT32_t Vector1, GI_INT32_t Vector2,
GI_INT32_t Vector3) {
#if defined(GI_NEON_INTRINSICS)
int16x8_t mid1 = vmaxq_s16(
vdupq_n_s16(0),
vminq_s16(
vcombine_s16(vqmovn_s32(Vector0), vqmovn_s32(Vector1)),
vdupq_n_s16(UINT8_MAX)));
int16x8_t mid2 = vmaxq_s16(
vdupq_n_s16(0),
vminq_s16(
vcombine_s16(vqmovn_s32(Vector2), vqmovn_s32(Vector3)),
vdupq_n_s16(UINT8_MAX)));
return vcombine_u8(
vqmovn_u16(vreinterpretq_u16_s16(mid1)),
vqmovn_u16(vreinterpretq_u16_s16(mid2)));
vqmovun_s16(vcombine_s16(vqmovn_s32(Vector0), vqmovn_s32(Vector1))),
vqmovun_s16(vcombine_s16(vqmovn_s32(Vector2), vqmovn_s32(Vector3))));
#elif defined(GI_SSE2_INTRINSICS)
__m128i vepi16_0 = _mm_packs_epi32(Vector0, Vector1);
__m128i vepi16_1 = _mm_packs_epi32(Vector2, Vector3);
......
......@@ -4954,7 +4954,7 @@ TEST_F(FALLBACK, GiMultiplyAddInt16LongHigh) {
assert_eq((int32_t*)&ret, naive, SIMD_LEN);
}
TEST_F(FALLBACK, GiCvtFromInt32V4ToUint8) {
std::vector<int32_t> s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678};
std::vector<int32_t> s0{INT16_MAX, INT16_MIN, INT32_MAX, INT32_MIN};
GI_INT32_t src0, src1, src2, src3;
force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE);
force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册