diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp index b8e6bf873d5987f9feed567fc5bd48d20588a44f..7c26f158ba90d44f247d8835c5269f98e5740e2a 100644 --- a/src/operators/kernel/arm/quantize_kernel.cpp +++ b/src/operators/kernel/arm/quantize_kernel.cpp @@ -44,6 +44,7 @@ int32x4_t vrnd_away_zero(float32x4_t r) { } int32x4_t vrnd_to_even(float32x4_t r) { +#if 0 int32x4_t ret; float value[4]; vst1q_f32(value, r); @@ -61,6 +62,30 @@ int32x4_t vrnd_to_even(float32x4_t r) { } } return ret; +#else + float32x4_t point5 = vdupq_n_f32(0.5); + int32x4_t one = vdupq_n_s32(1); + int32x4_t zero = vdupq_n_s32(0); + + int32x4_t rnd = vrnd_away_zero(r); + float32x4_t frnd = vcvtq_f32_s32(rnd); + frnd = vsubq_f32(frnd, r); + frnd = vabsq_f32(frnd); + uint32x4_t equal_point5 = vceqq_f32(frnd, point5); + int32x4_t abs_rnd = vabsq_s32(rnd); + abs_rnd = vandq_s32(abs_rnd, one); + uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd); + uint32x4_t mask = vandq_u32(equal_point5, not_mod2); + uint32x4_t more_than_zero = vcgtq_s32(rnd, zero); + more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one)); + mask = veorq_u32(more_than_zero, mask); + more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one)); + mask = vaddq_u32(more_than_zero, mask); + int32x4_t smask = vreinterpretq_s32_u32(mask); + smask = vsubq_s32(smask, one); + rnd = vaddq_s32(rnd, smask); + return rnd; + #endif } #endif diff --git a/src/operators/math/math_func_neon.h b/src/operators/math/math_func_neon.h index 5bb3fd0f5ae3f6349ab52535348f6310e4096951..97e1e6f67d57ec1ad9ea294aa227f5f781e2e273 100644 --- a/src/operators/math/math_func_neon.h +++ b/src/operators/math/math_func_neon.h @@ -38,6 +38,7 @@ limitations under the License. */ * * (this is the zlib license) */ + #pragma once #include