From ead611e11d4d12a96a79b63f3b38e1b5b56699d9 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 30 Nov 2021 17:48:18 +0800 Subject: [PATCH] perf(dnn): slightly improve arm neon transcendental function performance GitOrigin-RevId: 210d88f81e23efd104ff32ddb57c06b39d0e3e03 --- dnn/src/arm_common/elemwise/neon_mathfun.cpp | 143 +++++++------------ dnn/src/arm_common/elemwise/neon_mathfun.h | 10 +- 2 files changed, 63 insertions(+), 90 deletions(-) diff --git a/dnn/src/arm_common/elemwise/neon_mathfun.cpp b/dnn/src/arm_common/elemwise/neon_mathfun.cpp index 1d356d546..85324f600 100644 --- a/dnn/src/arm_common/elemwise/neon_mathfun.cpp +++ b/dnn/src/arm_common/elemwise/neon_mathfun.cpp @@ -86,11 +86,11 @@ v4sf log_ps_f32(v4sf x) { e = vaddq_f32(e, one); /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ + * if( x < SQRTHF ) { + * e -= 1; + * x = x + x - 1.0; + * } else { x = x - 1.0; } + */ v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); x = vsubq_f32(x, one); @@ -101,38 +101,26 @@ v4sf log_ps_f32(v4sf x) { v4sf z = vmulq_f32(x, x); v4sf y = vdupq_n_f32(c_cephes_log_p0); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); + y = fma_ps_f32(vdupq_n_f32(c_cephes_log_p1), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_log_p2), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_log_p3), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_log_p4), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_log_p5), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_log_p6), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_log_p7), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_log_p8), y, x); y = vmulq_f32(y, x); y = vmulq_f32(y, z); - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); - y = vaddq_f32(y, tmp); + y = fma_ps_f32(y, e, vdupq_n_f32(c_cephes_log_q1)); - tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); - y = vsubq_f32(y, tmp); + y = vmlsq_f32(y, z, vdupq_n_f32(0.5f)); - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); x = vaddq_f32(x, y); - x = vaddq_f32(x, tmp); + x = fma_ps_f32(x, e, vdupq_n_f32(c_cephes_log_q2)); x = vreinterpretq_f32_u32(vorrq_u32( - vreinterpretq_u32_f32(x), - invalid_mask)); // negative arg will be NAN + vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN return x; } @@ -159,7 +147,7 @@ v4sf exp_ps_f32(v4sf x) { x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); + fx = fma_ps_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); /* perform a floorf */ tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); @@ -175,34 +163,20 @@ v4sf exp_ps_f32(v4sf x) { x = vsubq_f32(x, tmp); x = vsubq_f32(x, z); - static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1, - c_cephes_exp_p2, c_cephes_exp_p3, - c_cephes_exp_p4, c_cephes_exp_p5}; - v4sf y = vld1q_dup_f32(cephes_exp_p + 0); - v4sf c1 = vld1q_dup_f32(cephes_exp_p + 1); - v4sf c2 = vld1q_dup_f32(cephes_exp_p + 2); - v4sf c3 = vld1q_dup_f32(cephes_exp_p + 3); - v4sf c4 = vld1q_dup_f32(cephes_exp_p + 4); - v4sf c5 = vld1q_dup_f32(cephes_exp_p + 5); - - y = vmulq_f32(y, x); z = vmulq_f32(x, x); - y = vaddq_f32(y, c1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c5); - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); + v4sf y = vdupq_n_f32(c_cephes_exp_p0); + y = fma_ps_f32(vdupq_n_f32(c_cephes_exp_p1), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_exp_p2), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_exp_p3), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_exp_p4), y, x); + y = fma_ps_f32(vdupq_n_f32(c_cephes_exp_p5), y, x); + + y = fma_ps_f32(x, y, z); y = vaddq_f32(y, one); /* build 2^n */ - int32x4_t mm; + v4si mm; mm = vcvtq_s32_f32(fx); mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); mm = vshlq_n_s32(mm, 23); @@ -249,8 +223,9 @@ float16x8_t exp_ps_f16(float16x8_t x) { almost no extra price so both sin_ps_f32 and cos_ps_f32 make use of sincos_ps_f32.. */ -void sincos_ps_f32(v4sf x, v4sf* ysin, v4sf* ycos) { // any x - v4sf xmm1, xmm2, xmm3, y; +void sincos_ps_f32(v4sf x, v4sf* ysin, v4sf* ycos) { + // any x + v4sf y; v4su emm2; @@ -269,44 +244,36 @@ void sincos_ps_f32(v4sf x, v4sf* ysin, v4sf* ycos) { // any x y = vcvtq_f32_u32(emm2); /* get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4