From 42cf7643210bc10df4c4e2868165332466e0b1d7 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Mon, 8 Oct 2018 20:57:53 +0800 Subject: [PATCH] Refine: optimize quantize kernel by using neon --- CMakeLists.txt | 4 +- src/operators/feed_op.h | 1 - src/operators/kernel/arm/quantize_kernel.cpp | 180 ++++++++++++++++++- 3 files changed, 175 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1287d98152..7d06636b58 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,8 +7,8 @@ option(DEBUGING "enable debug mode" ON) option(USE_EXCEPTION "use std exception" OFF) option(LOG_PROFILE "log profile" OFF) # select the platform to build -option(X86 "x86" ON) -option(CPU "armv7 with neon" OFF) +option(X86 "x86" OFF) +option(CPU "armv7 with neon" ON) option(MALI_GPU "mali gpu" OFF) option(FPGA "fpga" OFF) diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index 918c0d96cc..98d80a1226 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -38,7 +38,6 @@ class FeedOp : public framework::OperatorBase { } #ifdef PADDLE_MOBILE_FPGA - void Init() { Tensor *output = param_.Out(); fpga::format_ofm(output); diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp index 2b2c0f7e5f..bee438c9a8 100644 --- a/src/operators/kernel/arm/quantize_kernel.cpp +++ b/src/operators/kernel/arm/quantize_kernel.cpp @@ -18,13 +18,83 @@ limitations under the License. */ #include #include +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include +#ifndef __aarch64__ +float32_t vmaxvq_f32(float32x4_t r) { + float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r)); + return vget_lane_f32(vpmax_f32(v, v), 0); +} +#endif + +int32x4_t vrnd_towards_zero(float32x4_t r) { + return vcvtq_s32_f32(r); +} + +int32x4_t vrnd_away_zero(float32x4_t r) { + float32x4_t plus = vdupq_n_f32(0.5); + float32x4_t minus = vdupq_n_f32(-0.5); + float32x4_t zero = vdupq_n_f32(0); + uint32x4_t more_than_zero = vcgtq_f32(r1, zero); + float32x4_t temp = vbslq_f32(more_than_zero, plus, minus); + temp = vaddq_f32(r1, add); + int32x4_t ret = vcvtq_s32_f32(temp); + return ret; +} + +int32x4_t vrnd_to_even(float32x4_t r) { + int32x4_t ret; + for (int i = 0; i < 4; ++i) { + float v = round(r[i]); + int32_t q = (int32_t)v; + if (abs(abs(v - r[i]) - 0.5) > 0) { + ret[i] = q; + } else { + if (abs(q) % 2 == 0) { + ret[i] = q; + } else { + ret[i] = q + (q > 0) ? -1 : 1; + } + } + } + return ret; +} + +#endif + namespace paddle_mobile { namespace operators { static float find_abs_max(const Tensor *input) { float max_abs = float(0); const float *x = input->data(); - for (size_t i = 0; i < input->numel(); ++i) { + size_t size = input->numel(); +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + size_t loop = size >> 4; + size_t remain = size & 0xF; + for (size_t i = 0; i < loop; ++i) { + float32x4_t max; + float32x4_t r1 = vld1q_f32(x); + float32x4_t r2 = vld1q_f32(x + 4); + float32x4_t r3 = vld1q_f32(x + 8); + float32x4_t r4 = vld1q_f32(x + 12); + r1 = vabsq_f32(r1); + r2 = vabsq_f32(r2); + r3 = vabsq_f32(r3); + r4 = vabsq_f32(r4); + max[0] = vmaxvq_f32(r1); + max[1] = vmaxvq_f32(r2); + max[2] = vmaxvq_f32(r3); + max[3] = vmaxvq_f32(r4); + max[0] = vmaxvq_f32(max); + if (max[0] > max_abs) { + max_abs = max[0]; + } + x += 16; + } + size = remain; +#endif + for (size_t i = 0; i < size; ++i) { float value = std::abs(x[i]); if (value > max_abs) { max_abs = value; @@ -34,11 +104,43 @@ static float find_abs_max(const Tensor *input) { } static void quantize_round_to_even(const Tensor *input, - const float scale, - Tensor *output) { + const float scale, + Tensor *output) { const float *x = input->data(); int8_t *y = output->data(); - for (size_t i = 0; i < input->numel(); ++i) { + size_t size = input->numel(); +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + size_t loop = size >> 4; + size_t remain = size & 0xF; + for (size_t i = 0; i < loop; ++i) { + float32x4_t r0 = vld1q_f32(x); + float32x4_t r1 = vld1q_f32(x + 4); + float32x4_t r2 = vld1q_f32(x + 8); + float32x4_t r3 = vld1q_f32(x + 12); + r0 = vmulq_n_f32(r0, scale); + r1 = vmulq_n_f32(r1, scale); + r2 = vmulq_n_f32(r2, scale); + r3 = vmulq_n_f32(r3, scale); + int32x4_t q0 = vrnd_to_even(r0); + int32x4_t q1 = vrnd_to_even(r1); + int32x4_t q2 = vrnd_to_even(r2); + int32x4_t q3 = vrnd_to_even(r3); + int16x4_t d0 = vmovn_s32(q0); + int16x4_t d1 = vmovn_s32(q1); + int16x4_t d2 = vmovn_s32(q2); + int16x4_t d3 = vmovn_s32(q3); + int16x8_t q5 = vcombine_s16(d1, d0); + int16x8_t q6 = vcombine_s16(d3, d2); + int8x8_t d1 = vmovn_s16(q5); + int8x8_t d2 = vmovn_s16(q6); + vst1_s8(y, d1); + vst1_s8(y + 8, d2); + x += 16; + y += 16; + } + size = remain; +#endif + for (size_t i = 0; i < size; ++i) { float value = x[i] * scale; long long quant = llround(value); if (abs(abs(round(value) - value) - 0.5) > 0) { @@ -58,7 +160,39 @@ static void quantize_round_to_zero(const Tensor *input, Tensor *output) { const float *x = input->data(); int8_t *y = output->data(); - for (size_t i = 0; i < input->numel(); ++i) { + size_t size = input->numel(); +#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON) + size_t loop = size >> 4; + size_t remain = size & 0xF; + for (size_t i = 0; i < loop; ++i) { + float32x4_t r0 = vld1q_f32(x); + float32x4_t r1 = vld1q_f32(x + 4); + float32x4_t r2 = vld1q_f32(x + 8); + float32x4_t r3 = vld1q_f32(x + 12); + r0 = vmulq_n_f32(r0, scale); + r1 = vmulq_n_f32(r1, scale); + r2 = vmulq_n_f32(r2, scale); + r3 = vmulq_n_f32(r3, scale); + int32x4_t q0 = vrnd_towards_zero(r0); + int32x4_t q1 = vrnd_towards_zero(r1); + int32x4_t q2 = vrnd_towards_zero(r2); + int32x4_t q3 = vrnd_towards_zero(r3); + int16x4_t d0 = vmovn_s32(q0); + int16x4_t d1 = vmovn_s32(q1); + int16x4_t d2 = vmovn_s32(q2); + int16x4_t d3 = vmovn_s32(q3); + int16x8_t q5 = vcombine_s16(d1, d0); + int16x8_t q6 = vcombine_s16(d3, d2); + int8x8_t d1 = vmovn_s16(q5); + int8x8_t d2 = vmovn_s16(q6); + vst1_s8(y, d1); + vst1_s8(y + 8, d2); + x += 16; + y += 16; + } + size = remain; +#endif + for (size_t i = 0; i < size; ++i) { y[i] = trunc(x[i] * scale); } } @@ -68,8 +202,40 @@ static void quantize_round_to_nearest(const Tensor *input, Tensor *output) { const float *x = input->data(); int8_t *y = output->data(); - for (size_t i = 0; i < input->numel(); ++i) { - y[i] = round(x[i] * scale); + size_t size = input->numel(); +#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON) + size_t loop = size >> 4; + size_t remain = size & 0xF; + for (size_t i = 0; i < loop; ++i) { + float32x4_t r0 = vld1q_f32(x); + float32x4_t r1 = vld1q_f32(x + 4); + float32x4_t r2 = vld1q_f32(x + 8); + float32x4_t r3 = vld1q_f32(x + 12); + r0 = vmulq_n_f32(r0, scale); + r1 = vmulq_n_f32(r1, scale); + r2 = vmulq_n_f32(r2, scale); + r3 = vmulq_n_f32(r3, scale); + int32x4_t q0 = vrnd_away_zero(r0); + int32x4_t q1 = vrnd_away_zero(r1); + int32x4_t q2 = vrnd_away_zero(r2); + int32x4_t q3 = vrnd_away_zero(r3); + int16x4_t d0 = vmovn_s32(q0); + int16x4_t d1 = vmovn_s32(q1); + int16x4_t d2 = vmovn_s32(q2); + int16x4_t d3 = vmovn_s32(q3); + int16x8_t q5 = vcombine_s16(d1, d0); + int16x8_t q6 = vcombine_s16(d3, d2); + int8x8_t d1 = vmovn_s16(q5); + int8x8_t d2 = vmovn_s16(q6); + vst1_s8(y, d1); + vst1_s8(y + 8, d2); + x += 16; + y += 16; + } + size = remain; +#endif + for (size_t i = 0; i < size; ++i) { + y[i] = trunc(x[i] * scale); } } -- GitLab