From 0fbcd4ea5b527b9c2ed1073ea853376dd1e5116c Mon Sep 17 00:00:00 2001 From: hong19860320 <9973393+hong19860320@users.noreply.github.com> Date: Sat, 15 Jun 2019 15:23:26 +0000 Subject: [PATCH] add arm kernel and unit test for relue op test=develop --- paddle/fluid/lite/arm/math/CMakeLists.txt | 1 + paddle/fluid/lite/arm/math/activation.cc | 520 ++++++++++++++++++ paddle/fluid/lite/arm/math/activation.h | 50 ++ paddle/fluid/lite/kernels/arm/CMakeLists.txt | 5 +- .../{relu_compute.h => activation_compute.cc} | 30 +- .../{relu_compute.cc => activation_compute.h} | 24 +- .../kernels/arm/activation_compute_test.cc | 100 ++++ 7 files changed, 708 insertions(+), 22 deletions(-) create mode 100644 paddle/fluid/lite/arm/math/activation.cc create mode 100644 paddle/fluid/lite/arm/math/activation.h rename paddle/fluid/lite/kernels/arm/{relu_compute.h => activation_compute.cc} (61%) rename paddle/fluid/lite/kernels/arm/{relu_compute.cc => activation_compute.h} (56%) create mode 100644 paddle/fluid/lite/kernels/arm/activation_compute_test.cc diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt index 17d1b7d9b2a..0c0295e3211 100644 --- a/paddle/fluid/lite/arm/math/CMakeLists.txt +++ b/paddle/fluid/lite/arm/math/CMakeLists.txt @@ -32,6 +32,7 @@ cc_library(math_arm SRCS conv_winograd_3x3.cc conv_winograd.cc split.cc + activation.cc DEPS ${lite_kernel_deps} eigen3 framework_proto_lite) # TODO(TJ): fix me do not deps proto diff --git a/paddle/fluid/lite/arm/math/activation.cc b/paddle/fluid/lite/arm/math/activation.cc new file mode 100644 index 00000000000..b70767f848a --- /dev/null +++ b/paddle/fluid/lite/arm/math/activation.cc @@ -0,0 +1,520 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/arm/math/activation.h" +#include "paddle/fluid/lite/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +template <> +void act_relu(const float* din, float* dout, int size, int threads) { + int nums_per_thread = size / threads; + int remain = size - threads * nums_per_thread; + int neon_loop_cnt = nums_per_thread >> 4; + int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); + float32x4_t vzero = vdupq_n_f32(0.f); +#pragma omp parallel for + for (int i = 0; i < threads; ++i) { + const float* ptr_in_thread = din + i * nums_per_thread; + float* ptr_out_thread = dout + i * nums_per_thread; + int cnt = neon_loop_cnt; +#ifdef __aarch64__ + for (int num = 0; num < neon_loop_cnt; ++num) { + float32x4_t vr0 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + float32x4_t vr1 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + float32x4_t vr2 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + float32x4_t vr3 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + vr0 = vmaxq_f32(vr0, vzero); + vr1 = vmaxq_f32(vr1, vzero); + vr2 = vmaxq_f32(vr2, vzero); + vr3 = vmaxq_f32(vr3, vzero); + vst1q_f32(ptr_out_thread, vr0); + ptr_out_thread += 4; + vst1q_f32(ptr_out_thread, vr1); + ptr_out_thread += 4; + vst1q_f32(ptr_out_thread, vr2); + ptr_out_thread += 4; + vst1q_f32(ptr_out_thread, vr3); + ptr_out_thread += 4; + } + +#else + if (cnt > 0) { + asm volatile( + "1: @ loop header\n" + "vld1.32 {d0-d3}, [%[din]]! @ load din 0\n" + "vld1.32 {d4-d7}, [%[din]]! @ load din 0\n" + + "vmax.f32 q8, q0, %q[vzero] @ relu\n" + "vmax.f32 q9, q1, %q[vzero] @ relu\n" + "vmax.f32 q10, q2, %q[vzero] @ relu\n" + "vmax.f32 q11, q3, %q[vzero] @ relu\n" + + "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" + "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne 1b @ jump to main loop start " + "point\n" + : [dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero) + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + } +#endif + for (int j = 0; j < neon_loop_remain; ++j) { + ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f; + ptr_in_thread++; + ptr_out_thread++; + } + } + float* out_ptr_remain = dout + threads * nums_per_thread; + const float* in_ptr_remain = din + threads * nums_per_thread; + for (int j = 0; j < remain; ++j) { + out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : 0.f; + in_ptr_remain++; + out_ptr_remain++; + } +} + +template <> +void act_relu_neg(const float* din, float* dout, int size, + const float negative_slope, int threads) { + int nums_per_thread = size / threads; + int remain = size - threads * nums_per_thread; + int neon_loop_cnt = nums_per_thread >> 4; + int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t valpha = vdupq_n_f32(negative_slope); +#pragma omp parallel for + for (int i = 0; i < threads; ++i) { + const float* ptr_in_thread = din + i * nums_per_thread; + float* ptr_out_thread = dout + i * nums_per_thread; + int cnt = neon_loop_cnt; +#ifdef __aarch64__ + for (int num = 0; num < neon_loop_cnt; ++num) { + float32x4_t vr0 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + float32x4_t vr1 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + float32x4_t vr2 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + float32x4_t vr3 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + + uint32x4_t vm0 = vcgeq_f32(vr0, vzero); + uint32x4_t vm1 = vcgeq_f32(vr1, vzero); + uint32x4_t vm2 = vcgeq_f32(vr2, vzero); + uint32x4_t vm3 = vcgeq_f32(vr3, vzero); + + float32x4_t vn0 = vmulq_f32(vr0, valpha); + float32x4_t vn1 = vmulq_f32(vr1, valpha); + float32x4_t vn2 = vmulq_f32(vr2, valpha); + float32x4_t vn3 = vmulq_f32(vr3, valpha); + + float32x4_t vo0 = vbslq_f32(vm0, vr0, vn0); + float32x4_t vo1 = vbslq_f32(vm1, vr1, vn1); + float32x4_t vo2 = vbslq_f32(vm2, vr2, vn2); + float32x4_t vo3 = vbslq_f32(vm3, vr3, vn3); + + vst1q_f32(ptr_out_thread, vo0); + ptr_out_thread += 4; + vst1q_f32(ptr_out_thread, vo1); + ptr_out_thread += 4; + vst1q_f32(ptr_out_thread, vo2); + ptr_out_thread += 4; + vst1q_f32(ptr_out_thread, vo3); + ptr_out_thread += 4; + } + +#else + if (cnt > 0) { + asm volatile( + "1: @ loop header\n" + "vld1.32 {d0-d3}, [%[din]]! @ load din 0\n" + "vld1.32 {d4-d7}, [%[din]]! @ load din 0\n" + + "vcge.f32 q8, q0, %q[vzero] @ get mask\n" + "vcge.f32 q9, q1, %q[vzero] @ get mask\n" + "vcge.f32 q10, q2, %q[vzero] @ get mask\n" + "vcge.f32 q11, q3, %q[vzero] @ get mask\n" + + "vmul.f32 q4, q0, %q[valpha] @ get neg data\n" + "vmul.f32 q5, q1, %q[valpha] @ get neg data\n" + "vmul.f32 q6, q2, %q[valpha] @ get neg data\n" + "vmul.f32 q7, q3, %q[valpha] @ get neg data\n" + + "vbit q4, q0, q8 @ bitsel, insert q0 to q4, " + "if q8 is 1\n" + "vbit q5, q1, q9 @ bitsel, insert q1 to q5, " + "if q9 is 1\n" + "vbit q6, q2, q10 @ bitsel, insert q2 to q6, " + "if q10 is 1\n" + "vbit q7, q3, q11 @ bitsel, insert q3 to q7, " + "if q11 is 1\n" + + "vst1.32 {d8-d11}, [%[dout]]! @ store result, add pointer\n" + "vst1.32 {d12-d15}, [%[dout]]! @ store result, add pointer\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne 1b @ jump to main loop start " + "point\n" + : [dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), [valpha] "w"(valpha) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11"); + } +#endif + for (int j = 0; j < neon_loop_remain; ++j) { + ptr_out_thread[0] = ptr_in_thread[0] > 0.f + ? ptr_in_thread[0] + : ptr_in_thread[0] * negative_slope; + ptr_in_thread++; + ptr_out_thread++; + } + } + float* out_ptr_remain = dout + threads * nums_per_thread; + const float* in_ptr_remain = din + threads * nums_per_thread; + for (int j = 0; j < remain; ++j) { + out_ptr_remain[0] = in_ptr_remain[0] > 0.f + ? in_ptr_remain[0] + : in_ptr_remain[0] * negative_slope; + in_ptr_remain++; + out_ptr_remain++; + } +} + +template <> +void act_clipped_relu(const float* din, float* dout, int size, + const float coef, int threads) { + int nums_per_thread = size / threads; + int remain = size - threads * nums_per_thread; + int neon_loop_cnt = nums_per_thread >> 4; + int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vclip = vdupq_n_f32(coef); +#pragma omp parallel for + for (int i = 0; i < threads; ++i) { + const float* ptr_in_thread = din + i * nums_per_thread; + float* ptr_out_thread = dout + i * nums_per_thread; + int cnt = neon_loop_cnt; +#ifdef __aarch64__ + for (int num = 0; num < neon_loop_cnt; ++num) { + float32x4_t vr0 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + float32x4_t vr1 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + float32x4_t vr2 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + float32x4_t vr3 = vld1q_f32(ptr_in_thread); + ptr_in_thread += 4; + float32x4_t vt0 = vmaxq_f32(vr0, vzero); + float32x4_t vt1 = vmaxq_f32(vr1, vzero); + float32x4_t vt2 = vmaxq_f32(vr2, vzero); + float32x4_t vt3 = vmaxq_f32(vr3, vzero); + + float32x4_t vo0 = vminq_f32(vt0, vclip); + float32x4_t vo1 = vminq_f32(vt1, vclip); + float32x4_t vo2 = vminq_f32(vt2, vclip); + float32x4_t vo3 = vminq_f32(vt3, vclip); + + vst1q_f32(ptr_out_thread, vo0); + ptr_out_thread += 4; + vst1q_f32(ptr_out_thread, vo1); + ptr_out_thread += 4; + vst1q_f32(ptr_out_thread, vo2); + ptr_out_thread += 4; + vst1q_f32(ptr_out_thread, vo3); + ptr_out_thread += 4; + } +#else + if (cnt > 0) { + asm volatile( + "1: @ loop header\n" + "vld1.32 {d0-d3}, [%[din]]! @ load din 0\n" + "vld1.32 {d4-d7}, [%[din]]! @ load din 0\n" + + "vmax.f32 q8, q0, %q[vzero] @ relu\n" + "vmax.f32 q9, q1, %q[vzero] @ relu\n" + "vmax.f32 q10, q2, %q[vzero] @ relu\n" + "vmax.f32 q11, q3, %q[vzero] @ relu\n" + + "vmin.f32 q4, q8, %q[vclip] @ clip relu\n" + "vmin.f32 q5, q9, %q[vclip] @ clip relu\n" + "vmin.f32 q6, q10, %q[vclip] @ clip relu\n" + "vmin.f32 q7, q11, %q[vclip] @ clip relu\n" + + "vst1.32 {d8-d11}, [%[dout]]! @ store result, add pointer\n" + "vst1.32 {d12-d15}, [%[dout]]! @ store result, add pointer\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne 1b @ jump to main loop start " + "point\n" + : [dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), [vclip] "w"(vclip) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11"); + } +#endif + for (int j = 0; j < neon_loop_remain; ++j) { + ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f; + ptr_out_thread[0] = ptr_out_thread[0] < coef ? ptr_out_thread[0] : coef; + ptr_in_thread++; + ptr_out_thread++; + } + } + float* out_ptr_remain = dout + threads * nums_per_thread; + const float* in_ptr_remain = din + threads * nums_per_thread; + for (int j = 0; j < remain; ++j) { + out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : 0.f; + out_ptr_remain[0] = out_ptr_remain[0] < coef ? out_ptr_remain[0] : coef; + in_ptr_remain++; + out_ptr_remain++; + } +} + +template <> +void act_prelu(const float* din, float* dout, int outer_size, + int channel_size, int inner_size, bool channel_shared, + float* channel_slope, int threads) { + int stride_size = inner_size * channel_size; + int cnt = inner_size >> 4; + int remain = inner_size & 15; + float32x4_t vzero = vdupq_n_f32(0.f); + for (int n = 0; n < outer_size; n++) { + const float* data_in_batch = din + n * stride_size; + float* data_out_batch = dout + n * stride_size; +#pragma omp parallel for + for (int c = 0; c < channel_size; c++) { + const float* data_in_c = data_in_batch + c * inner_size; + float* data_out_c = data_out_batch + c * inner_size; + + float slope = channel_shared ? channel_slope[0] : channel_slope[c]; + float32x4_t vslope = vdupq_n_f32(slope); +#ifdef __aarch64__ + for (int i = 0; i < cnt; ++i) { + float32x4_t vr0 = vld1q_f32(data_in_c); + float32x4_t vr1 = vld1q_f32(data_in_c + 4); + float32x4_t vr2 = vld1q_f32(data_in_c + 8); + float32x4_t vr3 = vld1q_f32(data_in_c + 12); + uint32x4_t vm0 = vcltq_f32(vr0, vzero); // vr0 <= vzero + uint32x4_t vm1 = vcltq_f32(vr1, vzero); // vr0 <= vzero + uint32x4_t vm2 = vcltq_f32(vr2, vzero); // vr0 <= vzero + uint32x4_t vm3 = vcltq_f32(vr3, vzero); // vr0 <= vzero + float32x4_t vo0 = vmulq_f32(vr0, vslope); // vr0 * vslope + float32x4_t vo1 = vmulq_f32(vr1, vslope); // vr0 * vslope + float32x4_t vo2 = vmulq_f32(vr2, vslope); // vr0 * vslope + float32x4_t vo3 = vmulq_f32(vr3, vslope); // vr0 * vslope + float32x4_t vos0 = vbslq_f32(vm0, vo0, vr0); + float32x4_t vos1 = vbslq_f32(vm1, vo1, vr1); + float32x4_t vos2 = vbslq_f32(vm2, vo2, vr2); + float32x4_t vos3 = vbslq_f32(vm3, vo3, vr3); + vst1q_f32(data_out_c, vos0); + vst1q_f32(data_out_c + 4, vos1); + vst1q_f32(data_out_c + 8, vos2); + vst1q_f32(data_out_c + 12, vos3); + data_in_c += 16; + data_out_c += 16; + } +#else + int cnt_loop = cnt; + if (cnt_loop > 0) { + asm volatile( + "vld1.32 {d0-d3}, [%[ptr_in]]! @ load " + "input to q0, q1\n" + "pld [%[ptr_in]] @ preload\n" + "pld [%[ptr_in], #64] @ preload\n" + "pld [%[ptr_in], #128] @ preload\n" + "pld [%[ptr_in], #192] @ preload\n" + "1: @main loop\n" + "vld1.32 {d4-d7}, [%[ptr_in]]! @ load input to " + "q2, q3\n" + "vclt.f32 q8, q0, %q[vzero] @vcle q0 <= vzero\n" + "vclt.f32 q9, q1, %q[vzero] @vcle q1 <= vzero\n" + "vmul.f32 q10, q0, %q[vslope] @vmul q0 * vslope\n" + "vmul.f32 q11, q1, %q[vslope] @vmul q1 * vslope\n" + + "vclt.f32 q12, q2, %q[vzero] @vcle q2 <= vzero\n" + "vclt.f32 q13, q3, %q[vzero] @vcle q3 <= vzero\n" + "vmul.f32 q14, q2, %q[vslope] @vmul q2 * vslope\n" + "vmul.f32 q15, q3, %q[vslope] @vmul q3 * vslope\n" + + "vbif.32 q10, q0, q8 @vbit q10, q0, q8\n" + "vbif.32 q11, q1, q9 @vbit q11, q1, q9\n" + "vbif.32 q14, q2, q12 @vbit q14, q2, " + "q12\n" + "vbif.32 q15, q3, q13 @vbit q15, q3, " + "q13\n" + + "subs %[cnt], #1 @subs nn, 1\n" + "vld1.32 {d0-d3}, [%[ptr_in]]! @ load input to " + "q0, q1\n" + + "vst1.f32 {d20-d23}, [%[dout]]! @store data\n" + "vst1.f32 {d28-d31}, [%[dout]]! @store data\n" + "bne 1b @bne nn\n" + "sub %[ptr_in], #32 @ ptr-32\n" + : [ptr_in] "+r"(data_in_c), [cnt] "+r"(cnt_loop), + [dout] "+r"(data_out_c) + : [vzero] "w"(vzero), [vslope] "w"(vslope) + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); + } +#endif // __aarch64__ + for (int i = remain; i > 0; i--) { + *(data_out_c++) = + data_in_c[0] > 0.f ? data_in_c[0] : data_in_c[0] * slope; + data_in_c++; + } + } + } +} + +template <> +void act_sigmoid(const float* din, float* dout, int size, int threads) { + int nums_per_thread = size / threads; + int remain = size - threads * nums_per_thread; + int neon_loop_cnt_dim4 = nums_per_thread >> 2; + int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); + + float32x4_t vzero = vdupq_n_f32(0.f); +#pragma omp parallel for + for (int i = 0; i < threads; ++i) { + float32x4_t exp_vec = vdupq_n_f32(0.0f); + float32x4_t recip = vdupq_n_f32(0.0f); + const float* ptr_in_thread = din + i * nums_per_thread; + float* ptr_out_thread = dout + i * nums_per_thread; + for (int k = 0; k < neon_loop_cnt_dim4; ++k) { + exp_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread))); + exp_vec = vaddq_f32(exp_vec, vdupq_n_f32(1.0f)); + recip = vrecpeq_f32(exp_vec); + recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip); + recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip); + vst1q_f32(ptr_out_thread, recip); + ptr_out_thread += 4; + ptr_in_thread += 4; + } + for (int j = 0; j < neon_loop_remain_dim4; ++j) { + ptr_out_thread[0] = 1.f / (1 + expf(-ptr_in_thread[0])); + ptr_in_thread++; + ptr_out_thread++; + } + } + float* ptr_out = dout + threads * nums_per_thread; + const float* ptr_in = din + threads * nums_per_thread; + for (int j = 0; j < remain; ++j) { + ptr_out[0] = 1.f / (1 + expf(-ptr_in[0])); + ptr_in++; + ptr_out++; + } +} + +// tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template <> +void act_tanh(const float* din, float* dout, int size, int threads) { + int nums_per_thread = size / threads; + int remain = size - threads * nums_per_thread; + int neon_loop_cnt_dim4 = nums_per_thread >> 2; + int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); +#pragma omp parallel for + for (int i = 0; i < threads; ++i) { + float32x4_t exp_plus_vec = vdupq_n_f32(0.0f); + float32x4_t exp_minus_vec = vdupq_n_f32(0.0f); + float32x4_t exp_sum_vec = vdupq_n_f32(0.0f); + float32x4_t exp_diff_vec = vdupq_n_f32(0.0f); + float32x4_t recip = vdupq_n_f32(0.0f); + const float* ptr_in_thread = din + i * nums_per_thread; + float* ptr_out_thread = dout + i * nums_per_thread; + for (int k = 0; k < neon_loop_cnt_dim4; ++k) { + exp_plus_vec = exp_ps(vld1q_f32(ptr_in_thread)); + exp_minus_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread))); + exp_sum_vec = vaddq_f32(exp_plus_vec, exp_minus_vec); + exp_diff_vec = vsubq_f32(exp_plus_vec, exp_minus_vec); + recip = div_ps(exp_diff_vec, exp_sum_vec); + vst1q_f32(ptr_out_thread, recip); + ptr_out_thread += 4; + ptr_in_thread += 4; + } + for (int j = 0; j < neon_loop_remain_dim4; ++j) { + ptr_out_thread[0] = (expf(ptr_in_thread[0]) - expf(-ptr_in_thread[0])) / + (expf(ptr_in_thread[0]) + expf(-ptr_in_thread[0])); + ptr_in_thread++; + ptr_out_thread++; + } + } + float* ptr_out = dout + threads * nums_per_thread; + const float* ptr_in = din + threads * nums_per_thread; + for (int j = 0; j < remain; ++j) { + ptr_out[0] = (expf(ptr_in[0]) - expf(-ptr_in[0])) / + (expf(ptr_in[0]) + expf(-ptr_in[0])); + ptr_in++; + ptr_out++; + } +} + +// swish: x /(1 + exp(-(b * x))) +template <> +void act_swish(const float* din, float* dout, int size, const float coef, + int threads) { + int nums_per_thread = size / threads; + int remain = size - threads * nums_per_thread; + int neon_loop_cnt_dim4 = nums_per_thread >> 2; + int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); + const float beta = coef; + float32x4_t vbeta = vdupq_n_f32(beta); + float32x4_t vone = vdupq_n_f32(1.f); +#pragma omp parallel for + for (int i = 0; i < threads; ++i) { + const float* ptr_in_thread = din + i * nums_per_thread; + float* ptr_out_thread = dout + i * nums_per_thread; + for (int k = 0; k < neon_loop_cnt_dim4; ++k) { + float32x4_t va = vld1q_f32(ptr_in_thread); // x + float32x4_t vb = vnegq_f32(vld1q_f32(ptr_in_thread)); // -x + float32x4_t vsum = vmulq_f32(vb, vbeta); + vsum = exp_ps(vsum); + float32x4_t vc = vaddq_f32(vone, vsum); + float32x4_t vrst = div_ps(va, vc); + vst1q_f32(ptr_out_thread, vrst); + ptr_out_thread += 4; + ptr_in_thread += 4; + } + for (int j = 0; j < neon_loop_remain_dim4; ++j) { + ptr_out_thread[0] = + ptr_in_thread[0] / (1.0 + expf(-ptr_in_thread[0] * beta)); + ptr_in_thread++; + ptr_out_thread++; + } + } + float* ptr_out = dout + threads * nums_per_thread; + const float* ptr_in = din + threads * nums_per_thread; + for (int j = 0; j < remain; ++j) { + ptr_out[0] = ptr_in[0] / (1.0 + expf(-ptr_in[0] * beta)); + ptr_in++; + ptr_out++; + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/arm/math/activation.h b/paddle/fluid/lite/arm/math/activation.h new file mode 100644 index 00000000000..7dfe2141d4d --- /dev/null +++ b/paddle/fluid/lite/arm/math/activation.h @@ -0,0 +1,50 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +template +void act_relu(const T* din, T* dout, int size, int threads); + +template +void act_relu_neg(const T* din, T* dout, int size, const float negative_slope, + int threads); + +template +void act_clipped_relu(const T* din, T* dout, int size, const float coef, + int threads); + +template +void act_prelu(const T* din, T* dout, int outer_size, int channel_size, + int inner_size, bool channel_shared, float* channel_slope, + int threads); + +template +void act_sigmoid(const T* din, T* dout, int size, int threads); + +template +void act_tanh(const T* din, T* dout, int size, int threads); + +template +void act_swish(const T* din, T* dout, int size, const float coef, int threads); + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt index 6e4d73ecc6f..c10dadabb24 100644 --- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt @@ -5,7 +5,7 @@ endif() message(STATUS "compile with lite ARM kernels") cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm) -cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps}) +cc_library(activation_compute_arm SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -16,6 +16,7 @@ cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_a cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm) lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm) +lite_cc_test(test_activation_compute_arm SRCS activation_compute_test.cc DEPS activation_compute_arm) lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm) lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm) @@ -27,7 +28,7 @@ lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_comput set(arm_kernels fc_compute_arm - relu_compute_arm + activation_compute_arm mul_compute_arm scale_compute_arm softmax_compute_arm diff --git a/paddle/fluid/lite/kernels/arm/relu_compute.h b/paddle/fluid/lite/kernels/arm/activation_compute.cc similarity index 61% rename from paddle/fluid/lite/kernels/arm/relu_compute.h rename to paddle/fluid/lite/kernels/arm/activation_compute.cc index def3f02c504..79961f47417 100644 --- a/paddle/fluid/lite/kernels/arm/relu_compute.h +++ b/paddle/fluid/lite/kernels/arm/activation_compute.cc @@ -12,31 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once -#include -#include "paddle/fluid/lite/core/kernel.h" -#include "paddle/fluid/lite/core/op_registry.h" +#include "paddle/fluid/lite/kernels/arm/activation_compute.h" +#include "paddle/fluid/lite/arm/math/funcs.h" namespace paddle { namespace lite { namespace kernels { namespace arm { -class ReluCompute : public KernelLite { - public: - void Run() override { - auto& param = Param(); - auto n = param.input->dims().production(); - const float* input = param.input->data(); - float* output = param.output->mutable_data(); - for (int i = 0; i < n; i++) { - output[i] = std::max(0.f, input[i]); - } - } - - TargetType target() const override { return TARGET(kARM); } - PrecisionType precision() const override { return PRECISION(kFloat); } -}; +void ReluCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + lite::arm::math::act_relu(x_data, output_data, x_dims.production(), + ctx.threads()); +} } // namespace arm } // namespace kernels diff --git a/paddle/fluid/lite/kernels/arm/relu_compute.cc b/paddle/fluid/lite/kernels/arm/activation_compute.h similarity index 56% rename from paddle/fluid/lite/kernels/arm/relu_compute.cc rename to paddle/fluid/lite/kernels/arm/activation_compute.h index 6e27e8ec669..04e7127b598 100644 --- a/paddle/fluid/lite/kernels/arm/relu_compute.cc +++ b/paddle/fluid/lite/kernels/arm/activation_compute.h @@ -12,4 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/lite/kernels/arm/relu_compute.h" +#pragma once +#include +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class ReluCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~ReluCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/activation_compute_test.cc b/paddle/fluid/lite/kernels/arm/activation_compute_test.cc new file mode 100644 index 00000000000..8761fca5bad --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/activation_compute_test.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/activation_compute.h" +#include +#include +#include +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +template +void activation_compute_ref(const operators::ActivationParam& param) { + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + DDim x_dims = param.X->dims(); + DDim output_dims = param.Out->dims(); + ASSERT_EQ(x_dims.data(), output_dims.data()); + for (int i = 0; i < output_dims.production(); i++) { + output_data[i] = std::max(0.f, x_data[i]); + } +} + +TEST(activation_arm, retrive_op) { + auto activation = + KernelRegistry::Global().Create("relu"); + ASSERT_FALSE(activation.empty()); + ASSERT_TRUE(activation.front()); +} + +TEST(activation_arm, init) { + ReluCompute activation; + ASSERT_EQ(activation.precision(), PRECISION(kFloat)); + ASSERT_EQ(activation.target(), TARGET(kARM)); +} + +TEST(activation_arm, compute) { + DeviceInfo::Init(); + for (auto n : {1, 2}) { + for (auto c : {6, 32 /*, 128*/}) { + for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) { + for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) { + Tensor x; + Tensor output; + Tensor output_ref; + // set the dims of input, output, ref output tensors + x.Resize({n, c, h, w}); + output.Resize({n, c, h, w}); + output_ref.Resize({n, c, h, w}); + // initialize the data of input tensors + auto* x_data = x.mutable_data(); + auto* output_data = output.mutable_data(); + for (int i = 0; i < x.dims().production(); i++) { + float sign = i % 3 == 0 ? -1.0f : 1.0f; + x_data[i] = sign * static_cast(i % 128) * 0.013f; + } + // prepare kernel params and run + ReluCompute activation; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + activation.SetContext(std::move(ctx)); + operators::ActivationParam param; + param.X = &x; + param.Out = &output; + activation.SetParam(param); + activation.Launch(); + // invoking ref implementation and compare results + param.Out = &output_ref; + activation_compute_ref(param); + auto* output_ref_data = output_ref.mutable_data(); + for (int i = 0; i < output.dims().production(); i++) { + EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); + } + } + } + } + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def); -- GitLab