// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/lite/arm/math/activation.h" #include "paddle/fluid/lite/arm/math/funcs.h" namespace paddle { namespace lite { namespace arm { namespace math { template <> void act_relu(const float* din, float* dout, int size, int threads) { int nums_per_thread = size / threads; int remain = size - threads * nums_per_thread; int neon_loop_cnt = nums_per_thread >> 4; int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); float32x4_t vzero = vdupq_n_f32(0.f); #pragma omp parallel for for (int i = 0; i < threads; ++i) { const float* ptr_in_thread = din + i * nums_per_thread; float* ptr_out_thread = dout + i * nums_per_thread; int cnt = neon_loop_cnt; #ifdef __aarch64__ for (int num = 0; num < neon_loop_cnt; ++num) { float32x4_t vr0 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; float32x4_t vr1 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; float32x4_t vr2 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; float32x4_t vr3 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; vr0 = vmaxq_f32(vr0, vzero); vr1 = vmaxq_f32(vr1, vzero); vr2 = vmaxq_f32(vr2, vzero); vr3 = vmaxq_f32(vr3, vzero); vst1q_f32(ptr_out_thread, vr0); ptr_out_thread += 4; vst1q_f32(ptr_out_thread, vr1); ptr_out_thread += 4; vst1q_f32(ptr_out_thread, vr2); ptr_out_thread += 4; vst1q_f32(ptr_out_thread, vr3); ptr_out_thread += 4; } #else if (cnt > 0) { asm volatile( "1: @ loop header\n" "vld1.32 {d0-d3}, [%[din]]! @ load din 0\n" "vld1.32 {d4-d7}, [%[din]]! @ load din 0\n" "vmax.f32 q8, q0, %q[vzero] @ relu\n" "vmax.f32 q9, q1, %q[vzero] @ relu\n" "vmax.f32 q10, q2, %q[vzero] @ relu\n" "vmax.f32 q11, q3, %q[vzero] @ relu\n" "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" "subs %[cnt], #1 @ loop count minus 1\n" "bne 1b @ jump to main loop start " "point\n" : [dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), [cnt] "+r"(cnt) : [vzero] "w"(vzero) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } #endif for (int j = 0; j < neon_loop_remain; ++j) { ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f; ptr_in_thread++; ptr_out_thread++; } } float* out_ptr_remain = dout + threads * nums_per_thread; const float* in_ptr_remain = din + threads * nums_per_thread; for (int j = 0; j < remain; ++j) { out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : 0.f; in_ptr_remain++; out_ptr_remain++; } } template <> void act_relu_neg(const float* din, float* dout, int size, const float negative_slope, int threads) { int nums_per_thread = size / threads; int remain = size - threads * nums_per_thread; int neon_loop_cnt = nums_per_thread >> 4; int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t valpha = vdupq_n_f32(negative_slope); #pragma omp parallel for for (int i = 0; i < threads; ++i) { const float* ptr_in_thread = din + i * nums_per_thread; float* ptr_out_thread = dout + i * nums_per_thread; int cnt = neon_loop_cnt; #ifdef __aarch64__ for (int num = 0; num < neon_loop_cnt; ++num) { float32x4_t vr0 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; float32x4_t vr1 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; float32x4_t vr2 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; float32x4_t vr3 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; uint32x4_t vm0 = vcgeq_f32(vr0, vzero); uint32x4_t vm1 = vcgeq_f32(vr1, vzero); uint32x4_t vm2 = vcgeq_f32(vr2, vzero); uint32x4_t vm3 = vcgeq_f32(vr3, vzero); float32x4_t vn0 = vmulq_f32(vr0, valpha); float32x4_t vn1 = vmulq_f32(vr1, valpha); float32x4_t vn2 = vmulq_f32(vr2, valpha); float32x4_t vn3 = vmulq_f32(vr3, valpha); float32x4_t vo0 = vbslq_f32(vm0, vr0, vn0); float32x4_t vo1 = vbslq_f32(vm1, vr1, vn1); float32x4_t vo2 = vbslq_f32(vm2, vr2, vn2); float32x4_t vo3 = vbslq_f32(vm3, vr3, vn3); vst1q_f32(ptr_out_thread, vo0); ptr_out_thread += 4; vst1q_f32(ptr_out_thread, vo1); ptr_out_thread += 4; vst1q_f32(ptr_out_thread, vo2); ptr_out_thread += 4; vst1q_f32(ptr_out_thread, vo3); ptr_out_thread += 4; } #else if (cnt > 0) { asm volatile( "1: @ loop header\n" "vld1.32 {d0-d3}, [%[din]]! @ load din 0\n" "vld1.32 {d4-d7}, [%[din]]! @ load din 0\n" "vcge.f32 q8, q0, %q[vzero] @ get mask\n" "vcge.f32 q9, q1, %q[vzero] @ get mask\n" "vcge.f32 q10, q2, %q[vzero] @ get mask\n" "vcge.f32 q11, q3, %q[vzero] @ get mask\n" "vmul.f32 q4, q0, %q[valpha] @ get neg data\n" "vmul.f32 q5, q1, %q[valpha] @ get neg data\n" "vmul.f32 q6, q2, %q[valpha] @ get neg data\n" "vmul.f32 q7, q3, %q[valpha] @ get neg data\n" "vbit q4, q0, q8 @ bitsel, insert q0 to q4, " "if q8 is 1\n" "vbit q5, q1, q9 @ bitsel, insert q1 to q5, " "if q9 is 1\n" "vbit q6, q2, q10 @ bitsel, insert q2 to q6, " "if q10 is 1\n" "vbit q7, q3, q11 @ bitsel, insert q3 to q7, " "if q11 is 1\n" "vst1.32 {d8-d11}, [%[dout]]! @ store result, add pointer\n" "vst1.32 {d12-d15}, [%[dout]]! @ store result, add pointer\n" "subs %[cnt], #1 @ loop count minus 1\n" "bne 1b @ jump to main loop start " "point\n" : [dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), [cnt] "+r"(cnt) : [vzero] "w"(vzero), [valpha] "w"(valpha) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); } #endif for (int j = 0; j < neon_loop_remain; ++j) { ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : ptr_in_thread[0] * negative_slope; ptr_in_thread++; ptr_out_thread++; } } float* out_ptr_remain = dout + threads * nums_per_thread; const float* in_ptr_remain = din + threads * nums_per_thread; for (int j = 0; j < remain; ++j) { out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : in_ptr_remain[0] * negative_slope; in_ptr_remain++; out_ptr_remain++; } } template <> void act_clipped_relu(const float* din, float* dout, int size, const float coef, int threads) { int nums_per_thread = size / threads; int remain = size - threads * nums_per_thread; int neon_loop_cnt = nums_per_thread >> 4; int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vclip = vdupq_n_f32(coef); #pragma omp parallel for for (int i = 0; i < threads; ++i) { const float* ptr_in_thread = din + i * nums_per_thread; float* ptr_out_thread = dout + i * nums_per_thread; int cnt = neon_loop_cnt; #ifdef __aarch64__ for (int num = 0; num < neon_loop_cnt; ++num) { float32x4_t vr0 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; float32x4_t vr1 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; float32x4_t vr2 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; float32x4_t vr3 = vld1q_f32(ptr_in_thread); ptr_in_thread += 4; float32x4_t vt0 = vmaxq_f32(vr0, vzero); float32x4_t vt1 = vmaxq_f32(vr1, vzero); float32x4_t vt2 = vmaxq_f32(vr2, vzero); float32x4_t vt3 = vmaxq_f32(vr3, vzero); float32x4_t vo0 = vminq_f32(vt0, vclip); float32x4_t vo1 = vminq_f32(vt1, vclip); float32x4_t vo2 = vminq_f32(vt2, vclip); float32x4_t vo3 = vminq_f32(vt3, vclip); vst1q_f32(ptr_out_thread, vo0); ptr_out_thread += 4; vst1q_f32(ptr_out_thread, vo1); ptr_out_thread += 4; vst1q_f32(ptr_out_thread, vo2); ptr_out_thread += 4; vst1q_f32(ptr_out_thread, vo3); ptr_out_thread += 4; } #else if (cnt > 0) { asm volatile( "1: @ loop header\n" "vld1.32 {d0-d3}, [%[din]]! @ load din 0\n" "vld1.32 {d4-d7}, [%[din]]! @ load din 0\n" "vmax.f32 q8, q0, %q[vzero] @ relu\n" "vmax.f32 q9, q1, %q[vzero] @ relu\n" "vmax.f32 q10, q2, %q[vzero] @ relu\n" "vmax.f32 q11, q3, %q[vzero] @ relu\n" "vmin.f32 q4, q8, %q[vclip] @ clip relu\n" "vmin.f32 q5, q9, %q[vclip] @ clip relu\n" "vmin.f32 q6, q10, %q[vclip] @ clip relu\n" "vmin.f32 q7, q11, %q[vclip] @ clip relu\n" "vst1.32 {d8-d11}, [%[dout]]! @ store result, add pointer\n" "vst1.32 {d12-d15}, [%[dout]]! @ store result, add pointer\n" "subs %[cnt], #1 @ loop count minus 1\n" "bne 1b @ jump to main loop start " "point\n" : [dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), [cnt] "+r"(cnt) : [vzero] "w"(vzero), [vclip] "w"(vclip) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); } #endif for (int j = 0; j < neon_loop_remain; ++j) { ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f; ptr_out_thread[0] = ptr_out_thread[0] < coef ? ptr_out_thread[0] : coef; ptr_in_thread++; ptr_out_thread++; } } float* out_ptr_remain = dout + threads * nums_per_thread; const float* in_ptr_remain = din + threads * nums_per_thread; for (int j = 0; j < remain; ++j) { out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : 0.f; out_ptr_remain[0] = out_ptr_remain[0] < coef ? out_ptr_remain[0] : coef; in_ptr_remain++; out_ptr_remain++; } } template <> void act_prelu(const float* din, float* dout, int outer_size, int channel_size, int inner_size, bool channel_shared, float* channel_slope, int threads) { int stride_size = inner_size * channel_size; int cnt = inner_size >> 4; int remain = inner_size & 15; float32x4_t vzero = vdupq_n_f32(0.f); for (int n = 0; n < outer_size; n++) { const float* data_in_batch = din + n * stride_size; float* data_out_batch = dout + n * stride_size; #pragma omp parallel for for (int c = 0; c < channel_size; c++) { const float* data_in_c = data_in_batch + c * inner_size; float* data_out_c = data_out_batch + c * inner_size; float slope = channel_shared ? channel_slope[0] : channel_slope[c]; float32x4_t vslope = vdupq_n_f32(slope); #ifdef __aarch64__ for (int i = 0; i < cnt; ++i) { float32x4_t vr0 = vld1q_f32(data_in_c); float32x4_t vr1 = vld1q_f32(data_in_c + 4); float32x4_t vr2 = vld1q_f32(data_in_c + 8); float32x4_t vr3 = vld1q_f32(data_in_c + 12); uint32x4_t vm0 = vcltq_f32(vr0, vzero); // vr0 <= vzero uint32x4_t vm1 = vcltq_f32(vr1, vzero); // vr0 <= vzero uint32x4_t vm2 = vcltq_f32(vr2, vzero); // vr0 <= vzero uint32x4_t vm3 = vcltq_f32(vr3, vzero); // vr0 <= vzero float32x4_t vo0 = vmulq_f32(vr0, vslope); // vr0 * vslope float32x4_t vo1 = vmulq_f32(vr1, vslope); // vr0 * vslope float32x4_t vo2 = vmulq_f32(vr2, vslope); // vr0 * vslope float32x4_t vo3 = vmulq_f32(vr3, vslope); // vr0 * vslope float32x4_t vos0 = vbslq_f32(vm0, vo0, vr0); float32x4_t vos1 = vbslq_f32(vm1, vo1, vr1); float32x4_t vos2 = vbslq_f32(vm2, vo2, vr2); float32x4_t vos3 = vbslq_f32(vm3, vo3, vr3); vst1q_f32(data_out_c, vos0); vst1q_f32(data_out_c + 4, vos1); vst1q_f32(data_out_c + 8, vos2); vst1q_f32(data_out_c + 12, vos3); data_in_c += 16; data_out_c += 16; } #else int cnt_loop = cnt; if (cnt_loop > 0) { asm volatile( "vld1.32 {d0-d3}, [%[ptr_in]]! @ load " "input to q0, q1\n" "pld [%[ptr_in]] @ preload\n" "pld [%[ptr_in], #64] @ preload\n" "pld [%[ptr_in], #128] @ preload\n" "pld [%[ptr_in], #192] @ preload\n" "1: @main loop\n" "vld1.32 {d4-d7}, [%[ptr_in]]! @ load input to " "q2, q3\n" "vclt.f32 q8, q0, %q[vzero] @vcle q0 <= vzero\n" "vclt.f32 q9, q1, %q[vzero] @vcle q1 <= vzero\n" "vmul.f32 q10, q0, %q[vslope] @vmul q0 * vslope\n" "vmul.f32 q11, q1, %q[vslope] @vmul q1 * vslope\n" "vclt.f32 q12, q2, %q[vzero] @vcle q2 <= vzero\n" "vclt.f32 q13, q3, %q[vzero] @vcle q3 <= vzero\n" "vmul.f32 q14, q2, %q[vslope] @vmul q2 * vslope\n" "vmul.f32 q15, q3, %q[vslope] @vmul q3 * vslope\n" "vbif.32 q10, q0, q8 @vbit q10, q0, q8\n" "vbif.32 q11, q1, q9 @vbit q11, q1, q9\n" "vbif.32 q14, q2, q12 @vbit q14, q2, " "q12\n" "vbif.32 q15, q3, q13 @vbit q15, q3, " "q13\n" "subs %[cnt], #1 @subs nn, 1\n" "vld1.32 {d0-d3}, [%[ptr_in]]! @ load input to " "q0, q1\n" "vst1.f32 {d20-d23}, [%[dout]]! @store data\n" "vst1.f32 {d28-d31}, [%[dout]]! @store data\n" "bne 1b @bne nn\n" "sub %[ptr_in], #32 @ ptr-32\n" : [ptr_in] "+r"(data_in_c), [cnt] "+r"(cnt_loop), [dout] "+r"(data_out_c) : [vzero] "w"(vzero), [vslope] "w"(vslope) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } #endif // __aarch64__ for (int i = remain; i > 0; i--) { *(data_out_c++) = data_in_c[0] > 0.f ? data_in_c[0] : data_in_c[0] * slope; data_in_c++; } } } } template <> void act_sigmoid(const float* din, float* dout, int size, int threads) { int nums_per_thread = size / threads; int remain = size - threads * nums_per_thread; int neon_loop_cnt_dim4 = nums_per_thread >> 2; int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); float32x4_t vzero = vdupq_n_f32(0.f); #pragma omp parallel for for (int i = 0; i < threads; ++i) { float32x4_t exp_vec = vdupq_n_f32(0.0f); float32x4_t recip = vdupq_n_f32(0.0f); const float* ptr_in_thread = din + i * nums_per_thread; float* ptr_out_thread = dout + i * nums_per_thread; for (int k = 0; k < neon_loop_cnt_dim4; ++k) { exp_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread))); exp_vec = vaddq_f32(exp_vec, vdupq_n_f32(1.0f)); recip = vrecpeq_f32(exp_vec); recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip); recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip); vst1q_f32(ptr_out_thread, recip); ptr_out_thread += 4; ptr_in_thread += 4; } for (int j = 0; j < neon_loop_remain_dim4; ++j) { ptr_out_thread[0] = 1.f / (1 + expf(-ptr_in_thread[0])); ptr_in_thread++; ptr_out_thread++; } } float* ptr_out = dout + threads * nums_per_thread; const float* ptr_in = din + threads * nums_per_thread; for (int j = 0; j < remain; ++j) { ptr_out[0] = 1.f / (1 + expf(-ptr_in[0])); ptr_in++; ptr_out++; } } // tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template <> void act_tanh(const float* din, float* dout, int size, int threads) { int nums_per_thread = size / threads; int remain = size - threads * nums_per_thread; int neon_loop_cnt_dim4 = nums_per_thread >> 2; int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); #pragma omp parallel for for (int i = 0; i < threads; ++i) { float32x4_t exp_plus_vec = vdupq_n_f32(0.0f); float32x4_t exp_minus_vec = vdupq_n_f32(0.0f); float32x4_t exp_sum_vec = vdupq_n_f32(0.0f); float32x4_t exp_diff_vec = vdupq_n_f32(0.0f); float32x4_t recip = vdupq_n_f32(0.0f); const float* ptr_in_thread = din + i * nums_per_thread; float* ptr_out_thread = dout + i * nums_per_thread; for (int k = 0; k < neon_loop_cnt_dim4; ++k) { exp_plus_vec = exp_ps(vld1q_f32(ptr_in_thread)); exp_minus_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread))); exp_sum_vec = vaddq_f32(exp_plus_vec, exp_minus_vec); exp_diff_vec = vsubq_f32(exp_plus_vec, exp_minus_vec); recip = div_ps(exp_diff_vec, exp_sum_vec); vst1q_f32(ptr_out_thread, recip); ptr_out_thread += 4; ptr_in_thread += 4; } for (int j = 0; j < neon_loop_remain_dim4; ++j) { ptr_out_thread[0] = (expf(ptr_in_thread[0]) - expf(-ptr_in_thread[0])) / (expf(ptr_in_thread[0]) + expf(-ptr_in_thread[0])); ptr_in_thread++; ptr_out_thread++; } } float* ptr_out = dout + threads * nums_per_thread; const float* ptr_in = din + threads * nums_per_thread; for (int j = 0; j < remain; ++j) { ptr_out[0] = (expf(ptr_in[0]) - expf(-ptr_in[0])) / (expf(ptr_in[0]) + expf(-ptr_in[0])); ptr_in++; ptr_out++; } } // swish: x /(1 + exp(-(b * x))) template <> void act_swish(const float* din, float* dout, int size, const float coef, int threads) { int nums_per_thread = size / threads; int remain = size - threads * nums_per_thread; int neon_loop_cnt_dim4 = nums_per_thread >> 2; int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); const float beta = coef; float32x4_t vbeta = vdupq_n_f32(beta); float32x4_t vone = vdupq_n_f32(1.f); #pragma omp parallel for for (int i = 0; i < threads; ++i) { const float* ptr_in_thread = din + i * nums_per_thread; float* ptr_out_thread = dout + i * nums_per_thread; for (int k = 0; k < neon_loop_cnt_dim4; ++k) { float32x4_t va = vld1q_f32(ptr_in_thread); // x float32x4_t vb = vnegq_f32(vld1q_f32(ptr_in_thread)); // -x float32x4_t vsum = vmulq_f32(vb, vbeta); vsum = exp_ps(vsum); float32x4_t vc = vaddq_f32(vone, vsum); float32x4_t vrst = div_ps(va, vc); vst1q_f32(ptr_out_thread, vrst); ptr_out_thread += 4; ptr_in_thread += 4; } for (int j = 0; j < neon_loop_remain_dim4; ++j) { ptr_out_thread[0] = ptr_in_thread[0] / (1.0 + expf(-ptr_in_thread[0] * beta)); ptr_in_thread++; ptr_out_thread++; } } float* ptr_out = dout + threads * nums_per_thread; const float* ptr_in = din + threads * nums_per_thread; for (int j = 0; j < remain; ++j) { ptr_out[0] = ptr_in[0] / (1.0 + expf(-ptr_in[0] * beta)); ptr_in++; ptr_out++; } } } // namespace math } // namespace arm } // namespace lite } // namespace paddle