add arm kernel and unit test for relue op

test=develop

add arm kernel and unit test for relue op
test=develop
0fbcd4ea · hong19860320 · 7c02e682 · 0fbcd4ea · 0fbcd4ea · 0fbcd4ea
7 changed file
--- a/paddle/fluid/lite/arm/math/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -32,6 +32,7 @@ cc_library(math_arm SRCS
    conv_winograd_3x3.cc
    conv_winograd.cc
    split.cc
+    activation.cc
    DEPS ${lite_kernel_deps} eigen3 framework_proto_lite)
  # TODO(TJ): fix me do not deps proto
 

--- a/paddle/fluid/lite/arm/math/activation.cc
+++ b/paddle/fluid/lite/arm/math/activation.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/arm/math/activation.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <>
+void act_relu<float>(const float* din, float* dout, int size, int threads) {
+  int nums_per_thread = size / threads;
+  int remain = size - threads * nums_per_thread;
+  int neon_loop_cnt = nums_per_thread >> 4;
+  int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#pragma omp parallel for
+  for (int i = 0; i < threads; ++i) {
+    const float* ptr_in_thread = din + i * nums_per_thread;
+    float* ptr_out_thread = dout + i * nums_per_thread;
+    int cnt = neon_loop_cnt;
+#ifdef __aarch64__
+    for (int num = 0; num < neon_loop_cnt; ++num) {
+      float32x4_t vr0 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      float32x4_t vr1 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      float32x4_t vr2 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      float32x4_t vr3 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      vr0 = vmaxq_f32(vr0, vzero);
+      vr1 = vmaxq_f32(vr1, vzero);
+      vr2 = vmaxq_f32(vr2, vzero);
+      vr3 = vmaxq_f32(vr3, vzero);
+      vst1q_f32(ptr_out_thread, vr0);
+      ptr_out_thread += 4;
+      vst1q_f32(ptr_out_thread, vr1);
+      ptr_out_thread += 4;
+      vst1q_f32(ptr_out_thread, vr2);
+      ptr_out_thread += 4;
+      vst1q_f32(ptr_out_thread, vr3);
+      ptr_out_thread += 4;
+    }
+
+#else
+    if (cnt > 0) {
+      asm volatile(
+          "1:                                     @ loop header\n"
+          "vld1.32  {d0-d3}, [%[din]]!            @ load din 0\n"
+          "vld1.32  {d4-d7}, [%[din]]!            @ load din 0\n"
+
+          "vmax.f32 q8, q0, %q[vzero]             @ relu\n"
+          "vmax.f32 q9, q1, %q[vzero]             @ relu\n"
+          "vmax.f32 q10, q2, %q[vzero]            @ relu\n"
+          "vmax.f32 q11, q3, %q[vzero]            @ relu\n"
+
+          "vst1.32  {d16-d19}, [%[dout]]!         @ store result, add pointer\n"
+          "vst1.32  {d20-d23}, [%[dout]]!         @ store result, add pointer\n"
+
+          "subs %[cnt], #1                        @ loop count minus 1\n"
+          "bne    1b                              @ jump to main loop start "
+          "point\n"
+          : [dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread),
+            [cnt] "+r"(cnt)
+          : [vzero] "w"(vzero)
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+    }
+#endif
+    for (int j = 0; j < neon_loop_remain; ++j) {
+      ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f;
+      ptr_in_thread++;
+      ptr_out_thread++;
+    }
+  }
+  float* out_ptr_remain = dout + threads * nums_per_thread;
+  const float* in_ptr_remain = din + threads * nums_per_thread;
+  for (int j = 0; j < remain; ++j) {
+    out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : 0.f;
+    in_ptr_remain++;
+    out_ptr_remain++;
+  }
+}
+
+template <>
+void act_relu_neg<float>(const float* din, float* dout, int size,
+                         const float negative_slope, int threads) {
+  int nums_per_thread = size / threads;
+  int remain = size - threads * nums_per_thread;
+  int neon_loop_cnt = nums_per_thread >> 4;
+  int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  float32x4_t valpha = vdupq_n_f32(negative_slope);
+#pragma omp parallel for
+  for (int i = 0; i < threads; ++i) {
+    const float* ptr_in_thread = din + i * nums_per_thread;
+    float* ptr_out_thread = dout + i * nums_per_thread;
+    int cnt = neon_loop_cnt;
+#ifdef __aarch64__
+    for (int num = 0; num < neon_loop_cnt; ++num) {
+      float32x4_t vr0 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      float32x4_t vr1 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      float32x4_t vr2 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      float32x4_t vr3 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+
+      uint32x4_t vm0 = vcgeq_f32(vr0, vzero);
+      uint32x4_t vm1 = vcgeq_f32(vr1, vzero);
+      uint32x4_t vm2 = vcgeq_f32(vr2, vzero);
+      uint32x4_t vm3 = vcgeq_f32(vr3, vzero);
+
+      float32x4_t vn0 = vmulq_f32(vr0, valpha);
+      float32x4_t vn1 = vmulq_f32(vr1, valpha);
+      float32x4_t vn2 = vmulq_f32(vr2, valpha);
+      float32x4_t vn3 = vmulq_f32(vr3, valpha);
+
+      float32x4_t vo0 = vbslq_f32(vm0, vr0, vn0);
+      float32x4_t vo1 = vbslq_f32(vm1, vr1, vn1);
+      float32x4_t vo2 = vbslq_f32(vm2, vr2, vn2);
+      float32x4_t vo3 = vbslq_f32(vm3, vr3, vn3);
+
+      vst1q_f32(ptr_out_thread, vo0);
+      ptr_out_thread += 4;
+      vst1q_f32(ptr_out_thread, vo1);
+      ptr_out_thread += 4;
+      vst1q_f32(ptr_out_thread, vo2);
+      ptr_out_thread += 4;
+      vst1q_f32(ptr_out_thread, vo3);
+      ptr_out_thread += 4;
+    }
+
+#else
+    if (cnt > 0) {
+      asm volatile(
+          "1:                                             @ loop header\n"
+          "vld1.32  {d0-d3}, [%[din]]!            @ load din 0\n"
+          "vld1.32  {d4-d7}, [%[din]]!            @ load din 0\n"
+
+          "vcge.f32 q8, q0, %q[vzero]             @ get mask\n"
+          "vcge.f32 q9, q1, %q[vzero]             @ get mask\n"
+          "vcge.f32 q10, q2, %q[vzero]            @ get mask\n"
+          "vcge.f32 q11, q3, %q[vzero]            @ get mask\n"
+
+          "vmul.f32   q4, q0, %q[valpha]          @ get neg data\n"
+          "vmul.f32   q5, q1, %q[valpha]          @ get neg data\n"
+          "vmul.f32   q6, q2, %q[valpha]          @ get neg data\n"
+          "vmul.f32   q7, q3, %q[valpha]          @ get neg data\n"
+
+          "vbit   q4, q0, q8                      @ bitsel, insert q0 to q4, "
+          "if q8 is 1\n"
+          "vbit   q5, q1, q9                      @ bitsel, insert q1 to q5, "
+          "if q9 is 1\n"
+          "vbit   q6, q2, q10                     @ bitsel, insert q2 to q6, "
+          "if q10 is 1\n"
+          "vbit   q7, q3, q11                     @ bitsel, insert q3 to q7, "
+          "if q11 is 1\n"
+
+          "vst1.32  {d8-d11}, [%[dout]]!          @ store result, add pointer\n"
+          "vst1.32  {d12-d15}, [%[dout]]!         @ store result, add pointer\n"
+
+          "subs %[cnt], #1                        @ loop count minus 1\n"
+          "bne    1b                              @ jump to main loop start "
+          "point\n"
+          : [dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread),
+            [cnt] "+r"(cnt)
+          : [vzero] "w"(vzero), [valpha] "w"(valpha)
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+            "q8", "q9", "q10", "q11");
+    }
+#endif
+    for (int j = 0; j < neon_loop_remain; ++j) {
+      ptr_out_thread[0] = ptr_in_thread[0] > 0.f
+                              ? ptr_in_thread[0]
+                              : ptr_in_thread[0] * negative_slope;
+      ptr_in_thread++;
+      ptr_out_thread++;
+    }
+  }
+  float* out_ptr_remain = dout + threads * nums_per_thread;
+  const float* in_ptr_remain = din + threads * nums_per_thread;
+  for (int j = 0; j < remain; ++j) {
+    out_ptr_remain[0] = in_ptr_remain[0] > 0.f
+                            ? in_ptr_remain[0]
+                            : in_ptr_remain[0] * negative_slope;
+    in_ptr_remain++;
+    out_ptr_remain++;
+  }
+}
+
+template <>
+void act_clipped_relu<float>(const float* din, float* dout, int size,
+                             const float coef, int threads) {
+  int nums_per_thread = size / threads;
+  int remain = size - threads * nums_per_thread;
+  int neon_loop_cnt = nums_per_thread >> 4;
+  int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  float32x4_t vclip = vdupq_n_f32(coef);
+#pragma omp parallel for
+  for (int i = 0; i < threads; ++i) {
+    const float* ptr_in_thread = din + i * nums_per_thread;
+    float* ptr_out_thread = dout + i * nums_per_thread;
+    int cnt = neon_loop_cnt;
+#ifdef __aarch64__
+    for (int num = 0; num < neon_loop_cnt; ++num) {
+      float32x4_t vr0 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      float32x4_t vr1 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      float32x4_t vr2 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      float32x4_t vr3 = vld1q_f32(ptr_in_thread);
+      ptr_in_thread += 4;
+      float32x4_t vt0 = vmaxq_f32(vr0, vzero);
+      float32x4_t vt1 = vmaxq_f32(vr1, vzero);
+      float32x4_t vt2 = vmaxq_f32(vr2, vzero);
+      float32x4_t vt3 = vmaxq_f32(vr3, vzero);
+
+      float32x4_t vo0 = vminq_f32(vt0, vclip);
+      float32x4_t vo1 = vminq_f32(vt1, vclip);
+      float32x4_t vo2 = vminq_f32(vt2, vclip);
+      float32x4_t vo3 = vminq_f32(vt3, vclip);
+
+      vst1q_f32(ptr_out_thread, vo0);
+      ptr_out_thread += 4;
+      vst1q_f32(ptr_out_thread, vo1);
+      ptr_out_thread += 4;
+      vst1q_f32(ptr_out_thread, vo2);
+      ptr_out_thread += 4;
+      vst1q_f32(ptr_out_thread, vo3);
+      ptr_out_thread += 4;
+    }
+#else
+    if (cnt > 0) {
+      asm volatile(
+          "1:                                     @ loop header\n"
+          "vld1.32  {d0-d3}, [%[din]]!            @ load din 0\n"
+          "vld1.32  {d4-d7}, [%[din]]!            @ load din 0\n"
+
+          "vmax.f32 q8, q0, %q[vzero]             @ relu\n"
+          "vmax.f32 q9, q1, %q[vzero]             @ relu\n"
+          "vmax.f32 q10, q2, %q[vzero]            @ relu\n"
+          "vmax.f32 q11, q3, %q[vzero]            @ relu\n"
+
+          "vmin.f32 q4, q8, %q[vclip]             @ clip relu\n"
+          "vmin.f32 q5, q9, %q[vclip]             @ clip relu\n"
+          "vmin.f32 q6, q10, %q[vclip]            @ clip relu\n"
+          "vmin.f32 q7, q11, %q[vclip]            @ clip relu\n"
+
+          "vst1.32  {d8-d11}, [%[dout]]!          @ store result, add pointer\n"
+          "vst1.32  {d12-d15}, [%[dout]]!         @ store result, add pointer\n"
+
+          "subs %[cnt], #1                        @ loop count minus 1\n"
+          "bne    1b                              @ jump to main loop start "
+          "point\n"
+          : [dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread),
+            [cnt] "+r"(cnt)
+          : [vzero] "w"(vzero), [vclip] "w"(vclip)
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+            "q8", "q9", "q10", "q11");
+    }
+#endif
+    for (int j = 0; j < neon_loop_remain; ++j) {
+      ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f;
+      ptr_out_thread[0] = ptr_out_thread[0] < coef ? ptr_out_thread[0] : coef;
+      ptr_in_thread++;
+      ptr_out_thread++;
+    }
+  }
+  float* out_ptr_remain = dout + threads * nums_per_thread;
+  const float* in_ptr_remain = din + threads * nums_per_thread;
+  for (int j = 0; j < remain; ++j) {
+    out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : 0.f;
+    out_ptr_remain[0] = out_ptr_remain[0] < coef ? out_ptr_remain[0] : coef;
+    in_ptr_remain++;
+    out_ptr_remain++;
+  }
+}
+
+template <>
+void act_prelu<float>(const float* din, float* dout, int outer_size,
+                      int channel_size, int inner_size, bool channel_shared,
+                      float* channel_slope, int threads) {
+  int stride_size = inner_size * channel_size;
+  int cnt = inner_size >> 4;
+  int remain = inner_size & 15;
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  for (int n = 0; n < outer_size; n++) {
+    const float* data_in_batch = din + n * stride_size;
+    float* data_out_batch = dout + n * stride_size;
+#pragma omp parallel for
+    for (int c = 0; c < channel_size; c++) {
+      const float* data_in_c = data_in_batch + c * inner_size;
+      float* data_out_c = data_out_batch + c * inner_size;
+
+      float slope = channel_shared ? channel_slope[0] : channel_slope[c];
+      float32x4_t vslope = vdupq_n_f32(slope);
+#ifdef __aarch64__
+      for (int i = 0; i < cnt; ++i) {
+        float32x4_t vr0 = vld1q_f32(data_in_c);
+        float32x4_t vr1 = vld1q_f32(data_in_c + 4);
+        float32x4_t vr2 = vld1q_f32(data_in_c + 8);
+        float32x4_t vr3 = vld1q_f32(data_in_c + 12);
+        uint32x4_t vm0 = vcltq_f32(vr0, vzero);    // vr0 <= vzero
+        uint32x4_t vm1 = vcltq_f32(vr1, vzero);    // vr0 <= vzero
+        uint32x4_t vm2 = vcltq_f32(vr2, vzero);    // vr0 <= vzero
+        uint32x4_t vm3 = vcltq_f32(vr3, vzero);    // vr0 <= vzero
+        float32x4_t vo0 = vmulq_f32(vr0, vslope);  // vr0 * vslope
+        float32x4_t vo1 = vmulq_f32(vr1, vslope);  // vr0 * vslope
+        float32x4_t vo2 = vmulq_f32(vr2, vslope);  // vr0 * vslope
+        float32x4_t vo3 = vmulq_f32(vr3, vslope);  // vr0 * vslope
+        float32x4_t vos0 = vbslq_f32(vm0, vo0, vr0);
+        float32x4_t vos1 = vbslq_f32(vm1, vo1, vr1);
+        float32x4_t vos2 = vbslq_f32(vm2, vo2, vr2);
+        float32x4_t vos3 = vbslq_f32(vm3, vo3, vr3);
+        vst1q_f32(data_out_c, vos0);
+        vst1q_f32(data_out_c + 4, vos1);
+        vst1q_f32(data_out_c + 8, vos2);
+        vst1q_f32(data_out_c + 12, vos3);
+        data_in_c += 16;
+        data_out_c += 16;
+      }
+#else
+      int cnt_loop = cnt;
+      if (cnt_loop > 0) {
+        asm volatile(
+            "vld1.32    {d0-d3}, [%[ptr_in]]!                       @ load "
+            "input to q0, q1\n"
+            "pld [%[ptr_in]]                                @ preload\n"
+            "pld [%[ptr_in], #64]                           @ preload\n"
+            "pld [%[ptr_in], #128]                          @ preload\n"
+            "pld [%[ptr_in], #192]                          @ preload\n"
+            "1:                                             @main loop\n"
+            "vld1.32    {d4-d7}, [%[ptr_in]]!               @ load input to "
+            "q2, q3\n"
+            "vclt.f32   q8, q0, %q[vzero]                   @vcle q0 <= vzero\n"
+            "vclt.f32   q9, q1, %q[vzero]                   @vcle q1 <= vzero\n"
+            "vmul.f32  q10, q0, %q[vslope]                  @vmul q0 * vslope\n"
+            "vmul.f32  q11, q1, %q[vslope]                  @vmul q1 * vslope\n"
+
+            "vclt.f32  q12, q2, %q[vzero]                   @vcle q2 <= vzero\n"
+            "vclt.f32  q13, q3, %q[vzero]                   @vcle q3 <= vzero\n"
+            "vmul.f32  q14, q2, %q[vslope]                  @vmul q2 * vslope\n"
+            "vmul.f32  q15, q3, %q[vslope]                  @vmul q3 * vslope\n"
+
+            "vbif.32    q10, q0, q8                         @vbit q10, q0, q8\n"
+            "vbif.32    q11, q1, q9                         @vbit q11, q1, q9\n"
+            "vbif.32    q14, q2, q12                        @vbit q14, q2, "
+            "q12\n"
+            "vbif.32    q15, q3, q13                        @vbit q15, q3, "
+            "q13\n"
+
+            "subs       %[cnt], #1                          @subs nn, 1\n"
+            "vld1.32    {d0-d3}, [%[ptr_in]]!               @ load input to "
+            "q0, q1\n"
+
+            "vst1.f32   {d20-d23}, [%[dout]]!               @store data\n"
+            "vst1.f32   {d28-d31}, [%[dout]]!               @store data\n"
+            "bne        1b                                  @bne nn\n"
+            "sub    %[ptr_in], #32                          @ ptr-32\n"
+            : [ptr_in] "+r"(data_in_c), [cnt] "+r"(cnt_loop),
+              [dout] "+r"(data_out_c)
+            : [vzero] "w"(vzero), [vslope] "w"(vslope)
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
+              "q12", "q13", "q14", "q15");
+      }
+#endif  // __aarch64__
+      for (int i = remain; i > 0; i--) {
+        *(data_out_c++) =
+            data_in_c[0] > 0.f ? data_in_c[0] : data_in_c[0] * slope;
+        data_in_c++;
+      }
+    }
+  }
+}
+
+template <>
+void act_sigmoid(const float* din, float* dout, int size, int threads) {
+  int nums_per_thread = size / threads;
+  int remain = size - threads * nums_per_thread;
+  int neon_loop_cnt_dim4 = nums_per_thread >> 2;
+  int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#pragma omp parallel for
+  for (int i = 0; i < threads; ++i) {
+    float32x4_t exp_vec = vdupq_n_f32(0.0f);
+    float32x4_t recip = vdupq_n_f32(0.0f);
+    const float* ptr_in_thread = din + i * nums_per_thread;
+    float* ptr_out_thread = dout + i * nums_per_thread;
+    for (int k = 0; k < neon_loop_cnt_dim4; ++k) {
+      exp_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread)));
+      exp_vec = vaddq_f32(exp_vec, vdupq_n_f32(1.0f));
+      recip = vrecpeq_f32(exp_vec);
+      recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip);
+      recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip);
+      vst1q_f32(ptr_out_thread, recip);
+      ptr_out_thread += 4;
+      ptr_in_thread += 4;
+    }
+    for (int j = 0; j < neon_loop_remain_dim4; ++j) {
+      ptr_out_thread[0] = 1.f / (1 + expf(-ptr_in_thread[0]));
+      ptr_in_thread++;
+      ptr_out_thread++;
+    }
+  }
+  float* ptr_out = dout + threads * nums_per_thread;
+  const float* ptr_in = din + threads * nums_per_thread;
+  for (int j = 0; j < remain; ++j) {
+    ptr_out[0] = 1.f / (1 + expf(-ptr_in[0]));
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
+// tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <>
+void act_tanh<float>(const float* din, float* dout, int size, int threads) {
+  int nums_per_thread = size / threads;
+  int remain = size - threads * nums_per_thread;
+  int neon_loop_cnt_dim4 = nums_per_thread >> 2;
+  int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2);
+#pragma omp parallel for
+  for (int i = 0; i < threads; ++i) {
+    float32x4_t exp_plus_vec = vdupq_n_f32(0.0f);
+    float32x4_t exp_minus_vec = vdupq_n_f32(0.0f);
+    float32x4_t exp_sum_vec = vdupq_n_f32(0.0f);
+    float32x4_t exp_diff_vec = vdupq_n_f32(0.0f);
+    float32x4_t recip = vdupq_n_f32(0.0f);
+    const float* ptr_in_thread = din + i * nums_per_thread;
+    float* ptr_out_thread = dout + i * nums_per_thread;
+    for (int k = 0; k < neon_loop_cnt_dim4; ++k) {
+      exp_plus_vec = exp_ps(vld1q_f32(ptr_in_thread));
+      exp_minus_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread)));
+      exp_sum_vec = vaddq_f32(exp_plus_vec, exp_minus_vec);
+      exp_diff_vec = vsubq_f32(exp_plus_vec, exp_minus_vec);
+      recip = div_ps(exp_diff_vec, exp_sum_vec);
+      vst1q_f32(ptr_out_thread, recip);
+      ptr_out_thread += 4;
+      ptr_in_thread += 4;
+    }
+    for (int j = 0; j < neon_loop_remain_dim4; ++j) {
+      ptr_out_thread[0] = (expf(ptr_in_thread[0]) - expf(-ptr_in_thread[0])) /
+                          (expf(ptr_in_thread[0]) + expf(-ptr_in_thread[0]));
+      ptr_in_thread++;
+      ptr_out_thread++;
+    }
+  }
+  float* ptr_out = dout + threads * nums_per_thread;
+  const float* ptr_in = din + threads * nums_per_thread;
+  for (int j = 0; j < remain; ++j) {
+    ptr_out[0] = (expf(ptr_in[0]) - expf(-ptr_in[0])) /
+                 (expf(ptr_in[0]) + expf(-ptr_in[0]));
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
+// swish: x /(1 + exp(-(b * x)))
+template <>
+void act_swish<float>(const float* din, float* dout, int size, const float coef,
+                      int threads) {
+  int nums_per_thread = size / threads;
+  int remain = size - threads * nums_per_thread;
+  int neon_loop_cnt_dim4 = nums_per_thread >> 2;
+  int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2);
+  const float beta = coef;
+  float32x4_t vbeta = vdupq_n_f32(beta);
+  float32x4_t vone = vdupq_n_f32(1.f);
+#pragma omp parallel for
+  for (int i = 0; i < threads; ++i) {
+    const float* ptr_in_thread = din + i * nums_per_thread;
+    float* ptr_out_thread = dout + i * nums_per_thread;
+    for (int k = 0; k < neon_loop_cnt_dim4; ++k) {
+      float32x4_t va = vld1q_f32(ptr_in_thread);             // x
+      float32x4_t vb = vnegq_f32(vld1q_f32(ptr_in_thread));  // -x
+      float32x4_t vsum = vmulq_f32(vb, vbeta);
+      vsum = exp_ps(vsum);
+      float32x4_t vc = vaddq_f32(vone, vsum);
+      float32x4_t vrst = div_ps(va, vc);
+      vst1q_f32(ptr_out_thread, vrst);
+      ptr_out_thread += 4;
+      ptr_in_thread += 4;
+    }
+    for (int j = 0; j < neon_loop_remain_dim4; ++j) {
+      ptr_out_thread[0] =
+          ptr_in_thread[0] / (1.0 + expf(-ptr_in_thread[0] * beta));
+      ptr_in_thread++;
+      ptr_out_thread++;
+    }
+  }
+  float* ptr_out = dout + threads * nums_per_thread;
+  const float* ptr_in = din + threads * nums_per_thread;
+  for (int j = 0; j < remain; ++j) {
+    ptr_out[0] = ptr_in[0] / (1.0 + expf(-ptr_in[0] * beta));
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/activation.h
+++ b/paddle/fluid/lite/arm/math/activation.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void act_relu(const T* din, T* dout, int size, int threads);
+
+template <typename T>
+void act_relu_neg(const T* din, T* dout, int size, const float negative_slope,
+                  int threads);
+
+template <typename T>
+void act_clipped_relu(const T* din, T* dout, int size, const float coef,
+                      int threads);
+
+template <typename T>
+void act_prelu(const T* din, T* dout, int outer_size, int channel_size,
+               int inner_size, bool channel_shared, float* channel_slope,
+               int threads);
+
+template <typename T>
+void act_sigmoid(const T* din, T* dout, int size, int threads);
+
+template <typename T>
+void act_tanh(const T* din, T* dout, int size, int threads);
+
+template <typename T>
+void act_swish(const T* din, T* dout, int size, const float coef, int threads);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 message(STATUS "compile with lite ARM kernels")

 cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
-cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
+cc_library(activation_compute_arm SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -16,6 +16,7 @@ cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_a
 cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)

 lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
+lite_cc_test(test_activation_compute_arm SRCS activation_compute_test.cc DEPS activation_compute_arm)
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
 lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
@@ -27,7 +28,7 @@ lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_comput

 set(arm_kernels
    fc_compute_arm
-    relu_compute_arm
+    activation_compute_arm
    mul_compute_arm
    scale_compute_arm
    softmax_compute_arm

--- a/paddle/fluid/lite/kernels/arm/relu_compute.h
+++ b/paddle/fluid/lite/kernels/arm/relu_compute.h
@@ -12,31 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#pragma once
-#include <algorithm>
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/kernels/arm/activation_compute.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {

-class ReluCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    auto& param = Param<operators::ReluParam>();
-    auto n = param.input->dims().production();
-    const float* input = param.input->data<float>();
-    float* output = param.output->mutable_data<float>();
-    for (int i = 0; i < n; i++) {
-      output[i] = std::max(0.f, input[i]);
-    }
-  }
-
-  TargetType target() const override { return TARGET(kARM); }
-  PrecisionType precision() const override { return PRECISION(kFloat); }
-};
+void ReluCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_relu<float>(x_data, output_data, x_dims.production(),
+                                   ctx.threads());
+}

 }  // namespace arm
 }  // namespace kernels

--- a/paddle/fluid/lite/kernels/arm/relu_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/relu_compute.cc
@@ -12,4 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/lite/kernels/arm/relu_compute.h"
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ReluCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~ReluCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/activation_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/activation_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/activation_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename dtype>
+void activation_compute_ref(const operators::ActivationParam& param) {
+  auto x_data = param.X->data<dtype>();
+  auto output_data = param.Out->mutable_data<dtype>();
+  DDim x_dims = param.X->dims();
+  DDim output_dims = param.Out->dims();
+  ASSERT_EQ(x_dims.data(), output_dims.data());
+  for (int i = 0; i < output_dims.production(); i++) {
+    output_data[i] = std::max(0.f, x_data[i]);
+  }
+}
+
+TEST(activation_arm, retrive_op) {
+  auto activation =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("relu");
+  ASSERT_FALSE(activation.empty());
+  ASSERT_TRUE(activation.front());
+}
+
+TEST(activation_arm, init) {
+  ReluCompute activation;
+  ASSERT_EQ(activation.precision(), PRECISION(kFloat));
+  ASSERT_EQ(activation.target(), TARGET(kARM));
+}
+
+TEST(activation_arm, compute) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto c : {6, 32 /*, 128*/}) {
+      for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
+        for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
+          Tensor x;
+          Tensor output;
+          Tensor output_ref;
+          // set the dims of input, output, ref output tensors
+          x.Resize({n, c, h, w});
+          output.Resize({n, c, h, w});
+          output_ref.Resize({n, c, h, w});
+          // initialize the data of input tensors
+          auto* x_data = x.mutable_data<float>();
+          auto* output_data = output.mutable_data<float>();
+          for (int i = 0; i < x.dims().production(); i++) {
+            float sign = i % 3 == 0 ? -1.0f : 1.0f;
+            x_data[i] = sign * static_cast<float>(i % 128) * 0.013f;
+          }
+          // prepare kernel params and run
+          ReluCompute activation;
+          std::unique_ptr<KernelContext> ctx(new KernelContext);
+          ctx->As<ARMContext>();
+          activation.SetContext(std::move(ctx));
+          operators::ActivationParam param;
+          param.X = &x;
+          param.Out = &output;
+          activation.SetParam(param);
+          activation.Launch();
+          // invoking ref implementation and compare results
+          param.Out = &output_ref;
+          activation_compute_ref<float>(param);
+          auto* output_ref_data = output_ref.mutable_data<float>();
+          for (int i = 0; i < output.dims().production(); i++) {
+            EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);