merge ..

2b32484a · hanbuhe · e5f779de · 31150e99 · 2b32484a · 2b32484a
5 changed file
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -206,6 +206,20 @@ void pooling_basic(const float* din,
  "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \
  "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/
+#define P2x2S2P1_MAX                                                  \
+  "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */           \
+  "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */           \
+  "sub %[dr0], %[dr0], #4\n"                /* sub */                 \
+  "sub %[dr1], %[dr1], #4\n"                /* sub */                 \
+  "fmax  v4.4s, v0.4s, v6.4s\n"             /*  max */                \
+  "fmax  v5.4s, v2.4s, v8.4s\n"             /*  max */                \
+  "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n"    /* load q0-q1, dr0, 0-7*/ \
+  "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n"    /* load q2-q3, dr1, 0-7*/ \
+  "fmax  v6.4s, v4.4s, v5.4s\n"             /* max reduce */          \
+  "subs %w[cnt_num], %w[cnt_num], #1\n"     /* subs cnt_num, #1*/     \
+  "st1  {v6.4s}, [%[dr_out]], #16\n"        /* store 4 out, dr_out */ \
+  "ble       2f\n"                          /* bne s3_max_loop_mid */
 #define P2x2S2P0_MAX                                               \
  "1: \n"                                                          \
  "fmax  v4.4s, v0.4s, v1.4s\n"          /*  max */                \
@@ -217,6 +231,21 @@ void pooling_basic(const float* din,
  "st1  {v6.4s}, [%[dr_out]], #16\n"     /* store 4 out, dr_out */ \
  "bne       1b\n"                       /* bne s3_max_loop_mid */
+#define P2x2S2P1_AVG                                                          \
+  "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */                   \
+  "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */                   \
+  "sub %[dr0], %[dr0], #4\n"                /* sub */                         \
+  "sub %[dr1], %[dr1], #4\n"                /* sub */                         \
+  "fadd v4.4s, v0.4s, v6.4s\n"            /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
+  "fadd v5.4s, v2.4s, v8.4s\n"            /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
+  "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n"  /* load q0-q1, dr0, 0-7*/           \
+  "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n"  /* load q2-q3, dr1, 0-7*/           \
+  "fadd v6.4s, v4.4s, v5.4s\n"            /* add reduce */                    \
+  "subs %w[cnt_num], %w[cnt_num], #1\n"   /* subs cnt_num, #1*/               \
+  "fmul v4.4s, v6.4s, %[vcoef_left].4s\n" /* mul coef */                      \
+  "st1  {v4.4s}, [%[dr_out]], #16\n"      /* store 4 out, dr_out */           \
+  "ble       2f\n"                        /* bne s3_max_loop_mid */
 #define P2x2S2P0_AVG                                                         \
  "1: \n"                                /* load bias to q2, q3*/            \
  "fadd v4.4s, v0.4s, v1.4s\n"           /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
@@ -228,6 +257,7 @@ void pooling_basic(const float* din,
  "fmul v4.4s, v6.4s, %[vcoef].4s\n"     /* mul coef */                      \
  "st1  {v4.4s}, [%[dr_out]], #16\n"     /* store 4 out, dr_out */           \
  "bne       1b\n"                       /* bne s3_max_loop_mid */
 #define P3x3S1_INIT                                 \
  "ldr  q0, [%[dr0]], #16\n" /* load q0, dr0, 0-3*/ \
  "ldr  q1, [%[dr1]], #16\n" /* load q1, dr1, 0-3*/ \
@@ -518,16 +548,45 @@ void pooling_basic(const float* din,
  "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load \n" \
  "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"
+#define P2x2S2P1_MAX                                                 \
+  "vext.32 q4, %q[vzero], q1, #3                  @ 1357-0135\n"     \
+  "vext.32 q5, %q[vzero], q3, #3                  @ 1357-0135\n"     \
+  "sub %[dr0], #4                                 @sub \n"           \
+  "sub %[dr1], #4                                 @sub \n"           \
+  "vmax.f32  q8, q0, q4                           @ max \n"          \
+  "vmax.f32  q9, q2, q5                           @ max \n"          \
+  "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load \n"         \
+  "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"         \
+  "vmax.f32  q5, q9, q8                           @ max reduce\n"    \
+  "subs   %[cnt_num], #1                          @ subs cnt_num \n" \
+  "vst1.f32  {d10-d11}, [%[dr_out]]!              @ store 4 out \n"  \
+  "ble       2f                                   @ bne \n"
 #define P2x2S2P0_MAX                                                  \
  "1:                                             @ main loop\n"      \
  "vmax.f32  q4, q0, q1                           @ max \n"           \
  "vmax.f32  q5, q2, q3                           @ max \n"           \
  "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load \n"          \
  "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"          \
-  "vmax.f32  q6, q4, q5                           @ max reduce\n"     \
+  "vmax.f32  q8, q4, q5                           @ max reduce\n"     \
  "subs      %[cnt_num], #1                       @ subs cnt_num \n"  \
-  "vst1.f32  {d12-d13}, [%[dr_out]]!                @ store 4 out \n" \
+  "vst1.f32  {d16-d17}, [%[dr_out]]!                @ store 4 out \n" \
-  "bne       1b                                   @ bne "
+  "bne       1b                                   @ bne \n"
+#define P2x2S2P1_AVG                                                 \
+  "vext.32 q4, %q[vzero], q1, #3                  @ 1357-0135\n"     \
+  "vext.32 q5, %q[vzero], q3, #3                  @ 1357-0135\n"     \
+  "sub %[dr0], #4                                 @sub \n"           \
+  "sub %[dr1], #4                                 @sub \n"           \
+  "vadd.f32  q9, q0, q4                           @ max \n"          \
+  "vadd.f32  q8, q2, q5                           @ max \n"          \
+  "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load \n"         \
+  "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"         \
+  "vadd.f32  q5, q9, q8                           @ max reduce\n"    \
+  "subs      %[cnt_num], #1                       @ subs cnt_num \n" \
+  "vmul.f32  q4, q5, %q[vcoef_left]               @ mul coef \n"     \
+  "vst1.f32  {d8-d9}, [%[dr_out]]!                @ store 4 out \n"  \
+  "ble       2f                                   @ bne\n"
 #define P2x2S2P0_AVG                                                   \
  "1:                                             @ main loop\n"       \
@@ -535,9 +594,9 @@ void pooling_basic(const float* din,
  "vadd.f32  q5, q2, q3                           @ add 0, 2, 4, 6 \n" \
  "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load d0-d3 \n"     \
  "vld2.f32  {d4-d7}, [%[dr1]]!                  @ load d4-d7 \n"      \
-  "vadd.f32  q6, q4, q5                           @ add reduce \n"     \
+  "vadd.f32  q8, q4, q5                           @ add reduce \n"     \
  "subs      %[cnt_num], #1                       @ subs \n"           \
-  "vmul.f32  q4, q6, %q[vcoef]                    @ mul coef \n"       \
+  "vmul.f32  q4, q8, %q[vcoef]                    @ mul coef \n"       \
  "vst1.f32  {d8-d9}, [%[dr_out]]!                @ store 4 out \n"    \
  "bne       1b                                   @ bne \n"
@@ -1037,17 +1096,17 @@ void pooling1x1s2p0_max(const float* din,
  TargetFree(TARGET(kARM), write_ptr);
 }
-void pooling2x2s2_max(const float* din,
+void pooling2x2s2p0_max(const float* din,
-                      float* dout,
+                        float* dout,
-                      int num,
+                        int num,
-                      int chout,
+                        int chout,
-                      int hout,
+                        int hout,
-                      int wout,
+                        int wout,
-                      int chin,
+                        int chin,
-                      int hin,
+                        int hin,
-                      int win,
+                        int win,
-                      int pad_bottom,
+                        int pad_bottom,
-                      int pad_right) {
+                        int pad_right) {
  int size_channel_out = wout * hout;
  int size_channel_in = win * hin;
  auto data_out = static_cast<float*>(dout);
@@ -1095,7 +1154,7 @@ void pooling2x2s2_max(const float* din,
                [dr_out] "+r"(dr_out),
                [cnt_num] "+r"(cnt_num)
              :
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8");
 #endif
          dr0 -= 8;
          dr1 -= 8;
@@ -1121,18 +1180,18 @@ void pooling2x2s2_max(const float* din,
  }
 }
-void pooling2x2s2_avg(const float* din,
+void pooling2x2s2p0_avg(const float* din,
-                      float* dout,
+                        float* dout,
-                      int num,
+                        int num,
-                      int chout,
+                        int chout,
-                      int hout,
+                        int hout,
-                      int wout,
+                        int wout,
-                      int chin,
+                        int chin,
-                      int hin,
+                        int hin,
-                      int win,
+                        int win,
-                      bool exclusive,
+                        bool exclusive,
-                      int pad_bottom,
+                        int pad_bottom,
-                      int pad_right) {
+                        int pad_right) {
  int size_channel_out = wout * hout;
  int size_channel_in = win * hin;
  auto data_out = static_cast<float*>(dout);
@@ -1158,12 +1217,14 @@ void pooling2x2s2_avg(const float* din,
      const float* data_in_channel = data_in_batch + c * size_channel_in;
      const float* r0 = data_in_channel;
      const float* r1 = r0 + win;
+      vcoef = vdupq_n_f32(0.25f);
      for (int h = 0; h < hout; h++) {
        float* dr_out = data_out_channel;
        auto dr0 = r0;
        auto dr1 = r1;
        if (h * S + K - P > hin) {
          dr1 = zero_ptr;
+          vcoef = vdupq_n_f32(0.5f);
        }
        int cnt_num = w_unroll_size;
        if (w_unroll_size > 0) {
@@ -1184,7 +1245,7 @@ void pooling2x2s2_avg(const float* din,
                [dr_out] "+r"(dr_out),
                [cnt_num] "+r"(cnt_num)
              : [vcoef] "w"(vcoef)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8");
 #endif
          dr0 -= 8;
          dr1 -= 8;
@@ -1194,8 +1255,14 @@ void pooling2x2s2_avg(const float* din,
        int wstart = 0;
        for (int j = 0; j < w_unroll_remian; ++j) {
          int wend = std::min(wstart + K, rem);
-          float coef = 0.5f / (wend - wstart);
+          float coef = 0.25f;
          float tmp = 0.f;
+          if (wend - wstart == 1 && pad_right == 0) {
+            coef *= 2;
+          }
+          if (h * S + K - P > hin && pad_bottom == 0) {
+            coef *= 2;
+          }
          for (int i = wstart; i < wend; i++) {
            tmp += dr0[i] + dr1[i];
          }
@@ -1212,6 +1279,235 @@ void pooling2x2s2_avg(const float* din,
  TargetFree(TARGET(kARM), zero_ptr);
 }
+void pooling2x2s2p1_max(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        int pad_bottom,
+                        int pad_right) {
+  int size_channel_out = wout * hout;
+  int size_channel_in = win * hin;
+  auto data_out = static_cast<float*>(dout);
+  auto data_in = static_cast<const float*>(din);
+  const int K = 2;
+  const int P = 1;
+  const int S = 2;
+  int w_unroll_size = wout / 4;
+  int w_unroll_remian = wout - w_unroll_size * 4;
+  float32x4_t vzero = vdupq_n_f32(std::numeric_limits<float>::lowest());
+  if (w_unroll_remian == 0) {
+    w_unroll_size -= 1;
+    w_unroll_remian = wout - w_unroll_size * 4;
+  }
+  for (int n = 0; n < num; ++n) {
+    float* data_out_batch = data_out + n * chout * size_channel_out;
+    const float* data_in_batch = data_in + n * chin * size_channel_in;
+#pragma omp parallel for
+    for (int c = 0; c < chout; c++) {
+      float* data_out_channel = data_out_batch + c * size_channel_out;
+      const float* data_in_channel = data_in_batch + c * size_channel_in;
+      const float* r0 = data_in_channel;
+      const float* r1 = r0 + win;
+      for (int h = 0; h < hout; h++) {
+        float* dr_out = data_out_channel;
+        auto dr0 = r0;
+        auto dr1 = r1;
+        if (h == 0) {
+          dr0 = r0;
+          dr1 = r0;
+          r0 = r1;
+          r1 = r0 + win;
+        } else {
+          r0 = r1 + win;
+          r1 = r0 + win;
+        }
+        if (h * S + K - P > hin) {
+          dr1 = dr0;
+          if (h * S + K - P > hin + 1) {
+            memset(dr_out, 0, wout * sizeof(float));
+            continue;
+          }
+        }
+        int cnt_num = w_unroll_size;
+        if (w_unroll_size > 0) {
+#ifdef __aarch64__
+          asm volatile(
+              P2x2S2_INIT P2x2S2P1_MAX P2x2S2P0_MAX "2: \n" /* end */
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr_out] "+r"(dr_out),
+                [cnt_num] "+r"(cnt_num)
+              : [vzero] "w"(vzero)
+              : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8");
+#else
+          asm volatile(
+              P2x2S2_INIT P2x2S2P1_MAX P2x2S2P0_MAX "2: \n" /* end */
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr_out] "+r"(dr_out),
+                [cnt_num] "+r"(cnt_num)
+              : [vzero] "w"(vzero)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9");
+#endif
+          dr0 -= 8;
+          dr1 -= 8;
+        }
+        // deal with right pad
+        int wstart = w_unroll_size * 4 * S - P;
+        for (int j = 0; j < w_unroll_remian; ++j) {
+          int wend = std::min(wstart + K, win);
+          int st = wstart > 0 ? wstart : 0;
+          float tmp = wend == st ? 0.f : dr0[0];
+          for (int i = 0; i < wend - st; i++) {
+            tmp = std::max(tmp, dr0[i]);
+            tmp = std::max(tmp, dr1[i]);
+          }
+          *(dr_out++) = tmp;
+          dr0 += S - (st - wstart);
+          dr1 += S - (st - wstart);
+          wstart += S;
+        }
+        data_out_channel += wout;
+      }
+    }
+  }
+}
+void pooling2x2s2p1_avg(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right) {
+  int size_channel_out = wout * hout;
+  int size_channel_in = win * hin;
+  auto data_out = static_cast<float*>(dout);
+  auto data_in = static_cast<const float*>(din);
+  const int K = 2;
+  const int P = 1;
+  const int S = 2;
+  int w_unroll_size = wout / 4;
+  int w_unroll_remian = wout - w_unroll_size * 4;
+  auto zero_ptr =
+      static_cast<float*>(TargetMalloc(TARGET(kARM), win * sizeof(float)));
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  memset(zero_ptr, 0, win * sizeof(float));
+  if (w_unroll_remian == 0) {
+    w_unroll_size -= 1;
+    w_unroll_remian = wout - w_unroll_size * 4;
+  }
+  for (int n = 0; n < num; ++n) {
+    float* data_out_batch = data_out + n * chout * size_channel_out;
+    const float* data_in_batch = data_in + n * chin * size_channel_in;
+#pragma omp parallel for
+    for (int c = 0; c < chout; c++) {
+      float* data_out_channel = data_out_batch + c * size_channel_out;
+      const float* data_in_channel = data_in_batch + c * size_channel_in;
+      const float* r0 = data_in_channel;
+      const float* r1 = r0 + win;
+      for (int h = 0; h < hout; h++) {
+        float* dr_out = data_out_channel;
+        auto dr0 = r0;
+        auto dr1 = r1;
+        float coef_h = 0.5f;
+        if (h == 0) {
+          dr0 = zero_ptr;
+          dr1 = r0;
+          r0 = r1;
+          r1 = r0 + win;
+          if (exclusive) {
+            coef_h = 1.f;
+          }
+        } else {
+          r0 = r1 + win;
+          r1 = r0 + win;
+        }
+        if (h * S + K - P > hin) {
+          dr1 = zero_ptr;
+          if (exclusive) {
+            coef_h = 1.f;
+          }
+          if (h * S + K - P > hin + 1) {
+            memset(dr_out, 0, wout * sizeof(float));
+            continue;
+          }
+        }
+        float coef_left_most = exclusive ? coef_h : coef_h / 2;
+        float32x4_t vcoef = vdupq_n_f32(coef_h / 2);
+        float coef_left[4] = {
+            coef_left_most, coef_h / 2, coef_h / 2, coef_h / 2};
+        float32x4_t vcoef_left = vld1q_f32(coef_left);
+        int cnt_num = w_unroll_size;
+        if (w_unroll_size > 0) {
+#ifdef __aarch64__
+          asm volatile(
+              P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n"
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr_out] "+r"(dr_out),
+                [cnt_num] "+r"(cnt_num)
+              : [vcoef] "w"(vcoef),
+                [vzero] "w"(vzero),
+                [vcoef_left] "w"(vcoef_left)
+              : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8");
+#else
+          asm volatile(
+              P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n"
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr_out] "+r"(dr_out),
+                [cnt_num] "+r"(cnt_num)
+              : [vcoef] "w"(vcoef),
+                [vzero] "w"(vzero),
+                [vcoef_left] "w"(vcoef_left)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9");
+#endif
+          dr0 -= 8;
+          dr1 -= 8;
+        }
+        // deal with right pad
+        int wstart = w_unroll_size * 4 * S - P;
+        for (int j = 0; j < w_unroll_remian; ++j) {
+          int wend = std::min(wstart + K, win);
+          int st = wstart > 0 ? wstart : 0;
+          float tmp = 0.f;
+          float coef = coef_h / 2;
+          if (exclusive && wend - st == 1) {
+            coef = coef_h;
+          }
+          for (int i = 0; i < wend - st; i++) {
+            tmp += dr0[i] + dr1[i];
+          }
+          *(dr_out++) = tmp * coef;
+          dr0 += S - (st - wstart);
+          dr1 += S - (st - wstart);
+          wstart += S;
+        }
+        data_out_channel += wout;
+      }
+    }
+  }
+  TargetFree(TARGET(kARM), zero_ptr);
+}
 void pooling3x3s1p1_max(const float* din,
                        float* dout,
                        int num,
@@ -2240,6 +2536,9 @@ void pooling3x3s2p0_max(const float* din,
    w_unroll_remian = wout - w_unroll_size * 4;
  }
+  int remain = w_unroll_remian - 1;
+  int right = wout * 2 + 1 - win;  // if need right pad
  for (int n = 0; n < num; ++n) {
    float* data_out_batch = data_out + n * chout * size_channel_out;
    const float* data_in_batch = data_in + n * chin * size_channel_in;
@@ -2266,6 +2565,7 @@ void pooling3x3s2p0_max(const float* din,
          }
        }
        int cnt_num = w_unroll_size;
+        int cnt_remain = remain;
        if (w_unroll_size > 0) {
 #ifdef __aarch64__
          asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX
@@ -2289,46 +2589,80 @@ void pooling3x3s2p0_max(const float* din,
                         "v9",
                         "v10",
                         "v11");
-#else
-          asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX
-                       : [dr0] "+r"(dr0),
-                         [dr1] "+r"(dr1),
-                         [dr2] "+r"(dr2),
-                         [dr_out] "+r"(dr_out),
-                         [cnt_num] "+r"(cnt_num)
-                       :
-                       : "cc",
-                         "memory",
-                         "q0",
-                         "q1",
-                         "q2",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11");
-#endif
          dr0 -= 8;
          dr1 -= 8;
          dr2 -= 8;
-        }
+          int rem = win - (w_unroll_size * 4) * S;
-        // deal with right pad
+          int wstart = 0;
-        int rem = win - (w_unroll_size * 4) * S;
+          for (int j = 0; j < w_unroll_remian; ++j) {
-        int wstart = 0;
+            int wend = std::min(wstart + K, rem);
-        for (int j = 0; j < w_unroll_remian; ++j) {
+            float tmp = dr0[wstart];  // std::numeric_limits<float>::min();
-          int wend = std::min(wstart + K, rem);
+            for (int i = wstart; i < wend; i++) {
-          float tmp = dr0[wstart];  // std::numeric_limits<float>::min();
+              tmp = std::max(tmp, dr0[i]);
-          for (int i = wstart; i < wend; i++) {
+              tmp = std::max(tmp, dr1[i]);
-            tmp = std::max(tmp, dr0[i]);
+              tmp = std::max(tmp, dr2[i]);
-            tmp = std::max(tmp, dr1[i]);
+            }
-            tmp = std::max(tmp, dr2[i]);
+            *(dr_out++) = tmp;
+            wstart += S;
          }
-          *(dr_out++) = tmp;
+#else
-          wstart += S;
+          asm volatile(
+              P3x3S2P0_INIT P3x3S2P0_MAX
+              "cmp       %[remain], #0                         @cmp cnt_num\n"
+              "sub       %[dr0], #32                           @sub - 8\n"
+              "sub       %[dr1], #32                           @sub - 8\n"
+              "sub       %[dr2], #32                           @sub - 8\n"
+              "ble       4f                                    @ble exit1\n"
+              "2:                                              @mid loop\n"
+              "vld1.f32  {d0-d1}, [%[dr0]]!                    @load \n"
+              "vld1.f32  {d2-d3}, [%[dr1]]!                    @load \n"
+              "vld1.f32  {d4-d5}, [%[dr2]]!                    @load \n"
+              "vmov.f32  s3,s2                                 @mov \n"
+              "vmov.f32  s7,s6                                 @mov \n"
+              "vmov.f32  s11,s10                               @mov \n"
+              "vmax.f32  q0, q0, q1                            @max n"
+              "sub       %[dr0], #8                            @add w \n"
+              "sub       %[dr1], #8                            @add w \n"
+              "sub       %[dr2], #8                            @add w \n"
+              "vmax.f32  q0, q0, q2                            @max \n"
+              "vpmax.f32 d0, d0, d1                            @pmax \n"
+              "vpmax.f32 d0, d0, d0                            @pmax \n"
+              "subs      %[remain], #1                         @subs \n"
+              "vst1.f32  d0[0], [%[dr_out]]!                   @vst \n"
+              "bne       2b                                    @bne \n"
+              "4:                                              @exit\n"
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr2] "+r"(dr2),
+                [dr_out] "+r"(dr_out),
+                [remain] "+r"(cnt_remain),
+                [cnt_num] "+r"(cnt_num)
+              :
+              : "cc",
+                "memory",
+                "q0",
+                "q1",
+                "q2",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11");
+          if (right) {
+            int wstart = (w_unroll_size * 4 + remain) * S;
+            int wend = std::min(wstart + K, win);
+            float tmp = dr0[wstart];  // std::numeric_limits<float>::min();
+            for (int i = wstart; i < wend; i++) {
+              tmp = std::max(tmp, std::max(dr0[i], dr1[i]));
+              tmp = std::max(tmp, dr2[i]);
+            }
+            *(dr_out++) = tmp;
+          }
+#endif
        }
        r0 = r2;

--- a/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cl_common.h>
+__kernel void sigmoid(__global const CL_DTYPE* x_data, const int count, __global CL_DTYPE* out_data) {
+  const int index = get_global_id(0); 
+  if (index < count) {
+    out_data[index] = 1 / (1 + exp(-x_data[index]));
+  }
+}
--- a/lite/core/mir/fusion/fc_fuser.cc
+++ b/lite/core/mir/fusion/fc_fuser.cc
@@ -42,7 +42,6 @@ void FcFuser::BuildPattern() {
  add->AsIntermediate();
  if (act_type_ != "") {
-    std::cout << "act_type_:" << act_type_ << std::endl;
    auto* add_out = VarNode("add_out");
    auto* activation = OpNode(act_type_, act_type_);
    std::vector<PMNode*> act_inputs{add_out};
@@ -51,7 +50,6 @@ void FcFuser::BuildPattern() {
    add_out->AsIntermediate();
    activation->AsIntermediate();
  } else {
-    std::cout << "act_type_: empty" << std::endl;
    add_inputs >> *add >> *Out;
  }
 }

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h