[arm]add 2x2s2p1 pooling (#3705)

* fix pooling bug and speed * add 2x2s2p1 pooling. test=develop * fix conflict, test=develop

[arm]add 2x2s2p1 pooling (#3705)
* fix pooling bug and speed * add 2x2s2p1 pooling. test=develop * fix conflict, test=develop
31150e99 · HappyAngel · GitHub · 49a60805 · 31150e99 · 31150e99
5 changed file
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -206,6 +206,20 @@ void pooling_basic(const float* din,
  "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \
  "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/
+#define P2x2S2P1_MAX                                                  \
+  "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */           \
+  "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */           \
+  "sub %[dr0], %[dr0], #4\n"                /* sub */                 \
+  "sub %[dr1], %[dr1], #4\n"                /* sub */                 \
+  "fmax  v4.4s, v0.4s, v6.4s\n"             /*  max */                \
+  "fmax  v5.4s, v2.4s, v8.4s\n"             /*  max */                \
+  "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n"    /* load q0-q1, dr0, 0-7*/ \
+  "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n"    /* load q2-q3, dr1, 0-7*/ \
+  "fmax  v6.4s, v4.4s, v5.4s\n"             /* max reduce */          \
+  "subs %w[cnt_num], %w[cnt_num], #1\n"     /* subs cnt_num, #1*/     \
+  "st1  {v6.4s}, [%[dr_out]], #16\n"        /* store 4 out, dr_out */ \
+  "ble       2f\n"                          /* bne s3_max_loop_mid */
 #define P2x2S2P0_MAX                                               \
  "1: \n"                                                          \
  "fmax  v4.4s, v0.4s, v1.4s\n"          /*  max */                \
@@ -217,6 +231,21 @@ void pooling_basic(const float* din,
  "st1  {v6.4s}, [%[dr_out]], #16\n"     /* store 4 out, dr_out */ \
  "bne       1b\n"                       /* bne s3_max_loop_mid */
+#define P2x2S2P1_AVG                                                          \
+  "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */                   \
+  "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */                   \
+  "sub %[dr0], %[dr0], #4\n"                /* sub */                         \
+  "sub %[dr1], %[dr1], #4\n"                /* sub */                         \
+  "fadd v4.4s, v0.4s, v6.4s\n"            /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
+  "fadd v5.4s, v2.4s, v8.4s\n"            /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
+  "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n"  /* load q0-q1, dr0, 0-7*/           \
+  "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n"  /* load q2-q3, dr1, 0-7*/           \
+  "fadd v6.4s, v4.4s, v5.4s\n"            /* add reduce */                    \
+  "subs %w[cnt_num], %w[cnt_num], #1\n"   /* subs cnt_num, #1*/               \
+  "fmul v4.4s, v6.4s, %[vcoef_left].4s\n" /* mul coef */                      \
+  "st1  {v4.4s}, [%[dr_out]], #16\n"      /* store 4 out, dr_out */           \
+  "ble       2f\n"                        /* bne s3_max_loop_mid */
 #define P2x2S2P0_AVG                                                         \
  "1: \n"                                /* load bias to q2, q3*/            \
  "fadd v4.4s, v0.4s, v1.4s\n"           /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
@@ -228,6 +257,7 @@ void pooling_basic(const float* din,
  "fmul v4.4s, v6.4s, %[vcoef].4s\n"     /* mul coef */                      \
  "st1  {v4.4s}, [%[dr_out]], #16\n"     /* store 4 out, dr_out */           \
  "bne       1b\n"                       /* bne s3_max_loop_mid */
 #define P3x3S1_INIT                                 \
  "ldr  q0, [%[dr0]], #16\n" /* load q0, dr0, 0-3*/ \
  "ldr  q1, [%[dr1]], #16\n" /* load q1, dr1, 0-3*/ \
@@ -518,16 +548,45 @@ void pooling_basic(const float* din,
  "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load \n" \
  "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"
+#define P2x2S2P1_MAX                                                 \
+  "vext.32 q4, %q[vzero], q1, #3                  @ 1357-0135\n"     \
+  "vext.32 q5, %q[vzero], q3, #3                  @ 1357-0135\n"     \
+  "sub %[dr0], #4                                 @sub \n"           \
+  "sub %[dr1], #4                                 @sub \n"           \
+  "vmax.f32  q8, q0, q4                           @ max \n"          \
+  "vmax.f32  q9, q2, q5                           @ max \n"          \
+  "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load \n"         \
+  "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"         \
+  "vmax.f32  q5, q9, q8                           @ max reduce\n"    \
+  "subs   %[cnt_num], #1                          @ subs cnt_num \n" \
+  "vst1.f32  {d10-d11}, [%[dr_out]]!              @ store 4 out \n"  \
+  "ble       2f                                   @ bne \n"
 #define P2x2S2P0_MAX                                                  \
  "1:                                             @ main loop\n"      \
  "vmax.f32  q4, q0, q1                           @ max \n"           \
  "vmax.f32  q5, q2, q3                           @ max \n"           \
  "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load \n"          \
  "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"          \
-  "vmax.f32  q6, q4, q5                           @ max reduce\n"     \
+  "vmax.f32  q8, q4, q5                           @ max reduce\n"     \
  "subs      %[cnt_num], #1                       @ subs cnt_num \n"  \
-  "vst1.f32  {d12-d13}, [%[dr_out]]!                @ store 4 out \n" \
+  "vst1.f32  {d16-d17}, [%[dr_out]]!                @ store 4 out \n" \
-  "bne       1b                                   @ bne "
+  "bne       1b                                   @ bne \n"
+#define P2x2S2P1_AVG                                                 \
+  "vext.32 q4, %q[vzero], q1, #3                  @ 1357-0135\n"     \
+  "vext.32 q5, %q[vzero], q3, #3                  @ 1357-0135\n"     \
+  "sub %[dr0], #4                                 @sub \n"           \
+  "sub %[dr1], #4                                 @sub \n"           \
+  "vadd.f32  q9, q0, q4                           @ max \n"          \
+  "vadd.f32  q8, q2, q5                           @ max \n"          \
+  "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load \n"         \
+  "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"         \
+  "vadd.f32  q5, q9, q8                           @ max reduce\n"    \
+  "subs      %[cnt_num], #1                       @ subs cnt_num \n" \
+  "vmul.f32  q4, q5, %q[vcoef_left]               @ mul coef \n"     \
+  "vst1.f32  {d8-d9}, [%[dr_out]]!                @ store 4 out \n"  \
+  "ble       2f                                   @ bne\n"
 #define P2x2S2P0_AVG                                                   \
  "1:                                             @ main loop\n"       \
@@ -535,9 +594,9 @@ void pooling_basic(const float* din,
  "vadd.f32  q5, q2, q3                           @ add 0, 2, 4, 6 \n" \
  "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load d0-d3 \n"     \
  "vld2.f32  {d4-d7}, [%[dr1]]!                  @ load d4-d7 \n"      \
-  "vadd.f32  q6, q4, q5                           @ add reduce \n"     \
+  "vadd.f32  q8, q4, q5                           @ add reduce \n"     \
  "subs      %[cnt_num], #1                       @ subs \n"           \
-  "vmul.f32  q4, q6, %q[vcoef]                    @ mul coef \n"       \
+  "vmul.f32  q4, q8, %q[vcoef]                    @ mul coef \n"       \
  "vst1.f32  {d8-d9}, [%[dr_out]]!                @ store 4 out \n"    \
  "bne       1b                                   @ bne \n"
@@ -1037,17 +1096,17 @@ void pooling1x1s2p0_max(const float* din,
  TargetFree(TARGET(kARM), write_ptr);
 }
-void pooling2x2s2_max(const float* din,
+void pooling2x2s2p0_max(const float* din,
-                      float* dout,
+                        float* dout,
-                      int num,
+                        int num,
-                      int chout,
+                        int chout,
-                      int hout,
+                        int hout,
-                      int wout,
+                        int wout,
-                      int chin,
+                        int chin,
-                      int hin,
+                        int hin,
-                      int win,
+                        int win,
-                      int pad_bottom,
+                        int pad_bottom,
-                      int pad_right) {
+                        int pad_right) {
  int size_channel_out = wout * hout;
  int size_channel_in = win * hin;
  auto data_out = static_cast<float*>(dout);
@@ -1095,7 +1154,7 @@ void pooling2x2s2_max(const float* din,
                [dr_out] "+r"(dr_out),
                [cnt_num] "+r"(cnt_num)
              :
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8");
 #endif
          dr0 -= 8;
          dr1 -= 8;
@@ -1121,18 +1180,18 @@ void pooling2x2s2_max(const float* din,
  }
 }
-void pooling2x2s2_avg(const float* din,
+void pooling2x2s2p0_avg(const float* din,
-                      float* dout,
+                        float* dout,
-                      int num,
+                        int num,
-                      int chout,
+                        int chout,
-                      int hout,
+                        int hout,
-                      int wout,
+                        int wout,
-                      int chin,
+                        int chin,
-                      int hin,
+                        int hin,
-                      int win,
+                        int win,
-                      bool exclusive,
+                        bool exclusive,
-                      int pad_bottom,
+                        int pad_bottom,
-                      int pad_right) {
+                        int pad_right) {
  int size_channel_out = wout * hout;
  int size_channel_in = win * hin;
  auto data_out = static_cast<float*>(dout);
@@ -1158,12 +1217,14 @@ void pooling2x2s2_avg(const float* din,
      const float* data_in_channel = data_in_batch + c * size_channel_in;
      const float* r0 = data_in_channel;
      const float* r1 = r0 + win;
+      vcoef = vdupq_n_f32(0.25f);
      for (int h = 0; h < hout; h++) {
        float* dr_out = data_out_channel;
        auto dr0 = r0;
        auto dr1 = r1;
        if (h * S + K - P > hin) {
          dr1 = zero_ptr;
+          vcoef = vdupq_n_f32(0.5f);
        }
        int cnt_num = w_unroll_size;
        if (w_unroll_size > 0) {
@@ -1184,7 +1245,7 @@ void pooling2x2s2_avg(const float* din,
                [dr_out] "+r"(dr_out),
                [cnt_num] "+r"(cnt_num)
              : [vcoef] "w"(vcoef)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8");
 #endif
          dr0 -= 8;
          dr1 -= 8;
@@ -1194,8 +1255,14 @@ void pooling2x2s2_avg(const float* din,
        int wstart = 0;
        for (int j = 0; j < w_unroll_remian; ++j) {
          int wend = std::min(wstart + K, rem);
-          float coef = 0.5f / (wend - wstart);
+          float coef = 0.25f;
          float tmp = 0.f;
+          if (wend - wstart == 1 && pad_right == 0) {
+            coef *= 2;
+          }
+          if (h * S + K - P > hin && pad_bottom == 0) {
+            coef *= 2;
+          }
          for (int i = wstart; i < wend; i++) {
            tmp += dr0[i] + dr1[i];
          }
@@ -1212,6 +1279,235 @@ void pooling2x2s2_avg(const float* din,
  TargetFree(TARGET(kARM), zero_ptr);
 }
+void pooling2x2s2p1_max(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        int pad_bottom,
+                        int pad_right) {
+  int size_channel_out = wout * hout;
+  int size_channel_in = win * hin;
+  auto data_out = static_cast<float*>(dout);
+  auto data_in = static_cast<const float*>(din);
+  const int K = 2;
+  const int P = 1;
+  const int S = 2;
+  int w_unroll_size = wout / 4;
+  int w_unroll_remian = wout - w_unroll_size * 4;
+  float32x4_t vzero = vdupq_n_f32(std::numeric_limits<float>::lowest());
+  if (w_unroll_remian == 0) {
+    w_unroll_size -= 1;
+    w_unroll_remian = wout - w_unroll_size * 4;
+  }
+  for (int n = 0; n < num; ++n) {
+    float* data_out_batch = data_out + n * chout * size_channel_out;
+    const float* data_in_batch = data_in + n * chin * size_channel_in;
+#pragma omp parallel for
+    for (int c = 0; c < chout; c++) {
+      float* data_out_channel = data_out_batch + c * size_channel_out;
+      const float* data_in_channel = data_in_batch + c * size_channel_in;
+      const float* r0 = data_in_channel;
+      const float* r1 = r0 + win;
+      for (int h = 0; h < hout; h++) {
+        float* dr_out = data_out_channel;
+        auto dr0 = r0;
+        auto dr1 = r1;
+        if (h == 0) {
+          dr0 = r0;
+          dr1 = r0;
+          r0 = r1;
+          r1 = r0 + win;
+        } else {
+          r0 = r1 + win;
+          r1 = r0 + win;
+        }
+        if (h * S + K - P > hin) {
+          dr1 = dr0;
+          if (h * S + K - P > hin + 1) {
+            memset(dr_out, 0, wout * sizeof(float));
+            continue;
+          }
+        }
+        int cnt_num = w_unroll_size;
+        if (w_unroll_size > 0) {
+#ifdef __aarch64__
+          asm volatile(
+              P2x2S2_INIT P2x2S2P1_MAX P2x2S2P0_MAX "2: \n" /* end */
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr_out] "+r"(dr_out),
+                [cnt_num] "+r"(cnt_num)
+              : [vzero] "w"(vzero)
+              : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8");
+#else
+          asm volatile(
+              P2x2S2_INIT P2x2S2P1_MAX P2x2S2P0_MAX "2: \n" /* end */
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr_out] "+r"(dr_out),
+                [cnt_num] "+r"(cnt_num)
+              : [vzero] "w"(vzero)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9");
+#endif
+          dr0 -= 8;
+          dr1 -= 8;
+        }
+        // deal with right pad
+        int wstart = w_unroll_size * 4 * S - P;
+        for (int j = 0; j < w_unroll_remian; ++j) {
+          int wend = std::min(wstart + K, win);
+          int st = wstart > 0 ? wstart : 0;
+          float tmp = wend == st ? 0.f : dr0[0];
+          for (int i = 0; i < wend - st; i++) {
+            tmp = std::max(tmp, dr0[i]);
+            tmp = std::max(tmp, dr1[i]);
+          }
+          *(dr_out++) = tmp;
+          dr0 += S - (st - wstart);
+          dr1 += S - (st - wstart);
+          wstart += S;
+        }
+        data_out_channel += wout;
+      }
+    }
+  }
+}
+void pooling2x2s2p1_avg(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right) {
+  int size_channel_out = wout * hout;
+  int size_channel_in = win * hin;
+  auto data_out = static_cast<float*>(dout);
+  auto data_in = static_cast<const float*>(din);
+  const int K = 2;
+  const int P = 1;
+  const int S = 2;
+  int w_unroll_size = wout / 4;
+  int w_unroll_remian = wout - w_unroll_size * 4;
+  auto zero_ptr =
+      static_cast<float*>(TargetMalloc(TARGET(kARM), win * sizeof(float)));
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  memset(zero_ptr, 0, win * sizeof(float));
+  if (w_unroll_remian == 0) {
+    w_unroll_size -= 1;
+    w_unroll_remian = wout - w_unroll_size * 4;
+  }
+  for (int n = 0; n < num; ++n) {
+    float* data_out_batch = data_out + n * chout * size_channel_out;
+    const float* data_in_batch = data_in + n * chin * size_channel_in;
+#pragma omp parallel for
+    for (int c = 0; c < chout; c++) {
+      float* data_out_channel = data_out_batch + c * size_channel_out;
+      const float* data_in_channel = data_in_batch + c * size_channel_in;
+      const float* r0 = data_in_channel;
+      const float* r1 = r0 + win;
+      for (int h = 0; h < hout; h++) {
+        float* dr_out = data_out_channel;
+        auto dr0 = r0;
+        auto dr1 = r1;
+        float coef_h = 0.5f;
+        if (h == 0) {
+          dr0 = zero_ptr;
+          dr1 = r0;
+          r0 = r1;
+          r1 = r0 + win;
+          if (exclusive) {
+            coef_h = 1.f;
+          }
+        } else {
+          r0 = r1 + win;
+          r1 = r0 + win;
+        }
+        if (h * S + K - P > hin) {
+          dr1 = zero_ptr;
+          if (exclusive) {
+            coef_h = 1.f;
+          }
+          if (h * S + K - P > hin + 1) {
+            memset(dr_out, 0, wout * sizeof(float));
+            continue;
+          }
+        }
+        float coef_left_most = exclusive ? coef_h : coef_h / 2;
+        float32x4_t vcoef = vdupq_n_f32(coef_h / 2);
+        float coef_left[4] = {
+            coef_left_most, coef_h / 2, coef_h / 2, coef_h / 2};
+        float32x4_t vcoef_left = vld1q_f32(coef_left);
+        int cnt_num = w_unroll_size;
+        if (w_unroll_size > 0) {
+#ifdef __aarch64__
+          asm volatile(
+              P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n"
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr_out] "+r"(dr_out),
+                [cnt_num] "+r"(cnt_num)
+              : [vcoef] "w"(vcoef),
+                [vzero] "w"(vzero),
+                [vcoef_left] "w"(vcoef_left)
+              : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8");
+#else
+          asm volatile(
+              P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n"
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr_out] "+r"(dr_out),
+                [cnt_num] "+r"(cnt_num)
+              : [vcoef] "w"(vcoef),
+                [vzero] "w"(vzero),
+                [vcoef_left] "w"(vcoef_left)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9");
+#endif
+          dr0 -= 8;
+          dr1 -= 8;
+        }
+        // deal with right pad
+        int wstart = w_unroll_size * 4 * S - P;
+        for (int j = 0; j < w_unroll_remian; ++j) {
+          int wend = std::min(wstart + K, win);
+          int st = wstart > 0 ? wstart : 0;
+          float tmp = 0.f;
+          float coef = coef_h / 2;
+          if (exclusive && wend - st == 1) {
+            coef = coef_h;
+          }
+          for (int i = 0; i < wend - st; i++) {
+            tmp += dr0[i] + dr1[i];
+          }
+          *(dr_out++) = tmp * coef;
+          dr0 += S - (st - wstart);
+          dr1 += S - (st - wstart);
+          wstart += S;
+        }
+        data_out_channel += wout;
+      }
+    }
+  }
+  TargetFree(TARGET(kARM), zero_ptr);
+}
 void pooling3x3s1p1_max(const float* din,
                        float* dout,
                        int num,
@@ -2240,6 +2536,9 @@ void pooling3x3s2p0_max(const float* din,
    w_unroll_remian = wout - w_unroll_size * 4;
  }
+  int remain = w_unroll_remian - 1;
+  int right = wout * 2 + 1 - win;  // if need right pad
  for (int n = 0; n < num; ++n) {
    float* data_out_batch = data_out + n * chout * size_channel_out;
    const float* data_in_batch = data_in + n * chin * size_channel_in;
@@ -2266,6 +2565,7 @@ void pooling3x3s2p0_max(const float* din,
          }
        }
        int cnt_num = w_unroll_size;
+        int cnt_remain = remain;
        if (w_unroll_size > 0) {
 #ifdef __aarch64__
          asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX
@@ -2289,46 +2589,80 @@ void pooling3x3s2p0_max(const float* din,
                         "v9",
                         "v10",
                         "v11");
-#else
-          asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX
-                       : [dr0] "+r"(dr0),
-                         [dr1] "+r"(dr1),
-                         [dr2] "+r"(dr2),
-                         [dr_out] "+r"(dr_out),
-                         [cnt_num] "+r"(cnt_num)
-                       :
-                       : "cc",
-                         "memory",
-                         "q0",
-                         "q1",
-                         "q2",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11");
-#endif
          dr0 -= 8;
          dr1 -= 8;
          dr2 -= 8;
-        }
+          int rem = win - (w_unroll_size * 4) * S;
-        // deal with right pad
+          int wstart = 0;
-        int rem = win - (w_unroll_size * 4) * S;
+          for (int j = 0; j < w_unroll_remian; ++j) {
-        int wstart = 0;
+            int wend = std::min(wstart + K, rem);
-        for (int j = 0; j < w_unroll_remian; ++j) {
+            float tmp = dr0[wstart];  // std::numeric_limits<float>::min();
-          int wend = std::min(wstart + K, rem);
+            for (int i = wstart; i < wend; i++) {
-          float tmp = dr0[wstart];  // std::numeric_limits<float>::min();
+              tmp = std::max(tmp, dr0[i]);
-          for (int i = wstart; i < wend; i++) {
+              tmp = std::max(tmp, dr1[i]);
-            tmp = std::max(tmp, dr0[i]);
+              tmp = std::max(tmp, dr2[i]);
-            tmp = std::max(tmp, dr1[i]);
+            }
-            tmp = std::max(tmp, dr2[i]);
+            *(dr_out++) = tmp;
+            wstart += S;
          }
-          *(dr_out++) = tmp;
+#else
-          wstart += S;
+          asm volatile(
+              P3x3S2P0_INIT P3x3S2P0_MAX
+              "cmp       %[remain], #0                         @cmp cnt_num\n"
+              "sub       %[dr0], #32                           @sub - 8\n"
+              "sub       %[dr1], #32                           @sub - 8\n"
+              "sub       %[dr2], #32                           @sub - 8\n"
+              "ble       4f                                    @ble exit1\n"
+              "2:                                              @mid loop\n"
+              "vld1.f32  {d0-d1}, [%[dr0]]!                    @load \n"
+              "vld1.f32  {d2-d3}, [%[dr1]]!                    @load \n"
+              "vld1.f32  {d4-d5}, [%[dr2]]!                    @load \n"
+              "vmov.f32  s3,s2                                 @mov \n"
+              "vmov.f32  s7,s6                                 @mov \n"
+              "vmov.f32  s11,s10                               @mov \n"
+              "vmax.f32  q0, q0, q1                            @max n"
+              "sub       %[dr0], #8                            @add w \n"
+              "sub       %[dr1], #8                            @add w \n"
+              "sub       %[dr2], #8                            @add w \n"
+              "vmax.f32  q0, q0, q2                            @max \n"
+              "vpmax.f32 d0, d0, d1                            @pmax \n"
+              "vpmax.f32 d0, d0, d0                            @pmax \n"
+              "subs      %[remain], #1                         @subs \n"
+              "vst1.f32  d0[0], [%[dr_out]]!                   @vst \n"
+              "bne       2b                                    @bne \n"
+              "4:                                              @exit\n"
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr2] "+r"(dr2),
+                [dr_out] "+r"(dr_out),
+                [remain] "+r"(cnt_remain),
+                [cnt_num] "+r"(cnt_num)
+              :
+              : "cc",
+                "memory",
+                "q0",
+                "q1",
+                "q2",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11");
+          if (right) {
+            int wstart = (w_unroll_size * 4 + remain) * S;
+            int wend = std::min(wstart + K, win);
+            float tmp = dr0[wstart];  // std::numeric_limits<float>::min();
+            for (int i = wstart; i < wend; i++) {
+              tmp = std::max(tmp, std::max(dr0[i], dr1[i]));
+              tmp = std::max(tmp, dr2[i]);
+            }
+            *(dr_out++) = tmp;
+          }
+#endif
        }
        r0 = r2;

--- a/lite/backends/arm/math/pooling.h
+++ b/lite/backends/arm/math/pooling.h
@@ -76,30 +76,55 @@ void pooling1x1s2p0_max(const float* din,
                        int pad_bottom,
                        int pad_right);
-void pooling2x2s2_max(const float* din,
+void pooling2x2s2p0_max(const float* din,
-                      float* dout,
+                        float* dout,
-                      int num,
+                        int num,
-                      int chout,
+                        int chout,
-                      int hout,
+                        int hout,
-                      int wout,
+                        int wout,
-                      int chin,
+                        int chin,
-                      int hin,
+                        int hin,
-                      int win,
+                        int win,
-                      int pad_bottom,
+                        int pad_bottom,
-                      int pad_right);
+                        int pad_right);
-void pooling2x2s2_avg(const float* din,
+void pooling2x2s2p0_avg(const float* din,
-                      float* dout,
+                        float* dout,
-                      int num,
+                        int num,
-                      int chout,
+                        int chout,
-                      int hout,
+                        int hout,
-                      int wout,
+                        int wout,
-                      int chin,
+                        int chin,
-                      int hin,
+                        int hin,
-                      int win,
+                        int win,
-                      bool exclusive,
+                        bool exclusive,
-                      int pad_bottom,
+                        int pad_bottom,
-                      int pad_right);
+                        int pad_right);
+void pooling2x2s2p1_max(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        int pad_bottom,
+                        int pad_right);
+void pooling2x2s2p1_avg(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right);
 void pooling3x3s1p1_max(const float* din,
                        float* dout,

--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -50,13 +50,14 @@ class PoolingPE : public PE {
    PoolingArgs args = {0};
    args.mode = param_.type;
+    auto paddings = *param_.paddings;
    args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
    args.image.address = input->data<float16>();
    args.image.channels = input->shape().channel();
    args.image.height = input->shape().height();
    args.image.width = input->shape().width();
-    args.image.pad_height = param_.paddings[0];
+    args.image.pad_height = paddings[0];
-    args.image.pad_width = param_.paddings[1];
+    args.image.pad_width = paddings[2];
    args.image.scale_address = input->scale();
    args.output.address = output->mutableData<float16>();
    args.output.scale_address = output->scale();
@@ -69,8 +70,7 @@ class PoolingPE : public PE {
    param_.poolingArgs = args;
    // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
-    // &&
+    // && (k_width > 7 || k_height > 7);
-    //            (k_width > 7 || k_height > 7);
    use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
               (k_width > 255 || k_height > 255);
    // use_cpu_ = param_.type == AVERAGE;
@@ -86,12 +86,13 @@ class PoolingPE : public PE {
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
    float16* data_out = output->data<float16>();
+    auto paddings = *param_.paddings;
    int image_height = input->shape().height();
    int image_width = input->shape().width();
    int image_channels = input->shape().channel();
-    int image_pad_h = param_.paddings[0];
+    int image_pad_h = paddings[0];
-    int image_pad_w = param_.paddings[1];
+    int image_pad_w = paddings[2];
    int kernel_height = param_.kernelSize[1];
    int kernel_width = param_.kernelSize[0];
    int kernel_step_h = param_.strides[0];

--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -58,6 +58,7 @@ void PoolCompute::Run() {
  bool global_pooling = (paddings[0] == 0) && (ksize[0] == in_dims[2]) &&
                        (ksize[1] == in_dims[3]) && kps_equal && pads_equal;
  global_pooling = param.global_pooling || global_pooling;
  if (global_pooling) {
    for (size_t i = 0; i < ksize.size(); ++i) {
      paddings[2 * i] = 0;
@@ -107,35 +108,65 @@ void PoolCompute::Run() {
    } else if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 &&
               kps_equal) {
      if (pooling_type == "max") {
-        lite::arm::math::pooling2x2s2_max(din,
+        lite::arm::math::pooling2x2s2p0_max(din,
-                                          dout,
+                                            dout,
-                                          out_dims[0],
+                                            out_dims[0],
-                                          out_dims[1],
+                                            out_dims[1],
-                                          out_dims[2],
+                                            out_dims[2],
-                                          out_dims[3],
+                                            out_dims[3],
-                                          in_dims[1],
+                                            in_dims[1],
-                                          in_dims[2],
+                                            in_dims[2],
-                                          in_dims[3],
+                                            in_dims[3],
-                                          paddings[1],
+                                            paddings[1],
-                                          paddings[3]);
+                                            paddings[3]);
        return;
      } else if (pooling_type == "avg") {
-        lite::arm::math::pooling2x2s2_avg(din,
+        lite::arm::math::pooling2x2s2p0_avg(din,
-                                          dout,
+                                            dout,
-                                          out_dims[0],
+                                            out_dims[0],
-                                          out_dims[1],
+                                            out_dims[1],
-                                          out_dims[2],
+                                            out_dims[2],
-                                          out_dims[3],
+                                            out_dims[3],
-                                          in_dims[1],
+                                            in_dims[1],
-                                          in_dims[2],
+                                            in_dims[2],
-                                          in_dims[3],
+                                            in_dims[3],
-                                          exclusive,
+                                            exclusive,
-                                          paddings[1],
+                                            paddings[1],
-                                          paddings[3]);
+                                            paddings[3]);
        return;
      }
-    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
+    } else if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 1 &&
               kps_equal) {
+      if (pooling_type == "max") {
+        lite::arm::math::pooling2x2s2p1_max(din,
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3],
+                                            paddings[1],
+                                            paddings[3]);
+        return;
+      } else if (pooling_type == "avg") {
+        lite::arm::math::pooling2x2s2p1_avg(din,
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3],
+                                            exclusive,
+                                            paddings[1],
+                                            paddings[3]);
+        return;
+      }
+    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
+               pads_equal && kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s1p1_max(din,
                                            dout,
@@ -165,7 +196,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s1p0_max(din,
                                            dout,
@@ -195,7 +226,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s2p0_max(din,
                                            dout,
@@ -225,7 +256,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s2p1_max(din,
                                            dout,

--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -678,15 +678,9 @@ void resize(const uint8_t* src,
  } else if (srcFormat == NV12 || srcFormat == NV21) {
    nv21_resize(src, dst, srcw, srch, dstw, dsth);
    return;
-    num = 1;
-    int hout = static_cast<int>(0.5 * dsth);
-    dsth += hout;
  } else if (srcFormat == BGR || srcFormat == RGB) {
    bgr_resize(src, dst, srcw, srch, dstw, dsth);
    return;
-    w_in = srcw * 3;
-    w_out = dstw * 3;
-    num = 3;
  } else if (srcFormat == BGRA || srcFormat == RGBA) {
    w_in = srcw * 4;
    w_out = dstw * 4;