diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index cde58fb77b49f06187c24298229395532a60857e..651bfb06916e15e7f95212a82f16619ae8f16d6a 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -206,19 +206,19 @@ void pooling_basic(const float* din,
   "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \
   "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/
 
-#define P2x2S2P1_MAX                                               \
-  "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */        \
-  "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */        \
-  "sub %[dr0], %[dr0], #4\n"             /* sub */                 \
-  "sub %[dr1], %[dr1], #4\n"             /* sub */                 \
-  "fmax  v4.4s, v0.4s, v6.4s\n"          /*  max */                \
-  "fmax  v5.4s, v2.4s, v8.4s\n"          /*  max */                \
-  "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \
-  "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ \
-  "fmax  v6.4s, v4.4s, v5.4s\n"          /* max reduce */          \
-  "subs %w[cnt_num], %w[cnt_num], #1\n"  /* subs cnt_num, #1*/     \
-  "st1  {v6.4s}, [%[dr_out]], #16\n"     /* store 4 out, dr_out */ \
-  "ble       2f\n"                       /* bne s3_max_loop_mid */
+#define P2x2S2P1_MAX                                                  \
+  "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */           \
+  "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */           \
+  "sub %[dr0], %[dr0], #4\n"                /* sub */                 \
+  "sub %[dr1], %[dr1], #4\n"                /* sub */                 \
+  "fmax  v4.4s, v0.4s, v6.4s\n"             /*  max */                \
+  "fmax  v5.4s, v2.4s, v8.4s\n"             /*  max */                \
+  "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n"    /* load q0-q1, dr0, 0-7*/ \
+  "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n"    /* load q2-q3, dr1, 0-7*/ \
+  "fmax  v6.4s, v4.4s, v5.4s\n"             /* max reduce */          \
+  "subs %w[cnt_num], %w[cnt_num], #1\n"     /* subs cnt_num, #1*/     \
+  "st1  {v6.4s}, [%[dr_out]], #16\n"        /* store 4 out, dr_out */ \
+  "ble       2f\n"                          /* bne s3_max_loop_mid */
 
 #define P2x2S2P0_MAX                                               \
   "1: \n"                                                          \
@@ -231,20 +231,20 @@ void pooling_basic(const float* din,
   "st1  {v6.4s}, [%[dr_out]], #16\n"     /* store 4 out, dr_out */ \
   "bne       1b\n"                       /* bne s3_max_loop_mid */
 
-#define P2x2S2P1_AVG                                                         \
-  "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */        \
-  "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */        \
-  "sub %[dr0], %[dr0], #4\n"             /* sub */                 \
-  "sub %[dr1], %[dr1], #4\n"             /* sub */                 \
-  "fadd v4.4s, v0.4s, v6.4s\n"           /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
-  "fadd v5.4s, v2.4s, v8.4s\n"           /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
-  "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/           \
-  "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/           \
-  "fadd v6.4s, v4.4s, v5.4s\n"           /* add reduce */                    \
-  "subs %w[cnt_num], %w[cnt_num], #1\n"  /* subs cnt_num, #1*/               \
-  "fmul v4.4s, v6.4s, %[vcoef_left].4s\n"     /* mul coef */                      \
-  "st1  {v4.4s}, [%[dr_out]], #16\n"     /* store 4 out, dr_out */           \
-  "ble       2f\n"                       /* bne s3_max_loop_mid */
+#define P2x2S2P1_AVG                                                            \
+  "ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */                     \
+  "ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */                     \
+  "sub %[dr0], %[dr0], #4\n"                /* sub */                           \
+  "sub %[dr1], %[dr1], #4\n"                /* sub */                           \
+  "fadd v4.4s, v0.4s, v6.4s\n"              /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
+  "fadd v5.4s, v2.4s, v8.4s\n"              /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
+  "ld2  {v0.4s, v1.4s}, [%[dr0]], #32\n"    /* load q0-q1, dr0, 0-7*/           \
+  "ld2  {v2.4s, v3.4s}, [%[dr1]], #32\n"    /* load q2-q3, dr1, 0-7*/           \
+  "fadd v6.4s, v4.4s, v5.4s\n"              /* add reduce */                    \
+  "subs %w[cnt_num], %w[cnt_num], #1\n"     /* subs cnt_num, #1*/               \
+  "fmul v4.4s, v6.4s, %[vcoef_left].4s\n"   /* mul coef */                      \
+  "st1  {v4.4s}, [%[dr_out]], #16\n"        /* store 4 out, dr_out */           \
+  "ble       2f\n"                          /* bne s3_max_loop_mid */
 
 #define P2x2S2P0_AVG                                                         \
   "1: \n"                                /* load bias to q2, q3*/            \
@@ -549,8 +549,8 @@ void pooling_basic(const float* din,
   "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"
 
 #define P2x2S2P1_MAX                                                  \
-  "vext.32 q4, %q[vzero], q1, #3                 @ 1357-0135\n"      \
-  "vext.32 q5, %q[vzero], q3, #3                 @ 1357-0135\n"      \
+  "vext.32 q4, %q[vzero], q1, #3                  @ 1357-0135\n"      \
+  "vext.32 q5, %q[vzero], q3, #3                  @ 1357-0135\n"      \
   "sub %[dr0], #4                                 @sub \n"            \
   "sub %[dr1], #4                                 @sub \n"            \
   "vmax.f32  q8, q0, q4                           @ max \n"           \
@@ -558,7 +558,7 @@ void pooling_basic(const float* din,
   "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load \n"          \
   "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"          \
   "vmax.f32  q5, q9, q8                           @ max reduce\n"     \
-  "subs   %[cnt_num], #1                       @ subs cnt_num \n"  \
+  "subs   %[cnt_num], #1                          @ subs cnt_num \n"  \
   "vst1.f32  {d10-d11}, [%[dr_out]]!              @ store 4 out \n"   \
   "ble       2f                                   @ bne \n"
   
@@ -574,8 +574,8 @@ void pooling_basic(const float* din,
   "bne       1b                                   @ bne \n"
 
 #define P2x2S2P1_AVG                                                  \
-  "vext.32 q4, %q[vzero], q1, #3                 @ 1357-0135\n"      \
-  "vext.32 q5, %q[vzero], q3, #3                 @ 1357-0135\n"      \
+  "vext.32 q4, %q[vzero], q1, #3                  @ 1357-0135\n"      \
+  "vext.32 q5, %q[vzero], q3, #3                  @ 1357-0135\n"      \
   "sub %[dr0], #4                                 @sub \n"            \
   "sub %[dr1], #4                                 @sub \n"            \
   "vadd.f32  q9, q0, q4                           @ max \n"           \
@@ -583,9 +583,9 @@ void pooling_basic(const float* din,
   "vld2.f32  {d0-d3}, [%[dr0]]!                   @ load \n"          \
   "vld2.f32  {d4-d7}, [%[dr1]]!                   @ load \n"          \
   "vadd.f32  q5, q9, q8                           @ max reduce\n"     \
-  "subs   %[cnt_num],  %[cnt_num], #1                       @ subs cnt_num \n"  \
-  "vmul.f32  q4, q5, %q[vcoef_left]                    @ mul coef \n"       \
-  "vst1.f32  {d8-d9}, [%[dr_out]]!              @ store 4 out \n"   \
+  "subs      %[cnt_num], #1                       @ subs cnt_num \n"  \
+  "vmul.f32  q4, q5, %q[vcoef_left]               @ mul coef \n"      \
+  "vst1.f32  {d8-d9}, [%[dr_out]]!                @ store 4 out \n"   \
   "ble       2f                                   @ bne\n"
 
 #define P2x2S2P0_AVG                                                   \
@@ -1097,16 +1097,16 @@ void pooling1x1s2p0_max(const float* din,
 }
 
 void pooling2x2s2p0_max(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      int pad_bottom,
-                      int pad_right) {
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1181,17 +1181,17 @@ void pooling2x2s2p0_max(const float* din,
 }
 
 void pooling2x2s2p0_avg(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      bool exclusive,
-                      int pad_bottom,
-                      int pad_right) {
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1258,10 +1258,10 @@ void pooling2x2s2p0_avg(const float* din,
           float coef = 0.25f;
           float tmp = 0.f;
           if (wend - wstart == 1 && pad_right == 0) {
-             coef *= 2;
+            coef *= 2;
           }
           if (h * S + K - P > hin && pad_bottom == 0) {
-              coef *= 2;
+            coef *= 2;
           }
           for (int i = wstart; i < wend; i++) {
             tmp += dr0[i] + dr1[i];
@@ -1280,16 +1280,16 @@ void pooling2x2s2p0_avg(const float* din,
 }
 
 void pooling2x2s2p1_max(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      int pad_bottom,
-                      int pad_right) {
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1321,19 +1321,19 @@ void pooling2x2s2p1_max(const float* din,
         auto dr0 = r0;
         auto dr1 = r1;
         if ( h == 0 ) {
-           dr0 = r0;
-           dr1 = r0;
-           r0 = r1;
-           r1 = r0 + win;
+          dr0 = r0;
+          dr1 = r0;
+          r0 = r1;
+          r1 = r0 + win;
         } else {
           r0 = r1 + win;
-          r1 = r0 + win; 
+          r1 = r0 + win;
         }
         if (h * S + K - P > hin) {
           dr1 = dr0;
           if (h * S + K - P > hin + 1) {
-              memset(dr_out, 0, wout * sizeof(float));
-              continue;
+            memset(dr_out, 0, wout * sizeof(float));
+            continue;
           }
         }
         int cnt_num = w_unroll_size;
@@ -1345,19 +1345,16 @@ void pooling2x2s2p1_max(const float* din,
                 [dr1] "+r"(dr1),
                 [dr_out] "+r"(dr_out),
                 [cnt_num] "+r"(cnt_num)
-              : [vzero] "w" (vzero)
+              : [vzero] "w"(vzero)
               : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8");
-#else
-       //   cnt_num -= 1; 
+#else 
           asm volatile(
-              P2x2S2_INIT 
-              P2x2S2P1_MAX
-              P2x2S2P0_MAX "2: \n" /* end */
+              P2x2S2_INIT P2x2S2P1_MAX P2x2S2P0_MAX "2: \n" /* end */
               : [dr0] "+r"(dr0),
                 [dr1] "+r"(dr1),
                 [dr_out] "+r"(dr_out),
                 [cnt_num] "+r"(cnt_num)
-              : [vzero] "w" (vzero)
+              : [vzero] "w"(vzero)
               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9");
 #endif
           dr0 -= 8;
@@ -1385,17 +1382,17 @@ void pooling2x2s2p1_max(const float* din,
 }
 
 void pooling2x2s2p1_avg(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      bool exclusive,
-                      int pad_bottom,
-                      int pad_right) {
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1432,12 +1429,12 @@ void pooling2x2s2p1_avg(const float* din,
         auto dr1 = r1;
         float coef_h = 0.5f;
         if ( h == 0 ) {
-           dr0 = zero_ptr;
-           dr1 = r0;
-           r0 = r1;
-           r1 = r0 + win;
-           if (exclusive) {
-             coef_h = 1.f;
+          dr0 = zero_ptr;
+          dr1 = r0;
+          r0 = r1;
+          r1 = r0 + win;
+          if (exclusive) {
+            coef_h = 1.f;
           }
         } else {
           r0 = r1 + win;
@@ -1446,16 +1443,17 @@ void pooling2x2s2p1_avg(const float* din,
         if (h * S + K - P > hin) {
           dr1 = zero_ptr;
           if (exclusive) {
-             coef_h = 1.f;
+            coef_h = 1.f;
           }
           if (h * S + K - P > hin + 1) {
-              memset(dr_out, 0, wout * sizeof(float));
-              continue;
+            memset(dr_out, 0, wout * sizeof(float));
+            continue;
           }
         }
         float coef_left_most = exclusive ? coef_h : coef_h / 2;
         float32x4_t vcoef = vdupq_n_f32(coef_h / 2);
-        float coef_left[4] = {coef_left_most, coef_h / 2, coef_h / 2, coef_h / 2};
+        float coef_left[4] =
+            {coef_left_most, coef_h / 2, coef_h / 2, coef_h / 2};
         float32x4_t vcoef_left = vld1q_f32(coef_left);
         int cnt_num = w_unroll_size;
         if (w_unroll_size > 0) {
@@ -1466,16 +1464,20 @@ void pooling2x2s2p1_avg(const float* din,
                 [dr1] "+r"(dr1),
                 [dr_out] "+r"(dr_out),
                 [cnt_num] "+r"(cnt_num)
-              : [vcoef] "w"(vcoef), [vzero] "w"(vzero), [vcoef_left] "w"(vcoef_left)
+              : [vcoef] "w"(vcoef),
+                [vzero] "w"(vzero),
+                [vcoef_left] "w"(vcoef_left)
               : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8");
 #else
           asm volatile(
-              P2x2S2_INIT  P2x2S2P1_AVG P2x2S2P0_AVG "2: \n"
+              P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n"
               : [dr0] "+r"(dr0),
                 [dr1] "+r"(dr1),
                 [dr_out] "+r"(dr_out),
                 [cnt_num] "+r"(cnt_num)
-              : [vcoef] "w"(vcoef), [vzero] "w"(vzero), [vcoef_left] "w"(vcoef_left)
+              : [vcoef] "w"(vcoef),
+                [vzero] "w"(vzero),
+                [vcoef_left] "w"(vcoef_left)
               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9");
 #endif
           dr0 -= 8;
@@ -1489,7 +1491,7 @@ void pooling2x2s2p1_avg(const float* din,
           float tmp = 0.f;
           float coef = coef_h / 2;
           if (exclusive && wend - st == 1) {
-             coef = coef_h;
+            coef = coef_h;
           }
           for (int i = 0; i < wend - st; i++) {
             tmp += dr0[i] + dr1[i];
@@ -2535,7 +2537,7 @@ void pooling3x3s2p0_max(const float* din,
   }
 
   int remain = w_unroll_remian - 1;
-  int right = wout * 2 + 1 - win; // if need right pad
+  int right = wout * 2 + 1 - win;  // if need right pad
 
   for (int n = 0; n < num; ++n) {
     float* data_out_batch = data_out + n * chout * size_channel_out;
@@ -2571,7 +2573,6 @@ void pooling3x3s2p0_max(const float* din,
                          [dr1] "+r"(dr1),
                          [dr2] "+r"(dr2),
                          [dr_out] "+r"(dr_out),
-                         [remain] "+r" (cnt_remain),
                          [cnt_num] "+r"(cnt_num)
                        :
                        : "cc",
@@ -2594,76 +2595,73 @@ void pooling3x3s2p0_max(const float* din,
           int rem = win - (w_unroll_size * 4) * S;
           int wstart = 0;
           for (int j = 0; j < w_unroll_remian; ++j) {
-          int wend = std::min(wstart + K, rem);
-          float tmp = dr0[wstart];  // std::numeric_limits<float>::min();
-          for (int i = wstart; i < wend; i++) {
-            tmp = std::max(tmp, dr0[i]);
-            tmp = std::max(tmp, dr1[i]);
-            tmp = std::max(tmp, dr2[i]);
+            int wend = std::min(wstart + K, rem);
+            float tmp = dr0[wstart];  // std::numeric_limits<float>::min();
+            for (int i = wstart; i < wend; i++) {
+              tmp = std::max(tmp, dr0[i]);
+              tmp = std::max(tmp, dr1[i]);
+              tmp = std::max(tmp, dr2[i]);
+            }
+            *(dr_out++) = tmp;
+            wstart += S;
           }
-          *(dr_out++) = tmp;
-          wstart += S;
-        }
 #else
           asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX
-                       "cmp       %[remain], #0                           @cmp cnt_num, 0\n"
-                       "sub       %[dr0], #32                            @sub - 8\n"
-                       "sub       %[dr1], #32                            @sub - 8\n"
-                       "sub       %[dr2], #32                            @sub - 8\n"
-                       "ble       4f                                 @ble exit1\n"
-                       "2:                              @mid loop\n"
-                          "vld1.f32  {d0-d1}, [%[dr0]]!                     @load d0-d1, dr0\n"
-                          "vld1.f32  {d2-d3}, [%[dr1]]!                    @load d2-d3, dr1\n"
-                          "vld1.f32  {d4-d5}, [%[dr2]]!                     @load d2-d3, dr1\n"
-                          "vmov.f32  s3,s2                                 @movs3, s2\n"
-                          "vmov.f32  s7,s6                                 @movs7, s6\n"
-                          "vmov.f32  s11,s10                               @movs11, s10\n"
-                          "vmax.f32  q0, q0, q1                            @max q0, q0, q1\n"
-                          "sub       %[dr0], #8                            @add w, 6\n"
-                          "sub       %[dr1], #8                            @add w, 6\n"
-                          "sub       %[dr2], #8                            @add w, 6\n"
-                          "vmax.f32  q0, q0, q2                            @max q0, q0, q2\n"
-                          "vpmax.f32 d0, d0, d1                            @pmax d0, d0,d1\n"
-                          "vpmax.f32 d0, d0, d0                            @pmax d0, d0, d0\n"
-                          "subs      %[remain], #1                       @subs cnt_num, #1\n"
-                          "vst1.f32  d0[0], [%[dr_out]]!                   @vst  d0[0], dr_out\n"
-                          "bne       2b                     @bne s3_max_loop_mid_1\n"
-                          "4:                                           @exit\n"
-                       : [dr0] "+r"(dr0),
-                         [dr1] "+r"(dr1),
-                         [dr2] "+r"(dr2),
-                         [dr_out] "+r"(dr_out),
-                         [remain] "+r" (cnt_remain),
-                         [cnt_num] "+r"(cnt_num)
-                       :
-                       : "cc",
-                         "memory",
-                         "q0",
-                         "q1",
-                         "q2",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11");
-#endif
-          // dr0 -= 8;
-          // dr1 -= 8;
-          // dr2 -= 8;
-          if (right){
+              "cmp       %[remain], #0                           @cmp cnt_num, 0\n"
+              "sub       %[dr0], #32                            @sub - 8\n"
+              "sub       %[dr1], #32                            @sub - 8\n"
+              "sub       %[dr2], #32                            @sub - 8\n"
+              "ble       4f                                 @ble exit1\n"
+              "2:                              @mid loop\n"
+              "vld1.f32  {d0-d1}, [%[dr0]]!                     @load d0-d1, dr0\n"
+              "vld1.f32  {d2-d3}, [%[dr1]]!                    @load d2-d3, dr1\n"
+              "vld1.f32  {d4-d5}, [%[dr2]]!                     @load d2-d3, dr1\n"
+              "vmov.f32  s3,s2                                 @movs3, s2\n"
+              "vmov.f32  s7,s6                                 @movs7, s6\n"
+              "vmov.f32  s11,s10                               @movs11, s10\n"
+              "vmax.f32  q0, q0, q1                            @max q0, q0, q1\n"
+              "sub       %[dr0], #8                            @add w, 6\n"
+              "sub       %[dr1], #8                            @add w, 6\n"
+              "sub       %[dr2], #8                            @add w, 6\n"
+              "vmax.f32  q0, q0, q2                            @max q0, q0, q2\n"
+              "vpmax.f32 d0, d0, d1                            @pmax d0, d0,d1\n"
+              "vpmax.f32 d0, d0, d0                            @pmax d0, d0, d0\n"
+              "subs      %[remain], #1                       @subs cnt_num, #1\n"
+              "vst1.f32  d0[0], [%[dr_out]]!                   @vst  d0[0], dr_out\n"
+              "bne       2b                     @bne s3_max_loop_mid_1\n"
+              "4:                                           @exit\n"
+              : [dr0] "+r"(dr0),
+                [dr1] "+r"(dr1),
+                [dr2] "+r"(dr2),
+                [dr_out] "+r"(dr_out),
+                [remain] "+r"(cnt_remain),
+                [cnt_num] "+r"(cnt_num)
+              :
+              : "cc",
+                "memory",
+                "q0",
+                "q1",
+                "q2",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11");
+          if (right) {
             int wstart = (w_unroll_size * 4 + remain) * S;
             int wend = std::min(wstart + K, win);
-            float tmp = dr0[wstart];//std::numeric_limits<float>::min();
-            for(int i = wstart; i < wend; i++){
-              tmp = std::max(tmp,std::max(dr0[i],dr1[i]));
-              tmp = std::max(tmp,dr2[i]);
+            float tmp = dr0[wstart];  //std::numeric_limits<float>::min();
+            for (int i = wstart; i < wend; i++) {
+              tmp = std::max(tmp, std::max(dr0[i], dr1[i]));
+              tmp = std::max(tmp, dr2[i]);
             }
             *(dr_out++) = tmp;
           }
+#endif
         }
 
         r0 = r2;
diff --git a/lite/backends/arm/math/pooling.h b/lite/backends/arm/math/pooling.h
index 7d75bab5ddbc0aa08c7e1059379c49921dc2cabd..572919e3f083f736d8f49b3bae0dd2820fac35c4 100644
--- a/lite/backends/arm/math/pooling.h
+++ b/lite/backends/arm/math/pooling.h
@@ -77,54 +77,54 @@ void pooling1x1s2p0_max(const float* din,
                         int pad_right);
 
 void pooling2x2s2p0_max(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      int pad_bottom,
-                      int pad_right);
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling2x2s2p0_avg(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      bool exclusive,
-                      int pad_bottom,
-                      int pad_right);
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling2x2s2p1_max(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      int pad_bottom,
-                      int pad_right);
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling2x2s2p1_avg(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      bool exclusive,
-                      int pad_bottom,
-                      int pad_right);
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling3x3s1p1_max(const float* din,
                         float* dout,
diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp
index c8ea65fce4e02ad553daba79a019d9055d1128f5..2bc4f91f1d8c76b243a0ffb4a083f8d6ab138553 100644
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -70,8 +70,7 @@ class PoolingPE : public PE {
     param_.poolingArgs = args;
 
     // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
-    // &&
-    //            (k_width > 7 || k_height > 7);
+    // && (k_width > 7 || k_height > 7);
     use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
                (k_width > 255 || k_height > 255);
     // use_cpu_ = param_.type == AVERAGE;
diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc
index 7a383f297c49600d44f5ac83badb85fd2afb69b3..5cfca8f1b7d9a286d24dda5af5664aa381c8e0f1 100644
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -109,60 +109,60 @@ void PoolCompute::Run() {
                kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling2x2s2p0_max(din,
-                                          dout,
-                                          out_dims[0],
-                                          out_dims[1],
-                                          out_dims[2],
-                                          out_dims[3],
-                                          in_dims[1],
-                                          in_dims[2],
-                                          in_dims[3],
-                                          paddings[1],
-                                          paddings[3]);
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3],
+                                            paddings[1],
+                                            paddings[3]);
         return;
       } else if (pooling_type == "avg") {
         lite::arm::math::pooling2x2s2p0_avg(din,
-                                          dout,
-                                          out_dims[0],
-                                          out_dims[1],
-                                          out_dims[2],
-                                          out_dims[3],
-                                          in_dims[1],
-                                          in_dims[2],
-                                          in_dims[3],
-                                          exclusive,
-                                          paddings[1],
-                                          paddings[3]);
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3],
+                                            exclusive,
+                                            paddings[1],
+                                            paddings[3]);
         return;
       }
     } else if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 1 &&
                kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling2x2s2p1_max(din,
-                                          dout,
-                                          out_dims[0],
-                                          out_dims[1],
-                                          out_dims[2],
-                                          out_dims[3],
-                                          in_dims[1],
-                                          in_dims[2],
-                                          in_dims[3],
-                                          paddings[1],
-                                          paddings[3]);
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3],
+                                            paddings[1],
+                                            paddings[3]);
         return;
       } else if (pooling_type == "avg") {
         lite::arm::math::pooling2x2s2p1_avg(din,
-                                          dout,
-                                          out_dims[0],
-                                          out_dims[1],
-                                          out_dims[2],
-                                          out_dims[3],
-                                          in_dims[1],
-                                          in_dims[2],
-                                          in_dims[3],
-                                          exclusive,
-                                          paddings[1],
-                                          paddings[3]);
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3],
+                                            exclusive,
+                                            paddings[1],
+                                            paddings[3]);
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc
index aba5815741734342a367d3316de14cdae6b1c896..51f67a1c6f0122c1140aeb762b448a928bd16692 100644
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -64,20 +64,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     return FAILED;
   }
 
-  auto padding = op_info->GetAttr<std::vector<int>>("paddings");
-  bool pads_equal = (padding[0] == padding[1]) && (padding[2] == padding[3]);
-  if (!pads_equal) {
-    LOG(FATAL)
-        << "padding requires pad_left == pad_right, pad_top == pad_bottom";
-  }
-  auto npu_pad =
-      ge::AttrValue::LIST_INT{padding[0], padding[1], padding[2], padding[3]};
-  auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end());
-  int npu_ceil_mode = 0;
-  if (op_info->HasAttr("ceil_mode")) {
-    npu_ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
-
   // pad mode
   int pad_mode = 0;
   std::string padding_algorithm("");
diff --git a/lite/kernels/xpu/bridges/pool_op_test.cc b/lite/kernels/xpu/bridges/pool_op_test.cc
deleted file mode 100644
index 7efc6b464c00c945c71c8c5689e18823cde10f97..0000000000000000000000000000000000000000
--- a/lite/kernels/xpu/bridges/pool_op_test.cc
+++ /dev/null
@@ -1,268 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pool_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto& in_dims = x->dims();
-  auto& out_dims = out->dims();
-
-  const float* src_ptr = x->data<const float>();
-  float* dst_ptr = out->mutable_data<float>();
-
-  std::vector<int> ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  std::vector<int> strides = op_info->GetAttr<std::vector<int>>("strides");
-  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  bool exclusive = op_info->GetAttr<bool>("exclusive");
-  std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
-  bool global_pooling = op_info->GetAttr<bool>("global_pooling");
-
-  int in_n = in_dims[0];
-  int in_c = in_dims[1];
-  int in_h = in_dims[2];
-  int in_w = in_dims[3];
-  int size_in_n = in_c * in_h * in_w;
-  int size_in_c = in_h * in_w;
-
-  int out_h = out_dims[2];
-  int out_w = out_dims[3];
-  int size_out_n = in_c * out_h * out_w;
-  int size_out_c = out_h * out_w;
-
-  int window_h = ksize[0];
-  int window_w = ksize[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[2];
-
-  if (global_pooling == true) {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        const float* src = src_ptr + n * size_in_n + c * size_in_c;
-        float res = src[0];
-        if (pooling_type == "max") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res = cur_val > res ? cur_val : res;
-          }
-        } else if (pooling_type == "avg") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res += cur_val;
-          }
-          res /= size_in_c;
-        }
-        dst_ptr[n * size_out_n + c] = res;
-      }
-    }
-  } else {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        for (int h = 0; h < out_h; ++h) {
-          int sh = h * stride_h;
-          int eh = sh + window_h;
-          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
-          for (int w = 0; w < out_w; ++w) {
-            int sw = w * stride_w;
-            int ew = sw + window_w;
-            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
-            int pooling_size = (ew - sw) * (eh - sh);
-            if (pooling_size == 0) continue;
-            float res = 0.f;
-            for (int kh = sh; kh < eh; ++kh) {
-              for (int kw = sw; kw < ew; ++kw) {
-                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
-                if (kh == sh && kw == sw) {
-                  res = src_ptr[src_idx];
-                } else {
-                  if (pooling_type == "max") {
-                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
-                  }
-                  if (pooling_type == "avg") {
-                    res += src_ptr[src_idx];
-                  }
-                }
-              }
-            }
-            if (pooling_type == "avg") {
-              if (exclusive) {
-                res /= pooling_size;
-              } else {
-                res /= window_h * window_w;
-              }
-            }
-            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
-          }
-        }
-      }
-    }
-  }
-}
-
-void test_pool(int bs,
-               int ic,
-               int ih,
-               int iw,
-               std::string pooling_type,
-               bool ceil_mode,
-               bool global_pooling,
-               bool exclusive,
-               int ksize,
-               int stride,
-               int padding) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("pool2d");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("pooling_type", pooling_type);
-  opdesc.SetAttr("ksize", std::vector<int>({ksize, ksize}));
-  opdesc.SetAttr("global_pooling", global_pooling);
-  opdesc.SetAttr("exclusive", exclusive);
-  opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings",
-                 std::vector<int>({padding, padding, padding, padding}));
-  opdesc.SetAttr("ceil_mode", ceil_mode);
-
-  // create and convert op to XPU model, then run it on XPU
-  auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  pool_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(XPUBridges, pool) {
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto bs : {1, 3}) {
-      for (auto ic : {2}) {
-        for (auto ih : {3}) {
-          for (auto iw : {4}) {
-            test_pool(bs, ic, ih, iw, pooling_type, true, true, true, 0, 1, 0);
-          }
-        }
-      }
-    }
-  }
-
-  for (auto pooling_type : {"max"}) {
-    for (auto ceil_mode : {true, false}) {
-      for (auto ksize : {2, 3}) {
-        for (auto stride : {1, 2}) {
-          for (auto padding : {0, 1}) {
-            for (auto bs : {1, 3}) {
-              for (auto ic : {2}) {
-                for (auto ih : {3}) {
-                  for (auto iw : {4}) {
-                    test_pool(bs,
-                              ic,
-                              ih,
-                              iw,
-                              pooling_type,
-                              ceil_mode,
-                              false,
-                              true,
-                              ksize,
-                              stride,
-                              padding);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  for (auto pooling_type : {"avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      for (auto exclusive : {true, false}) {
-        for (auto ksize : {2, 3}) {
-          for (auto stride : {1, 2}) {
-            for (auto padding : {0, 1}) {
-              for (auto bs : {1, 3}) {
-                for (auto ic : {2}) {
-                  for (auto ih : {3}) {
-                    for (auto iw : {4}) {
-                      test_pool(bs,
-                                ic,
-                                ih,
-                                iw,
-                                pooling_type,
-                                ceil_mode,
-                                false,
-                                exclusive,
-                                ksize,
-                                stride,
-                                padding);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(pool2d);
-USE_XPU_BRIDGE(pool2d);
diff --git a/lite/operators/pool_op.cc b/lite/operators/pool_op.cc
index 7048e15b82b0153c9112c20a498ec5725dee7929..5fb990928ec1ae723bc12b695af1be5e50da5079 100644
--- a/lite/operators/pool_op.cc
+++ b/lite/operators/pool_op.cc
@@ -41,39 +41,6 @@ bool PoolOpLite::CheckShape() const {
   return true;
 }
 
-inline void UpdatePadding(std::vector<int>* paddings,
-                          const bool global_pooling,
-                          const bool adaptive,
-                          const std::string padding_algorithm,
-                          const lite::DDim data_dims,
-                          const std::vector<int>& strides,
-                          const std::vector<int>& ksize) {
-  // when padding_algorithm is "VALID" or "SAME"
-  if (padding_algorithm == "SAME") {
-    for (int i = 0; i < strides.size(); ++i) {
-      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
-      int pad_sum =
-          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
-                   (int64_t)0);
-      int pad_0 = pad_sum / 2;
-      int pad_1 = pad_sum - pad_0;
-      *(paddings->begin() + i * 2) = pad_0;
-      *(paddings->begin() + i * 2 + 1) = pad_1;
-    }
-  } else if (padding_algorithm == "VALID") {
-    for (auto it = paddings->begin(); it != paddings->end(); it++) {
-      *it = 0;
-    }
-  }
-
-  // if global_pooling == true or adaptive == true, padding will be ignore
-  if (global_pooling || adaptive) {
-    for (auto it = paddings->begin(); it != paddings->end(); it++) {
-      *it = 0;
-    }
-  }
-}
-
 int PoolOutputSize(int input_size,
                    int filter_size,
                    int pad_left,