diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h
index 35fb09f94c49cd91915128260b6426fe0fedf725..37479c22efe95b6506054cf3ded5855aa766c34c 100644
--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -76,15 +76,17 @@ void PoolCompute(const PoolParam<CPU> &param) {
       }
     }
 
-  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
+  } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
+             strides[0] == strides[1] && paddings[0] == paddings[1] &&
+             paddings[1] == 0) {
 #if __ARM_NEON
 #if __aarch64__
     PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
 #else
     if (pooling_type == "max") {
-      math::Pool2x2Max(strides, paddings, in_x, out);
+      math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
     } else if (pooling_type == "avg") {
-      math::Pool2x2Avg(strides, paddings, in_x, out);
+      math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
     }
 #endif
 #else
diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp
index 0a2d96d4d065d7938e6872b4f073e080d7be8c3a..76af743818edacac6dd9e1878e8d8220ccff6d73 100644
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -20,21 +20,15 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
+#define FLT_MAX __FLT_MAX__
 
-void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
-                Tensor *output) {
-#if __ARM_NEON
-
-#if __aarch64__
-#else
+void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
+                    const Tensor *input, Tensor *output) {
   const int batch_size = input->dims()[0];
-
   const int input_height = input->dims()[2];
-
   const int input_width = input->dims()[3];
 
   const int output_channels = output->dims()[1];
-
   int output_height = output->dims()[2];
   const int output_width = output->dims()[3];
   const int ksize_height = 2;
@@ -47,72 +41,110 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
   const int input_channel_stride = input_height * input_width;
   const int output_channel_stride = output_height * output_width;
 
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+
   const float *input_data = input->data<float>();
   float *output_data = output->mutable_data<float>();
 
-  int out_w_num = output_width >> 2;
-  const int in_h_num = output_height >> 1;
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-  int remain = output_width - out_w_num << 2;
+  int w1 = input_width / 16;
+  int _w1 = input_width % 16;
+  int w2 = _w1 / 4;
+  int _w2 = _w1 % 4;
+
   for (int i = 0; i < batch_size; ++i) {
     for (int c = 0; c < output_channels; ++c) {
-      const float *input_data_chanel_row_next = input_data + input_width;
-      for (; output_height > 0; output_height--) {
-        if (out_w_num > 0) {
-          asm volatile(
-              "max_loop:                            \n\t"
-              "vld1.f32  {q0,q1},  [%[in_ptr1]]!         \n\t"
-              "vld1.f32  {q2,q3},  [%[in_ptr2]]!         \n\t"
-              "vmax.f32  q0,  q0,  q2                 \n\t"
-              "vmax.f32  q1,  q1,  q3                 \n\t"
-              "vpmax.f32  d4,  d0, d1                  \n\t"
-              "vpmax.f32  d5,  d2, d3                  \n\t"
-              "subs %[out_w_num],  #1                  \n\t"
-              "vst1.32  {q2},  [%[out_ptr]]!                 \n\t"
-              "bne  max_loop                            \n\t"
-              : [in_ptr1] "+r"(input_data),
-                [in_ptr2] "+r"(input_data_chanel_row_next),
-                [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
-              :
-              : "memory", "q0", "q1", "q2", "q3");
+      for (int ph = 0; ph < input_height; ph += 2) {
+        const float *in_ptr1 = input_data + i * input_batch_stride +
+                               c * input_channel_stride + ph * input_width;
+        const float *in_ptr2 = in_ptr1 + input_width;
+        if (ph + 1 >= input_height) {
+          in_ptr2 = static_cast<float *>(
+              paddle_mobile::memory::Alloc(sizeof(float) * input_width));
+          memset(static_cast<void *>(const_cast<float *>(in_ptr2)), -FLT_MAX,
+                 sizeof(float) * input_width);
         }
+        float *out_ptr = output_data + i * output_batch_stride +
+                         c * output_channel_stride + ph / 2 * output_width;
+        asm volatile(
+            "subs       %[w1], %[w1], #1        \n\t"
+            "blt        end_w1_%=               \n\t"
+            "loop_w1_%=:                        \n\t"
+
+            "pld        [%[in_ptr1], #64]       \n\t"
+            "pld        [%[in_ptr2], #64]       \n\t"
+
+            "vld1.f32   {q0, q1},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q2, q3},   [%[in_ptr2]]!   \n\t"
+            "vld1.f32   {q6, q7},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q8, q9},   [%[in_ptr2]]!   \n\t"
 
-        for (; remain > 0; remain--) {
-          float max_row1 = std::max(input_data[0], input_data[1]);
-          float max_row2 = std::max(input_data_chanel_row_next[0],
-                                    input_data_chanel_row_next[1]);
-          *output_data = std::max(max_row1, max_row2);
-          input_data += 2;
-          input_data_chanel_row_next += 2;
-          output_data++;
+            "vmax.f32   q0,     q0,   q2        \n\t"
+            "vmax.f32   q1,     q1,   q3        \n\t"
+
+            "vmax.f32   q6,     q6,   q8        \n\t"
+            "vmax.f32   q7,     q7,   q9        \n\t"
+
+            "vpmax.f32  d8,     d0,   d1        \n\t"
+            "vpmax.f32  d9,     d2,   d3        \n\t"
+
+            "vpmax.f32  d10,    d12,  d13       \n\t"
+            "vpmax.f32  d11,    d14,  d15       \n\t"
+
+            "vst1.32  {q4, q5},  [%[out_ptr]]!  \n\t"
+
+            "subs       %[w1], %[w1], #1        \n\t"
+            "bge        loop_w1_%=              \n\t"
+            "end_w1_%=:                         \n\t"
+
+            "subs       %[w2], %[w2], #1        \n\t"
+            "blt        end_w2_%=               \n\t"
+            "loop_w2_%=:                        \n\t"
+
+            "vld1.f32   {q0},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q1},   [%[in_ptr2]]!   \n\t"
+            "vmax.f32   q0,     q0,   q1        \n\t"
+            "vpmax.f32  d4,     d0,   d1        \n\t"
+            "vst1.32    {d4},   [%[out_ptr]]!   \n\t"
+
+            "subs       %[w2], %[w2], #1        \n\t"
+            "bge        loop_w2_%=              \n\t"
+            "end_w2_%=:                         \n\t"
+            :
+            : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
+              [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr)
+            : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+              "q9");
+
+        if (_w2 != 0) {
+          in_ptr1 += 16 * w1 + 4 * w2;
+          in_ptr2 += 16 * w1 + 4 * w2;
+          out_ptr += 8 * w1 + 2 * w2;
+          if (_w2 == 1) {
+            *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+          } else if (_w2 == 2) {
+            float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
+            float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+            *out_ptr = (temp > temp1) ? temp : temp1;
+          } else if (_w2 == 3) {
+            float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
+            float temp1 = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
+            *out_ptr++ = (temp > temp1) ? temp : temp1;
+            *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+          }
         }
       }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
     }
-    input_data += input_batch_stride;
-    output_data += output_batch_stride;
   }
-#endif
-#else
-#endif
 }
 
-void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
-                Tensor *output) {
-#if __ARM_NEON
-
-#if __aarch64__
-#else
+void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
+                    const Tensor *input, Tensor *output) {
   const int batch_size = input->dims()[0];
-
   const int input_height = input->dims()[2];
-
   const int input_width = input->dims()[3];
 
   const int output_channels = output->dims()[1];
-
   int output_height = output->dims()[2];
   const int output_width = output->dims()[3];
   const int ksize_height = 2;
@@ -125,59 +157,114 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
   const int input_channel_stride = input_height * input_width;
   const int output_channel_stride = output_height * output_width;
 
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+
   const float *input_data = input->data<float>();
   float *output_data = output->mutable_data<float>();
 
-  int out_w_num = output_width >> 2;
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-  float vqua[] = {0.25f, 0.25f, 0.25f, 0.25f};
-  int remain = output_width - out_w_num << 2;
+  int w1 = input_width / 16;
+  int _w1 = input_width % 16;
+  int w2 = _w1 / 4;
+  int _w2 = _w1 % 4;
+
+  float quarter = 1 / 4;
   for (int i = 0; i < batch_size; ++i) {
     for (int c = 0; c < output_channels; ++c) {
-      const float *input_data_chanel_row_next = input_data + input_width;
-      for (; output_height > 0; output_height--) {
-        if (out_w_num > 0) {
-          asm volatile(
-              "avg_loop:                            \n\t"
-              "vld1.32  {q0,q1},  [%[in_ptr1]]!         \n\t"
-              "vld1.32  {q2,q3},  [%[in_ptr2]]!         \n\t"
-              "vadd.f32  q0,  q0,  q2                 \n\t"
-              "vadd.f32  q1,  q1,  q3                 \n\t"
-              "vpadd.f32  d4,  d0, d1                  \n\t"
-              "vpadd.f32  d5,  d2, d3                  \n\t"
-              "vld1.32  {q4}, [%[vqua]]!                  \n\t"
-              "vmul.f32  q2,  q2,  q4                          \n\t"
-              "subs %[out_w_num],  #1                  \n\t"
-              "vst1.32  {q2},  [%[out_ptr]]!                 \n\t"
-              "bne  avg_loop                            \n\t"
-              : [in_ptr1] "+r"(input_data),
-                [in_ptr2] "+r"(input_data_chanel_row_next),
-                [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
-              : [vqua] "r"(vqua)
-              : "memory", "q0", "q1", "q2", "q3", "q4");
+      for (int ph = 0; ph < input_height; ph += 2) {
+        const float *in_ptr1 = input_data + i * input_batch_stride +
+                               c * input_channel_stride + ph * input_width;
+        const float *in_ptr2 = in_ptr1 + input_width;
+        if (ph + 1 >= input_height) {
+          in_ptr2 = static_cast<float *>(
+              paddle_mobile::memory::Alloc(sizeof(float) * input_width));
+          memset(static_cast<void *>(const_cast<float *>(in_ptr2)), 0,
+                 sizeof(float) * input_width);
         }
+        float *out_ptr = output_data + i * output_batch_stride +
+                         c * output_channel_stride + ph / 2 * output_width;
+        asm volatile(
+            "subs       %[w1], %[w1], #1        \n\t"
+            "blt        end_w1_%=               \n\t"
+            "loop_w1_%=:                        \n\t"
+
+            "pld        [%[in_ptr1], #64]       \n\t"
+            "pld        [%[in_ptr2], #64]       \n\t"
+
+            "vmov.f32   d0[0],      %[quarter]      \n\t"
+            "vld1.f32   {q1, q2},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q3, q4},   [%[in_ptr2]]!   \n\t"
+            "vld1.f32   {q7, q8},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q9, q10},  [%[in_ptr2]]!   \n\t"
+
+            "vadd.f32   q1,     q1,   q3        \n\t"
+            "vadd.f32   q2,     q2,   q4        \n\t"
 
-        for (; remain > 0; remain--) {
-          float max_row1 = std::max(input_data[0], input_data[1]);
-          float max_row2 = std::max(input_data_chanel_row_next[0],
-                                    input_data_chanel_row_next[1]);
-          *output_data = std::max(max_row1, max_row2);
-          input_data += 2;
-          input_data_chanel_row_next += 2;
-          output_data++;
+            "vadd.f32   q7,     q7,   q9        \n\t"
+            "vadd.f32   q8,     q8,   q10       \n\t"
+
+            "vpadd.f32  d10,    d2,   d3        \n\t"
+            "vpadd.f32  d11,    d4,   d5        \n\t"
+
+            "vpadd.f32  d12,    d14,  d15       \n\t"
+            "vpadd.f32  d13,    d16,  d17       \n\t"
+
+            "vmul.f32   q5,     q5,   d0[0]     \n\t"
+            "vmul.f32   q6,     q6,   d0[0]     \n\t"
+
+            "vst1.32  {q5, q6},  [%[out_ptr]]!  \n\t"
+
+            "subs       %[w1], %[w1], #1        \n\t"
+            "bge        loop_w1_%=              \n\t"
+            "end_w1_%=:                         \n\t"
+
+            "subs       %[w2], %[w2], #1        \n\t"
+            "blt        end_w2_%=               \n\t"
+            "loop_w2_%=:                        \n\t"
+
+            "vld1.f32   {q1},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q2},   [%[in_ptr2]]!   \n\t"
+            "vadd.f32   q1,     q1,   q2        \n\t"
+            "vpadd.f32  d4,     d2,   d3        \n\t"
+            "vmul.f32   d4,     d4,   d0[0]     \n\t"
+            "vst1.32    {d4},   [%[out_ptr]]!   \n\t"
+
+            "subs       %[w2], %[w2], #1        \n\t"
+            "bge        loop_w2_%=              \n\t"
+            "end_w2_%=:                         \n\t"
+            :
+            : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
+              [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr),
+              [quarter] "r"(quarter)
+            : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+              "q9", "q10");
+
+        if (_w2 != 0) {
+          in_ptr1 += 16 * w1 + 4 * w2;
+          in_ptr2 += 16 * w1 + 4 * w2;
+          out_ptr += 8 * w1 + 2 * w2;
+          if (_w2 == 1) {
+            *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
+          } else if (_w2 == 2) {
+            float temp = 0;
+            temp += *in_ptr1++;
+            temp += *in_ptr2++;
+            temp += *in_ptr1;
+            temp += *in_ptr2;
+            *out_ptr = 0.5 * temp;
+          } else if (_w2 == 3) {
+            float temp = 0;
+            temp += *in_ptr1++;
+            temp += *in_ptr2++;
+            temp += *in_ptr1++;
+            temp += *in_ptr2++;
+            *out_ptr++ = 0.5 * temp;
+            *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
+          }
         }
       }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
     }
-    input_data += input_batch_stride;
-    output_data += output_batch_stride;
   }
-
-#endif
-#else
-#endif
 }
 
 //}
diff --git a/src/operators/math/pool_2x2.h b/src/operators/math/pool_2x2.h
index ae32a3912b677efb50d8558700741a225e3eb3f8..bd5e48482607cc868408b6371f47e0cb55caf499 100644
--- a/src/operators/math/pool_2x2.h
+++ b/src/operators/math/pool_2x2.h
@@ -26,11 +26,11 @@ namespace math {
 using framework::Tensor;
 using std::vector;
 
-void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
-                Tensor *output);
+void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
+                    const Tensor *input, Tensor *output);
 
-void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
-                Tensor *out);
+void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
+                    const Tensor *in_x, Tensor *out);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp
index 28547b71fca6caea2ff4341b3f832c0035436a72..05d3017f635a040a52d2cc377c8f384dbbd8086c 100644
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -558,15 +558,13 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
       const float *input_seg = input_data + c * input_channel_stride;
       float *output_seg = output_data + c * output_channel_stride;
       for (int ph = 0; ph < output_height; ph++) {
+        int hstart = ph * stride - padding;
+        int hend = min(hstart + 3, input_height);
+        hstart = max(hstart, 0);
         for (int pw = 0; pw < output_width; pw++) {
-          int hstart = ph * stride - padding;
           int wstart = pw * stride - padding;
-          int hend = min(hstart + 3, input_height + padding);
-          int wend = min(wstart + 3, input_width + padding);
-          hstart = max(hstart, 0);
+          int wend = min(wstart + 3, input_width);
           wstart = max(wstart, 0);
-          hend = min(hend, input_height);
-          wend = min(wend, input_width);
           const float *pos1 = input_seg + hstart * input_width + wstart;
           const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
           const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;