diff --git a/mace/kernels/neon/avg_pooling_neon_2x2.cc b/mace/kernels/neon/avg_pooling_neon_2x2.cc
new file mode 100644
index 0000000000000000000000000000000000000000..586e3f4af990da52132c2f535512132657489dd9
--- /dev/null
+++ b/mace/kernels/neon/avg_pooling_neon_2x2.cc
@@ -0,0 +1,177 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include <arm_neon.h>
+#include <float.h>
+#include <limits>
+
+#include "mace/core/common.h"
+
+namespace mace {
+namespace kernels {
+
+void PoolingAvgNeonK2x2S2x2(const float *input,
+                            const index_t *in_shape,
+                            float *output,
+                            const index_t *out_shape,
+                            const int *paddings) {
+  index_t batch = in_shape[0];
+  index_t channels = in_shape[1];
+  index_t in_height = in_shape[2];
+  index_t in_width = in_shape[3];
+
+  index_t out_height = out_shape[2];
+  index_t out_width = out_shape[3];
+
+  int padding_top = paddings[0] / 2;
+  int padding_bottom = paddings[0] - padding_top;
+  int padding_left = paddings[1] / 2;
+  int padding_right = paddings[1] - padding_left;
+
+  int in_image_size = in_height * in_width;
+  int out_image_size = out_height * out_width;
+  index_t input_offset = 0;
+  index_t output_offset = 0;
+  float avg_factors[4] = {0.25, 0.25, 0.25, 0.25};
+
+#pragma omp parallel for collapse(2)
+  for (int b = 0; b < batch; ++b) {
+    for (int c = 0; c < channels; ++c) {
+      float *outptr = output + output_offset;
+      const float *r0, *r1;
+
+      for (int h = 0; h < out_height; ++h) {
+        int w = 0;
+        int num_vectors = 0;
+        if (!((h == 0 && padding_top > 0) ||
+            (h == out_height - 1 && padding_bottom > 0))) {
+          r0 = input + input_offset + (h * 2 - padding_top) * in_width;
+          r1 = r0 + in_width;
+          if (padding_left > 0) {
+            *outptr = (r0[0] + r1[0]) * 0.25;
+            ++r0;
+            ++r1;
+            ++outptr;
+            ++w;
+          }
+          if (padding_right > 0) {
+            num_vectors = (out_width - w - 1) >> 2;
+          } else {
+            num_vectors = (out_width - w) >> 2;
+          }
+        }
+
+        w += num_vectors << 2;
+        float32x4_t factors = vld1q_f32(avg_factors);
+        for (; num_vectors > 0; --num_vectors) {
+          float32x4_t r00 = vld1q_f32(r0);
+          float32x4_t r10 = vld1q_f32(r1);
+          float32x4_t r01 = vld1q_f32(r0 + 4);
+          float32x4_t r11 = vld1q_f32(r1 + 4);
+
+          float32x4_t sum0 = vaddq_f32(r00, r10);
+          float32x4_t sum1 = vaddq_f32(r01, r11);
+
+          float32x4_t sum_result = vpaddq_f32(sum0, sum1);
+          float32x4_t avg_result = vmulq_f32(sum_result, factors);
+
+          vst1q_f32(outptr, avg_result);
+
+          r0 += 8;
+          r1 += 8;
+          outptr += 4;
+        }
+
+        for (; w < out_width; ++w) {
+          float sum = 0.0;
+          for (int kh = 0; kh < 2; ++kh) {
+            for (int kw = 0; kw < 2; ++kw) {
+              int inh = h * 2 - padding_top + kh;
+              int inw = w * 2 - padding_left + kw;
+              if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
+                sum += input[input_offset + inh * in_width + inw];
+              }
+            }
+          }
+
+          *outptr = sum * 0.25;
+          ++outptr;
+        }
+      }
+      input_offset += in_image_size;
+      output_offset += out_image_size;
+    }
+  }
+}
+
+// assume the input has already been padded
+void PoolingAvgNeonK2x2S2x2Padded(const float *input,
+                                  const index_t *in_shape,
+                                  float *output,
+                                  const index_t *out_shape) {
+  index_t batch = in_shape[0];
+  index_t channels = in_shape[1];
+  index_t in_height = in_shape[2];
+  index_t in_width = in_shape[3];
+
+  index_t out_height = out_shape[2];
+  index_t out_width = out_shape[3];
+
+  int in_image_size = in_height * in_width;
+  int out_image_size = out_height * out_width;
+  index_t input_offset = 0;
+  index_t output_offset = 0;
+  float avg_factors[4] = {0.25, 0.25, 0.25, 0.25};
+
+#pragma omp parallel for collapse(2)
+  for (int b = 0; b < batch; ++b) {
+    for (int c = 0; c < channels; ++c) {
+      const float *img0 = input + input_offset;
+      float *outptr = output + output_offset;
+
+      const float *r0 = img0;
+      const float *r1 = img0 + in_width;
+
+      for (int h = 0; h < out_height; ++h) {
+        int num_vectors = out_width >> 2;
+        int remain = out_width - (num_vectors << 2);
+
+        float32x4_t factors = vld1q_f32(avg_factors);
+        for (; num_vectors > 0; --num_vectors) {
+          float32x4_t r00 = vld1q_f32(r0);
+          float32x4_t r10 = vld1q_f32(r1);
+          float32x4_t r01 = vld1q_f32(r0 + 4);
+          float32x4_t r11 = vld1q_f32(r1 + 4);
+
+          float32x4_t sum0 = vaddq_f32(r00, r10);
+          float32x4_t sum1 = vaddq_f32(r01, r11);
+
+          float32x4_t sum_result = vpaddq_f32(sum0, sum1);
+          float32x4_t avg_result = vmulq_f32(sum_result, factors);
+
+          vst1q_f32(outptr, avg_result);
+
+          r0 += 8;
+          r1 += 8;
+          outptr += 4;
+        }
+
+        for (; remain > 0; --remain) {
+          *outptr = (r0[0] + r0[1] + r1[0] + r1[1]) * 0.25;
+
+          r0 += 2;
+          r1 += 2;
+          outptr++;
+        }
+        r0 += in_width;
+        r1 += in_width;
+      }
+      input_offset += in_image_size;
+      output_offset += out_image_size;
+    }
+  }
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/neon/avg_pooling_neon_3x3.cc b/mace/kernels/neon/avg_pooling_neon_3x3.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c977f59daebf4a1e2a152439e4d6d8508be7a60
--- /dev/null
+++ b/mace/kernels/neon/avg_pooling_neon_3x3.cc
@@ -0,0 +1,223 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include <arm_neon.h>
+#include <float.h>
+#include <limits>
+
+#include "mace/core/common.h"
+
+namespace mace {
+namespace kernels {
+
+void PoolingAvgNeonK3x3S2x2(const float *input,
+                            const index_t *in_shape,
+                            float *output,
+                            const index_t *out_shape,
+                            const int *paddings) {
+  index_t batch = in_shape[0];
+  index_t channels = in_shape[1];
+  index_t in_height = in_shape[2];
+  index_t in_width = in_shape[3];
+
+  index_t out_height = out_shape[2];
+  index_t out_width = out_shape[3];
+
+  int padding_top = paddings[0] / 2;
+  int padding_bottom = paddings[0] - padding_top;
+  int padding_left = paddings[1] / 2;
+  int padding_right = paddings[1] - padding_left;
+
+  int in_image_size = in_height * in_width;
+  int out_image_size = out_height * out_width;
+  index_t input_offset = 0;
+  index_t output_offset = 0;
+  float avg_factors[4] = {1.0/9.0, 1.0/9.0, 1.0/9.0, 1.0/9.0};
+
+#pragma omp parallel for collapse(2)
+  for (int b = 0; b < batch; ++b) {
+    for (int c = 0; c < channels; ++c) {
+      float *outptr = output + output_offset;
+
+      for (int h = 0; h < out_height; ++h) {
+        int w = 0;
+        int num_vectors = 0;
+        const float *r0, *r1, *r2;
+        if (!((h == 0 && padding_top > 0) ||
+            (h == out_height - 1 && padding_bottom > 0))) {
+          r0 = input + input_offset + (h * 2 - padding_top) * in_width;
+          r1 = r0 + in_width;
+          r2 = r1 + in_width;
+
+          if (padding_left > 0) {
+            if (padding_left == 1) {
+              float sum0 = std::max(r0[0], r0[1]);
+              float sum1 = std::max(r1[0], r1[1]);
+              float max2 = std::max(r2[0], r2[1]);
+              *outptr = (r0[0] + r0[1] + r1[0] + r1[1] + r2[0] + r2[1]) / 9.0;
+              ++r0;
+              ++r1;
+            } else {  // padding_left == 2
+              *outptr = (r0[0] + r1[0] + r2[0]) / 9.0;
+            }
+            ++outptr;
+            ++w;
+          }
+          if (padding_right > 0) {
+            num_vectors = (out_width - w - 1) >> 2;
+          } else {
+            num_vectors = (out_width - w) >> 2;
+          }
+        }
+
+        w += num_vectors << 2;
+        float32x4_t factors = vld1q_f32(avg_factors);
+        float32x4x2_t row0 = vld2q_f32(r0);
+        float32x4x2_t row1 = vld2q_f32(r1);
+        float32x4x2_t row2 = vld2q_f32(r2);
+        for (; num_vectors > 0; --num_vectors) {
+          float32x4x2_t row0_next = vld2q_f32(r0 + 8);
+          float32x4x2_t row1_next = vld2q_f32(r1 + 8);
+          float32x4x2_t row2_next = vld2q_f32(r2 + 8);
+
+          float32x4_t sum0 = vaddq_f32(row0.val[0], row0.val[1]);
+          float32x4_t sum1 = vaddq_f32(row1.val[0], row1.val[1]);
+          float32x4_t sum2 = vaddq_f32(row2.val[0], row2.val[1]);
+
+          float32x4_t row02 = vextq_f32(row0.val[0], row0_next.val[0], 1);
+          float32x4_t row12 = vextq_f32(row1.val[0], row1_next.val[0], 1);
+          float32x4_t row22 = vextq_f32(row2.val[0], row2_next.val[0], 1);
+
+          sum0 = vaddq_f32(sum0, row02);
+          sum1 = vaddq_f32(sum1, row12);
+          sum2 = vaddq_f32(sum2, row22);
+
+          float32x4_t sum_result = vaddq_f32(vaddq_f32(sum0, sum1), sum2);
+          float32x4_t avg_result = vmulq_f32(sum_result, factors);
+
+          vst1q_f32(outptr, avg_result);
+
+          row0 = row0_next;
+          row1 = row1_next;
+          row2 = row2_next;
+
+          r0 += 8;
+          r1 += 8;
+          r2 += 8;
+          outptr += 4;
+        }
+
+        for (; w < out_width; ++w) {
+          float sum = 0.0;
+          for (int kh = 0; kh < 3; ++kh) {
+            for (int kw = 0; kw < 3; ++kw) {
+              int inh = h * 2 - padding_top + kh;
+              int inw = w * 2 - padding_left + kw;
+              if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
+                sum += input[input_offset + inh * in_width + inw];
+              }
+            }
+          }
+
+          *outptr = sum / 9.0;
+          ++outptr;
+        }
+      }
+      input_offset += in_image_size;
+      output_offset += out_image_size;
+    }
+  }
+}
+
+// assume the input has already been padded
+void PoolingAvgNeonK3x3S2x2Padded(const float *input,
+                                  const index_t *in_shape,
+                                  float *output,
+                                  const index_t *out_shape) {
+  index_t batch = in_shape[0];
+  index_t channels = in_shape[1];
+  index_t in_height = in_shape[2];
+  index_t in_width = in_shape[3];
+
+  index_t out_height = out_shape[2];
+  index_t out_width = out_shape[3];
+
+  int in_image_size = in_height * in_width;
+  int out_image_size = out_height * out_width;
+  index_t input_offset = 0;
+  index_t output_offset = 0;
+  float avg_factors[4] = {1.0/9.0, 1.0/9.0, 1.0/9.0, 1.0/9.0};
+
+#pragma omp parallel for collapse(2)
+  for (int b = 0; b < batch; ++b) {
+    for (int c = 0; c < channels; ++c) {
+      const float *img0 = input + input_offset;
+      float *outptr = output + output_offset;
+
+      const float *r0 = img0;
+      const float *r1 = r0 + in_width;
+      const float *r2 = r1 + in_width;
+
+      for (int h = 0; h < out_height; h++) {
+        int num_vectors = out_width >> 2;
+        int remain = out_width - (num_vectors << 2);
+
+        float32x4_t factors = vld1q_f32(avg_factors);
+        float32x4x2_t row0 = vld2q_f32(r0);
+        float32x4x2_t row1 = vld2q_f32(r1);
+        float32x4x2_t row2 = vld2q_f32(r2);
+        for (; num_vectors > 0; --num_vectors) {
+          float32x4x2_t row0_next = vld2q_f32(r0 + 8);
+          float32x4x2_t row1_next = vld2q_f32(r1 + 8);
+          float32x4x2_t row2_next = vld2q_f32(r2 + 8);
+
+          float32x4_t sum0 = vaddq_f32(row0.val[0], row0.val[1]);
+          float32x4_t sum1 = vaddq_f32(row1.val[0], row1.val[1]);
+          float32x4_t sum2 = vaddq_f32(row2.val[0], row2.val[1]);
+
+          float32x4_t row02 = vextq_f32(row0.val[0], row0_next.val[0], 1);
+          float32x4_t row12 = vextq_f32(row1.val[0], row1_next.val[0], 1);
+          float32x4_t row22 = vextq_f32(row2.val[0], row2_next.val[0], 1);
+
+          sum0 = vaddq_f32(sum0, row02);
+          sum1 = vaddq_f32(sum1, row12);
+          sum2 = vaddq_f32(sum2, row22);
+
+          float32x4_t sum_result = vaddq_f32(vaddq_f32(sum0, sum1), sum2);
+          float32x4_t avg_result = vmulq_f32(sum_result, factors);
+
+          vst1q_f32(outptr, avg_result);
+
+          row0 = row0_next;
+          row1 = row1_next;
+          row2 = row2_next;
+
+          r0 += 8;
+          r1 += 8;
+          r2 += 8;
+          outptr += 4;
+        }
+
+        for (; remain > 0; remain--) {
+          *outptr = (r0[0] + r0[1] + r0[2] + r1[0] + r1[1] + r1[2] +
+                     r2[0] + r2[1] + r2[2]) / 9.0;
+
+          r0 += 2;
+          r1 += 2;
+          r2 += 2;
+          outptr++;
+        }
+
+        r0 += 1 + in_width;
+        r1 += 1 + in_width;
+        r2 += 1 + in_width;
+      }
+      input_offset += in_image_size;
+      output_offset += out_image_size;
+    }
+  }
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/neon/pooling_neon.cc b/mace/kernels/neon/pooling_neon.cc
index 0e57f02e045dcd1aed45cb7893e8a4c33dba142f..06efdeaa5258455f5c9779487ef5fe562d2aede9 100644
--- a/mace/kernels/neon/pooling_neon.cc
+++ b/mace/kernels/neon/pooling_neon.cc
@@ -15,51 +15,95 @@ extern void PoolingMaxNeonK2x2S2x2(const float *input,
                                    const index_t *out_shape,
                                    const int *paddings);
 
+extern void PoolingAvgNeonK2x2S2x2(const float *input,
+                                   const index_t *in_shape,
+                                   float *output,
+                                   const index_t *out_shape,
+                                   const int *paddings);
+
 extern void PoolingMaxNeonK3x3S2x2(const float *input,
                                    const index_t *in_shape,
                                    float *output,
                                    const index_t *out_shape,
                                    const int *paddings);
 
+extern void PoolingAvgNeonK3x3S2x2(const float *input,
+                                   const index_t *in_shape,
+                                   float *output,
+                                   const index_t *out_shape,
+                                   const int *paddings);
+
 #ifdef __COPY_MAKE_PADDING
 extern void PoolingMaxNeonK2x2S2x2Padded(const float *input,
                                          const index_t *in_shape,
                                          float *output,
                                          const index_t *out_shape);
+
+extern void PoolingAvgNeonK2x2S2x2Padded(const float *input,
+                                         const index_t *in_shape,
+                                         float *output,
+                                         const index_t *out_shape);
+
 extern void PoolingMaxNeonK3x3S2x2Padded(const float *input,
                                          const index_t *in_shape,
                                          float *output,
                                          const index_t *out_shape);
+
+extern void PoolingAvgNeonK3x3S2x2Padded(const float *input,
+                                         const index_t *in_shape,
+                                         float *output,
+                                         const index_t *out_shape);
 #endif
 
-template <>
+template<>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
     const float *input,
     const index_t *input_shape,
     float *output,
     const index_t *output_shape) {
+#ifdef __COPY_MAKE_PADDING
+  Tensor padded_input;
+  ConstructInputWithPadding(input, input_shape, paddings_, &padded_input);
+  input = padded_input.data<float>();
+  input_shape = padded_input.shape().data();
+#endif
+
   if (kernels_[0] == 2 && kernels_[1] == 2 && strides_[0] == 2 &&
-      strides_[1] == 2 && pooling_type_ == MAX) {
+      strides_[1] == 2) {
+    // kernel_size: 2x2, strides: 2x2
+    if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2
+#ifdef __COPY_MAKE_PADDING
+      PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
+#else
+      PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape,
+                             paddings_);
+#endif
+    } else { // AVG_POOL_2x2s2x2
 #ifdef __COPY_MAKE_PADDING
-    Tensor padded_input;
-    ConstructInputWithPadding(input, input_shape, paddings_, &padded_input);
-    input = padded_input.data<float>();
-    input_shape = padded_input.shape().data();
-    PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
+      PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
 #else
-    PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape, paddings_);
+      PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape,
+                             paddings_);
 #endif
+    }
   } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
-             strides_[1] == 2 && pooling_type_ == MAX) {
+      strides_[1] == 2) {
+    // kernel_size: 3x3, strides: 2x2
+    if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2
+#ifdef __COPY_MAKE_PADDING
+      PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
+#else
+      PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape,
+                             paddings_);
+#endif
+    } else { // AVG_POOL_3x3s2x2
 #ifdef __COPY_MAKE_PADDING
-    Tensor padded_input;
-    ConstructInputWithPadding(input, input_shape, paddings_, &padded_input);
-    input = padded_input.data<float>();
-    input_shape = padded_input.shape().data();
-    PoolingMaxNeonK3x3S2x2V2Padded(input, input_shape, output, output_shape);
+      PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
 #else
-    PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape, paddings_);
+      PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape,
+                             paddings_);
 #endif
+    }
   } else {  // not implement yet
     PoolingFunctor<DeviceType::CPU, float>(pooling_type_, kernels_, strides_,
                                            paddings_, dilations_)(
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index 3972743c76df9971ba88473551b80b4f96df6d9a..f5e9b599f20722766ace7ad98fb4b8011a239867 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -148,14 +148,14 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
   net.AddIntsArg("dilations", {1, 1});
 
   // Add input data
-  net.AddInputFromArray<float>(
-      "Input", {1, 1, 4, 5},
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19});
+  net.AddInputFromArray<float>("Input", {1, 1, 2, 9},
+                               {0, 1, 2, 3, 4, 5, 6, 7, 8,
+                                9, 10, 11, 12, 13, 14, 15, 16, 17});
   // Run
   net.RunOp(DeviceType::NEON);
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
+  auto expected = CreateTensor<float>({1, 1, 1, 5}, {10, 12, 14, 16, 17});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -172,18 +172,50 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
   net.AddIntArg("pooling_type", PoolingType::MAX);
   net.AddIntsArg("kernels", {3, 3});
   net.AddIntsArg("strides", {2, 2});
+  net.AddIntArg("padding", Padding::VALID);
+  net.AddIntsArg("dilations", {1, 1});
+
+  // Add input data
+  net.AddInputFromArray<float>("Input", {1, 1, 3, 9},
+                               {0, 1, 2, 3, 4, 5, 6, 7, 8,
+                                9, 10, 11, 12, 13, 14, 15, 16, 17,
+                                18, 19, 20, 21, 22, 23, 24, 25, 26});
+  // Run
+  net.RunOp(DeviceType::NEON);
+
+  // Check
+  auto expected = CreateTensor<float>({1, 1, 1, 4}, {20, 22, 24, 26});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+}
+
+TEST_F(PoolingOpTest, AVG_k2x2s2x2) {
+  // Construct graph
+  auto& net = test_net();
+  OpDefBuilder("Pooling", "PoolingTest")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  // Add args
+  net.AddIntArg("pooling_type", PoolingType::AVG);
+  net.AddIntsArg("kernels", {2, 2});
+  net.AddIntsArg("strides", {2, 2});
   net.AddIntArg("padding", Padding::SAME);
   net.AddIntsArg("dilations", {1, 1});
 
   // Add input data
   net.AddInputFromArray<float>(
-      "Input", {1, 1, 4, 5},
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19});
+      "Input", {1, 1, 2, 8},
+      {0, 1, 2, 3, 4, 5, 6, 7,
+       8, 9, 10, 11, 12, 13, 14, 15});
   // Run
   net.RunOp(DeviceType::NEON);
 
   // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
+  auto expected = CreateTensor<float>({1, 1, 1, 4},
+                                      {4.5, 6.5, 8.5, 10.5});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+