diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index c2b383013f220cbf8a309a94248ad9c4d24d4b04..d87999f1a1b7defc0caa6280580a7c5704f19587 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -5,6 +5,10 @@
 #ifndef MACE_KERNELS_DEPTHWISE_CONV2D_H_
 #define MACE_KERNELS_DEPTHWISE_CONV2D_H_
 
+#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
 #include "mace/core/common.h"
 #include "mace/core/future.h"
 #include "mace/core/public/mace.h"
@@ -64,7 +68,9 @@ void DepthwiseConv2dKernel(const T *input_ptr,
                   inw >= input_width) {
                 MACE_CHECK(inh >= padded_h_start && inh < padded_h_stop &&
                                inw >= padded_w_start && inw < padded_w_stop,
-                           "Out of range read from input: ", inh, ", ", inw);
+                           "Out of range read from input: ", padded_h_start,
+                           " <= ", inh, " < ", padded_h_stop, ", ",
+                           padded_w_start, " <= ", inw, " < ", padded_w_stop);
               } else {
                 index_t input_offset =
                     n * input_height * input_width * input_channels +
@@ -108,33 +114,120 @@ void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr,
                                      int h_stop,
                                      int w_start,
                                      int w_stop) {
+  if (multiplier == 1) {
+    constexpr int c_tile_size = 4;
+
+#pragma omp parallel for collapse(3)
+    for (int n = 0; n < batch; ++n) {
+      for (int h = h_start; h < h_stop; ++h) {
+        for (int w = w_start; w < w_stop; ++w) {
+          int c;
+          for (c = 0; c + c_tile_size <= channels; c += c_tile_size) {
+#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+            static_assert(c_tile_size == 4, "channels tile size must be 4");
+            float32x4_t sum = vdupq_n_f32(0);
+            if (bias_ptr != nullptr) {
+              sum = vld1q_f32(bias_ptr + c);
+            }
+#else
+            T sum[c_tile_size] = {0};
+            if (bias_ptr != nullptr) {
+              for (int ci = 0; ci < c_tile_size; ++ci) {
+                sum[ci] = bias_ptr[c + ci];
+              }
+            }
+#endif
+            const T *filter_base = filter_ptr + c;
+            for (int kh = 0; kh < kernel_h; ++kh) {
+              for (int kw = 0; kw < kernel_w; ++kw) {
+                int inh = padded_h_start + h * stride_h + dilation_h * kh;
+                int inw = padded_w_start + w * stride_w + dilation_w * kw;
+                MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 &&
+                            inw < input_width);
+                index_t input_offset =
+                    n * input_height * input_width * input_channels +
+                    inh * input_width * input_channels + inw * input_channels +
+                    c;
+#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+                float32x4_t in = vld1q_f32(input_ptr + input_offset);
+                float32x4_t weights = vld1q_f32(filter_base);
+                sum = vfmaq_f32(sum, in, weights);
+#else
+                for (int ci = 0; ci < c_tile_size; ++ci) {
+                  sum[ci] +=
+                      input_ptr[input_offset + ci] * filter_base[ci];  // HWIM
+                }
+#endif
+                filter_base += input_channels;
+              }
+            }
+
+            index_t offset = n * height * width * channels +
+                             h * width * channels + w * channels + c;
+#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+            vst1q_f32(output_ptr + offset, sum);
+#else
+            for (int ci = 0; ci < c_tile_size; ++ci) {
+              output_ptr[offset + ci] = sum[ci];
+            }
+#endif
+          }
+          for (; c < channels; ++c) {
+            T bias_channel = bias_ptr ? bias_ptr[c] : 0;
+            index_t offset = n * height * width * channels +
+                             h * width * channels + w * channels + c;
+            output_ptr[offset] = bias_channel;
+            T sum = 0;
+            const T *filter_base = filter_ptr + c;
+            for (int kh = 0; kh < kernel_h; ++kh) {
+              for (int kw = 0; kw < kernel_w; ++kw) {
+                int inh = padded_h_start + h * stride_h + dilation_h * kh;
+                int inw = padded_w_start + w * stride_w + dilation_w * kw;
+                MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 &&
+                            inw < input_width);
+                index_t input_offset =
+                    n * input_height * input_width * input_channels +
+                    inh * input_width * input_channels + inw * input_channels +
+                    c;
+                sum += input_ptr[input_offset] * filter_base[0];  // HWIM
+                filter_base += input_channels * multiplier;
+              }
+            }
+            output_ptr[offset] += sum;
+          }
+        }
+      }
+    }
+  } else {
 #pragma omp parallel for collapse(4)
-  for (int n = 0; n < batch; ++n) {
-    for (int h = h_start; h < h_stop; ++h) {
-      for (int w = w_start; w < w_stop; ++w) {
-        for (int c = 0; c < channels; ++c) {
-          const index_t inc = c / multiplier;
-          const index_t m = c % multiplier;
-          T bias_channel = bias_ptr ? bias_ptr[c] : 0;
-          index_t offset = n * height * width * channels +
-                           h * width * channels + w * channels + c;
-          output_ptr[offset] = bias_channel;
-          T sum = 0;
-          const T *filter_base = filter_ptr + inc * multiplier + m;
-          for (int kh = 0; kh < kernel_h; ++kh) {
-            for (int kw = 0; kw < kernel_w; ++kw) {
-              int inh = padded_h_start + h * stride_h + dilation_h * kh;
-              int inw = padded_w_start + w * stride_w + dilation_w * kw;
-              index_t input_offset =
-                  n * input_height * input_width * input_channels +
-                  inh * input_width * input_channels + inw * input_channels +
-                  inc;
-              // TODO vectorize this
-              sum += input_ptr[input_offset] * filter_base[0];  // HWIM
-              filter_base += input_channels * multiplier;
+    for (int n = 0; n < batch; ++n) {
+      for (int h = h_start; h < h_stop; ++h) {
+        for (int w = w_start; w < w_stop; ++w) {
+          for (int c = 0; c < channels; ++c) {
+            const index_t inc = c / multiplier;
+            const index_t m = c % multiplier;
+            T bias_channel = bias_ptr ? bias_ptr[c] : 0;
+            index_t offset = n * height * width * channels +
+                             h * width * channels + w * channels + c;
+            output_ptr[offset] = bias_channel;
+            T sum = 0;
+            const T *filter_base = filter_ptr + inc * multiplier + m;
+            for (int kh = 0; kh < kernel_h; ++kh) {
+              for (int kw = 0; kw < kernel_w; ++kw) {
+                int inh = padded_h_start + h * stride_h + dilation_h * kh;
+                int inw = padded_w_start + w * stride_w + dilation_w * kw;
+                MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 &&
+                            inw < input_width);
+                index_t input_offset =
+                    n * input_height * input_width * input_channels +
+                    inh * input_width * input_channels + inw * input_channels +
+                    inc;
+                sum += input_ptr[input_offset] * filter_base[0];  // HWIM
+                filter_base += input_channels * multiplier;
+              }
             }
+            output_ptr[offset] += sum;
           }
-          output_ptr[offset] += sum;
         }
       }
     }
@@ -230,10 +323,15 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
     MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
 
     // The left-upper most offset of the padded input
-    int padded_h_start = 0 - paddings[0] / 2;
-    int padded_w_start = 0 - paddings[1] / 2;
-    index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
-    index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;
+    int paddings_top = paddings[0] / 2;
+    int paddings_bottom = paddings[0] - paddings_top;
+    int paddings_left = paddings[1] / 2;
+    int paddings_right = paddings[1] - paddings_left;
+
+    int padded_h_start = 0 - paddings_top;
+    int padded_w_start = 0 - paddings_left;
+    index_t padded_h_stop = input_height + paddings_bottom;
+    index_t padded_w_stop = input_width + paddings_right;
 
     Tensor::MappingGuard input_mapper(input);
     Tensor::MappingGuard filter_mapper(filter);
@@ -244,38 +342,59 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
     const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>();
     T *output_ptr = output->mutable_data<T>();
 
+    int valid_h_start =
+        paddings_top == 0 ? 0 : (paddings_top - 1) / stride_h + 1;
+    int valid_h_stop = paddings_bottom == 0
+                           ? height
+                           : height - ((paddings_bottom - 1) / stride_h + 1);
+    int valid_w_start =
+        paddings_left == 0 ? 0 : (paddings_left - 1) / stride_w + 1;
+    int valid_w_stop = paddings_right == 0
+                           ? width
+                           : width - ((paddings_right - 1) / stride_w + 1);
+
     // Calculate border elements with out-of-boundary checking
-    DepthwiseConv2dKernel<T>(
-        input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
-        channels, input_height, input_width, input_channels, multiplier,
-        padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h,
-        kernel_w, stride_h, stride_w, dilation_h, dilation_w, 0, 1, 0, width);
-    DepthwiseConv2dKernel<T>(input_ptr, filter_ptr, bias_ptr, output_ptr, batch,
-                             height, width, channels, input_height, input_width,
-                             input_channels, multiplier, padded_h_start,
-                             padded_h_stop, padded_w_start, padded_w_stop,
-                             kernel_h, kernel_w, stride_h, stride_w, dilation_h,
-                             dilation_w, height - 1, height, 0, width);
-    DepthwiseConv2dKernel<T>(input_ptr, filter_ptr, bias_ptr, output_ptr, batch,
-                             height, width, channels, input_height, input_width,
-                             input_channels, multiplier, padded_h_start,
-                             padded_h_stop, padded_w_start, padded_w_stop,
-                             kernel_h, kernel_w, stride_h, stride_w, dilation_h,
-                             dilation_w, 1, height - 1, 0, 1);
-    DepthwiseConv2dKernel<T>(input_ptr, filter_ptr, bias_ptr, output_ptr, batch,
-                             height, width, channels, input_height, input_width,
-                             input_channels, multiplier, padded_h_start,
-                             padded_h_stop, padded_w_start, padded_w_stop,
-                             kernel_h, kernel_w, stride_h, stride_w, dilation_h,
-                             dilation_w, 1, height - 1, width - 1, width);
+    if (valid_h_start > 0) {
+      DepthwiseConv2dKernel<T>(
+          input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
+          channels, input_height, input_width, input_channels, multiplier,
+          padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
+          kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, 0,
+          valid_h_start, 0, width);
+    }
+    if (valid_h_stop < height) {
+      DepthwiseConv2dKernel<T>(
+          input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
+          channels, input_height, input_width, input_channels, multiplier,
+          padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
+          kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w,
+          std::max(valid_h_start, valid_h_stop), height, 0, width);
+    }
+    if (valid_w_start > 0) {
+      DepthwiseConv2dKernel<T>(
+          input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
+          channels, input_height, input_width, input_channels, multiplier,
+          padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
+          kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w,
+          valid_h_start, valid_h_stop, 0, valid_w_start);
+    }
+    if (valid_w_stop < width) {
+      DepthwiseConv2dKernel<T>(
+          input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
+          channels, input_height, input_width, input_channels, multiplier,
+          padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
+          kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w,
+          valid_h_start, valid_h_stop, std::max(valid_w_start, valid_w_stop),
+          width);
+    }
 
     // Calculate border elements without out-of-boundary checking
     DepthwiseConv2dNoOOBCheckKernel<T>(
         input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
         channels, input_height, input_width, input_channels, multiplier,
         padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h,
-        kernel_w, stride_h, stride_w, dilation_h, dilation_w, 1, height - 1, 1,
-        width - 1);
+        kernel_w, stride_h, stride_w, dilation_h, dilation_w, valid_h_start,
+        valid_h_stop, valid_w_start, valid_w_stop);
 
     output_ptr = output->mutable_data<T>();
     DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
diff --git a/mace/ops/depthwise_conv_2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc
similarity index 82%
rename from mace/ops/depthwise_conv_2d_benchmark.cc
rename to mace/ops/depthwise_conv2d_benchmark.cc
index 2971130c3fa9dc92ede22e1fa41dc480d81bbeb8..2f58343ada3665017e197fa171e323348504a706 100644
--- a/mace/ops/depthwise_conv_2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -75,25 +75,27 @@ static void DepthwiseConv2d(int iters,
   }
 }
 
-#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE,                             \
-                                   DEVICE)                                                              \
-  static void                                                                                           \
-      BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
-          int iters) {                                                                                  \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                                    \
-    mace::testing::ItemsProcessed(tot);                                                                 \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                                 \
-    DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,                                    \
-                                  mace::Padding::P, OC);                                                \
-  }                                                                                                     \
-  BENCHMARK(                                                                                            \
-      BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
+#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE,                                  \
+                                   DEVICE)                                                                   \
+  static void                                                                                                \
+      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
+          int iters) {                                                                                       \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                                         \
+    mace::testing::ItemsProcessed(tot);                                                                      \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                                      \
+    DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,                                         \
+                                  mace::Padding::P, OC);                                                     \
+  }                                                                                                          \
+  BENCHMARK(                                                                                                 \
+      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
 
 #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC)                 \
   BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU);    \
   BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \
   BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
 
+BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
+BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1);
 BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1);
 BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1);
 BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1);
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index 2ec149e6688edbf2d58bdabf4a36f566c1dc5ecf..c5ff2713d73795421e159c2ad9c7f20e9869d8dc 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -280,10 +280,10 @@ void TestNxNS12(const index_t height, const index_t width) {
     ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.1);
   };
 
-  for (int kernel_size : {3}) {
+  for (int kernel_size : {2, 3, 4}) {
     for (int stride : {1, 2}) {
       func(kernel_size, kernel_size, stride, stride, VALID);
-      //func(kernel_size, kernel_size, stride, stride, SAME);
+      func(kernel_size, kernel_size, stride, stride, SAME);
     }
   }
 }